import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;

public class Pachong {

public static void main(String[] args) {
    String url = "https://www.biqooge.com/3_3319/2111837.html"; 
    try {
        // html是抓取到的源代码
        String html = crawl(url);
        // System.out.println(content);

        // 获取标题 <h1> 第19章 通玄 </h1>
        // 观察可以发现标题是存储在h1标签中的
        // 所以获取 <h1>的下标就能获取到标题的下标
        // 有了下标,用字符串获取子串的substring方法来获取到标题
        String target = "<h1>";
        int end = html.indexOf(target) + target.length();
        
        target = "</h1>";
        int start = html.indexOf(target);
        System.out.println(html.substring(end, start));

        // 正文部分的获取 <div id="content"> (正文) </div>
        // 观察可以发现正文是存储在div标签中的
        target = "<div id=\"content\">";
        start = html.indexOf(target) + target.length();
        
        target = "</div>";
        end = start + html.substring(start).indexOf(target);
        // System.out.println(html.substring(start, end));

        // 输出后发现 <br /> 和 &nbsp; 这两个是不想要的部分,所以要进行替换
        String content = html.substring(start, end);
        content = content.replace("&nbsp;", "");
        content = content.replace("<br />", "\n");
        content = content.replace("\n\n\n", "\n");
        System.out.println(content);
    } 
    catch (IOException e) {
        e.printStackTrace();
    }
}

public static String crawl(String url) throws IOException {
    URLConnection connection = new URL(url).openConnection();
    connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36");
    
    BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));

    StringBuilder stringBuilder = new StringBuilder();
    String line;
    while ((line = reader.readLine())!= null) {
        stringBuilder.append(line).append("\n");
    }
    reader.close();

    return stringBuilder.toString();
    }
}


public static void testDownImage()
{

String url = "https://www.biqooge.com/files/article/image/3/3319/3319s.jpg";
        String name = "test.jpg";
        try {
            // html是抓取到的源代码
            downImage(url, name);
        } catch (IOException e) {
            e.printStackTrace();
        }
}


    public static void downImage(String url, String name) throws IOException {
        URLConnection connection = new URL(url).openConnection();
        connection.setRequestProperty("User-Agent",
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36");

        InputStream inputStream = connection.getInputStream();
        File file = new File(name);
        FileOutputStream out = new FileOutputStream(file);
        int i = 0;

        while ((i = inputStream.read()) != -1) {
            out.write(i);
        }

        inputStream.close();
        out.close();
    }

0 条评论

目前还没有评论...