- Java
Java爬虫
- 2024-6-14 15:23:19 @
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
public class Pachong {
public static void main(String[] args) {
String url = "https://www.biqooge.com/3_3319/2111837.html";
try {
// html是抓取到的源代码
String html = crawl(url);
// System.out.println(content);
// 获取标题 <h1> 第19章 通玄 </h1>
// 观察可以发现标题是存储在h1标签中的
// 所以获取 <h1>的下标就能获取到标题的下标
// 有了下标,用字符串获取子串的substring方法来获取到标题
String target = "<h1>";
int end = html.indexOf(target) + target.length();
target = "</h1>";
int start = html.indexOf(target);
System.out.println(html.substring(end, start));
// 正文部分的获取 <div id="content"> (正文) </div>
// 观察可以发现正文是存储在div标签中的
target = "<div id=\"content\">";
start = html.indexOf(target) + target.length();
target = "</div>";
end = start + html.substring(start).indexOf(target);
// System.out.println(html.substring(start, end));
// 输出后发现 <br /> 和 这两个是不想要的部分,所以要进行替换
String content = html.substring(start, end);
content = content.replace(" ", "");
content = content.replace("<br />", "\n");
content = content.replace("\n\n\n", "\n");
System.out.println(content);
}
catch (IOException e) {
e.printStackTrace();
}
}
public static String crawl(String url) throws IOException {
URLConnection connection = new URL(url).openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36");
BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
StringBuilder stringBuilder = new StringBuilder();
String line;
while ((line = reader.readLine())!= null) {
stringBuilder.append(line).append("\n");
}
reader.close();
return stringBuilder.toString();
}
}
public static void testDownImage()
{
String url = "https://www.biqooge.com/files/article/image/3/3319/3319s.jpg";
String name = "test.jpg";
try {
// html是抓取到的源代码
downImage(url, name);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void downImage(String url, String name) throws IOException {
URLConnection connection = new URL(url).openConnection();
connection.setRequestProperty("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36");
InputStream inputStream = connection.getInputStream();
File file = new File(name);
FileOutputStream out = new FileOutputStream(file);
int i = 0;
while ((i = inputStream.read()) != -1) {
out.write(i);
}
inputStream.close();
out.close();
}
0 条评论
目前还没有评论...