java中利用jsoup获取csdn网页数据
Posted Acmen-zym
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了java中利用jsoup获取csdn网页数据相关的知识,希望对你有一定的参考价值。
Maven依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
Gradle依赖
compile group: 'org.jsoup', name: 'jsoup', version: '1.13.1'
try
visitBlog();
catch (IOException e)
e.printStackTrace();
catch (InterruptedException e)
e.printStackTrace();
private void visitBlog() throws IOException, InterruptedException
Map<String, String> urlTitleMap = new LinkedHashMap<>();//key是Url value是标题
// ----------------------------------------------遍历每一页 获取文章链接----------------------------------------------
for (int i = 1; i < Integer.MAX_VALUE; i++)
String url = String.format(pageStartUrl, i);
System.out.println("开始获取博客的第 " + i + " 页的文章 url:" + url);
Document document = Jsoup.connect(url).timeout(10000)
.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/90.0.4430.212 Safari/537.36")
.get();
Elements articleMeListEs = document.select("#articleMeList-blog");
String articleListText = articleMeListEs.text();
//包含这两个则认定为没有文章了
if (articleListText.contains("空空如也"))
System.out.println("博客的第 " + i + " 页没有找到文章跳出抓取文章链接处理");
break;
Elements articleEs = articleMeListEs.select("div.article-list").select("div.article-item-box").select("a");
articleEs.forEach(e ->
String href = e.attr("href");
//用于移除标题开头使用,例如:原创 搭一个Gradle多模块管理的,typeText=原创
String typeText = e.select(".article-type").text() + " ";
String aText = e.text();
//得到一个有效的标题以上面为例标题为:搭一个Gradle多模块管理的,typeText=原创
String title = aText.replace(typeText, "");
urlTitleMap.put(href, title);
);
System.out.println("获取博客的第 " + i + " 页的文章完毕,获取当前数量:" + urlTitleMap.size());
// ---------------------------------------------------多线程访问每个链接---------------------------------------------------
ExecutorService executor = Executors.newCachedThreadPool();
int threadCount = 4; // 根据当前电脑核数创建 并发线程数量
for (int i = 0; i < threadCount; i++)
executor.execute(new PreviewThread(urlTitleMap));
class PreviewThread implements Runnable
private Map<String, String> urlTitleMap;
public PreviewThread(Map<String, String> urlTitleMap)
this.urlTitleMap = urlTitleMap;
@Override
public void run()
while (true)
for (Map.Entry<String, String> entry : urlTitleMap.entrySet())
String url = entry.getKey();
try
Document articleDoc = Jsoup.connect(url).timeout(10000).referrer("https://blog.csdn.net/qq_42623400/article/list/")
.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36").get();
String articleTitle = articleDoc.select("#articleContentId").text();
if (entry.getValue().equals(articleTitle))
System.out.println(Thread.currentThread().getName() + "成功访问文章 url:" + url + " title:" + articleTitle);
else
System.out.println(Thread.currentThread().getName() + "访问文章失败 url:" + url);
int sleep = new Random().nextInt(600) + 300;
Thread.sleep(sleep);
catch (IOException e)
e.printStackTrace();
catch (InterruptedException e)
e.printStackTrace();
以上是关于java中利用jsoup获取csdn网页数据的主要内容,如果未能解决你的问题,请参考以下文章