Java使用Jsoup获得新闻联播所有文字稿
Posted <・)))><<
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Java使用Jsoup获得新闻联播所有文字稿相关的知识,希望对你有一定的参考价值。
Jsoup的maven坐标:
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
Java代码:
package com.zifeiy.test;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class XinwenGetter {
private static List<String> urlList = new ArrayList<String>();
private static OutputStreamWriter out;
private static void getUrlList() throws IOException {
for (int i = 1; i <= 44; i ++) {
String url = null;
if (i == 0) {
url = "http://www.xwlbo.com/txt.html";
} else {
url = "http://www.xwlbo.com/txt_" + i + ".html";
}
Document doc = Jsoup.connect(url).get();
Elements xwlistElements = doc.getElementsByClass("xwlist");
Elements aElements = xwlistElements.get(0).select("a");
for (Element element : aElements) {
String resUrl = element.attr("href");
urlList.add(resUrl);
}
}
}
private static void solve(String url) throws IOException {
Document doc = Jsoup.connect(url).get();
System.out.println("handling " + doc.title() + " ...");
out.write("<h3>" + doc.title() + "</h3>
");
Elements textElements = doc.getElementsByClass("text_content");
Elements pElements = textElements.get(0).select("p");
for (Element pElement : pElements) {
// System.out.println(pElement);
out.write(pElement.toString() + "
");
}
out.write("<hr>
");
}
public static void main(String[] args) throws IOException {
getUrlList();
File file = new File("D:/新闻联播大全.html");
if (file.exists() == true) file.delete();
out = new OutputStreamWriter(new FileOutputStream(file, true), "UTF-8");
for (String url: urlList) {
solve(url);
}
out.close();
}
}
以上是关于Java使用Jsoup获得新闻联播所有文字稿的主要内容,如果未能解决你的问题,请参考以下文章
java 利用jsoup 如何去除一段代码中的所有html标签,只留纯文本