java 爬取微信公众号文章 - 搜狗微信搜索
Posted 馥钰
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了java 爬取微信公众号文章 - 搜狗微信搜索相关的知识,希望对你有一定的参考价值。
需求描述:参照图,1、2、3、4步骤!
https://weixin.sogou.com/weixin?type=1&s_from=input&query=36%E6%B0%AA&ie=utf8&_sug_=n&_sug_type_=
HttpTool工具类、Jsoup maven 依赖参见文章 https://blog.csdn.net/liyanlei5858/article/details/117450118
Jsoup 入门参考 https://www.cnblogs.com/zhangyinhua/p/8037599.html
1、获取SNUID ,目的是为了绕过搜狗的验证码页面
// 获取snuid (因为sunid有时间和访问次数限制建议每次自动查询数据时更新一次)
public String getSnuid() {
CloseableHttpClient httpClient = null;
CookieStore cookieStore = null;
String url = "https://www.sogou.com/web?query=333&_asf=www.sogou.com&_ast=1488955851&w=01019900&p=40040100&ie=utf8&from=index-nologin";
int timeout = 30000;
String snuid = null;
try {
cookieStore = new BasicCookieStore();
HttpClientContext context = HttpClientContext.create();
context.setCookieStore(cookieStore);
RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).build();
httpClient = HttpClients.custom().setDefaultRequestConfig(globalConfig).setDefaultCookieStore(cookieStore).build();
HttpGet httpGet = new HttpGet(url);
httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
httpGet.setHeader("Cookie",
"ABTEST=0|1488956269|v17;IPLOC=CN3301;SUID=E9DA81B7290B940A0000000058BFAB6D;phpSESSID=rfrcqafv5v74hbgpt98ah20vf3;SUIR=1488956269");
httpClient.execute(httpGet);
for (Cookie c : cookieStore.getCookies()) {
if (c.getName().equals("SNUID")) {
snuid = c.getValue();
}
}
} catch (Exception e) {
e.printStackTrace();
}
return snuid;
}
2、抓取公众号搜索数据
https://weixin.sogou.com/weixin?type=1&s_from=input&query=36%E6%B0%AA&ie=utf8&_sug_=n&_sug_type_=
@Test
public void wechatLatestArticleLink() {
//公众号名称
String wechatName = "36氪";
String url = "https://weixin.sogou.com/weixin?type=1&s_from=input&query={query}&ie=utf8&_sug_=n&_sug_type_=";
// String url = "";
Map<String, String> paramMap = new HashMap<>();
paramMap.put("query", wechatName);//公众号名称
URI uri = new UriTemplate(url).expand(paramMap);
url = uri.toString();
//获取webchatList
String snuid = getSnuid();
String webchatListResp = HttpTool.get(url, getSoGouHeaderMap(snuid));
//解析最新文章的跳转链接 link?url=dn9a_-gY295K0Rci_xozVXfdMkSQTLW6cwJThYulHEtVjXrGTiVgSzeeEp_hLeB2yqXURLyfUtLrJXD1_0k9nVqXa8Fplpd9zzyQJTTG4l0_9qb01isMvlZ64lZu95gQIMueGIVv5d1dqVpT_pMk4offWRUgcoizEACiaxkcwhvTWNDOS5z7xv1sl3spDge3bn9dTu5NB8apP4DxpTPU2JJXAvEOlBkhNcfvnKsLq2y_fex_FvLObSeMu8iyJiWt4mSINg6uUw7I3OTRHYU2og..&type=1&query=36%E6%B0%AA&token=67498C9B4AEFECA0CACC0A62E51D7A8ACA9C78CD60B61980
String link = parseArticleLink(webchatListResp, wechatName);
System.out.println(link);
}
/**
* 搜狗请求头设置
*/
public static Map<String, String> getSoGouHeaderMap(String snuid) {
Map<String, String> map = new HashMap<>(new LinkedHashMap<>());
map.put("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/90.0.4430.212 Safari/537.36");
map.put("Cookie", "SNUID=" + snuid + ";");
return map;
}
public String parseArticleLink(String html, String wechatName) {
String articleLink = null;
try {
Document doc = Jsoup.parse(html);
//拿到ul 列表数据
Elements liList = doc.select("ul.news-list2 li");
Element liElt = null;
while (liList.size() > 0) {
liElt = liList.first();
Element wechatElt = liElt.selectFirst("p.tit");
//找到与查询公众号wechatName名称相同的一项,拿到最新文章的跳转链接
if (wechatElt != null && wechatName.equals(wechatElt.text())) {
Element aElt = liElt.selectFirst("dl a[uigs]");
if (aElt != null) {
articleLink = aElt.attr("href");
break;
}
}
liList = liList.next();
}
} catch (Exception e) {
logger.error("解析微信公众号最新文章跳转链接出错,error: {}", e.getMessage());
}
if(articleLink != null && articleLink.startsWith("/link")){
articleLink = "https://weixin.sogou.com" + articleLink;
}
return articleLink;
}
3、根据解析出的link,获取微信文章链接,并爬取微信文章内容
@Test
public void downloadWechatArticle() {
String url = "https://weixin.sogou.com/link?url=dn9a_-gY295K0Rci_xozVXfdMkSQTLW6cwJThYulHEtVjXrGTiVgSzeeEp_hLeB2zp7xyoWxqBjrJXD1_0k9nVqXa8Fplpd9zzyQJTTG4l0_9qb01isMvlZ64lZu95gQIMueGIVv5d1dqVpT_pMk4qISHGPazlJzIO6ZGefK-ffNEG4BwMubzDfho2FueAI7Qy-z8-8yTfqhcyncUMxqF1VyI0elEgkkrNr7tAJl89TTtvLRtCj8lfXLq_u3s55PQ7F9M3IWtLPS-e4Yz84xMA..&type=1&query=36%E6%B0%AA&token=6709931A4AEFECA0CACC0A62E51D7A8ACA9C78CD60B5FE30";
// String url = "";
// String url = "";
//获取articleUrl
String snuid = getSnuid();
String linkResp = HttpTool.get(url, getSoGouHeaderMap(snuid));
//解析articleUrl, https://mp.weixin.qq.com/s?src=11×tamp=1622537971&ver=3103&signature=-I-7L0hCDOh1LfBwKNDSpjZ1sUYizUM0P8Rbn5XJddf21B3mRfF*BCmJF9AdrZ0T3PjQkZgQ0rS*5tk0NAN*BvKliGSVCAhgDP5Y2ScozkF8tGp07aecT-9yqxUEBlrU&new=1
String articleUrl = getWechatArticleUrl(linkResp);
System.out.println(articleUrl);
//爬取微信文章内容
String articleResp = HttpTool.get(articleUrl);
System.out.println(articleResp);
}
public String getWechatArticleUrl(String resp) {
if(StringUtils.isBlank(resp)){
return null;
}
int startIndex = resp.indexOf("http://mp.w");
int endIndex = resp.indexOf("&new=1");
if(startIndex == -1 || endIndex == -1){
return null;
}
String url = resp.substring(startIndex, endIndex + 6);
url = url.replaceAll("'; url \\\\+\\\\= '", "");
return url;
}
参考文章
https://blog.csdn.net/xionggegehao/article/details/102832092
以上是关于java 爬取微信公众号文章 - 搜狗微信搜索的主要内容,如果未能解决你的问题,请参考以下文章
23个Python爬虫开源项目代码:爬取微信淘宝豆瓣知乎微博等