天涯帖子备份
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了天涯帖子备份相关的知识,希望对你有一定的参考价值。
from bs4 import BeautifulSoup import urllib.request as request import os import time import threading url_s = ‘http://bbs.tianya.cn/m/post-develop-‘ url_e = ‘-1.shtml‘ headers=(‘User-Agent‘,‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11‘) def del_extract(dd,name=None): if len(dd)>0: for ds in dd: ds.extract() def cut_str(s): if len(s)>0: try: href = s[0][‘href‘] if len(href)>0: s[0][‘href‘]=href[3:] except: pass def get_total_page(url_site): opener = request.build_opener() opener.addheaders=[headers] content = opener.open(url_site).read() soup = BeautifulSoup(content,"html.parser") d = soup.find_all(class_=‘post‘,id="j-post-content")[0] data=d.find_all(class_=‘u-btn last-btn‘) total = 0 if len(data)>0: total = data[0][‘href‘] total = int(str(total).split(".")[0].split("-")[-1]) else: total = 0 return total def dealwith_page( url_site): opener = request.build_opener() opener.addheaders=[headers] content = opener.open(url_site).read() soup = BeautifulSoup(content,"html.parser") del_extract(soup.find_all("script")) del_extract(soup.find_all(class_="ty-m-nav")) del_extract(soup.find_all(class_="meta f-cf")) del_extract(soup.find_all(class_="ft")) d = soup.find_all(class_=‘post‘,id="j-post-content")[0] del_extract(d.find_all(class_="u-like")) del_extract(d.find_all(class_="post-func-close")) del_extract(d.find_all(class_="u-like hot-list")) cut_str(d.find_all(class_=‘u-btn off first-btn‘)) cut_str(d.find_all(class_=‘u-btn pre-btn‘)) cut_str(d.find_all(class_=‘page-txt‘)) cut_str(d.find_all(class_=‘u-btn last-btn‘)) cut_str(d.find_all(class_=‘u-btn next-btn‘)) name = str(url_site).split(‘/‘)[-1] content = soup.prettify() if not os.path.exists("m/"): os.makedirs("m") with open(r"m/"+name,‘w‘,encoding="utf-8") as fw: fw.write(content) def main_fun(): print("please input the id of tianyaer (eg.2165689):") url_t = input("> ") url_page = url_s + url_t + url_e total = get_total_page(url_page) for n in range(total): url_page = url_s+str(url_t)+str("-")+str(n+1)+".shtml" print(url_page) while(threading.active_count()>10): time.sleep(1) t1=threading.Thread(target=dealwith_page,args=(url_page,)) t1.setDaemon(True) t1.start() main_fun()
以上是关于天涯帖子备份的主要内容,如果未能解决你的问题,请参考以下文章