天涯帖子备份

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了天涯帖子备份相关的知识,希望对你有一定的参考价值。

from bs4 import BeautifulSoup
import urllib.request as request
import os
import time
import threading

url_s = http://bbs.tianya.cn/m/post-develop-
url_e = -1.shtml

headers=(User-Agent,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11)


def del_extract(dd,name=None):
    if len(dd)>0:
        for ds in dd:
            ds.extract()

def cut_str(s):
    if len(s)>0:
        try:
            href = s[0][href]
            if len(href)>0:
                s[0][href]=href[3:]
        except:
            pass
    
def get_total_page(url_site):
    opener = request.build_opener()
    opener.addheaders=[headers]
    content = opener.open(url_site).read()
    soup = BeautifulSoup(content,"html.parser")
    d = soup.find_all(class_=post,id="j-post-content")[0]

    data=d.find_all(class_=u-btn last-btn)
    total = 0
    if len(data)>0:
        total = data[0][href]
        total = int(str(total).split(".")[0].split("-")[-1])
    else:
        total = 0
    return total
    
    
    
def dealwith_page( url_site):

    opener = request.build_opener()
    opener.addheaders=[headers]
    content = opener.open(url_site).read()
    
    soup = BeautifulSoup(content,"html.parser")

    del_extract(soup.find_all("script"))
    del_extract(soup.find_all(class_="ty-m-nav"))
    del_extract(soup.find_all(class_="meta f-cf"))
    del_extract(soup.find_all(class_="ft"))
    d = soup.find_all(class_=post,id="j-post-content")[0]
    
    del_extract(d.find_all(class_="u-like"))
    del_extract(d.find_all(class_="post-func-close"))
    del_extract(d.find_all(class_="u-like hot-list"))

    cut_str(d.find_all(class_=u-btn off first-btn))
    cut_str(d.find_all(class_=u-btn pre-btn))
    cut_str(d.find_all(class_=page-txt))
    cut_str(d.find_all(class_=u-btn last-btn))
    cut_str(d.find_all(class_=u-btn next-btn))

    name = str(url_site).split(/)[-1]
    content = soup.prettify()
    if not os.path.exists("m/"):
        os.makedirs("m")
    with open(r"m/"+name,w,encoding="utf-8") as fw:
        fw.write(content)
    


def main_fun():
    print("please input the id of tianyaer (eg.2165689):")
    url_t = input("> ")
    url_page = url_s + url_t + url_e
    total = get_total_page(url_page)
    for n in range(total):
        url_page = url_s+str(url_t)+str("-")+str(n+1)+".shtml"
        print(url_page)
        while(threading.active_count()>10):
            time.sleep(1)
        
        t1=threading.Thread(target=dealwith_page,args=(url_page,))
        t1.setDaemon(True)
        t1.start() 
        
main_fun()   

 

以上是关于天涯帖子备份的主要内容,如果未能解决你的问题,请参考以下文章

调用模板化成员函数:帮助我理解另一个 *** 帖子中的代码片段

Xcode 4.6 的备份代码片段

关于比特币的帖子

PHP Wordpress查询从帖子/页面中提取片段

GraphQL + Relay 现代片段给了我一系列空帖子

在完成帖子执行之前切换到另一个片段时,应用程序崩溃,并显示此错误