爬取王垠的博客并生成pdf
Posted jiangwenwen1
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取王垠的博客并生成pdf相关的知识,希望对你有一定的参考价值。
尚未完善,有待改进
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
__author__ = ‘jiangwenwen‘
import pdfkit
import time
import requests
import random
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
# 请求头
ua = UserAgent()
headers = {
‘cache-control‘: "no-cache",
"Host": "www.yinwang.org",
"User-Agent": ua.random,
"Referer": "http://www.yinwang.org/",
}
# IP代理池
ip_pool = [‘123.55.114.217:9999‘,
‘110.52.235.91:9999‘,
‘183.163.43.61:9999‘,
‘119.101.126.52:9999‘,
‘119.101.124.165:9999‘,
‘119.101.125.38:9999‘,
‘119.101.125.84:9999‘,
‘110.52.235.80:9999‘,
‘119.101.125.49:9999‘,
‘110.52.235.162:9999‘,
‘119.101.124.23:9999‘
]
# 打印成pdf
def print_pdf(url, file_name):
start = time.time()
print("正在打印中...")
headers["User-Agent"] = ua.random
print("User-Agent是:{0}".format(headers["User-Agent"]))
content = requests.get(url, headers=headers, timeout=3, proxies=get_proxy(ip_pool)).text
pdfkit.from_string(content, file_name)
end = time.time()
print("打印成功,本次打印耗时:%0.2f秒" % (end - start))
# 获得有效代理
def get_proxy(ip_pool):
for ip in ip_pool:
url = "http://www.yinwang.org/"
# 用requests来验证ip是否可用
try:
requests.get(url, proxies={"http": "http://{}".format(ip), }, timeout=3)
except:
continue
else:
proxies = {
"http": "http://{}".format(ip),
"https": "http://{}".format(ip),
}
return proxies
response = requests.get("http://www.yinwang.org/", headers=headers, proxies=get_proxy(ip_pool))
soup = BeautifulSoup(response.content, ‘html.parser‘)
tags = soup.find_all("li", class_="list-group-item title")
for child in tags:
article_url = "http://www.yinwang.org" + child.a.get(‘href‘)
article_file_name = "桌面\" + child.a.string + ".pdf"
print_pdf(article_url, article_file_name)
以上是关于爬取王垠的博客并生成pdf的主要内容,如果未能解决你的问题,请参考以下文章