爬虫工具代码
Posted dog-and-cat
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫工具代码相关的知识,希望对你有一定的参考价值。
#工具函数,整体测试不行
import requests
import time
‘‘‘
def get_html(url):
# 代理服务器
print("开始下载url : {}".format(url))
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = "H58G6G30137G865D"
proxyPass = "043F1F63DA9899C8"
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
from fake_useragent import UserAgent
ua = UserAgent()
print(ua.random)
headers = {
"User-Agent": ua.random
}
resp = requests.get(url, proxies=proxies, headers=headers)
resp = resp.content.decode("utf-8")
return resp
‘‘‘
#阿布云代理ip,此处后期改成ip池获取
def get_html_0(url):
# 代理服务器
print("开始下载url : {}".format(url))
proxies = {
"http:" :"http://117.95.199.208:9999",
"https" :"https://117.95.199.208:9999"
}
from fake_useragent import UserAgent
ua = UserAgent()
print(ua.random)
headers = {
"User-Agent": ua.random
}
r = requests.get(‘http://icanhazip.com/‘,proxies=proxies)
print(r.text)
resp = requests.get(url, proxies=proxies, headers=headers)
return resp
#menu_text = get_html_0(url).content.decode(‘utf-8‘)
#time.sleep(random.randint(0,2))
# 用来try测试远程服务器的连接状况
while True:
try:
res_text = requests.get(url).text
except:
time.sleep(3)
print(‘间隔休眠时间,再次处理‘)
else:
break
以上是关于爬虫工具代码的主要内容,如果未能解决你的问题,请参考以下文章