怎样用Python设计一个爬虫模拟登陆知乎

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了怎样用Python设计一个爬虫模拟登陆知乎相关的知识,希望对你有一定的参考价值。

参考技术A import requestsimport timeimport jsonimport osimport reimport sysimport subprocessfrom bs4 import BeautifulSoup as BS class ZhiHuClient(object): """连接知乎的工具类,维护一个Session 2015.11.11 用法: client = ZhiHuClient() # 第一次使用时需要调用此方法登录一次,生成cookie文件 # 以后可以跳过这一步 client.login("username", "password") # 用这个session进行其他网络操作,详见requests库 session = client.getSession() """ # 网址参数是账号类型 TYPE_PHONE_NUM = "phone_num" TYPE_EMAIL = "email" loginURL = r"http://www.zhihu.com/login/0" homeURL = r"http://www.zhihu.com" captchaURL = r"http://www.zhihu.com/captcha.gif" headers = "User-Agent": "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/46.0.2490.86 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Host": "www.zhihu.com", "Upgrade-Insecure-Requests": "1", captchaFile = os.path.join(sys.path[0], "captcha.gif") cookieFile = os.path.join(sys.path[0], "cookie") def __init__(self): os.chdir(sys.path[0]) # 设置脚本所在目录为当前工作目录 self.__session = requests.Session() self.__session.headers = self.headers # 用self调用类变量是防止将来类改名 # 若已经有 cookie 则直接登录 self.__cookie = self.__loadCookie() if self.__cookie: print("检测到cookie文件,直接使用cookie登录") self.__session.cookies.update(self.__cookie) soup = BS(self.open(r"http://www.zhihu.com/").text, "html.parser") print("已登陆账号: %s" % soup.find("span", class_="name").getText()) else: print("没有找到cookie文件,请调用login方法登录一次!") # 登录 def login(self, username, password): """ 验证码错误返回: 'errcode': 1991829, 'r': 1, 'data': 'captcha': '请提交正确的验证码 :(', 'msg': '请提交正确的验证码 :(' 登录成功返回: 'r': 0, 'msg': '登陆成功' """ self.__username = username self.__password = password self.__loginURL = self.loginURL.format(self.__getUsernameType()) # 随便开个网页,获取登陆所需的_xsrf html = self.open(self.homeURL).text soup = BS(html, "html.parser") _xsrf = soup.find("input", "name": "_xsrf")["value"] # 下载验证码图片 while True: captcha = self.open(self.captchaURL).content with open(self.captchaFile, "wb") as output: output.write(captcha) # 人眼识别 print("=" * 50) print("已打开验证码图片,请识别!") subprocess.call(self.captchaFile, shell=True) captcha = input("请输入验证码:") os.remove(self.captchaFile) # 发送POST请求 data = "_xsrf": _xsrf, "password": self.__password, "remember_me": "true", self.__getUsernameType(): self.__username, "captcha": captcha res = self.__session.post(self.__loginURL, data=data) print("=" * 50) # print(res.text) # 输出脚本信息,调试用 if res.json()["r"] == 0: print("登录成功") self.__saveCookie() break else: print("登录失败") print("错误信息 --->", res.json()["msg"]) def __getUsernameType(self): """判断用户名类型 经测试,网页的判断规则是纯数字为phone_num,其他为email """ if self.__username.isdigit(): return self.TYPE_PHONE_NUM return self.TYPE_EMAIL def __saveCookie(self): """cookies 序列化到文件 即把dict对象转化成字符串保存 """ with open(self.cookieFile, "w") as output: cookies = self.__session.cookies.get_dict() json.dump(cookies, output) print("=" * 50) print("已在同目录下生成cookie文件:", self.cookieFile) def __loadCookie(self): """读取cookie文件,返回反序列化后的dict对象,没有则返回None""" if os.path.exists(self.cookieFile): print("=" * 50) with open(self.cookieFile, "r") as f: cookie = json.load(f) return cookie return None def open(self, url, delay=0, timeout=10): """打开网页,返回Response对象""" if delay: time.sleep(delay) return self.__session.get(url, timeout=timeout) def getSession(self): return self.__session if __name__ == '__main__': client = ZhiHuClient() # 第一次使用时需要调用此方法登录一次,生成cookie文件 # 以后可以跳过这一步 # client.login("username", "password") # 用这个session进行其他网络操作,详见requests库 session = client.getSession()

Python爬虫 —— 知乎之selenium模拟登陆+requests.Session()获取cookies

 

 

代码如下:

 1 # coding:utf-8
 2 from selenium import webdriver
 3 import requests
 4 import sys
 5 import time
 6 from lxml import etree
 7 # reload(sys)
 8 # sys.setdefaultencoding(‘utf-8‘)
 9 
10 class Zhihu:
11     def __init__(self,homeurl):
12         self.homeurl = homeurl
13 
14     def GetCookies(self):
15         browser = webdriver.Chrome()
16         browser.get("https://www.zhihu.com/signin")
17         browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys("13060882373")
18         browser.find_element_by_css_selector(".SignFlow-password input").send_keys("XXXXXX")
19         browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click()
20         time.sleep(3)
21         # js = "window.scrollTo(0, document.body.scrollHeight);"
22         # browser.execute_script(js)
23         # time.sleep(3)
24         cookies = browser.get_cookies()
25         browser.quit()
26         return cookies
27 
28     def Crawl(self):
29         s = requests.Session()
30         s.headers.clear()
31         for cookie in self.GetCookies():
32             s.cookies.set(cookie[name], cookie[value])
33         html = s.get(self.homeurl).text
34         html_tree = etree.HTML(html)
35         items = html_tree.xpath(//*[@id="root"]/div/main/div/div/div[1]/div[2]/div//div[@class="ContentItem AnswerItem"]/@data-zop)
36         for item in items:
37             # print item
38             content = eval(item)
39             authorName = content[authorName]
40             title = content[title]
41             print authorName + "回答了:" + title
42 
43 
44 zhihu = Zhihu(https://www.zhihu.com/)
45 zhihu.Crawl()

 

以上是关于怎样用Python设计一个爬虫模拟登陆知乎的主要内容,如果未能解决你的问题,请参考以下文章

怎样用Python设计一个爬虫模拟登陆知乎?

Python爬虫 —— 知乎之selenium模拟登陆+requests.Session()获取cookies

HttpClient 模拟登陆知乎

Python模拟登陆万能法-微博|知乎

Python爬取知乎与我所理解的爬虫与反爬虫

Scrapy 模拟登陆知乎--抓取热点话题