怎样用Python设计一个爬虫模拟登陆知乎
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了怎样用Python设计一个爬虫模拟登陆知乎相关的知识,希望对你有一定的参考价值。
参考技术A import requestsimport timeimport jsonimport osimport reimport sysimport subprocessfrom bs4 import BeautifulSoup as BS class ZhiHuClient(object): """连接知乎的工具类,维护一个Session 2015.11.11 用法: client = ZhiHuClient() # 第一次使用时需要调用此方法登录一次,生成cookie文件 # 以后可以跳过这一步 client.login("username", "password") # 用这个session进行其他网络操作,详见requests库 session = client.getSession() """ # 网址参数是账号类型 TYPE_PHONE_NUM = "phone_num" TYPE_EMAIL = "email" loginURL = r"http://www.zhihu.com/login/0" homeURL = r"http://www.zhihu.com" captchaURL = r"http://www.zhihu.com/captcha.gif" headers = "User-Agent": "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/46.0.2490.86 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Host": "www.zhihu.com", "Upgrade-Insecure-Requests": "1", captchaFile = os.path.join(sys.path[0], "captcha.gif") cookieFile = os.path.join(sys.path[0], "cookie") def __init__(self): os.chdir(sys.path[0]) # 设置脚本所在目录为当前工作目录 self.__session = requests.Session() self.__session.headers = self.headers # 用self调用类变量是防止将来类改名 # 若已经有 cookie 则直接登录 self.__cookie = self.__loadCookie() if self.__cookie: print("检测到cookie文件,直接使用cookie登录") self.__session.cookies.update(self.__cookie) soup = BS(self.open(r"http://www.zhihu.com/").text, "html.parser") print("已登陆账号: %s" % soup.find("span", class_="name").getText()) else: print("没有找到cookie文件,请调用login方法登录一次!") # 登录 def login(self, username, password): """ 验证码错误返回: 'errcode': 1991829, 'r': 1, 'data': 'captcha': '请提交正确的验证码 :(', 'msg': '请提交正确的验证码 :(' 登录成功返回: 'r': 0, 'msg': '登陆成功' """ self.__username = username self.__password = password self.__loginURL = self.loginURL.format(self.__getUsernameType()) # 随便开个网页,获取登陆所需的_xsrf html = self.open(self.homeURL).text soup = BS(html, "html.parser") _xsrf = soup.find("input", "name": "_xsrf")["value"] # 下载验证码图片 while True: captcha = self.open(self.captchaURL).content with open(self.captchaFile, "wb") as output: output.write(captcha) # 人眼识别 print("=" * 50) print("已打开验证码图片,请识别!") subprocess.call(self.captchaFile, shell=True) captcha = input("请输入验证码:") os.remove(self.captchaFile) # 发送POST请求 data = "_xsrf": _xsrf, "password": self.__password, "remember_me": "true", self.__getUsernameType(): self.__username, "captcha": captcha res = self.__session.post(self.__loginURL, data=data) print("=" * 50) # print(res.text) # 输出脚本信息,调试用 if res.json()["r"] == 0: print("登录成功") self.__saveCookie() break else: print("登录失败") print("错误信息 --->", res.json()["msg"]) def __getUsernameType(self): """判断用户名类型 经测试,网页的判断规则是纯数字为phone_num,其他为email """ if self.__username.isdigit(): return self.TYPE_PHONE_NUM return self.TYPE_EMAIL def __saveCookie(self): """cookies 序列化到文件 即把dict对象转化成字符串保存 """ with open(self.cookieFile, "w") as output: cookies = self.__session.cookies.get_dict() json.dump(cookies, output) print("=" * 50) print("已在同目录下生成cookie文件:", self.cookieFile) def __loadCookie(self): """读取cookie文件,返回反序列化后的dict对象,没有则返回None""" if os.path.exists(self.cookieFile): print("=" * 50) with open(self.cookieFile, "r") as f: cookie = json.load(f) return cookie return None def open(self, url, delay=0, timeout=10): """打开网页,返回Response对象""" if delay: time.sleep(delay) return self.__session.get(url, timeout=timeout) def getSession(self): return self.__session if __name__ == '__main__': client = ZhiHuClient() # 第一次使用时需要调用此方法登录一次,生成cookie文件 # 以后可以跳过这一步 # client.login("username", "password") # 用这个session进行其他网络操作,详见requests库 session = client.getSession()Python爬虫 —— 知乎之selenium模拟登陆+requests.Session()获取cookies
代码如下:
1 # coding:utf-8 2 from selenium import webdriver 3 import requests 4 import sys 5 import time 6 from lxml import etree 7 # reload(sys) 8 # sys.setdefaultencoding(‘utf-8‘) 9 10 class Zhihu: 11 def __init__(self,homeurl): 12 self.homeurl = homeurl 13 14 def GetCookies(self): 15 browser = webdriver.Chrome() 16 browser.get("https://www.zhihu.com/signin") 17 browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys("13060882373") 18 browser.find_element_by_css_selector(".SignFlow-password input").send_keys("XXXXXX") 19 browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click() 20 time.sleep(3) 21 # js = "window.scrollTo(0, document.body.scrollHeight);" 22 # browser.execute_script(js) 23 # time.sleep(3) 24 cookies = browser.get_cookies() 25 browser.quit() 26 return cookies 27 28 def Crawl(self): 29 s = requests.Session() 30 s.headers.clear() 31 for cookie in self.GetCookies(): 32 s.cookies.set(cookie[‘name‘], cookie[‘value‘]) 33 html = s.get(self.homeurl).text 34 html_tree = etree.HTML(html) 35 items = html_tree.xpath(‘//*[@id="root"]/div/main/div/div/div[1]/div[2]/div//div[@class="ContentItem AnswerItem"]/@data-zop‘) 36 for item in items: 37 # print item 38 content = eval(item) 39 authorName = content[‘authorName‘] 40 title = content[‘title‘] 41 print authorName + "回答了:" + title 42 43 44 zhihu = Zhihu(‘https://www.zhihu.com/‘) 45 zhihu.Crawl()
以上是关于怎样用Python设计一个爬虫模拟登陆知乎的主要内容,如果未能解决你的问题,请参考以下文章