如何使用 Python 抓取需要先登录的网站
Posted
技术标签:
【中文标题】如何使用 Python 抓取需要先登录的网站【英文标题】:How to scrape a website that requires login first with Python 【发布时间】:2013-12-01 02:56:27 【问题描述】:首先,我认为值得一提的是,我知道有很多类似的问题,但没有一个对我有用...
我是 Python、html 和网络爬虫的新手。我正在尝试从需要先登录的网站上抓取用户信息。在我的测试中,我使用来自 github 的刮板我的电子邮件设置作为示例。主页是'https://github.com/login',目标页面是'https://github.com/settings/emails'
这是我尝试过的方法列表
##################################### Method 1
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Chrome')]
# The site we will navigate into, handling it's session
br.open('https://github.com/login')
for f in br.forms():
print f
br.select_form(nr=0)
# User credentials
br.form['login'] = 'myusername'
br.form['password'] = 'mypwd'
# Login
br.submit()
br.open('github.com/settings/emails').read()
################ Method 2
import urllib, urllib2, cookielib
username = 'myusername'
password = 'mypwd'
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
login_data = urllib.urlencode('username' : username, 'j_password' : password)
opener.open('https://github.com/login', login_data)
resp = opener.open('https://github.com/settings/emails')
print resp.read()
############# Method 3
import urllib
opener = urllib.FancyURLopener()
print opener.open('http://myusername:mypwd@github.com/settings/emails').read()
########## Method 4
import mechanize
import cookielib
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
br.addheaders = [('User-agent', 'Chrome')]
br.add_password('https://github.com/settings/emails', 'myusername', 'mypwd')
br.open('https://github.com/settings/emails')
print br.response().read()
############ Methods 5
from requests import session
payload =
'action': 'login',
'username': 'myusername',
'password': 'mypwd'
with session() as c:
c.post('https://github.com/login', data=payload)
request = c.get('https://github.com/settings/emails')
print request.headers
print request.text
########### Method 6
import requests
from requests.packages.urllib3 import add_stderr_logger
import sys
from bs4 import BeautifulSoup as bs
add_stderr_logger()
s = requests.Session()
s.headers['User-Agent'] = 'Chrome'
username = 'myusername'
password = 'mypwd'
url = 'https://github.com/login'
# after examining the HTML of the website you're trying to log into
# set name_form to the name of the form element that contains the name and
# set password_form to the name of the form element that will contain the password
login = 'login': username, 'password': password
login_response = s.post(url, data=login)
for r in login_response.history:
if r.status_code == 401: # 401 means authentication failed
print 'error!'
sys.exit(1) # abort
pdf_response = s.get('https://github.com/settings/emails') # Your cookies and headers are automatically included
soup = bs(pdf_response.content)
我还阅读了一些关于 HTTP 身份验证和 cookie 之间差异的讨论。他们仍然没有工作。
请提供帮助,我们将不胜感激。非常感谢。
【问题讨论】:
【参考方案1】:解决这个问题的经典方法是:
-
启动浏览器,进入网站并搜索登录页面
查看页面源代码发现:
一、哪个是登录表单(一个页面可以有很多表单,但通常其中一个是登录表单)
二、哪些是用于用户名和密码的字段名称(这些可能会有很大差异)
三、如果还有其他必须提交的字段(例如身份验证令牌)
编写 Scrapy 蜘蛛以使用 FormRequest 复制表单提交
作为自动化的拥护者,我们认为我们可以编写一些代码来自动化第 2 点(这实际上是最耗时的),结果是登录表单,一个自动填写登录表单的库,给定登录页面、用户名和密码。 这是一个简单蜘蛛的代码,它会使用 loginform 自动登录网站。
githubloginspider.py
from scrapy.spider import BaseSpider
from scrapy.http import FormRequest
from scrapy.http.request import Request
from loginform import fill_login_form
from scrapy import log
from scraping.articles import ArticleItem
class GitHubLogin(BaseSpider):
name = 'GitHubLogin'
allowed_domains = ['github.com']
start_urls = ['http://github.com/login']
login_user = 'ranvijay5686'
login_pass = ''
def parse(self, response):
(args, url, method) = fill_login_form(response.url,
response.body, self.login_user, self.login_pass)
return FormRequest(url, method=method, formdata=args,
callback=self.after_login)
def after_login(self, response):
# for link in response.xpath("//*[@id='site-container']/div[2]/div[4]/p/a/@href").extract():
item = ArticleItem()
item['title'] = 'ranvijay'
log.msg('*************** : '
+ str(response.xpath("//form[@class='subnav-search left']/input/@value"
).extract()))
item['url'] = \
response.xpath("//*[@id='site-container']/div[1]/div/div/span/span/text()"
).extract()
yield item
items.py
from scrapy.item import Item, Field
class ArticleItem(Item):
title = Field()
url = Field()
loginform.py
import sys
from argparse import ArgumentParser
from collections import defaultdict
from lxml import html
__version__ = '1.0' # also update setup.py
def _form_score(form):
score = 0
# In case of user/pass or user/pass/remember-me
if len(form.inputs.keys()) in (2, 3):
score += 10
typecount = defaultdict(int)
for x in form.inputs:
type_ = (x.type if isinstance(x, html.InputElement) else 'other'
)
typecount[type_] += 1
if typecount['text'] > 1:
score += 10
if not typecount['text']:
score -= 10
if typecount['password'] == 1:
score += 10
if not typecount['password']:
score -= 10
if typecount['checkbox'] > 1:
score -= 10
if typecount['radio']:
score -= 10
return score
def _pick_form(forms):
"""Return the form most likely to be a login form"""
return sorted(forms, key=_form_score, reverse=True)[0]
def _pick_fields(form):
"""Return the most likely field names for username and password"""
userfield = passfield = emailfield = None
for x in form.inputs:
if not isinstance(x, html.InputElement):
continue
type_ = x.type
if type_ == 'password' and passfield is None:
passfield = x.name
elif type_ == 'text' and userfield is None:
userfield = x.name
elif type_ == 'email' and emailfield is None:
emailfield = x.name
return (userfield or emailfield, passfield)
def submit_value(form):
"""Returns the value for the submit input, if any"""
for x in form.inputs:
if x.type == 'submit' and x.name:
return [(x.name, x.value)]
else:
return []
def fill_login_form(
url,
body,
username,
password,
):
doc = html.document_fromstring(body, base_url=url)
form = _pick_form(doc.xpath('//form'))
(userfield, passfield) = _pick_fields(form)
form.fields[userfield] = username
form.fields[passfield] = password
form_values = form.form_values() + submit_value(form)
return (form_values, form.action or form.base_url, form.method)
def main():
ap = ArgumentParser()
ap.add_argument('-u', '--username', default='username')
ap.add_argument('-p', '--password', default='secret')
ap.add_argument('url')
args = ap.parse_args()
try:
import requests
except ImportError:
print 'requests library is required to use loginform as a tool'
r = requests.get(args.url)
(values, action, method) = fill_login_form(args.url, r.text,
args.username, args.password)
print '''url: 0
method: 1
payload:'''.format(action, method)
for (k, v) in values:
print '- 0: 1'.format(k, v)
if __name__ == '__main__':
sys.exit(main())
【讨论】:
【参考方案2】:很想将我的解决方案添加到 .这个答案主要遵循我所做的每一件事都遵循的 hacky/lazy 方法。继续下去主要是因为我懒得处理 cookie、会话数据等。
如果您想在使用单个帐户凭据(例如您的所有 pinterest 板)登录后抓取网站的多个页面,则此解决方案最有用。如果您想使用多个自动身份验证,则不是帐户
所以我的解决方案是 selenium 以及 firefox 配置文件。
创建一个新的firefox配置文件你创建一个新的firefox配置文件,记下它的存储位置,在相应的配置文件中打开firefox。并手动登录网站。关于firefox profiles的详细信息 现在在此配置文件中使用 selenium,selenium 会话将使用 Firefox 配置文件中的 cookie 和会话数据,以便您的身份验证保留。当我遇到需要抓取几个 pinterest 页面时,我设计了这个机制,我从示例中添加了几行代码来展示如何使用配置文件。根据您的需要适合代码。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
#replace with your firefox profile
fp=webdriver.FirefoxProfile('C:/Users/SJ/AppData/Roaming/Mozilla/Firefox/Profiles/hlsfrs2o.scrape')
#enter your url here
url=""
driver = webdriver.Firefox(fp)
driver.get(url)
html_source = driver.page_source
【讨论】:
【参考方案3】:这对我有用:
##################################### Method 1
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text
# Browser
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Chrome')]
# The site we will navigate into, handling it's session
br.open('https://github.com/login')
# View available forms
for f in br.forms():
print f
# Select the second (index one) form (the first form is a search query box)
br.select_form(nr=1)
# User credentials
br.form['login'] = 'mylogin'
br.form['password'] = 'mypass'
# Login
br.submit()
print(br.open('https://github.com/settings/emails').read())
你离得不远了!
【讨论】:
你能给我更多的细节吗?我确实查找了脚本,但没有看到我的用户名和密码。我也从第一种方法中得到了这个表单: br.open('github.com/login') for f in br.forms(): print f 我得到了以下信息:以上是关于如何使用 Python 抓取需要先登录的网站的主要内容,如果未能解决你的问题,请参考以下文章