爬虫BS4—淘女郎

Posted wskxy

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫BS4—淘女郎相关的知识,希望对你有一定的参考价值。

1.修改网页头

用独自的py文件getheaders,随机返回header

getheaders文件

import random

headerstr = """Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0
Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999 """


def headers():
header = headerstr.split("\n")
length = len(header)
return header[random.randint(0, length - 1)]




2.主文件

# coding:utf-8
from bs4 import BeautifulSoup
import urllib2
from getheaders import headers
from json import loads
import re
import os


def reqobject(): # 实例化一个请求对象,还没有访问
# 1、实例化一个请求对象,还没有访问
req = urllib2.Request("https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8")
# 2、对请求对象进行加工,添加用户头
req.add_header(‘user-agent‘, headers())
return req


def getUrlList(): # 获取页面所有的用户信息
req = reqobject()
# 1.2.再次对对象进行加工,添加参数
req.add_data(
‘q&viewFlag=A&sortType=default&searchStyle=&searchRegion=city%3A&searchFansNum=&currentPage=1&pageSize=100‘)
# 3、访问对象并解码+编码
# """
# decode(‘gbk‘) 解码:吧gbk格式解码为Unicode
# decode解码时要对应网页的respon heasers里面的content-type:text/html;charset=GBK
# 若没有则查看网页源代码头部<meta charset="gbk" />
# encode(‘utf-8‘) 编码:把Unicode编码为utf-8
# encode只能编码Unicode
# """
html = urllib2.urlopen(req).read().decode(‘gbk‘).encode(‘utf-8‘)
# 4、取值,html为一个json对象,先转化为dict,便于取值
json_dict = loads(html)
# 5、返回一个列表
return json_dict[‘data‘][‘searchDOList‘]


def getInfo(userid): # 获取用户的“她的爱秀”
req = urllib2.Request("https://mm.taobao.com/self/aiShow.htm?&userId=%s" % userid)
req.add_header(‘user-agent‘, headers())
html = urllib2.urlopen(req).read().decode(‘gbk‘).encode(‘utf-8‘)
return html


def getNeedInf(html): # 提取我们需要的信息
soup = BeautifulSoup(html, ‘html.parser‘)
name = soup.select(‘dl > dd > a‘)[0].text.encode(‘utf-8‘)
follow = soup.select(‘dl > dt > a‘)[1].text.encode(‘utf-8‘)
fens = soup.select(‘dl > dt > a‘)[2].text.encode(‘utf-8‘)
detail = soup.find(‘div‘, attrs={‘id‘: ‘J_ScaleImg‘}).get_text().strip().encode(‘utf-8‘)
content = "姓名:{} 关注:{} 粉丝:{}\n{}".format(name, follow, fens, detail)
if os.path.exists("images\\" + str(userid)) == False:
os.mkdir("images\\" + str(userid))
print ‘Start downloading...‘
print ‘getInf:{}‘.format(str(userid))
with open("images\\{}\\{}.txt".format(str(userid), str(userid)), ‘wb‘) as f:
f.write(content)


def getAlbumList(userid): # 获取用户的“相册”和相册的封面照片链接
req = urllib2.Request("https://mm.taobao.com/self/album/open_album_list.htm?&user_id%20=" + str(userid)) # 相册链接
req.add_header(‘user-agent‘, headers())
html = urllib2.urlopen(req).read().decode(‘gbk‘).encode(‘utf-8‘)
# 获取每一个相册的链接
rel = r‘class="mm-first" href="//(.*?)"‘
AlbumListurl = re.findall(rel, html)
# 获取每一个相册的封面的链接,用于下载封面图片
# rel = r‘<img src="//(.*?jpg_240x240xz.jpg)" width="125" height="125">‘
# 爬取出来的链接:img.alicdn.com/imgextra/i1/176817195/TB1jFcMKFXXXXblXFXXXXXXXXXX_!!0-tstar.jpg_240x240xz.jpg
# 我们需要的链接:img.alicdn.com/imgextra/i1/176817195/TB1jFcMKFXXXXblXFXXXXXXXXXX_!!0-tstar.jpg_620x10000.jpg
# 相差为【jpg_240x240xz.jpg】和【jpg_620x10000.jpg】所以将【jpg_240x240xz.jpg】写在括号外面
# 爬到链接img.alicdn.com/imgextra/i1/176817195/TB1jFcMKFXXXXblXFXXXXXXXXXX_!!0-tstar.
# 再补齐【jpg_620x10000.jpg】,如下
rel = r‘<img src="//(.*?)jpg_240x240xz.jpg" width="125" height="125">‘
AlbumListCoverurl = re.findall(rel, html)
getAlbumListCoverurl = []
for url in AlbumListCoverurl:
url += "jpg_620x10000.jpg"
url = "http://" + url
getAlbumListCoverurl.append(url)
return getAlbumListCoverurl


def getimages(userid, urls): # 通过图片链接下载图片
# http://img.alicdn.com/imgextra/i3/865838484/TB1_n_XKVXXXXb5XXXXXXXXXXXX_!!865838484-0-tstar.jpg_620x10000
# if os.path.exists("images\\" + str(userid)) == False:
# os.mkdir("images\\" + str(userid))
i = 1
for url in urls:
req = urllib2.Request(url)
req.add_header(‘user-agent‘, headers())
html = urllib2.urlopen(req).read()
# with open(‘images\\‘+str(userid)+"\\" + str(i) + ‘.jpg‘, ‘wb‘) as f:
with open(‘images\\{}\\{}.jpg‘.format(str(userid), str(i)), ‘wb‘) as f:
f.write(html)
print "getImage:", url
i += 1
print "End of download..."


for user in getUrlList():
if os.path.exists("images") == False:
os.mkdir("images")
try:
userid = user[‘userId‘]
html = getInfo(userid)
getNeedInf(html)
# for i in getAlbumList(userid):
# print i
urls = getAlbumList(userid)
getimages(userid, urls)
except urllib2.URLError,e: print e.reason
 
 

以上是关于爬虫BS4—淘女郎的主要内容,如果未能解决你的问题,请参考以下文章

爬虫实例——爬取淘女郎的相册(通过谷歌浏览器的开发者工具找出规律快速爬取)

Python爬虫实战:爬取淘女郎照片

Ajax数据的爬取(淘女郎为例)

python 硒爬取淘女郎照片

python3 爬淘女郎

python网络爬虫学习利用Pyspider+Phantomjs爬取淘宝模特图片