爬虫实例——爬取淘女郎相册(通过seleniumPhantomJSBeautifulSoup爬取)
Posted 昨、夜星辰
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫实例——爬取淘女郎相册(通过seleniumPhantomJSBeautifulSoup爬取)相关的知识,希望对你有一定的参考价值。
环境
操作系统:CentOS 6.7 32-bit
Python版本:2.6.6
第三方插件
selenium
PhantomJS
BeautifulSoup
代码
# -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding(‘utf-8‘) ‘‘‘ 作者:昨夜星辰 ‘‘‘ import re import os import time import shutil import requests import subprocess from bs4 import BeautifulSoup from selenium import webdriver # 拼接url def joint_url(string): return ‘https:‘ + string # 判断文件夹是否存在,如果存在就删除,否则就创建。 def create_folder(path): if os.path.exists(path): if os.path.isdir(path): shutil.rmtree(path) else: os.remove(path) os.mkdir(path) root_folder = ‘淘女郎‘ create_folder(root_folder) url = ‘https://mm.taobao.com/json/request_top_list.htm?page=1‘ browser = webdriver.PhantomJS() browser.get(url) bs = BeautifulSoup(browser.page_source, ‘lxml‘) for top in bs(‘p‘, ‘top‘): mm_url = joint_url(top.find(‘a‘)[‘href‘]) mm_name = top.find(‘a‘).text mm_age = top.find(‘em‘).text mm_city = top.find(‘span‘).text mm_folder = ‘%s/%s‘ % (root_folder, mm_name) create_folder(mm_folder) print ‘发现一位美眉,她叫做%s,今年%s,住在%s,现在开始爬取她的个人页面……‘ % (mm_name, mm_age, mm_city) browser.get(mm_url) bs1 = BeautifulSoup(browser.page_source, ‘lxml‘) base_info = bs1.find(‘ul‘, ‘mm-p-info-cell clearfix‘) info_list = base_info(‘span‘) result = [] result.append(‘昵称:‘ + info_list[0].text) result.append(‘生日:‘ + info_list[1].text.strip()) result.append(‘所在城市:‘ + info_list[2].text) result.append(‘职业:‘ + info_list[3].text) result.append(‘血型:‘ + info_list[4].text) result.append(‘学校/专业:‘ + info_list[5].text) result.append(‘风格:‘ + info_list[6].text) result.append(‘身高:‘ + base_info.find(‘li‘, ‘mm-p-small-cell mm-p-height‘).find(‘p‘).text) result.append(‘体重:‘ + base_info.find(‘li‘, ‘mm-p-small-cell mm-p-weight‘).find(‘p‘).text) result.append(‘三围:‘ + base_info.find(‘li‘, ‘mm-p-small-cell mm-p-size‘).find(‘p‘).text) result.append(‘罩杯:‘ + base_info.find(‘li‘, ‘mm-p-small-cell mm-p-bar‘).find(‘p‘).text) result.append(‘鞋码:‘ + base_info.find(‘li‘, ‘mm-p-small-cell mm-p-shose‘).find(‘p‘).text) print ‘资料收集完毕,正在保存她的个人资料……‘ filename = ‘%s/%s.txt‘ % (mm_folder, mm_name) with open(filename, ‘w‘) as f: f.write(‘\r\n‘.join(result)) print ‘保存完毕!现在开始爬取她的个人相册……‘ album_menu_url = joint_url(bs1.find(‘ul‘, ‘mm-p-menu‘).find(‘a‘)[‘href‘]) browser.get(album_menu_url) time.sleep(3) bs2 = BeautifulSoup(browser.page_source, ‘lxml‘) album_number = 1 for album_info in bs2(‘div‘, ‘mm-photo-cell-middle‘): album_url = joint_url(album_info.find(‘h4‘).find(‘a‘)[‘href‘]) album_name = album_info.find(‘h4‘).find(‘a‘).text.strip() album_size = album_info.find(‘span‘, ‘mm-pic-number‘).text print ‘现在开始爬取她的第%d个相册,相册名为:《%s》%s……‘ % (album_number, album_name, album_size) browser.get(album_url) js1 = ‘return document.body.scrollHeight‘ js2 = ‘window.scrollTo(0, document.body.scrollHeight)‘ old_scroll_height = 0 while(browser.execute_script(js1) > old_scroll_height): old_scroll_height = browser.execute_script(js1) browser.execute_script(js2) time.sleep(3) bs3 = BeautifulSoup(browser.page_source, ‘lxml‘) photo_number = 1 for photo_area in bs3(‘div‘, ‘mm-photoimg-area‘): print ‘现在开始下载她这个相册的第%d张图片……‘ % photo_number, photo_url = joint_url(photo_area.find(‘a‘)[‘href‘]) browser.get(photo_url) bs4 = BeautifulSoup(browser.page_source, ‘lxml‘) big_img_url = joint_url(bs4.find(‘img‘, id=‘J_MmBigImg‘)[‘src‘]) content = requests.get(big_img_url).content filename = ‘%s/%d.jpg‘ % (mm_folder, photo_number) with open(filename, ‘wb‘) as f: f.write(content) print ‘下载完毕!‘ photo_number += 1 album_number += 1
以上是关于爬虫实例——爬取淘女郎相册(通过seleniumPhantomJSBeautifulSoup爬取)的主要内容,如果未能解决你的问题,请参考以下文章