用selenium爬取淘宝美食
Posted regit
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了用selenium爬取淘宝美食相关的知识,希望对你有一定的参考价值。
1. spider.py文件如下
1 __author__ = ‘Administrator‘ 2 from selenium import webdriver 3 from selenium.webdriver.common.by import By 4 from selenium.webdriver.support.ui import WebDriverWait 5 from selenium.webdriver.support import expected_conditions as EC 6 import re 7 from pyquery import PyQuery as pq 8 from config import * 9 import pymongo 10 11 client = pymongo.MongoClient(MONGO_URL) 12 db = client[MONGO_DB] 13 14 browser = webdriver.Chrome() 15 """ 16 如果把Chrome修改为使用PhantomJS 17 1. 首先需要安装phantomJS 18 2. 自定义一些配置参数,这里不加载图片以及使用缓存 19 browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) 20 3. 设置窗口大小 21 browser.set_window_size(1400,900) 22 """ 23 24 wait = WebDriverWait(browser, 10) 25 26 27 def search(): 28 # print(‘正在搜索‘) 用于phantomJS调试 29 try: 30 browser.get(‘https://www.taobao.com‘) 31 input1 = wait.until( 32 EC.presence_of_element_located((By.CSS_SELECTOR, ‘#q‘)) 33 ) 34 submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ‘#J_TSearchForm > div.search-button > button‘))) 35 # 这里的美食可以替换为配置文件中的变量KEYWORD 36 input1.send_keys(‘美食‘) 37 submit.click() 38 total = wait.until( 39 EC.presence_of_element_located((By.CSS_SELECTOR, ‘#mainsrp-pager > div > div > div > div.total‘))) 40 # 调用get_products 41 get_products() 42 return total.text 43 44 except TimeoutError: 45 return search() 46 47 48 # 使用翻页输入框来翻页 49 def next_page(page_number): 50 # print(‘正在翻页‘,page_number) 用于phantomJS调试 51 try: 52 input1 = wait.until( 53 EC.presence_of_element_located((By.CSS_SELECTOR, ‘#mainsrp-pager > div > div > div > div.form > input‘)) 54 ) 55 submit = wait.until( 56 EC.element_to_be_clickable((By.CSS_SELECTOR, ‘#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit‘)) 57 ) 58 input1.clear() 59 input1.send_keys(page_number) 60 submit.click() 61 # 根据选择页面会高亮这个条件,来判断是否成功跳转 62 wait.until(EC.text_to_be_present_in_element( 63 (By.CSS_SELECTOR, ‘#mainsrp-pager > div > div > div > ul > li.item.active > span‘), str(page_number))) 64 # 调用get_products() 65 get_products() 66 67 except TimeoutError: 68 next_page(page_number) 69 70 71 # 解析信息 72 def get_products(): 73 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ‘#mainsrp-itemlist .items .item‘))) 74 html = browser.page_source 75 doc = pq(html) 76 items = doc(‘#mainsrp-itemlist .items .item‘).items() 77 for item in items: 78 product = { 79 ‘image‘: item.find(‘.pic .img‘).attr(‘src‘), 80 ‘price‘: item.find(‘.price‘).text(), 81 ‘deal‘: item.find(‘.deal-cnt‘).text()[:-3], 82 ‘title‘: item.find(‘.title‘).text(), 83 ‘shop‘: item.find(‘.shop‘).text(), 84 ‘location‘: item.find(‘.location‘).text() 85 } 86 print(product) 87 # 保存数据到mongodb 88 save_to_mongo(product) 89 90 91 # 定义一个保存到mongodb的方法 92 def save_to_mongo(result): 93 try: 94 if db[MON_TABLE].insert(result): 95 print(‘存储到MONGODB成功‘, result) 96 except Exception: 97 print(‘存储到MONGODB失败‘, result) 98 99 100 def main(): 101 try: 102 # 输出100数字 103 total = search() 104 total = int(re.compile(‘(d+)‘).search(total).group(1)) 105 # 调用翻页函数 106 for i in range(2, total + 1): 107 next_page(i) 108 except Exception: 109 print(‘出错了‘) 110 111 finally: 112 browser.close() 113 114 if __name__ == ‘__main__‘: 115 main()
2. config.py
1 __author__ = ‘Administrator‘ 2 MONGO_URL = ‘localhost‘ 3 MONGO_DB = ‘taobao‘ 4 MON_TABLE = ‘product‘ 5 6 # 配置phantomJS 7 SERVICE_ARGS = [‘--load-images=false‘, ‘--disk-cache=true‘] 8 KEYWORD = ‘美食‘
以上是关于用selenium爬取淘宝美食的主要内容,如果未能解决你的问题,请参考以下文章
使用selenium结合PhantomJS爬取淘宝美食并存储到MongoDB