Selenium+Chrome抓取淘宝数据

Posted smbl

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Selenium+Chrome抓取淘宝数据相关的知识,希望对你有一定的参考价值。

在学习了网易云课堂上崔庆才老师的Python3爬虫三大案例实战分享之后模仿了一段代码,PhantomJS和MongoDB还没学,暂时没放进去,用pandas代替。

 1 from selenium import webdriver
 2 from selenium.common.exceptions import TimeoutException
 3 from selenium.webdriver.common.by import By
 4 from selenium.webdriver.support.ui import WebDriverWait
 5 from selenium.webdriver.support import expected_conditions as EC
 6 import re
 7 from pyquery import PyQuery as pq
 8 import pandas as pd
 9 
10 browser = webdriver.Chrome()
11 wait = WebDriverWait(browser, 10)
12 totaldata = []
13 def search():
14     global totaldata
15     try:
16         browser.get(https://www.taobao.com)
17         input = wait.until(
18             EC.presence_of_element_located((By.CSS_SELECTOR, "#q"))
19         )
20         submit = wait.until(
21             EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_TSearchForm > div.search-button > button"))
22         )
23         input.send_keys(鸡蛋)
24         submit.click()
25         total = wait.until(
26             EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total"))
27         )
28         totaldata.extend(get_products())
29         return total.text
30     except TimeoutException:
31         return search()
32 
33 def next_page(page_number):
34     global totaldata
35     try:
36         input = wait.until(
37             EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))
38         )
39         submit = wait.until(
40             EC.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))
41         )
42         input.clear()
43         input.send_keys(page_number)
44         submit.click()
45         wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, #mainsrp-pager > div > div > div > ul > li.item.active > span), str(page_number)))
46         totaldata.extend(get_products())
47     except TimeoutException:
48         return next_page(page_number)
49 
50 def get_products():
51     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, #mainsrp-itemlist .items .item)))
52     html = browser.page_source
53     doc = pq(html)
54     items = doc(#mainsrp-itemlist .items .item).items()
55     data = []
56     for item in items:
57         product = {
58             image: item.find(.pic .img).attr(src),
59             price: item.find(.price).text().replace(\n, ‘‘),
60             deal: item.find(.deal-cnt).text()[:-3],
61             title: item.find(.title).text().replace(\n, ‘‘),
62             shop: item.find(.shop).text(),
63             location: item.find(.location).text()
64         }
65         data.append(product)
66     return data
67 
68 def main():
69 
70     search()
71     total = search()
72     total = int(re.compile((\d+)).search(total).group(1))
73     for i in range(2, total+1):
74         next_page(i)
75     df = pd.DataFrame(totaldata)
76     df.to_excel(taobaoeggs.xlsx)
77 
78 if __name__ == __main__:
79     main()

 



以上是关于Selenium+Chrome抓取淘宝数据的主要内容,如果未能解决你的问题,请参考以下文章

python+selenium操作chrome浏览器抓取网页解决方案

使用selenium模拟浏览器抓取淘宝信息

使用selenium抓取淘宝的商品信息

selenium+PhantomJS 抓取淘宝搜索商品

使用Selenium模拟浏览器抓取淘宝商品美食信息

使用Pyquery+selenium抓取淘宝商品信息