在学习了网易云课堂上崔庆才老师的Python3爬虫三大案例实战分享之后模仿了一段代码,PhantomJS和MongoDB还没学,暂时没放进去,用pandas代替。
1 from selenium import webdriver 2 from selenium.common.exceptions import TimeoutException 3 from selenium.webdriver.common.by import By 4 from selenium.webdriver.support.ui import WebDriverWait 5 from selenium.webdriver.support import expected_conditions as EC 6 import re 7 from pyquery import PyQuery as pq 8 import pandas as pd 9 10 browser = webdriver.Chrome() 11 wait = WebDriverWait(browser, 10) 12 totaldata = [] 13 def search(): 14 global totaldata 15 try: 16 browser.get(‘https://www.taobao.com‘) 17 input = wait.until( 18 EC.presence_of_element_located((By.CSS_SELECTOR, "#q")) 19 ) 20 submit = wait.until( 21 EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_TSearchForm > div.search-button > button")) 22 ) 23 input.send_keys(‘鸡蛋‘) 24 submit.click() 25 total = wait.until( 26 EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total")) 27 ) 28 totaldata.extend(get_products()) 29 return total.text 30 except TimeoutException: 31 return search() 32 33 def next_page(page_number): 34 global totaldata 35 try: 36 input = wait.until( 37 EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")) 38 ) 39 submit = wait.until( 40 EC.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")) 41 ) 42 input.clear() 43 input.send_keys(page_number) 44 submit.click() 45 wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, ‘#mainsrp-pager > div > div > div > ul > li.item.active > span‘), str(page_number))) 46 totaldata.extend(get_products()) 47 except TimeoutException: 48 return next_page(page_number) 49 50 def get_products(): 51 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ‘#mainsrp-itemlist .items .item‘))) 52 html = browser.page_source 53 doc = pq(html) 54 items = doc(‘#mainsrp-itemlist .items .item‘).items() 55 data = [] 56 for item in items: 57 product = { 58 ‘image‘: item.find(‘.pic .img‘).attr(‘src‘), 59 ‘price‘: item.find(‘.price‘).text().replace(‘\n‘, ‘‘), 60 ‘deal‘: item.find(‘.deal-cnt‘).text()[:-3], 61 ‘title‘: item.find(‘.title‘).text().replace(‘\n‘, ‘‘), 62 ‘shop‘: item.find(‘.shop‘).text(), 63 ‘location‘: item.find(‘.location‘).text() 64 } 65 data.append(product) 66 return data 67 68 def main(): 69 70 search() 71 total = search() 72 total = int(re.compile(‘(\d+)‘).search(total).group(1)) 73 for i in range(2, total+1): 74 next_page(i) 75 df = pd.DataFrame(totaldata) 76 df.to_excel(‘taobaoeggs.xlsx‘) 77 78 if __name__ == ‘__main__‘: 79 main()