爬虫爬当当网书籍信息
Posted 可是我不配
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫爬当当网书籍信息相关的知识,希望对你有一定的参考价值。
拖了好久的一个爬虫
先上代码 文字慢慢补
1 # -*- coding: utf-8 -* 2 3 import urllib2 4 import xlwt 5 from bs4 import BeautifulSoup 6 from datashape import json 7 import re 8 import json 9 import requests 10 11 12 def getJsonText(url): 13 try: 14 r = requests.get(url, timeout=1) 15 r.raise_for_status() 16 r.encoding = r.apparent_encoding 17 return r.text 18 except: 19 print ‘获取失败‘ 20 return ‘‘ 21 22 23 def getgood(url): 24 html = urllib2.urlopen(url).read() 25 26 # 用正则表达式拿取 27 ma = re.search(r‘"productId":"[\d]+"‘, html) 28 productId = eval(ma.group().split(‘:‘)[-1]) 29 categoryPath = eval(ma.group().split(‘:‘)[-1]) 30 mainProductId = eval(ma.group().split(‘:‘)[-1]) 31 # 对Ajax的url进行拼接 32 json_url = ‘http://product.dangdang.com/index.php?r=comment%2Flist&productId={productId}&categoryPath={categoryPath}&mainProductId={mainProductId}&mediumId=0&pageIndex=1&sortType=1&filterType=1&isSystem=1&tagId=0&tagFilterCount=0‘.format( 33 productId=productId, categoryPath=categoryPath, mainProductId=mainProductId) 34 # 调用方法,下载下来json数据 35 json_html = json.loads(getJsonText(json_url)) 36 summary = json_html[‘data‘][‘list‘][‘summary‘] 37 data = {} 38 data[‘all_comment_num‘] = summary[‘total_comment_num‘] # 总评论数 39 data[‘good_comment_num‘] = summary[‘total_crazy_count‘] # 好评数 40 data[‘middle_comment_num‘] = summary[‘total_indifferent_count‘] # 中评数 41 data[‘bad_comment_num‘] = summary[‘total_detest_count‘] # 差评数 42 data[‘good_rate‘] = summary[‘goodRate‘] # 好评率 43 return data 44 45 def main(): 46 wb = xlwt.Workbook() 47 sheet1 = wb.add_sheet("Sheet") 48 sheet1.write(0, 0, unicode(‘序号‘, "utf-8")) 49 sheet1.write(0, 1, unicode(‘书名‘, "utf-8")) 50 sheet1.write(0, 2, unicode(‘价格‘, "utf-8")) 51 sheet1.write(0, 3, unicode(‘折扣‘, "utf-8")) 52 sheet1.write(0, 4, unicode(‘评论数‘, "utf-8")) 53 sheet1.write(0, 5, unicode(‘好评‘, "utf-8")) 54 sheet1.write(0, 6, unicode(‘中评‘, "utf-8")) 55 sheet1.write(0, 7, unicode(‘差评‘, "utf-8")) 56 sheet1.write(0, 8, unicode(‘好评率‘, "utf-8")) 57 58 for page in range(25): 59 60 url = ‘http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-%d‘ % (page+1) 61 get = urllib2.urlopen(url).read() 62 data = BeautifulSoup(get, ‘lxml‘) 63 64 bookname = data.find_all(‘div‘, attrs={‘class‘: ‘name‘}) 65 bookstar = data.find_all(‘div‘, attrs={‘class‘: ‘star‘}) 66 bookprice = data.find_all(‘div‘, attrs={‘class‘: ‘price‘}) 67 bookoff = data.find_all(‘span‘, attrs={‘class‘: ‘price_s‘}) 68 69 for i in range(20): 70 bookurl = bookname[i].find(‘a‘)[‘href‘] 71 data = getgood(bookurl) 72 print (str(page*20+i+1) + " " 73 + bookname[i].find(‘a‘)[‘title‘] + " " # 书名 74 + bookprice[i].find(‘span‘).text[1:] + " " # 价格 75 + bookoff[i].text[:-1] + " " # 折扣 76 + bookstar[i].find(‘a‘).text[:-3] + " " # 评论数 77 + data[‘good_comment_num‘] + " " # 好评数 78 + data[‘middle_comment_num‘] + " " # 中评数 79 + data[‘bad_comment_num‘] + " " # 差评数 80 + data[‘good_rate‘] + " " # 好评率 81 ) 82 83 sheet1.write(page * 20 + i + 1, 0, page * 20 + i + 1) 84 sheet1.write(page * 20 + i + 1, 1, bookname[i].find(‘a‘)[‘title‘]) 85 sheet1.write(page * 20 + i + 1, 2, bookprice[i].find(‘span‘).text[1:]) 86 sheet1.write(page * 20 + i + 1, 3, bookoff[i].text[:-1]) 87 sheet1.write(page * 20 + i + 1, 4, bookstar[i].find(‘a‘).text[:-3]) 88 sheet1.write(page * 20 + i + 1, 5, data[‘good_comment_num‘]) 89 sheet1.write(page * 20 + i + 1, 6, data[‘middle_comment_num‘]) 90 sheet1.write(page * 20 + i + 1, 7, data[‘bad_comment_num‘]) 91 sheet1.write(page * 20 + i + 1, 8, data[‘good_rate‘]) 92 wb.save(‘test.xls‘) 93 94 main()
以上是关于爬虫爬当当网书籍信息的主要内容,如果未能解决你的问题,请参考以下文章