豆瓣3.21
1 #coding:utf-8 2 #采集豆瓣书信息和图片,写进数据库 3 4 from urllib import request 5 # from bs4 import BeautifulSoup 6 from lxml import etree 7 import json,pymysql 8 9 # from my_pymysql import pymysql 10 11 url="https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4" 12 headers={ 13 ‘Host‘:‘book.douban.com‘, 14 ‘Upgrade-Insecure-Requests‘:‘1‘, 15 ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36‘ 16 } 17 req = request.Request(url=url,headers=headers,method="GET") 18 content = request.urlopen(req).read().decode("utf-8") 19 content_dict=etree.HTML(content) #格式化 20 # print(content_dict) 21 content_dict_allli = content_dict.xpath(r‘//*[@id="subject_list"]/ul/li‘) #拿到列表 22 info_all = ‘‘ 23 24 for li in content_dict_allli: 25 # 书名/标题 26 title_list = li.xpath(r‘div[2]/h2/a/@title‘) #取标签里的内容,注意地址是相对地址,不能直接拿来用 (注:和bs4不一样) 27 title =title_list[0] 28 title=title.replace(" ",‘‘) 29 print(title) 30 #信息 作者、出版社 31 info_list = li.xpath(r‘div[2]/div[1]/text()‘) 32 author = info_list[0].split(‘/‘)[0] 33 author = author.replace(‘\n‘,‘‘).replace(" ",‘‘) 34 chubanshe = info_list[0].split(‘/‘)[1] 35 print(author) 36 print(chubanshe) 37 #评分 38 pingfen_list = li.xpath(r‘div[2]/div[2]/span[2]/text()‘) 39 pingfen = pingfen_list[0] 40 print(pingfen) 41 42 #图片 43 img_net_addr =li.xpath(r‘div[1]/a/img/@src‘) 44 img_net_addr = img_net_addr[0] 45 print(img_net_addr) 46 data = request.urlopen(img_net_addr).read() 47 img_name =str(‘douban/‘) + title + str(‘.jpg‘) 48 with open(img_name,‘wb‘)as f: 49 f.write(data) 50 51 #数据库 52 db = pymysql.connect(host=‘localhost‘,port=3306,user="root",password=‘root‘,db=‘douban‘,charset=‘utf8‘) # 53 cur=db.cursor() 54 sql = "insert into douban(title,author,chubanshe,pingfen)values(‘%s‘,‘%s‘,‘%s‘,‘%s‘)"%(title,author,chubanshe,pingfen) 55 cur.execute(sql) 56 db.commit() 57 58 db.close()