爬取豆瓣古典文学(数据库存储)
Posted dolfamingo
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取豆瓣古典文学(数据库存储)相关的知识,希望对你有一定的参考价值。
代码如下:
1 # coding:utf-8 2 import cPickle 3 import random 4 import requests 5 from lxml import etree 6 import time 7 import re 8 import sys 9 import codecs 10 import sqlite3 11 12 class Spider: 13 def __init__(self): 14 self.con = sqlite3.connect(r‘BookInformation.db‘) 15 self.cur = self.con.cursor() 16 self.home = ‘https://book.douban.com/tag/%E5%8F%A4%E5%85%B8%E6%96%87%E5%AD%A6‘ 17 self.Referer = ‘https://book.douban.com/‘ 18 self.user_agent_list = [] 19 self.books_list = [] 20 with open(‘user_agent.txt‘, ‘rb‘) as f: 21 self.user_agent_list = cPickle.load(f) 22 23 def GetHeaders(self): 24 UserAgent = random.choice(self.user_agent_list) 25 headers = {‘Referer‘: self.Referer, ‘User-Agent‘: UserAgent} 26 return headers 27 28 def SaveBook(self,info): 29 sql = ‘INSERT INTO BookInfo VALUES(?,?,?,?,?)‘ 30 info_list = (info["Name"],info["Author"],info["Rating"],info["ContentIntro"],info["AuthorIntro"]) 31 self.cur.execute(sql, info_list) 32 self.con.commit() 33 34 def Crawl(self): 35 html = requests.get(self.home,headers=self.GetHeaders()).text 36 html_tree = etree.HTML(html) 37 booksList = html_tree.xpath(‘/html/body/div[3]/div[1]/div/div[1]/div/ul/li‘) 38 num = 0 39 for book in booksList: 40 time.sleep(1) 41 bookUrl = book.xpath(‘div[2]/h2/a‘)[0].get(‘href‘) 42 pageHtml = requests.get(bookUrl,headers=self.GetHeaders()).text 43 page_tree = etree.HTML(pageHtml) 44 book_info = self.GetPage(page_tree) 45 print book_info[‘Name‘] 46 self.SaveBook(book_info) 47 # self.books_list.append(book_info) 48 # f = codecs.open(‘text.txt‘,‘a‘,encoding=‘utf-8‘) 49 # f.write(book_info[‘AuthorIntro‘]) 50 # f.close() 51 # print book_info[‘AuthorIntro‘] 52 num = num+1 53 if num==5: 54 break 55 56 57 def GetPage(self, page_tree): 58 book_info = {} 59 try: 60 Name = self.GetName(page_tree) 61 book_info[‘Name‘] = Name 62 except: 63 book_info[‘Name‘] = ‘‘ 64 try: 65 Author = self.GetAuthor(page_tree) 66 book_info[‘Author‘] = Author 67 except: 68 book_info[‘Author‘] = ‘‘ 69 try: 70 Rating = self.GetRating(page_tree) 71 book_info[‘Rating‘] = Rating 72 except: 73 book_info[‘Rating‘] = ‘‘ 74 try: 75 ContentIntro = self.GetContentIntro(page_tree) 76 book_info[‘ContentIntro‘] = ContentIntro 77 except: 78 book_info[‘ContentIntro‘] = ‘‘ 79 try: 80 AuthorIntro = self.GetAuthorIntro(page_tree) 81 book_info[‘AuthorIntro‘] = AuthorIntro 82 except: 83 book_info[‘AuthorIntro‘] = ‘‘ 84 85 86 return book_info 87 88 def GetName(self, page_tree): 89 return page_tree.xpath(‘/html/body/div[3]/h1/span‘)[0].text 90 91 def GetAuthor(self,page_tree): 92 author_list = page_tree.xpath(‘/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div[2]/span[1]/a‘) 93 result = ‘‘ 94 if len(author_list) is not 0: 95 list = [] 96 for author in author_list: 97 list.append(author.text.strip()) 98 result = ‘/‘.join(list) 99 else: 100 result = page_tree.xpath(‘/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div[2]/a‘)[0].text.strip() 101 return re.sub(r‘s+‘,‘ ‘,result) 102 103 104 def GetRating(self, page_tree): 105 return page_tree.xpath(‘/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[2]/div/div[2]/strong‘)[0].text.strip() 106 107 def GetContentIntro(self, page_tree): 108 para_div = page_tree.xpath(‘//*[@id="link-report"]//div[@class="intro"]‘) 109 result = ‘‘ 110 if len(para_div) is not 0: 111 para_para = para_div[len(para_div)-1].xpath(‘p‘) 112 for para in para_para: 113 result = result+‘ ‘+para.text+‘ ‘ 114 return result 115 116 def GetAuthorIntro(self, page_tree): 117 para_div = page_tree.xpath(‘/html/body/div[3]/div[2]/div/div[1]/div[3]/div[@class="indent "]//div[@class="intro"]‘) 118 result = ‘‘ 119 if len(para_div) is not 0: 120 para_para = para_div[len(para_div) - 1].xpath(‘p‘) 121 for para in para_para: 122 result = result + ‘ ‘ + para.text + ‘ ‘ 123 return result 124 125 # def GetCatalogue(self, page_tree): 126 # pass 127 # 128 # def GetTag(self, page_tree): 129 # pass 130 # 131 # def GetShortCommentary(self, page_tree): 132 # pass 133 134 if __name__ == ‘__main__‘: 135 s = Spider() 136 s.Crawl()
以上是关于爬取豆瓣古典文学(数据库存储)的主要内容,如果未能解决你的问题,请参考以下文章