爬取豆瓣古典文学(数据库存储)

Posted dolfamingo

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取豆瓣古典文学(数据库存储)相关的知识,希望对你有一定的参考价值。

 

 

代码如下:

  1 # coding:utf-8
  2 import cPickle
  3 import random
  4 import requests
  5 from lxml import etree
  6 import time
  7 import re
  8 import sys
  9 import codecs
 10 import sqlite3
 11 
 12 class Spider:
 13     def __init__(self):
 14         self.con = sqlite3.connect(rBookInformation.db)
 15         self.cur = self.con.cursor()
 16         self.home = https://book.douban.com/tag/%E5%8F%A4%E5%85%B8%E6%96%87%E5%AD%A6
 17         self.Referer = https://book.douban.com/
 18         self.user_agent_list = []
 19         self.books_list = []
 20         with open(user_agent.txt, rb) as f:
 21             self.user_agent_list = cPickle.load(f)
 22 
 23     def GetHeaders(self):
 24         UserAgent = random.choice(self.user_agent_list)
 25         headers = {Referer: self.Referer, User-Agent: UserAgent}
 26         return headers
 27 
 28     def SaveBook(self,info):
 29         sql = INSERT INTO BookInfo VALUES(?,?,?,?,?)
 30         info_list = (info["Name"],info["Author"],info["Rating"],info["ContentIntro"],info["AuthorIntro"])
 31         self.cur.execute(sql, info_list)
 32         self.con.commit()
 33 
 34     def Crawl(self):
 35         html = requests.get(self.home,headers=self.GetHeaders()).text
 36         html_tree = etree.HTML(html)
 37         booksList = html_tree.xpath(/html/body/div[3]/div[1]/div/div[1]/div/ul/li)
 38         num = 0
 39         for book in booksList:
 40             time.sleep(1)
 41             bookUrl = book.xpath(div[2]/h2/a)[0].get(href)
 42             pageHtml = requests.get(bookUrl,headers=self.GetHeaders()).text
 43             page_tree = etree.HTML(pageHtml)
 44             book_info = self.GetPage(page_tree)
 45             print book_info[Name]
 46             self.SaveBook(book_info)
 47             # self.books_list.append(book_info)
 48             # f = codecs.open(‘text.txt‘,‘a‘,encoding=‘utf-8‘)
 49             # f.write(book_info[‘AuthorIntro‘])
 50             # f.close()
 51             # print book_info[‘AuthorIntro‘]
 52             num = num+1
 53             if num==5:
 54                 break
 55 
 56 
 57     def GetPage(self, page_tree):
 58         book_info = {}
 59         try:
 60             Name = self.GetName(page_tree)
 61             book_info[Name] = Name
 62         except:
 63             book_info[Name] = ‘‘
 64         try:
 65             Author = self.GetAuthor(page_tree)
 66             book_info[Author] = Author
 67         except:
 68             book_info[Author] = ‘‘
 69         try:
 70             Rating = self.GetRating(page_tree)
 71             book_info[Rating] = Rating
 72         except:
 73             book_info[Rating] = ‘‘
 74         try:
 75             ContentIntro = self.GetContentIntro(page_tree)
 76             book_info[ContentIntro] = ContentIntro
 77         except:
 78             book_info[ContentIntro] = ‘‘
 79         try:
 80             AuthorIntro = self.GetAuthorIntro(page_tree)
 81             book_info[AuthorIntro] = AuthorIntro
 82         except:
 83             book_info[AuthorIntro] = ‘‘
 84 
 85 
 86         return book_info
 87 
 88     def GetName(self, page_tree):
 89         return page_tree.xpath(/html/body/div[3]/h1/span)[0].text
 90 
 91     def GetAuthor(self,page_tree):
 92         author_list = page_tree.xpath(/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div[2]/span[1]/a)
 93         result = ‘‘
 94         if len(author_list) is not 0:
 95             list = []
 96             for author in author_list:
 97                 list.append(author.text.strip())
 98             result = /.join(list)
 99         else:
100             result = page_tree.xpath(/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div[2]/a)[0].text.strip()
101         return re.sub(rs+, ,result)
102 
103 
104     def GetRating(self, page_tree):
105         return page_tree.xpath(/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[2]/div/div[2]/strong)[0].text.strip()
106 
107     def GetContentIntro(self, page_tree):
108         para_div = page_tree.xpath(//*[@id="link-report"]//div[@class="intro"])
109         result = ‘‘
110         if len(para_div) is not 0:
111             para_para = para_div[len(para_div)-1].xpath(p)
112             for para in para_para:
113                 result = result+	+para.text+

114         return result
115 
116     def GetAuthorIntro(self, page_tree):
117         para_div = page_tree.xpath(/html/body/div[3]/div[2]/div/div[1]/div[3]/div[@class="indent "]//div[@class="intro"])
118         result = ‘‘
119         if len(para_div) is not 0:
120             para_para = para_div[len(para_div) - 1].xpath(p)
121             for para in para_para:
122                 result = result + 	 + para.text + 

123         return result
124 
125     # def GetCatalogue(self, page_tree):
126     #     pass
127     #
128     # def GetTag(self, page_tree):
129     #     pass
130     #
131     # def GetShortCommentary(self, page_tree):
132     #     pass
133 
134 if __name__ == __main__:
135     s = Spider()
136     s.Crawl()

 

以上是关于爬取豆瓣古典文学(数据库存储)的主要内容,如果未能解决你的问题,请参考以下文章

爬取豆瓣网图书TOP250的信息

用Scrapy爬虫爬取豆瓣电影排行榜数据,存储到Mongodb数据库

使用mongodb保存爬取豆瓣电影的数据

如何用python爬取豆瓣读书的数据

Forward团队-爬虫豆瓣top250项目-需求分析

Forward团队-爬虫豆瓣top250项目-需求分析