爬取豆瓣电影储存到数据库MONGDB中以及反反爬虫

Posted cuzz

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取豆瓣电影储存到数据库MONGDB中以及反反爬虫相关的知识,希望对你有一定的参考价值。

1.代码如下:

doubanmoive.py

# -*- coding: utf-8 -*-
import scrapy
from douban.items import DoubanItem

class DoubamovieSpider(scrapy.Spider):
    name = "doubanmovie"
    allowed_domains = ["movie.douban.com"]
    offset = 0
    url = "https://movie.douban.com/top250?start="
    start_urls = (
            url+str(offset),
    )

    def parse(self, response):
        item = DoubanItem()
        movies = response.xpath("//div[@class=‘info‘]")

        for each in movies:
            # 标题
            item[title] = each.xpath(".//span[@class=‘title‘][1]/text()").extract()[0]
            # 信息
            item[bd] = each.xpath(".//div[@class=‘bd‘]/p/text()").extract()[0]
            # 评分
            item[star] = each.xpath(".//div[@class=‘star‘]/span[@class=‘rating_num‘]/text()").extract()[0]
            # 简介
            quote = each.xpath(".//p[@class=‘quote‘]/span/text()").extract()
            if len(quote) != 0:
                item[quote] = quote[0]
            yield item

        if self.offset < 225:
            self.offset += 25
            yield scrapy.Request(self.url + str(self.offset), callback = self.parse)

items.py

import scrapy


class DoubanItem(scrapy.Item):
    # define the fields for your item here like:
    # 标题
    title = scrapy.Field()
    # 信息
    bd = scrapy.Field()
    # 评分
    star = scrapy.Field()
    # 简介
    quote = scrapy.Field()

2.在管道文件中更改储存位置

import pymongo
from scrapy.conf import settings

class DoubanPipeline(object):
    def __init__(self):
        host = settings["MONGODB_HOST"]
        port = settings["MONGODB_PORT"]
        dbname = settings["MONGODB_DBNAME"]
        sheetname= settings["MONGODB_SHEETNAME"]

        # 创建MONGODB数据库链接
        client = pymongo.MongoClient(host = host, port = port)
        # 指定数据库
        mydb = client[dbname]
        # 存放数据的数据库表名
        self.sheet = mydb[sheetname]

    def process_item(self, item, spider):
        data = dict(item)
        self.sheet.insert(data)
        return item

3.新建中间件 middlewares.py 进行反反爬虫

 1 # -*- coding:utf-8 -*-
 2 
 3 import random
 4 import base64
 5 
 6 from settings import USER_AGENTS
 7 from settings import PROXIES
 8 
 9 # 随机的User-Agent
10 class RandomUserAgent(object):
11     def process_request(self, request, spider):
12         useragent = random.choice(USER_AGENTS)
13         #print useragent
14         request.headers.setdefault("User-Agent", useragent)
15 
16 class RandomProxy(object):
17     def process_request(self, request, spider):
18         proxy = random.choice(PROXIES)
19 
20         if proxy[user_passwd] is None:
21             # 没有代理账户验证的代理使用方式
22             request.meta[proxy] = "http://" + proxy[ip_port]
23 
24         else:
25             # 对账户密码进行base64编码转换
26             base64_userpasswd = base64.b64encode(proxy[user_passwd])
27             # 对应到代理服务器的信令格式里
28             request.headers[Proxy-Authorization] = Basic  + base64_userpasswd
29 
30             request.meta[proxy] = "http://" + proxy[ip_port]

4.setting的设置

  1 # -*- coding: utf-8 -*-
  2 
  3 # Scrapy settings for douban project
  4 #
  5 # For simplicity, this file contains only settings considered important or
  6 # commonly used. You can find more settings consulting the documentation:
  7 #
  8 #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 
 12 BOT_NAME = douban
 13 
 14 SPIDER_MODULES = [douban.spiders]
 15 NEWSPIDER_MODULE = douban.spiders
 16 
 17 
 18 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 USER_AGENT = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"
 20 
 21 # Obey robots.txt rules
 22 #ROBOTSTXT_OBEY = True
 23 
 24 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 #CONCURRENT_REQUESTS = 32
 26 
 27 # Configure a delay for requests for the same website (default: 0)
 28 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 29 # See also autothrottle settings and docs
 30 DOWNLOAD_DELAY = 2.5
 31 # The download delay setting will honor only one of:
 32 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 33 #CONCURRENT_REQUESTS_PER_IP = 16
 34 
 35 # Disable cookies (enabled by default)
 36 COOKIES_ENABLED = False
 37 
 38 # Disable Telnet Console (enabled by default)
 39 #TELNETCONSOLE_ENABLED = False
 40 
 41 # Override the default request headers:
 42 #DEFAULT_REQUEST_HEADERS = {
 43 #   ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,
 44 #   ‘Accept-Language‘: ‘en‘,
 45 #}
 46 
 47 # Enable or disable spider middlewares
 48 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 49 #SPIDER_MIDDLEWARES = {
 50 #    ‘douban.middlewares.MyCustomSpiderMiddleware‘: 543,
 51 #}
 52 
 53 # Enable or disable downloader middlewares
 54 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 55 DOWNLOADER_MIDDLEWARES = {
 56     douban.middlewares.RandomUserAgent: 100,
 57     douban.middlewares.RandomProxy: 200,
 58 }
 59 
 60 USER_AGENTS = [
 61     Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0),
 62     Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2),
 63     Opera/9.27 (Windows NT 5.2; U; zh-cn),
 64     Opera/8.0 (Macintosh; PPC Mac OS X; U; en),
 65     Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0,
 66     Mozilla/5.0 (Linux; U; android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30,
 67     Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13
 68 ]
 69 
 70 PROXIES = [
 71         {"ip_port" :"121.42.140.113:16816", "user_passwd" : "mr_mao_hacker:sffqry9r"},
 72         #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
 73         #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
 74         #{"ip_prot" :"121.42.140.113:16816", "user_passwd" : ""}
 75 ]
 76 
 77 #LOG_FILE = "douban.log"
 78 #LOG_LEVEL = "DEBUG"
 79 # Enable or disable extensions
 80 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 81 #EXTENSIONS = {
 82 #    ‘scrapy.extensions.telnet.TelnetConsole‘: None,
 83 #}
 84 
 85 # Configure item pipelines
 86 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 87 ITEM_PIPELINES = {
 88     douban.pipelines.DoubanPipeline: 300,
 89 }
 90 
 91 # MONGODB 主机名
 92 MONGODB_HOST = "127.0.0.1"
 93 
 94 # MONGODB 端口号
 95 MONGODB_PORT = 27017
 96 
 97 # 数据库名称
 98 MONGODB_DBNAME = "Douban"
 99 
100 # 存放数据的表名称
101 MONGODB_SHEETNAME = "doubanmovies"
102 
103 # Enable and configure the AutoThrottle extension (disabled by default)
104 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
105 #AUTOTHROTTLE_ENABLED = True
106 # The initial download delay
107 #AUTOTHROTTLE_START_DELAY = 5
108 # The maximum download delay to be set in case of high latencies
109 #AUTOTHROTTLE_MAX_DELAY = 60
110 # The average number of requests Scrapy should be sending in parallel to
111 # each remote server
112 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
113 # Enable showing throttling stats for every response received:
114 #AUTOTHROTTLE_DEBUG = False
115 
116 # Enable and configure HTTP caching (disabled by default)
117 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
118 #HTTPCACHE_ENABLED = True
119 #HTTPCACHE_EXPIRATION_SECS = 0
120 #HTTPCACHE_DIR = ‘httpcache‘
121 #HTTPCACHE_IGNORE_HTTP_CODES = []
122 #HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage‘

 

以上是关于爬取豆瓣电影储存到数据库MONGDB中以及反反爬虫的主要内容,如果未能解决你的问题,请参考以下文章

用Scrapy爬虫爬取豆瓣电影排行榜数据,存储到Mongodb数据库

python爬取豆瓣电影Top250(附完整源代码)

团队-张宸-需求分析-python爬虫分类爬取豆瓣电影

团队-张文然-需求分析-python爬虫分类爬取豆瓣电影信息

Python开发简单爬虫之静态网页抓取篇:爬取“豆瓣电影 Top 250”电影数据

Java豆瓣电影爬虫——小爬虫成长记(附源码)