爬取网易
Posted jianxiang
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取网易相关的知识,希望对你有一定的参考价值。
爬取国内,国际,军事,航空四个板块对应,标题,缩略图,关键字,发布时间,url
# -*- coding: utf-8 -*- import scrapy from selenium import webdriver from wangyiPro.items import WangyiproItem from scrapy_redis.spiders import RedisSpider class WangyiSpider(RedisSpider): name = ‘wangyi‘ #allowed_domains = [‘www.xxxx.com‘] #start_urls = [‘https://news.163.com‘] redis_key = ‘wangyi‘ def __init__(self): #实例化一个浏览器对象(实例化一次) self.bro = webdriver.Chrome() #必须在整个爬虫结束后,关闭浏览器 def closed(self,spider): print(‘爬虫结束‘) self.bro.quit() def parse(self, response): lis = response.xpath(‘//div[@class="ns_area list"]/ul/li‘) indexs = [3,4,6,7] li_list = [] #存储的就是国内,国际,军事,航空四个板块对应的li标签对象 for index in indexs: li_list.append(lis[index]) #获取四个板块中的链接和文字标题 for li in li_list: url = li.xpath(‘./a/@href‘).extract_first() title = li.xpath(‘./a/text()‘).extract_first() #print(url+":"+title) #对每一个板块对应的url发起请求,获取页面数据(标题,缩略图,关键字,发布时间,url) yield scrapy.Request(url=url,callback=self.parseSecond,meta=‘title‘:title) def parseSecond(self,response): div_list = response.xpath(‘//div[@class="data_row news_article clearfix "]‘) #print(len(div_list)) for div in div_list: head = div.xpath(‘.//div[@class="news_title"]/h3/a/text()‘).extract_first() url = div.xpath(‘.//div[@class="news_title"]/h3/a/@href‘).extract_first() imgUrl = div.xpath(‘./a/img/@src‘).extract_first() tag = div.xpath(‘.//div[@class="news_tag"]//text()‘).extract() tags = [] for t in tag: t = t.strip(‘ \n \t‘) tags.append(t) tag = "".join(tags) #获取meta传递过来的数据值title title = response.meta[‘title‘] #实例化item对象,将解析到的数据值存储到item对象中 item = WangyiproItem() item[‘head‘] = head item[‘url‘] = url item[‘imgUrl‘] = imgUrl item[‘tag‘] = tag item[‘title‘] = title #对url发起请求,获取对应页面中存储的新闻内容数据 yield scrapy.Request(url=url,callback=self.getContent,meta=‘item‘:item) def getContent(self,response): #获取传递过来的item item = response.meta[‘item‘] #解析当前页面中存储的新闻数据 content_list = response.xpath(‘//div[@class="post_text"]/p/text()‘).extract() content = "".join(content_list) item[‘content‘] = content yield item
import scrapy class WangyiproItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() head = scrapy.Field() url = scrapy.Field() imgUrl = scrapy.Field() tag = scrapy.Field() title = scrapy.Field() content = scrapy.Field()
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals from scrapy.http import HtmlResponse import time # from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware import random #UA池代码的编写(单独给UA池封装一个下载中间件的一个类) #1,导包UserAgentMiddlware类 class RandomUserAgent(object): def process_request(self, request, spider): #从列表中随机抽选出一个ua值 ua = random.choice(spider.setings.get("USER_AGENT_LIST")) #ua值进行当前拦截到请求的ua的写入操作 request.headers.setdefault(‘User-Agent‘,ua) # 批量对拦截到的请求进行ip更换 class Proxy(object): def process_request(self, request, spider): #对拦截到请求的url进行判断(协议头到底是http还是https) #request.url返回值:http://www.xxx.com h = request.url.split(‘:‘)[0] #请求的协议头 if h == ‘https‘: ip = random.choice(PROXY_https) request.meta[‘proxy‘] = ‘https://‘+ip else: ip = random.choice(PROXY_http) request.meta[‘proxy‘] = ‘http://‘ + ip class WangyiproDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None #拦截到响应对象(下载器传递给Spider的响应对象) #request:响应对象对应的请求对象 #response:拦截到的响应对象 #spider:爬虫文件中对应的爬虫类的实例 def process_response(self, request, response, spider): #响应对象中存储页面数据的篡改 if request.url in[‘http://news.163.com/domestic/‘,‘http://news.163.com/world/‘,‘http://news.163.com/air/‘,‘http://war.163.com/‘]: spider.bro.get(url=request.url) js = ‘window.scrollTo(0,document.body.scrollHeight)‘ spider.bro.execute_script(js) time.sleep(2) #一定要给与浏览器一定的缓冲加载数据的时间 #页面数据就是包含了动态加载出来的新闻数据对应的页面数据 page_text = spider.bro.page_source #篡改响应对象 return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding=‘utf-8‘,request=request) else: return response PROXY_http = [ ‘153.180.102.104:80‘, ‘195.208.131.189:56055‘, ] PROXY_https = [ ‘120.83.49.90:9000‘, ‘95.189.112.214:35508‘, ]
# -*- coding: utf-8 -*- # Scrapy settings for wangyiPro project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = ‘wangyiPro‘ SPIDER_MODULES = [‘wangyiPro.spiders‘] NEWSPIDER_MODULE = ‘wangyiPro.spiders‘ # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = ‘wangyiPro (+http://www.yourdomain.com)‘ USER_AGENT = ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36‘ # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = # ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘, # ‘Accept-Language‘: ‘en‘, # # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = # ‘wangyiPro.middlewares.WangyiproSpiderMiddleware‘: 543, # # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = # ‘wangyiPro.middlewares.WangyiproDownloaderMiddleware‘: 543, # # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = # ‘scrapy.extensions.telnet.TelnetConsole‘: None, # # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = # ‘wangyiPro.pipelines.WangyiproPipeline‘: 300, # # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = ‘httpcache‘ #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage‘ USER_AGENT_LIST=["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 ", "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
以上是关于爬取网易的主要内容,如果未能解决你的问题,请参考以下文章