scrapy实战1分布式爬取有缘网:
Posted hcw_19
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了scrapy实战1分布式爬取有缘网:相关的知识,希望对你有一定的参考价值。
直接上代码:
items.py
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class YouyuanwangItem(scrapy.Item): 12 # define the fields for your item here like: 13 # name = scrapy.Field() 14 # 个人头像链接 15 header_url=scrapy.Field() 16 # 用户名 17 username=scrapy.Field() 18 # 内心独白 19 monologue=scrapy.Field() 20 # 相册图片链接 21 pic_urls=scrapy.Field() 22 #籍贯 23 place_from=scrapy.Field() 24 #学历 25 education=scrapy.Field() 26 # 年龄 27 age=scrapy.Field() 28 #身高 29 height=scrapy.Field() 30 #工资 31 salary=scrapy.Field() 32 #兴趣爱好 33 hobby=scrapy.Field() 34 # 网站来源 youyuan 35 source=scrapy.Field() 36 # 个人主页源url 37 source_url=scrapy.Field() 38 # 爬虫名 39 spider=scrapy.Field()
spiders >yuoyuan.py
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import LinkExtractor 4 from scrapy.spiders import Rule 5 from scrapy_redis.spiders import RedisCrawlSpider 6 from youyuanwang.items import YouyuanwangItem 7 8 9 10 # class YouyuanSpider(CrawlSpider): 11 class youyuan(RedisCrawlSpider): 12 name = \'youyuan\' 13 # allowed_domains = [\'www.youyuan.com\'] 14 # 有缘网的列表页 15 # start_urls = [\'http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/\'] 16 redis_key = \'youyuan:start_urls\' 17 #动态域范围的获取 18 def __init__(self, *args, **kwargs): 19 # Dynamically define the allowed domains list. 20 domain = kwargs.pop(\'domain\', \'\') 21 self.allowed_domains = filter(None, domain.split(\',\')) 22 super(youyuan, self).__init__(*args, **kwargs) 23 #匹配全国 24 #list_page = LinkExtractor(allow=(r\'http://www.youyuan.com/find/.+\')) 25 # 只匹配北京、18~25岁、女性 的 搜索页面匹配规则,根据response提取链接 26 page_links=LinkExtractor(allow=r"http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\\d+/") 27 # 个人主页 匹配规则,根据response提取链接 28 profile_page=LinkExtractor(allow=r"http://www.youyuan.com/\\d+-profile/") 29 30 rules = ( 31 # 匹配列表页成功,跟进链接,跳板 32 Rule(page_links), 33 # 匹配个人主页的链接,形成request保存到redis中等待调度,一旦有响应则调用parse_profile_page()回调函数处理,不做继续跟进 34 Rule(profile_page,callback="parse_profile_page",follow=False) 35 ) 36 37 # 处理个人主页信息,得到我们要的数据 38 def parse_profile_page(self, response): 39 item=YouyuanwangItem() 40 # 个人头像链接 41 item[\'header_url\']=self.get_header_url(response) 42 # 用户名 43 item[\'username\']=self.get_username(response) 44 #籍贯 45 item[\'place_from\']=self.get_place_from(response) 46 #学历 47 item[\'education\']=self.get_education(response) 48 49 # 年龄 50 item[\'age\']=self.get_age(response) 51 # 身高 52 item[\'height\']=self.get_height(response) 53 # 工资 54 item[\'salary\']=self.get_salary(response) 55 # 兴趣爱好 56 item[\'hobby\']=self.get_hobby(response) 57 # 相册图片链接 58 item[\'pic_urls\'] = self.get_pic_urls(response) 59 # 内心独白 60 item[\'monologue\'] = self.get_monologue(response) 61 # 个人主页源url 62 item[\'source_url\']=response.url 63 # 网站来源 youyuan 64 item[\'source\']="youyuan" 65 # 爬虫名 66 item[\'spider\']="youyuan" 67 yield item 68 #提取头像地址 69 def get_header_url(self,response): 70 header=response.xpath(\'//dl[@class="personal_cen"]/dt/img/@src\').extract() 71 if len(header): 72 header_url=header[0] 73 else: 74 header_url= "" 75 return header_url.strip() 76 #提取用户名 77 def get_username(self,response): 78 username=response.xpath(\'//dl[@class="personal_cen"]/dd//div[@class="main"]/strong/text()\').extract() 79 if len(username): 80 username=username[0] 81 else: 82 username="" 83 return username.strip() 84 #提取年龄 85 def get_age(self,response): 86 age=response.xpath(\'//dl[@class="personal_cen"]//p[@class="local"]/text()\').extract() 87 if len(age): 88 age=age[0].split()[1] 89 else: 90 age="" 91 return age 92 #提取身高 93 def get_height(self,response): 94 height=response.xpath(\'//div[@class="pre_data"]/ul/li[2]/div/ol[2]/li[2]/span/text()\').extract() 95 if len(height): 96 height=height[0] 97 else: 98 height="" 99 100 return height.strip() 101 #提取工资 102 def get_salary(self,response): 103 salary=response.xpath(\'//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[4]/span/text()\').extract() 104 if len(salary): 105 salary=salary[0] 106 else: 107 salary="" 108 return salary.strip() 109 #提取兴趣爱好 110 def get_hobby(self,response): 111 hobby=response.xpath(\'//dl[@class="personal_cen"]//ol[@class="hoby"]//li/text()\').extract() 112 if len(hobby): 113 hobby=",".join(hobby).replace(" ","") 114 else: 115 hobby="" 116 return hobby.strip() 117 #提取相册图片 118 def get_pic_urls(self,response): 119 pic_urls=response.xpath(\'//div[@class="ph_show"]/ul/li/a/img/@src\').extract() 120 if len(pic_urls): 121 pic_urls=",".join(pic_urls) 122 #将相册url列表转换成字符串 123 else: 124 pic_urls="" 125 return pic_urls 126 #提取内心独白 127 def get_monologue(self,response): 128 monologue=response.xpath(\'//div[@class="pre_data"]/ul/li/p/text()\').extract() 129 if len(monologue): 130 monologue=monologue[0] 131 else: 132 monologue="" 133 return monologue.strip() 134 #提取籍贯 135 def get_place_from(self,response): 136 place_from=response.xpath(\'//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[1]/span/text()\').extract() 137 if len(place_from): 138 place_from=place_from[0] 139 else: 140 place_from="" 141 return place_from.strip() 142 #提取学历 143 def get_education(self,response): 144 education=response.xpath(\'//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[3]/span/text()\').extract() 145 if len(education): 146 education=education[0] 147 else: 148 education="" 149 return education.strip()
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import LinkExtractor 4 from scrapy.spiders import Rule,CrawlSpider 5 #from scrapy_redis.spiders import RedisCrawlSpider 6 from youyuanwang.items import YouyuanwangItem 7 8 9 class YouyuanSpider(CrawlSpider): 10 #class YouyuanSpider(RedisCrawlSpider): 11 name = \'youyuan\' 12 allowed_domains = [\'www.youyuan.com\'] 13 # 有缘网的列表页 14 start_urls = [\'http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/\'] 15 #redis_key = \'YouyuanSpider:start_urls\' 16 #动态域范围的获取 17 # def __init__(self, *args, **kwargs): 18 # # Dynamically define the allowed domains list. 19 # domain = kwargs.pop(\'domain\', \'\') 20 # self.allowed_domains = filter(None, domain.split(\',\')) 21 # super(YouyuanSpider, self).__init__(*args, **kwargs) 22 #匹配全国 23 #list_page = LinkExtractor(allow=(r\'http://www.youyuan.com/find/.+\')) 24 # 只匹配北京、18~25岁、女性 的 搜索页面匹配规则,根据response提取链接 25 page_links=LinkExtractor(allow=r"http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\\d+/") 26 # 个人主页 匹配规则,根据response提取链接 27 profile_page=LinkExtractor(allow=r"http://www.youyuan.com/\\d+-profile/") 28 29 rules = ( 30 # 匹配列表页成功,跟进链接,跳板 31 Rule(page_links), 32 # 匹配个人主页的链接,形成request保存到redis中等待调度,一旦有响应则调用parse_profile_page()回调函数处理,不做继续跟进 33 Rule(profile_page,callback="parse_profile_page",follow=False) 34 ) 35 36 # 处理个人主页信息,得到我们要的数据 37 def parse_profile_page(self, response): 38 item=YouyuanwangItem() 39 # 个人头像链接 40 item[\'header_url\']=self.get_header_url(response) 41 # 用户名 42 item[\'username\']=self.get_username(response) 43 #籍贯 44 item[\'place_from\']=self.get_place_from(response) 45 #学历 46 item[\'education\']=self.get_education(response) 47 48 # 年龄 49 item[\'age\']=self.get_age(response) 50 # 身高 51 item[\'height\']=self.get_height(response) 52 # 工资 53 item[\'salary\']=self.get_salary(response) 54 # 兴趣爱好 55 item[\'hobby\']=self.get_hobby(response) 56 # 相册图片链接 57 item[\'pic_urls\'] = self.get_pic_urls(response) 58 # 内心独白 59 item[\'monologue\'] = self.get_monologue(response) 60 # 个人主页源url 61 item[\'source_url\']=response.url 62 # 网站来源 youyuan 63 item[\'source\']="youyuan" 64 # 爬虫名 65 item[\'spider\']="youyuan" 66 yield item 67 #提取头像地址 68 def get_header_url(self,response): 69 header=response.xpath(\'//dl[@class="personal_cen"]/dt/img/@src\').extract() 70 if len(header): 71 header_url=header[0] 72 else: 73 header_url= "" 74 return header_url.strip() 75 #提取用户名 76 def get_username(self,response): 77 username=response.xpath(\'//dl[@class="personal_cen"]/dd//div[@class="main"]/strong/text()\').extract() 78 if len(username): 79 username=username[0] 80 else: 81 username="" 82 return username.strip() 83 #提取年龄 84 def get_age(self,response): 85 age=response.xpath(\'//dl[@class="personal_cen"]//p[@class="local"]/text()\').extract() 86 if len(age): 87 age=age[0].split()[1] 88 else: 89 age="" 90 return age 91 #提取身高 92 def get_height(self,response): 93 height=response.xpath(\'//div[@class="pre_data"]/ul/li[2]/div/ol[2]/li[2]/span/text()\').extract() 94 if len(height): 95 height=height[0] 96 else: 97 height="" 98 99 return height.strip() 100 #提取工资 101 def get_salary(self,response): 102 salary=response.xpath(\'//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[4]/span/text()\').extract() 103 if len(salary): 104 salary=salary[0] 105 else: 106 salary="" 107 return salary.strip() 108 #提取兴趣爱好 109 def get_hobby(self,response): 110 hobby=response.xpath(\'//dl[@class="personal_cen"]//ol[@class="hoby"]//li/text()\').extract() 111 if len(hobby): 112 hobby=",".join(hobby).replace(" ","") 113 else: 114 hobby="" 115 return hobby.strip() 116 #提取相册图片 117 def get_pic_urls(self,response): 118 pic_urls=response.xpath(\'//div[@class="ph_show"]/ul/li/a/img/@src\').extract() 119 if len(pic_urls): 120 pic_urls=",".join(pic_urls) 121 #将相册url列表转换成字符串 122 else: 123 pic_urls="" 124 return pic_urls 125 #提取内心独白 126 def get_monologue(self,response): 127 monologue=response.xpath(\'//div[@class="pre_data"]/ul/li/p/text()\').extract() 128 if len(monologue): 129 monologue=monologue[0] 130 else: 131 monologue="" 132 return monologue.strip() 133 #提取籍贯 134 def get_place_from(self,response): 135 place_from=response.xpath(\'//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[1]/span/text()\').extract() 136 if len(place_from): 137 place_from=place_from[0] 138 else: 139 place_from="" 140 return place_from.strip() 141 #提取学历 142 def get_education(self,response): 143 education=response.xpath(\'//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[3]/span/text()\').extract() 144 if len(education): 145 education=education[0] 146 else: 147 education="" 148 return education.strip()
pipelines.py