scrapy实战1分布式爬取有缘网:

Posted hcw_19

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了scrapy实战1分布式爬取有缘网:相关的知识,希望对你有一定的参考价值。

直接上代码:

items.py

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define here the models for your scraped items
 4 #
 5 # See documentation in:
 6 # http://doc.scrapy.org/en/latest/topics/items.html
 7 
 8 import scrapy
 9 
10 
11 class YouyuanwangItem(scrapy.Item):
12     # define the fields for your item here like:
13     # name = scrapy.Field()
14     # 个人头像链接
15     header_url=scrapy.Field()
16     # 用户名
17     username=scrapy.Field()
18     # 内心独白
19     monologue=scrapy.Field()
20     # 相册图片链接
21     pic_urls=scrapy.Field()
22     #籍贯
23     place_from=scrapy.Field()
24     #学历
25     education=scrapy.Field()
26     # 年龄
27     age=scrapy.Field()
28     #身高
29     height=scrapy.Field()
30     #工资
31     salary=scrapy.Field()
32     #兴趣爱好
33     hobby=scrapy.Field()
34     # 网站来源 youyuan
35     source=scrapy.Field()
36     # 个人主页源url
37     source_url=scrapy.Field()
38     # 爬虫名
39     spider=scrapy.Field()
View Code

spiders >yuoyuan.py

  1 # -*- coding: utf-8 -*-
  2 import scrapy
  3 from scrapy.linkextractors import LinkExtractor
  4 from scrapy.spiders import Rule
  5 from scrapy_redis.spiders import RedisCrawlSpider
  6 from youyuanwang.items import YouyuanwangItem
  7 
  8 
  9 
 10 # class YouyuanSpider(CrawlSpider):
 11 class youyuan(RedisCrawlSpider):
 12     name = \'youyuan\'
 13     # allowed_domains = [\'www.youyuan.com\']
 14     # 有缘网的列表页
 15     # start_urls = [\'http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/\']
 16     redis_key = \'youyuan:start_urls\'
 17     #动态域范围的获取
 18     def __init__(self, *args, **kwargs):
 19         # Dynamically define the allowed domains list.
 20         domain = kwargs.pop(\'domain\', \'\')
 21         self.allowed_domains = filter(None, domain.split(\',\'))
 22         super(youyuan, self).__init__(*args, **kwargs)
 23     #匹配全国
 24     #list_page = LinkExtractor(allow=(r\'http://www.youyuan.com/find/.+\'))
 25     # 只匹配北京、18~25岁、女性 的 搜索页面匹配规则,根据response提取链接
 26     page_links=LinkExtractor(allow=r"http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\\d+/")
 27     # 个人主页 匹配规则,根据response提取链接
 28     profile_page=LinkExtractor(allow=r"http://www.youyuan.com/\\d+-profile/")
 29 
 30     rules = (
 31         # 匹配列表页成功,跟进链接,跳板
 32         Rule(page_links),
 33         # 匹配个人主页的链接,形成request保存到redis中等待调度,一旦有响应则调用parse_profile_page()回调函数处理,不做继续跟进
 34         Rule(profile_page,callback="parse_profile_page",follow=False)
 35     )
 36 
 37     # 处理个人主页信息,得到我们要的数据
 38     def parse_profile_page(self, response):
 39         item=YouyuanwangItem()
 40         # 个人头像链接
 41         item[\'header_url\']=self.get_header_url(response)
 42         # 用户名
 43         item[\'username\']=self.get_username(response)
 44         #籍贯
 45         item[\'place_from\']=self.get_place_from(response)
 46         #学历
 47         item[\'education\']=self.get_education(response)
 48 
 49         # 年龄
 50         item[\'age\']=self.get_age(response)
 51         # 身高
 52         item[\'height\']=self.get_height(response)
 53         # 工资
 54         item[\'salary\']=self.get_salary(response)
 55         # 兴趣爱好
 56         item[\'hobby\']=self.get_hobby(response)
 57         # 相册图片链接
 58         item[\'pic_urls\'] = self.get_pic_urls(response)
 59         # 内心独白
 60         item[\'monologue\'] = self.get_monologue(response)
 61         # 个人主页源url
 62         item[\'source_url\']=response.url
 63         # 网站来源 youyuan
 64         item[\'source\']="youyuan"
 65         # 爬虫名
 66         item[\'spider\']="youyuan"
 67         yield item
 68    #提取头像地址
 69     def get_header_url(self,response):
 70         header=response.xpath(\'//dl[@class="personal_cen"]/dt/img/@src\').extract()
 71         if len(header):
 72             header_url=header[0]
 73         else:
 74             header_url= ""
 75         return header_url.strip()
 76     #提取用户名
 77     def get_username(self,response):
 78         username=response.xpath(\'//dl[@class="personal_cen"]/dd//div[@class="main"]/strong/text()\').extract()
 79         if len(username):
 80             username=username[0]
 81         else:
 82             username=""
 83         return username.strip()
 84     #提取年龄
 85     def get_age(self,response):
 86         age=response.xpath(\'//dl[@class="personal_cen"]//p[@class="local"]/text()\').extract()
 87         if len(age):
 88             age=age[0].split()[1]
 89         else:
 90             age=""
 91         return age
 92     #提取身高
 93     def get_height(self,response):
 94         height=response.xpath(\'//div[@class="pre_data"]/ul/li[2]/div/ol[2]/li[2]/span/text()\').extract()
 95         if len(height):
 96             height=height[0]
 97         else:
 98             height=""
 99 
100         return height.strip()
101     #提取工资
102     def get_salary(self,response):
103         salary=response.xpath(\'//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[4]/span/text()\').extract()
104         if len(salary):
105             salary=salary[0]
106         else:
107             salary=""
108         return salary.strip()
109     #提取兴趣爱好
110     def get_hobby(self,response):
111         hobby=response.xpath(\'//dl[@class="personal_cen"]//ol[@class="hoby"]//li/text()\').extract()
112         if len(hobby):
113             hobby=",".join(hobby).replace(" ","")
114         else:
115             hobby=""
116         return hobby.strip()
117     #提取相册图片
118     def get_pic_urls(self,response):
119         pic_urls=response.xpath(\'//div[@class="ph_show"]/ul/li/a/img/@src\').extract()
120         if len(pic_urls):
121             pic_urls=",".join(pic_urls)
122             #将相册url列表转换成字符串
123         else:
124             pic_urls=""
125         return pic_urls
126     #提取内心独白
127     def get_monologue(self,response):
128         monologue=response.xpath(\'//div[@class="pre_data"]/ul/li/p/text()\').extract()
129         if len(monologue):
130             monologue=monologue[0]
131         else:
132             monologue=""
133         return monologue.strip()
134     #提取籍贯
135     def get_place_from(self,response):
136         place_from=response.xpath(\'//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[1]/span/text()\').extract()
137         if len(place_from):
138             place_from=place_from[0]
139         else:
140             place_from=""
141         return place_from.strip()
142     #提取学历
143     def get_education(self,response):
144         education=response.xpath(\'//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[3]/span/text()\').extract()
145         if len(education):
146             education=education[0]
147         else:
148             education=""
149         return education.strip()
View Code

 

  1 # -*- coding: utf-8 -*-
  2 import scrapy
  3 from scrapy.linkextractors import LinkExtractor
  4 from scrapy.spiders import Rule,CrawlSpider
  5 #from scrapy_redis.spiders import RedisCrawlSpider
  6 from youyuanwang.items import YouyuanwangItem
  7 
  8 
  9 class YouyuanSpider(CrawlSpider):
 10 #class YouyuanSpider(RedisCrawlSpider):
 11     name = \'youyuan\'
 12     allowed_domains = [\'www.youyuan.com\']
 13     # 有缘网的列表页
 14     start_urls = [\'http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/\']
 15     #redis_key = \'YouyuanSpider:start_urls\'
 16     #动态域范围的获取
 17     # def __init__(self, *args, **kwargs):
 18     #     # Dynamically define the allowed domains list.
 19     #     domain = kwargs.pop(\'domain\', \'\')
 20     #     self.allowed_domains = filter(None, domain.split(\',\'))
 21     #     super(YouyuanSpider, self).__init__(*args, **kwargs)
 22     #匹配全国
 23     #list_page = LinkExtractor(allow=(r\'http://www.youyuan.com/find/.+\'))
 24     # 只匹配北京、18~25岁、女性 的 搜索页面匹配规则,根据response提取链接
 25     page_links=LinkExtractor(allow=r"http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\\d+/")
 26     # 个人主页 匹配规则,根据response提取链接
 27     profile_page=LinkExtractor(allow=r"http://www.youyuan.com/\\d+-profile/")
 28 
 29     rules = (
 30         # 匹配列表页成功,跟进链接,跳板
 31         Rule(page_links),
 32         # 匹配个人主页的链接,形成request保存到redis中等待调度,一旦有响应则调用parse_profile_page()回调函数处理,不做继续跟进
 33         Rule(profile_page,callback="parse_profile_page",follow=False)
 34     )
 35 
 36     # 处理个人主页信息,得到我们要的数据
 37     def parse_profile_page(self, response):
 38         item=YouyuanwangItem()
 39         # 个人头像链接
 40         item[\'header_url\']=self.get_header_url(response)
 41         # 用户名
 42         item[\'username\']=self.get_username(response)
 43         #籍贯
 44         item[\'place_from\']=self.get_place_from(response)
 45         #学历
 46         item[\'education\']=self.get_education(response)
 47 
 48         # 年龄
 49         item[\'age\']=self.get_age(response)
 50         # 身高
 51         item[\'height\']=self.get_height(response)
 52         # 工资
 53         item[\'salary\']=self.get_salary(response)
 54         # 兴趣爱好
 55         item[\'hobby\']=self.get_hobby(response)
 56         # 相册图片链接
 57         item[\'pic_urls\'] = self.get_pic_urls(response)
 58         # 内心独白
 59         item[\'monologue\'] = self.get_monologue(response)
 60         # 个人主页源url
 61         item[\'source_url\']=response.url
 62         # 网站来源 youyuan
 63         item[\'source\']="youyuan"
 64         # 爬虫名
 65         item[\'spider\']="youyuan"
 66         yield item
 67    #提取头像地址
 68     def get_header_url(self,response):
 69         header=response.xpath(\'//dl[@class="personal_cen"]/dt/img/@src\').extract()
 70         if len(header):
 71             header_url=header[0]
 72         else:
 73             header_url= ""
 74         return header_url.strip()
 75     #提取用户名
 76     def get_username(self,response):
 77         username=response.xpath(\'//dl[@class="personal_cen"]/dd//div[@class="main"]/strong/text()\').extract()
 78         if len(username):
 79             username=username[0]
 80         else:
 81             username=""
 82         return username.strip()
 83     #提取年龄
 84     def get_age(self,response):
 85         age=response.xpath(\'//dl[@class="personal_cen"]//p[@class="local"]/text()\').extract()
 86         if len(age):
 87             age=age[0].split()[1]
 88         else:
 89             age=""
 90         return age
 91     #提取身高
 92     def get_height(self,response):
 93         height=response.xpath(\'//div[@class="pre_data"]/ul/li[2]/div/ol[2]/li[2]/span/text()\').extract()
 94         if len(height):
 95             height=height[0]
 96         else:
 97             height=""
 98 
 99         return height.strip()
100     #提取工资
101     def get_salary(self,response):
102         salary=response.xpath(\'//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[4]/span/text()\').extract()
103         if len(salary):
104             salary=salary[0]
105         else:
106             salary=""
107         return salary.strip()
108     #提取兴趣爱好
109     def get_hobby(self,response):
110         hobby=response.xpath(\'//dl[@class="personal_cen"]//ol[@class="hoby"]//li/text()\').extract()
111         if len(hobby):
112             hobby=",".join(hobby).replace(" ","")
113         else:
114             hobby=""
115         return hobby.strip()
116     #提取相册图片
117     def get_pic_urls(self,response):
118         pic_urls=response.xpath(\'//div[@class="ph_show"]/ul/li/a/img/@src\').extract()
119         if len(pic_urls):
120             pic_urls=",".join(pic_urls)
121             #将相册url列表转换成字符串
122         else:
123             pic_urls=""
124         return pic_urls
125     #提取内心独白
126     def get_monologue(self,response):
127         monologue=response.xpath(\'//div[@class="pre_data"]/ul/li/p/text()\').extract()
128         if len(monologue):
129             monologue=monologue[0]
130         else:
131             monologue=""
132         return monologue.strip()
133     #提取籍贯
134     def get_place_from(self,response):
135         place_from=response.xpath(\'//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[1]/span/text()\').extract()
136         if len(place_from):
137             place_from=place_from[0]
138         else:
139             place_from=""
140         return place_from.strip()
141     #提取学历
142     def get_education(self,response):
143         education=response.xpath(\'//div[@class="pre_data"]/ul/li[2]/div/ol[1]/li[3]/span/text()\').extract()
144         if len(education):
145             education=education[0]
146         else:
147             education=""
148         return education.strip()
View Code

pipelines.py