原生爬虫(爬取熊猫直播人气排名)
Posted ksyoon
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了原生爬虫(爬取熊猫直播人气排名)相关的知识,希望对你有一定的参考价值。
1 ‘‘‘‘ 2 This is a module 3 ‘‘‘ 4 5 import re 6 7 from urllib import request 8 # 断点调试 9 10 class Spider(): 11 ‘‘‘ 12 This is a class 13 ‘‘‘ 14 # 私有方法 15 # 匹配所有字符 [sS]*? 非贪婪 16 url=‘https://www.panda.tv/all?pdt=1.27.psbar-menu.0.1oj9bbkfjbh‘ 17 root_pattern = ‘<div class="video-info">([wW]*?)</div>‘ 18 name_pattern = ‘</i>([wW]*?)</span>‘ 19 number_pattern = ‘<span class="video-number">([wW]*?)</span>‘ 20 def __fetch_content(self): 21 22 # This is a HTTP request 23 r = request.urlopen(Spider.url) 24 # 字节码 25 htmls = r.read() 26 htmls = str(htmls,encoding=‘utf-8‘) 27 28 return htmls 29 30 def __analysis(self, htmls): 31 root_html = re.findall(Spider.root_pattern, htmls) 32 33 anchors = [] 34 for html in root_html: 35 name = re.findall(Spider.name_pattern, html) 36 number = re.findall(Spider.number_pattern, html) 37 anchor = {‘name‘:name,‘number‘:number} 38 anchors.append(anchor) 39 # print(root_html[0]) 40 # print(anchors[0]) 41 # print(anchors) 42 return anchors 43 44 def __refine(self, anchors): 45 46 # 匿名函数lambda 47 l = lambda anchor: {‘name‘:anchor[‘name‘][0].strip(),‘number‘:anchor[‘number‘][0]} 48 # r = map(l, anchors) 49 # print(r) 50 return map(l,anchors) 51 52 def __sort(self, anchors): 53 54 # 默认增序 55 anchors = sorted(anchors, key = self.__sort_seed, reverse=True) 56 57 return anchors 58 59 def __sort_seed(self, anchor): 60 r = re.findall(‘d*‘, anchor[‘number‘]) 61 number = float(r[0]) 62 if ‘万‘ in anchor[‘number‘]: 63 number *= 10000 64 65 return number 66 67 def __show(self, anchors): 68 for rank in range(0, len(anchors)): 69 print(‘rank‘+str(rank+1)+‘:‘+anchors[rank][‘name‘]+‘ ‘+anchors[rank][‘number‘]) 70 71 def go(self): 72 htmls = self.__fetch_content() 73 # self.__analysis(htmls) 74 anchors = self.__analysis(htmls) 75 # anchors = self.__refine(anchors) 76 anchors = list(self.__refine(anchors)) 77 # print(anchors) 78 # anchors = list(self.__refine(anchors)) 79 anchors = self.__sort(anchors) 80 self.__show(anchors) 81 # print(anchors) 82 83 spider = Spider() 84 spider.go()
以上是关于原生爬虫(爬取熊猫直播人气排名)的主要内容,如果未能解决你的问题,请参考以下文章