爬虫入门-5-1.正则表达式在爬虫中的应用
Posted min-r
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫入门-5-1.正则表达式在爬虫中的应用相关的知识,希望对你有一定的参考价值。
1.爬取百思不得姐段子
1 import requests 2 import re 3 4 5 def parse_url(url): 6 headers = { 7 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) ‘ 8 ‘AppleWebKit/537.36 (Khtml, like Gecko) Chrome/72.0.3626.121 Safari/537.36‘ 9 } 10 response = requests.get(url, headers=headers, timeout=10) 11 text = response.text 12 # 1.正则表达式规则注意标签后必须加.*? 13 contents = re.findall(r‘<div class="j-r-list-c-desc">.*?<a.*?>(.*?)</a>‘, text, re.DOTALL) 14 # 2.新建列表,保存修改过的段子 15 jokes = [] 16 for content in contents: 17 # dz类型为<class ‘str‘> 18 dz = re.sub(r"<br.*?>", "", content) 19 # 3.将段子加入到列表 20 jokes.append(dz) 21 for joke in jokes: 22 # 4.将段子存储到文件中 23 with open(‘3.txt‘, ‘a‘, encoding=‘utf-8‘) as f: 24 f.write(joke) 25 f.write(‘ ‘) 26 27 28 def main(): 29 url = ‘http://www.budejie.com/text/1‘ 30 for x in range(1, 10): 31 url = ‘http://www.budejie.com/text/%s‘ % x 32 parse_url(url) 33 34 35 if __name__ == ‘__main__‘: 36 main()
2.爬取豆瓣新书首页
1 import re 2 import requests 3 import os 4 5 PROXY = { 6 ‘HTTPS‘: ‘116.209.55.208:9999‘ 7 } 8 9 10 def spider(): 11 url = ‘https://book.douban.com/latest?icn=index-latestbook-all‘ 12 response = requests.get(url, proxies=PROXY) 13 # 获取图片只能用response.text,不能用response.content 14 html = response.text 15 # 获取图片只需要写src属性正则 16 titles = re.findall(r‘<div class="detail-frame">.*?<a.*?>(.*?)</a>‘, html, re.DOTALL) 17 imgs = re.findall(‘img src="(.*?)"‘, html, re.DOTALL)[1:] 18 for value in zip(titles, imgs): 19 title, img = value 20 with open(‘pic/‘ + title + ‘.jpg‘, ‘wb‘) as f: 21 f.write(requests.get(img).content) 22 23 24 if __name__ == ‘__main__‘: 25 if os.path.exists(‘pic‘): 26 pass 27 else: 28 os.mkdir(‘pic‘) 29 spider()
3.爬取古诗文
1 import requests 2 import re 3 4 5 def parse_page(url): 6 headers = { 7 ‘user-agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ‘ 8 ‘(KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36‘ 9 } 10 response = requests.get(url, headers=headers) 11 text = response.text 12 # 获取古诗标题 13 titles = re.findall(r‘<divsclass="cont">.*?<b>(.*?)</b>‘, text, re.DOTALL) 14 # 获取作者朝代 15 dynasties = re.findall(r‘<psclass="source">.*?<a.*?>(.*?)</a>‘, text, re.DOTALL) 16 # 获取作者姓名 17 authors = re.findall(r‘<psclass="source">.*?<a.*?><a.*?>(.*?)</a>‘, text, re.DOTALL) 18 # 获取古诗内容 19 content_tags = re.findall(r‘<divsclass="contson" .*?>(.*?)</div>‘, text, re.DOTALL) 20 contents = [] 21 for content in content_tags: 22 x = re.sub(r‘<.*?>‘, "", content) 23 contents.append(x.strip()) 24 poems = [] 25 for value in zip(titles, dynasties, authors, contents): 26 title, dynasty, author, content = value 27 poem = { 28 ‘title‘: title, 29 ‘dynasty‘: dynasty, 30 ‘author‘: author, 31 ‘content‘: content 32 } 33 poems.append(poem) 34 for poem in poems: 35 print(poem) 36 print(‘=‘ * 50) 37 38 39 def main(): 40 url = ‘https://www.gushiwen.org/default_1.aspx‘ 41 for x in range(1, 20): 42 url = ‘https://www.gushiwen.org/default_%s.aspx‘ % x 43 parse_page(url) 44 45 46 if __name__ == ‘__main__‘: 47 main()
4.爬取校花网图片
1 import requests, re, os 2 3 # 文件夹名称 4 FileName = ‘downloads‘ 5 6 7 # 保存图片 8 def SaveImage(image, name="temp"): 9 # 图片存放路径 10 fpath = os.path.join(FileName, name + ‘.jpg‘) 11 response = requests.get("http://www.xiaohuar.com/d/file/" + image).content 12 # 写入图片 13 with open(fpath, ‘wb+‘) as f: 14 f.write(response) 15 16 17 # 获取当前页图片Url集合 18 def GetImage(fanyeUr): 19 # 请求页面 20 page = requests.get(fanyeUr) 21 # 设置编码 22 page.encoding = ‘gbk‘ 23 # 正则获取图片集合 24 imglist = re.findall(‘alt="(.*?)" src="/d/file/(.*?.jpg)"‘, page.text) 25 # 循环保存图片 26 for name, url in imglist: 27 print(url, name) 28 SaveImage(url, name) 29 30 31 # 判断文件夹是否存在 32 if not os.path.exists(os.path.join(os.getcwd(), FileName)): 33 # 新建文件夹 34 os.mkdir(os.path.join(os.getcwd(), FileName)) 35 36 # 请求第一页 37 fanyeUr = ‘http://www.xiaohuar.com/list-1-0.html‘ 38 # 循环翻页 39 for faye in range(1, 5): 40 # 获取翻页Url 41 GetImage(fanyeUr) 42 fanyeUr = ‘http://www.xiaohuar.com/list-1-%s.html‘ % faye
以上是关于爬虫入门-5-1.正则表达式在爬虫中的应用的主要内容,如果未能解决你的问题,请参考以下文章