python3 怎么爬取新闻网站

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python3 怎么爬取新闻网站相关的知识,希望对你有一定的参考价值。

1 #coding=utf-8
2 import re # 正则表达式
3 import bs4 # Beautiful Soup 4 解析模块
4 import urllib2 # 网络访问模块
5 import News #自己定义的新闻结构
6 import codecs #解决编码问题的关键 ,使用codecs.open打开文件
7 import sys #1解决不同页面编码问题
8
9 reload(sys) # 2
10 sys.setdefaultencoding('utf-8') # 3
11
12 # 从首页获取所有链接
13 def GetAllUrl(home):
14 html = urllib2.urlopen(home).read().decode('utf8')
15 soup = bs4.BeautifulSoup(html, 'html.parser')
16 pattern = 'http://\w+\.baijia\.baidu\.com/article/\w+'
17 links = soup.find_all('a', href=re.compile(pattern))
18 for link in links:
19 url_set.add(link['href'])
20
21 def GetNews(url):
22 global NewsCount,MaxNewsCount #全局记录新闻数量
23 while len(url_set) != 0:
24 try:
25 # 获取链接
26 url = url_set.pop()
27 url_old.add(url)
28
29 # 获取代码
30 html = urllib2.urlopen(url).read().decode('utf8')
31
32 # 解析
33 soup = bs4.BeautifulSoup(html, 'html.parser')
34 pattern = 'http://\w+\.baijia\.baidu\.com/article/\w+' # 链接匹配规则
35 links = soup.find_all('a', href=re.compile(pattern))
36
37 # 获取URL
38 for link in links:
39 if link['href'] not in url_old:
40 url_set.add(link['href'])
41
42 # 获取信息
43 article = News.News()
44 article.url = url # URL信息
45 page = soup.find('div', 'id': 'page')
46 article.title = page.find('h1').get_text() # 标题信息
47 info = page.find('div', 'class': 'article-info')
48 article.author = info.find('a', 'class': 'name').get_text() # 作者信息
49 article.date = info.find('span', 'class': 'time').get_text() # 日期信息
50 article.about = page.find('blockquote').get_text()
51 pnode = page.find('div', 'class': 'article-detail').find_all('p')
52 article.content = ''
53 for node in pnode: # 获取文章段落
54 article.content += node.get_text() + '\n' # 追加段落信息
55
56 SaveNews(article)
57
58 print NewsCount
59 break
60 except Exception as e:
61 print(e)
62 continue
63 else:
64 print(article.title)
65 NewsCount+=1
66 finally:
67 # 判断数据是否收集完成
68 if NewsCount == MaxNewsCount:
69 break
70
71 def SaveNews(Object):
72 file.write("【"+Object.title+"】"+"\t")
73 file.write(Object.author+"\t"+Object.date+"\n")
74 file.write(Object.content+"\n"+"\n")
75
76 url_set = set() # url集合
77 url_old = set() # 爬过的url集合
78
79 NewsCount = 0
80 MaxNewsCount=3
81
82 home = 'http://baijia.baidu.com/' # 起始位置
83
84 GetAllUrl(home)
85
86 file=codecs.open("D:\\test.txt","a+") #文件操作
87
88 for url in url_set:
89 GetNews(url)
90 # 判断数据是否收集完成
91 if NewsCount == MaxNewsCount:
92 break
93
94 file.close()
复制代码
新闻文章结构

复制代码
1 #coding: utf-8
2 # 文章类定义
3 class News(object):
4 def __init__(self):
5 self.url = None
6 self.title = None
7 self.author = None
8 self.date = None
9 self.about = None
10 self.content = None
参考技术A 用到的python模块:
1 import re # 正则表达式
2 import bs4 # Beautiful Soup 4 解析模块
3 import urllib2 # 网络访问模块
4 import News #自己定义的新闻结构
5 import codecs #解决编码问题的关键 ,使用codecs.open打开文件
6 import sys #1解决不同页面编码问题

Python爬虫实战案例:爬取新闻资讯

前言

本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理。

一个简单的Python资讯采集案例,列表页到详情页,到数据保存,保存为txt文档,网站网页结构算是比较规整,简单清晰明了,资讯新闻内容的采集和保存!

技术图片

 

技术图片

 

应用到的库

requests,time,re,UserAgent,etree

import requests,time,re
from fake_useragent import UserAgent
from lxml import etree

列表页面

技术图片

 

列表页,链接xpath解析

href_list=req.xpath(‘//ul[@class="news-list"]/li/a/@href‘)

详情页

技术图片

 

技术图片

 

内容xpath解析

h2=req.xpath(‘//div[@class="title-box"]/h2/text()‘)[0]
author=req.xpath(‘//div[@class="title-box"]/span[@class="news-from"]/text()‘)[0]
details=req.xpath(‘//div[@class="content-l detail"]/p/text()‘)

内容格式化处理

detail=‘
‘.join(details)

标题格式化处理,替换非法字符

pattern = r"[/\:*?"<>|]"
new_title = re.sub(pattern, "_", title) # 替换为下划线

保存数据,保存为txt文本

def save(self,h2, author, detail):
with open(f‘{h2}.txt‘,‘w‘,encoding=‘utf-8‘) as f:
f.write(‘%s%s%s%s%s‘%(h2,‘ ‘,detail,‘ ‘,author))


print(f"保存{h2}.txt文本成功!")

遍历数据采集,yield处理

def get_tasks(self):
data_list = self.parse_home_list(self.url)
for item in data_list:
yield item

程序运行效果

技术图片

 

程序采集效果

技术图片

 

附源码参考:

#研招网考研资讯采集
#20200710 by微信:huguo00289
# -*- coding: UTF-8 -*-

import requests,time,re
from fake_useragent import UserAgent
from lxml import etree

class RandomHeaders(object):
ua=UserAgent()
@property
def random_headers(self):
return {
‘User-Agent‘: self.ua.random,
}

class Spider(RandomHeaders):
def __init__(self,url):
self.url=url


def parse_home_list(self,url):
response=requests.get(url,headers=self.random_headers).content.decode(‘utf-8‘)
req=etree.HTML(response)
href_list=req.xpath(‘//ul[@class="news-list"]/li/a/@href‘)
print(href_list)
for href in href_list:
item = self.parse_detail(f‘https://yz.chsi.com.cn{href}‘)
yield item


def parse_detail(self,url):
print(f">>正在爬取{url}")
try:
response = requests.get(url, headers=self.random_headers).content.decode(‘utf-8‘)
time.sleep(2)
except Exception as e:
print(e.args)
self.parse_detail(url)
else:
req = etree.HTML(response)
try:
h2=req.xpath(‘//div[@class="title-box"]/h2/text()‘)[0]
h2=self.validate_title(h2)
author=req.xpath(‘//div[@class="title-box"]/span[@class="news-from"]/text()‘)[0]
details=req.xpath(‘//div[@class="content-l detail"]/p/text()‘)
detail=‘ ‘.join(details)
print(h2, author, detail)
self.save(h2, author, detail)
return h2, author, detail
except IndexError:
print(">>>采集出错需延时,5s后重试..")
time.sleep(5)
self.parse_detail(url)


@staticmethod
def validate_title(title):
pattern = r"[/\:*?"<>|]"
new_title = re.sub(pattern, "_", title) # 替换为下划线
return new_title



def save(self,h2, author, detail):
with open(f‘{h2}.txt‘,‘w‘,encoding=‘utf-8‘) as f:
f.write(‘%s%s%s%s%s‘%(h2,‘ ‘,detail,‘ ‘,author))

print(f"保存{h2}.txt文本成功!")


def get_tasks(self):
data_list = self.parse_home_list(self.url)
for item in data_list:
yield item




if __name__=="__main__":
url="https://yz.chsi.com.cn/kyzx/jyxd/"
spider=Spider(url)
for data in spider.get_tasks():
print(data)

以上是关于python3 怎么爬取新闻网站的主要内容,如果未能解决你的问题,请参考以下文章

Python3从零开始爬取今日头条的新闻开发环境搭建

python3爬虫 -----华东交大校园新闻爬取

scrapy主动退出爬虫的代码片段(python3)

抓取新闻网站:异步爬虫实现的流程和细节

python3网络爬虫(2.1):爬取堆糖美女

Python3.5:爬取网站上电影数据