Python爬虫实战:定时爬取微博热榜信息并存入SqlServer,不再错过每条热搜
Posted 大数据_小袁
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python爬虫实战:定时爬取微博热榜信息并存入SqlServer,不再错过每条热搜相关的知识,希望对你有一定的参考价值。
🌹前言
博主开始更新爬虫实战教程了,期待你的关注!!!
第一篇:Python爬虫实战(一):翻页爬取数据存入SqlServer
第二篇:Python爬虫实战(二):爬取快代理构建代理IP池
第三篇:Python爬虫实战(三):定时爬取微博热榜信息并存入SqlServer
点赞收藏博主更有创作动力哟,以后常更!!!
爬取目标(效果展示)
我们要爬取的网页是:https://weibo.com/newlogin?tabtype=search&url=https%3A%2F%2Fweibo.com%2F
效果展示:
爬取的内容是:标题、榜单、热度值、新闻类型、时间戳、url地址等
准备工作
建表:
CREATE TABLE "WB_HotList" (
"id" INT IDENTITY(1,1) PRIMARY key,
"batch" NVARCHAR(MAX),
"daydate" SMALLDATETIME,
"star_word" NVARCHAR(MAX),
"title" NVARCHAR(MAX),
"category" NVARCHAR(MAX),
"num" NVARCHAR(MAX),
"subject_querys" NVARCHAR(MAX),
"flag" NVARCHAR(MAX),
"icon_desc" NVARCHAR(MAX),
"raw_hot" NVARCHAR(MAX),
"mid" NVARCHAR(MAX),
"emoticon" NVARCHAR(MAX),
"icon_desc_color" NVARCHAR(MAX),
"realpos" NVARCHAR(MAX),
"onboard_time" SMALLDATETIME,
"topic_flag" NVARCHAR(MAX),
"ad_info" NVARCHAR(MAX),
"fun_word" NVARCHAR(MAX),
"note" NVARCHAR(MAX),
"rank" NVARCHAR(MAX),
"url" NVARCHAR(MAX)
)
为防止,字段给的不够,直接给个MAX!
代码分析
第一步
发送请求,获取网页信息
微博提供了数据的接口,所以我们直接访问接口就行,如下图(json格式):
接口地址:https://weibo.com/ajax/statuses/hot_band
def __init__(self) :
self.url = "https://weibo.com/ajax/statuses/hot_band"
self.headers = "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
# 发送请求,获取相应
def parse_url(self):
response = requests.get(self.url,headers=self.headers)
time.sleep(2) # 休息两秒
return response.content.decode()
第二步
解析数据,提取我们所需要的数据
接口中的数据如下(只需提取我们所需要的):
for i in range(50):
ban_list = json_data['data']['band_list'][i]
batch = f'第a批'
try:
star_word = ban_list['star_word']
except Exception as e:
print(e)
try:
title = ban_list['word']
except Exception as e:
print(e)
try:
category = ban_list['category']
except Exception as e:
print(e)
try:
num = ban_list['num']
except Exception as e:
print(e)
try:
subject_querys = ban_list['subject_querys']
except Exception as e:
print(e)
try:
flag = ban_list['flag']
except Exception as e:
print(e)
try:
icon_desc = ban_list['icon_desc']
except Exception as e:
print(e)
try:
raw_hot = ban_list['raw_hot']
except Exception as e:
print(e)
try:
mid = ban_list['mid']
except Exception as e:
print(e)
try:
emoticon = ban_list['emoticon']
except Exception as e:
print(e)
try:
icon_desc_color = ban_list['icon_desc_color']
except Exception as e:
print(e)
try:
realpos = ban_list['realpos']
except Exception as e:
print(e)
try:
onboard_time = ban_list['onboard_time']
onboard_time = datetime.datetime.fromtimestamp(onboard_time)
except Exception as e:
print(e)
try:
topic_flag = ban_list['topic_flag']
except Exception as e:
print(e)
try:
ad_info = ban_list['ad_info']
except Exception as e:
print(e)
try:
fun_word = ban_list['fun_word']
except Exception as e:
print(e)
try:
note = ban_list['note']
except Exception as e:
print(e)
try:
rank = ban_list['rank'] + 1
except Exception as e:
print(e)
try:
url = json_data['data']['band_list'][i]['mblog']['text']
url = re.findall('href="(.*?)"',url)[0]
第三步
数据库的batch用于判断,每次插入的批次(50个一批),如果爬虫断了,写个方法还能接着上次的批次
如图:
# 把数据库batch列存入列表并返回(用于判断批次号)
def batch(self):
conn=pymssql.connect('.', 'sa', 'yuan427', 'test')
cursor=conn.cursor()
cursor.execute("select batch from WB_HotList") #向数据库发送SQL命令
rows=cursor.fetchall()
batchlist=[]
for list in rows:
batchlist.append(list[0])
return batchlist
第四步
把数据存入数据库
# 连接数据库服务,创建游标对象
db = pymssql.connect('.', 'sa', 'yuan427', 'test') #服务器名,账户,密码,数据库名
if db:
print("连接成功!")
cursor= db.cursor()
try:
# 插入sql语句
sql = "insert into test4(batch,daydate,star_word,title,category,num,subject_querys,flag,icon_desc,raw_hot,mid,emoticon,icon_desc_color,realpos,onboard_time, \\
topic_flag,ad_info,fun_word,note,rank,url) values (%s,getdate(),%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
# 执行插入操作
cursor.execute(sql,(batch,star_word,title,category,num,subject_querys,flag,icon_desc,raw_hot,mid,emoticon,icon_desc_color,realpos,onboard_time,topic_flag,ad_info, \\
fun_word,note,rank,url))
db.commit()
print('成功载入......' )
except Exception as e:
db.rollback()
print(str(e))
# 关闭游标,断开数据库
cursor.close()
db.close()
完整代码
import requests,pymssql,time,json,re,datetime
from threading import Timer
class Spider:
def __init__(self) :
self.url = "https://weibo.com/ajax/statuses/hot_band"
self.headers = "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
# 发送请求,获取相应
def parse_url(self):
response = requests.get(self.url,headers=self.headers)
time.sleep(2)
return response.content.decode()
# 解析数据,入库
def parse_data(self,data,a):
json_data = json.loads(data)
# 连接数据库服务,创建游标对象
db = pymssql.connect('.', 'sa', 'yuan427', 'test') #服务器名,账户,密码,数据库名
cursor= db.cursor()
for i in range(50):
ban_list = json_data['data']['band_list'][i]
batch = f'第a批'
try:
star_word = ban_list['star_word']
except Exception as e:
print(e)
try:
title = ban_list['word']
except Exception as e:
print(e)
try:
category = ban_list['category']
except Exception as e:
print(e)
try:
num = ban_list['num']
except Exception as e:
print(e)
try:
subject_querys = ban_list['subject_querys']
except Exception as e:
print(e)
try:
flag = ban_list['flag']
except Exception as e:
print(e)
try:
icon_desc = ban_list['icon_desc']
except Exception as e:
print(e)
try:
raw_hot = ban_list['raw_hot']
except Exception as e:
print(e)
try:
mid = ban_list['mid']
except Exception as e:
print(e)
try:
emoticon = ban_list['emoticon']
except Exception as e:
print(e)
try:
icon_desc_color = ban_list['icon_desc_color']
except Exception as e:
print(e)
try:
realpos = ban_list['realpos']
except Exception as e:
print(e)
try:
onboard_time = ban_list['onboard_time']
onboard_time = datetime.datetime.fromtimestamp(onboard_time)
except Exception as e:
print(e)
try:
topic_flag = ban_list['topic_flag']
except Exception as e:
print(e)
try:
ad_info = ban_list['ad_info']
except Exception as e:
print(e)
try:
fun_word = ban_list['fun_word']
except Exception as e:
print(e)
try:
note = ban_list['note']
except Exception as e:
print(e)
try:
rank = ban_list['rank'] + 1
except Exception as e:
print(e)
try:
url = json_data['data']['band_list'][i]['mblog']['text']
url = re.findall('href="(.*?)"',url)[0]
except Exception as e:
print(e)
try:
# 插入sql语句
sql = "insert into test4(batch,daydate,star_word,title,category,num,subject_querys,flag,icon_desc,raw_hot,mid,emoticon,icon_desc_color,realpos,onboard_time, \\
topic_flag,ad_info,fun_word,note,rank,url) values (%s,getdate(),%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
# 执行插入操作
cursor.execute(sql,(batch,star_word,title,category,num,subject_querys,flag,icon_desc,raw_hot,mid,emoticon,icon_desc_color,realpos,onboard_time,topic_flag,ad_info, \\
fun_word,note,rank,url))
db.commit()
print('成功载入......' )
except Exception as e:
db.rollback()
print(str(e))
# 关闭游标,断开数据库
cursor.close()
db.close()
# 把数据库batch列存入列表并返回(用于判断批次号)
def batch(self):
conn=pymssql.connect('.', 'sa', 'yuan427', 'test')
cursor=conn.cursor()
cursor.execute("select batch from WB_HotList") #向数据库发送SQL命令
rows=cursor.fetchall()
batchlist=[]
for list in rows:
batchlist.append(list[0])
return batchlist
# 实现主要逻辑
def run(self, a):
# 根据数据库批次号给定a的值
batchlist = self.batch()
if len(batchlist) != 0:
batch = batchlist[len(batchlist) -1]
a = re.findall('第(.*?)批',batch)
a = int(a[0]) + 1
data = self.parse_url()
self.parse_data(data,a)
a +=1
# 定时调用
t = Timer(1800, self.run, (a, )) # 1800表示1800秒,半小时调用一次
t.start()
if __name__ == "__main__":
spider = Spider()
spider.run(1)
启动
因为需要一直运行,所以就在 cmd 挂着
运行成功后,去数据库看看:
O了O了!!!
有讲的不对的地方,希望各位大佬指正!!!,如果有不明白的地方评论区留言回复!兄弟们来个点赞有空就更新爬虫实战!!!
以上是关于Python爬虫实战:定时爬取微博热榜信息并存入SqlServer,不再错过每条热搜的主要内容,如果未能解决你的问题,请参考以下文章