爬虫 - 博客爬取并入库
Posted afrafre
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫 - 博客爬取并入库相关的知识,希望对你有一定的参考价值。
'''
对崔庆才的个人博客上的文章基本信息的爬取 (共41页)
https://cuiqingcai.com/page/1
标题、链接、浏览的数目、评论的数目以及喜欢的人数
'''
import re
import requests
import logging
from lxml import etree
import pymysql
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class DBconnect(object):
def __init__(self):
self.conn = pymysql.connect(host='localhost', port=3306, user='', password='',db='spider')
self.cursor = self.conn.cursor()
def save(self, table, data):
'''判断数据是列表还是字典'''
print('数据类型',type(data))
try:
if isinstance(data,dict):
sql = "insert ignore into " + table + str(tuple(data.keys())) + 'values' + str(tuple(data.values())) + ";"
self.cursor.execute(sql, data)
elif isinstance(data,list):
for d in data:
sql = "insert ignore into " + table + ' values' + str(tuple(d.values())) + ";"
print(sql)
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
logging.error(e)
self.conn.rollback()
class BlogSpider():
def __init__(self):
self.base_url = 'https://cuiqingcai.com/page/'
self.total_page = 41
def parse_url(self,url):
res = requests.get(url,verify=False)
return res.text
def parse_content(self,html):
tree = etree.HTML(html)
articles = tree.xpath("//div[@class='content']/article")
data_list = []
for article in articles:
category = article.xpath("./header/a/text()")
category = category[0] if category else None
title =article.xpath("./header/h2/a/text()")[0] if article.xpath("./header/h2/a/text()") else None
synopsis = article.xpath("./span/text()")[0]
picture = article.xpath("./div/a/img/@src")[0]
author = article.xpath('./p/span[1]/a/text()')[0]
publish_time = article.xpath("./p/span[2]/text()")[0]
page_view = article.xpath("./p/span[3]/text()")[0]
page_view = int(re.findall('d+',page_view)[0])
comment = article.xpath("./p/span[4]/a/text()")[0]
comment = int(re.findall('d+',comment)[0])
likes = article.xpath("./p/span[5]/a/span/text()")[0]
# data_dic = {'category':category,'title':title,'synopsis':synopsis,'picture':picture,'author':author,'publish_time':publish_time,
# 'page_view':page_view,'comments':comments,'likes':likes}
data_dic = {'title': title,'author': author, 'publish_time': publish_time,
'page_view': page_view, 'comment': comment}
data_list.append(data_dic)
return data_list
def save_data(self,data_list):
db = DBconnect()
# 先创建表
table_name = 'blogs'
sql = """
create table if not exists blogs(
title varchar(100) not null,
author varchar(30) not null,
publish_time varchar(30) not null,
page_view int(6) not null,
comment int(6) not null
);
"""
sql2 = "alter table blogs add unique key(publish_time);"
db.cursor.execute(sql)
db.cursor.execute(sql2)
# db.cursor.execute(sql)
# 保存数据到数据库
db.save(table='blogs',data = data_list)
def run(self):
for i in range(1,41):
url = self.base_url + str(i)
# 请求
str_html = self.parse_url(url)
# 解析网页
data_list = self.parse_content(str_html)
print(data_list)
# 存储数据
self.save_data(data_list)
return {'status_code':'200'}
if __name__ == '__main__':
bs = BlogSpider()
bs.run()
以上是关于爬虫 - 博客爬取并入库的主要内容,如果未能解决你的问题,请参考以下文章
Python爬虫实战,Scrapy实战,爬取并简单分析安居客租房信息