爬取素材库直接存入mysql数据库
Posted 耀扬
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取素材库直接存入mysql数据库相关的知识,希望对你有一定的参考价值。
爬取素材库。直接存入mysql数据库。
包含html源码直接存入数据库需要的转义函数。
替换掉源码中的html注释语句
import re
import requests
import random
import time
from bs4 import BeautifulSoup
import pymysql
#html源码进数据库,转义函数
def transferContent(content):
if content is None:
return None
else:
stri = ""
for c in content:
if c == ‘"‘:
stri += c.replace(‘"‘, ‘\"‘)
elif c == "‘":
stri += c.replace("‘", "\‘")
elif c == "\":
stri += "\\"
else:
stri += str(c)
return stri
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
UA = random.choice(user_agent_list) ##从self.user_agent_list中随机取出一个字符串
headers = {‘User-Agent‘: UA} ##构造成一个完整的User-Agent (UA代表的是上面随机取出来的字符串哦)
# 连接database
conn =pymysql.connect(user=‘root‘, password=‘1234‘ ,host=‘127.0.0.1‘,database=‘sucai‘)
url = ‘https://www.***.com/***_0_‘
for x in range(26, 724):
time.sleep(random.randint(1, 5))
with requests.get(url + str(x), headers=headers, timeout=5) as response:
soup = BeautifulSoup(response.text, ‘lxml‘)
li_list = soup.find_all(‘div‘, class_=‘material-item‘)
for li_quick in li_list:
# 源码
item_code = li_quick.find(‘div‘, class_=‘item-code‘).decode_contents() #转换为字符串 decode_contents(不含当前标签)
re_comment=re.compile(‘<![^>]*>‘)#HTML注释
item_code = re_comment.sub(‘‘, item_code) # 去掉HTML注释
item_code = transferContent(item_code.strip()) #去掉前后空格
# 简介
item_info = li_quick.find(‘div‘, class_=‘item-bottom‘).find(‘div‘, class_=‘item-info‘).a.get_text().strip()
# lable
item_label = li_quick.find(‘div‘, class_=‘item-bottom‘).find(‘div‘, class_=‘item-label‘)
item_label_0=‘‘
for kj in item_label.find_all(‘span‘):
item_label_0+=kj.get_text()+‘;‘
# 创建游标
cursor = conn.cursor()
# --insert---
effect_rows = cursor.execute(‘insert into sucaix (ye,rowcode,info,lablex) values (%s,%s,%s,%s)‘,[str(x),item_code,item_info,item_label_0])
# 提交任务
conn.commit()
print(x)
# 关闭游标
cursor.close()
conn.close()
以上是关于爬取素材库直接存入mysql数据库的主要内容,如果未能解决你的问题,请参考以下文章
Scrapy爬取慕课网(imooc)所有课程数据并存入MySQL数据库