今日头条街拍图片爬取
Posted lxh777
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了今日头条街拍图片爬取相关的知识,希望对你有一定的参考价值。
import re import requests import os from urllib import request import json from mysql_tu import mysql_conn headers = { ‘user-agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/67.0.3396.99 Safari/537.36‘ } for v in range(0,60,20): url = ‘https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab‘.format(v) response = requests.get(url,headers=headers) html_json_dict = response.json() # 创建文件 if not os.path.exists(‘cccc‘): os.mkdir(‘cccc‘) data_list = html_json_dict[‘data‘] # print(data_list) for data_item in data_list: if ‘article_url‘ in data_item: article_url = data_item[‘article_url‘] # print(article_url) # response = requests.get(article_url,headers=headers) html_ee = response.text # print(html_ee) # html_ee = json.loads(html_str) # print(type(html_str)) pp = r‘gallery: JSON.parse((.*)),‘ match_res = re.search(pp, html_ee) # print(match_res.group(1)) if match_res: match_str = match_res.group(1) match_dict = json.loads(match_str) # print(match_dict) # print(type(match_dict)) match_dict = json.loads(match_dict) # print(match_dict) # print(type(match_dict)) image_dict = match_dict[‘sub_images‘] # print(image_dict) for v in image_dict: image_aa = v[‘url‘] print(image_aa) try: # filename = ‘cccc/‘ + image_aa.split(‘/‘)[-1] + ‘.jpg‘ filename = image_aa.split(‘/‘)[-1] + ‘.jpg‘ # 下载图片 # request.urlretrieve(image_aa, filename) ver = {} ver[‘filename‘] = filename sql = ‘insert into jiepai(filename) values("{filename}")‘.format(**ver) mc = mysql_conn() mc.execute_modify_mysql(sql) except TimeoutError: print(‘下载超时‘) continue else: print(‘没有那个文件‘) continue
#文件名 mysql_tu.py import pymysql class mysql_conn(object): # 魔术方法, 初始化, 构造函数 def __init__(self): self.db = pymysql.connect(host=‘127.0.0.1‘, user=‘root‘, password=‘lxh1122‘, port=3306, database=‘py11‘) self.cursor = self.db.cursor() # 执行modify(修改)相关的操作 def execute_modify_mysql(self, sql): self.cursor.execute(sql) self.db.commit() # 魔术方法, 析构化 ,析构函数 def __del__(self): self.cursor.close() self.db.close() if __name__==‘__main__‘: sql = ‘insert into jiepai values ()‘ mc = mysql_conn() mc.execute_modify_mysql(sql) sql = ‘insert into jiepai values ()‘ mc.execute_modify_mysql(sql) sql = ‘insert into jiepai values ()‘ mc.execute_modify_mysql(sql) sql = ‘insert into jiepai values ()‘ mc.execute_modify_mysql(sql)
以上是关于今日头条街拍图片爬取的主要内容,如果未能解决你的问题,请参考以下文章