python 百度图片爬虫
Posted Image Process
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 百度图片爬虫相关的知识,希望对你有一定的参考价值。
# -*- coding:utf-8 -*- #https://blog.csdn.net/qq_32166627/article/details/60882964 import requests import os import pinyin def getManyPages(keyword,pages): params=[] for i in range(30,30*pages+30,30): params.append({ ‘tn‘: ‘resultjson_com‘, ‘ipn‘: ‘rj‘, ‘ct‘: 201326592, ‘is‘: ‘‘, ‘fp‘: ‘result‘, ‘queryWord‘: keyword, ‘cl‘: 2, ‘lm‘: -1, ‘ie‘: ‘utf-8‘, ‘oe‘: ‘utf-8‘, ‘adpicid‘: ‘‘, ‘st‘: -1, ‘z‘: ‘‘, ‘ic‘: 0, ‘word‘: keyword, ‘s‘: ‘‘, ‘se‘: ‘‘, ‘tab‘: ‘‘, ‘width‘: ‘‘, ‘height‘: ‘‘, ‘face‘: 0, ‘istype‘: 2, ‘qc‘: ‘‘, ‘nc‘: 1, ‘fr‘: ‘‘, ‘pn‘: i, ‘rn‘: 30, ‘gsm‘: ‘1e‘, ‘1488942260214‘: ‘‘ }) url = ‘https://image.baidu.com/search/acjson‘ urls = [] for i in params: urls.append(requests.get(url,params=i).json().get(‘data‘)) return urls def getImg(dataList, localPath, keyword): if not os.path.exists(localPath): # 新建文件夹 os.mkdir(localPath) x = 0 for list in dataList: for i in list: if i.get(‘thumbURL‘) != None: #print(‘download:%s‘ % i.get(‘thumbURL‘)) print("down " + keyword + str(x) + " image " + i.get(‘thumbURL‘)) ir = requests.get(i.get(‘thumbURL‘)) open(localPath +"/" + keyword + ‘_%d.jpg‘ % x, ‘wb‘).write(ir.content) x += 1 else: print(‘image not exist‘) # if __name__ == ‘__main__‘: # with open("stars_list_clean.txt",‘r‘) as face_file: # stars_list = face_file.readlines() # index = 0 # for line in stars_list: # line = line.replace(‘\r‘,‘‘).replace(‘\n‘,‘‘).replace(‘\t‘,‘‘) # keyword_english = pinyin.get(line, format="strip") # keyword = line # index += 1 # if index > 0: # break # # print(keyword) # # keyword1 = ‘胡因梦‘ # # if keyword == keyword1: # # print("yes") # # else: # # print("no") # #keyword = ‘胡因梦‘ # #keyword = keyword.replace(‘\X‘,‘‘) # dataList = getManyPages(keyword,2) # 参数1:关键字,参数2:要下载的页数 # getImg(dataList,‘./hanxue‘, keyword_english) # 参数2:指定保存的路径 # keyword = ‘韩雪‘ # dataList = getManyPages(keyword,10) # 参数1:关键字,参数2:要下载的页数 # getImg(dataList,‘./hanxue‘) # 参数2:指定保存的路径 if __name__ == ‘__main__‘: #convert() #word = input("Input key word: ") # print pinyin.get(‘你好‘)#声调 # print pinyin.get((‘你好‘), format="strip")#无声调 #stars_list = ["范冰冰", "刘德华","周迅","乔丹"] #en = [] # fp = open("stars_list_en.txt",‘w‘) # with open("stars_list.txt",‘r‘) as face_file: # stars_list = face_file.readlines() # for line in stars_list: # print(line[0:-1]) # keyword_english = pinyin.get(line[0:-1], format="strip") # print(keyword_english) # en.append(keyword_english) # fp.write(‘%s\n‘ % keyword_english.encode(‘utf-8‘)) # print(en) fp = open("stars_list_en.txt",‘w‘) with open("stars_list_clean.txt",‘r‘) as face_file: stars_list = face_file.readlines() for line in stars_list: line = line.replace(‘\r‘,‘‘).replace(‘\n‘,‘‘).replace(‘\t‘,‘‘) keyword_english = pinyin.get(line, format="strip") fp.write(‘%s\n‘ % keyword_english) face_ID_index = 0 dir = "./stars_srcimg/" # if os.path.exists(dir): # os.system("rm -rf " + dir) if not os.path.exists(dir): os.mkdir(dir) pages = 2 maxnum = pages * 30 print(maxnum) for line in stars_list: #line.decode(‘utf-8‘).encode(‘gb2312‘) line = line.replace(‘\r‘,‘‘).replace(‘\n‘,‘‘).replace(‘\t‘,‘‘) keyword = line print keyword keyword_english = pinyin.get(keyword, format="strip") print keyword_english facesavepath = dir + str(face_ID_index) + "_" + keyword_english face_ID_index += 1 print facesavepath if not os.path.exists(facesavepath): os.mkdir(facesavepath) else: print(keyword, " exist") continue dataList = getManyPages(keyword, pages) # 参数1:关键字,参数2:要下载的页数 getImg(dataList, facesavepath, keyword_english) # 参数2:指定保存的路径
以上是关于python 百度图片爬虫的主要内容,如果未能解决你的问题,请参考以下文章