python鐖创鍚ф暟鎹瓨mysql瀹屾暣浠g爜妗堜緥
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python鐖创鍚ф暟鎹瓨mysql瀹屾暣浠g爜妗堜緥相关的知识,希望对你有一定的参考价值。
鏍囩锛?a href='http://www.mamicode.com/so/1/utf8' title='utf8'>utf8
=== write 閿欒 efi use code ict 鐢ㄦ埛python鐖彇璐村惂鏁版嵁
鏈€杩戝啓涓畝鍗曠殑鐖彇璐村惂鏁版嵁鐨刣emo锛屽垎浜粰澶у
鐖彇鍐呭鍖呮嫭锛?/p>
import requests import parsel # pip install parsel import urllib.request import urllib.parse import re import json import pymysql from pymysql.cursors import DictCursor header = { 鈥?/span>user-agent鈥?/span>: 鈥?/span>Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36鈥?/span> } file = open(鈥?/span>鐢佃剳鍚ф暟鎹?txt鈥?/span>, 鈥?/span>w鈥?/span>, encoding=鈥?/span>utf-8鈥?/span>) # 鐖彇璐村惂鏁版嵁 def spider(startpage ,endpage, pagesize): page_num = 0 # range 宸﹀寘鍙充笉鍖? for page in range(startpage, endpage + 1, pagesize): page_num += 1 print(鈥?/span>===================姝e湪鎶撳彇璐村惂鐨勭{}椤垫暟鎹?==================鈥?/span>.format(page_num)) url = 鈥?/span>https://tieba.baidu.com/f?kw=%E7%94%B5%E8%84%91&ie=utf-8&pn={}鈥?/span>.format(page) page_data(url) # 瑙f瀽璐村惂涓婚〉 def page_data(url): request = urllib.request.Request(url=url, headers=header) response = urllib.request.urlopen(request) html = response.read().decode(鈥?/span>utf-8鈥?/span>) # 瑙f瀽甯栧瓙鍦板潃 thread_ids = re.findall(r鈥?/span>href="/p/(d+)"鈥?/span>, html) # thread_urls = [鈥?/span>http://tieba.baidu.com/p/鈥?/span> + str(url) for url in thread_ids] # for url in thread_urls: for thread_id in thread_ids: parser_thread(thread_id) # 瑙f瀽甯栧瓙鍐呭 def parser_thread(thread_id): thread_url = 鈥?/span>http://tieba.baidu.com/p/鈥?/span> + str(thread_id) #print(鈥?/span>id鈥?/span>, thread_id) print(鈥?/span>thread_url鈥?/span>, thread_url) # 瑙f瀽甯栧瓙绗竴椤垫暟鎹紝鑾峰彇甯栧瓙鎬婚〉鏁? response = requests.get(thread_url, headers=header).text response_data = parsel.Selector(response) # 鏍囬 thread_title = response_data.xpath(鈥?/span>//h1/text()鈥?/span>).extract()[0] # 鍙戝笘鏃堕棿 content_field = response_data.xpath(鈥?/span>//div[contains(@class,"l_post j_l_post l_post_bright")]/@data-field鈥?/span>).extract() content_field_json = json.loads(content_field[0]) publish_date = content_field_json[鈥?/span>content鈥?/span>][鈥?/span>date鈥?/span>] # 妤间富鏄电О ps锛氬鏋滃悕瀛椾腑鏈夊浘鐗?/span>/瀛楃鍙兘瀵艰嚧涓嶅畬鏁? thread_author = content_field_json[鈥?/span>author鈥?/span>][鈥?/span>user_name鈥?/span>] # 妤间富澶村儚鍦板潃 avatar_url = 鈥?/span>https:鈥?/span> + response_data.xpath(鈥?/span>//ul/li/div/a/img/@src鈥?/span>).extract()[0] # 甯栧瓙鎬诲洖澶嶆暟 thread_reply_count = response_data.xpath(鈥?/span>//li[@class="l_reply_num"]/span/text()鈥?/span>).extract()[0] # 甯栧瓙鎬婚〉鏁? thread_page_count = int(response_data.xpath(鈥?/span>//li[@class="l_reply_num"]/span/text()鈥?/span>).extract()[1]) # print(鈥?/span>---------------------------------------- 鈥?/span>) # print(鈥?/span>id锛?/span>鈥?/span>, thread_id) # print(鈥?/span>閾炬帴锛?/span>鈥?/span>, thread_url) # print(鈥?/span>鏍囬锛?/span>鈥?/span>, thread_title) # print(鈥?/span>鏃ユ湡锛?/span>鈥?/span>, publish_date) # print(鈥?/span>浣滆€咃細鈥?/span>, thread_author) # print(鈥?/span>澶村儚锛?/span>鈥?/span>, avatar_url) # 淇濆瓨璐村瓙涓绘暟鎹? save_thread(thread_id, thread_title, thread_author, publish_date, avatar_url) # print(鈥?/span>甯栧瓙鎬婚〉鏁帮細{0},甯栧瓙鎬诲洖澶嶆暟锛歿1}鈥?/span>.format(thread_page_count,thread_reply_count)) # for page_index in range(0, thread_page_count+1): # page_url = thread_url+"?pn={}".format(page_index+1) # parser_thread_detail(thread_url) # 甯栧瓙鍐呭闆嗗悎 thread_contents = response_data.xpath(鈥?/span>.//div[contains(@id,"post_content_")]鈥?/span>) # index 妤煎眰 index = 0 while index < len(thread_contents): # 妤煎眰鏂囨 content_text = thread_contents.xpath(鈥?/span>string(.)鈥?/span>).extract()[index] # 妤煎眰鍓嶉潰绌烘牸鍘婚櫎 content_text = content_text[12:] field_json = json.loads(content_field[index]) detail_publish_date = field_json[鈥?/span>content鈥?/span>][鈥?/span>date鈥?/span>] thread_detail_id = field_json[鈥?/span>content鈥?/span>][鈥?/span>post_id鈥?/span>] # 璇ュ眰鐨凷elector content_sel = thread_contents[index] # 鑾峰彇璇ュ眰鍥剧墖 images = content_sel.xpath(鈥?/span>img/@src鈥?/span>).extract() index = index + 1 print(鈥?/span>绗瑊}妤?/span>鈥?/span>.format(index)) # print(鈥?/span>鏂囨锛?/span>鈥?/span>, content_text) save_thread_detail(thread_detail_id, thread_id, content_text, str(images), detail_publish_date) # thread_images = response_data.xpath(鈥?/span>//cc/div/img[@class="BDE_Image"]/@src鈥?/span>).extract() # saveImg(thread_images) # 淇濆瓨璐村瓙涓绘暟鎹?def save_thread(thread_id, thread_title, nickname, publish_time, avatar_url): # SQL 鎻掑叆璇彞 sql = 鈥?/span>insert into thread_info(thread_id, thread_title, nickname, publish_time, avatar_url) 鈥?/span> 鈥?/span>value (%s, %s, %s, %s, %s )鈥?/span> try: conn = pymysql.connect( host=鈥?/span>47.101.213.133鈥?/span>, # 杩炴帴鍚? port=3306, # 绔彛 user=鈥?/span>dreaming鈥?/span>, # 鐢ㄦ埛鍚? password=鈥?/span>30wish2003!鈥?/span>, # 瀵嗙爜 charset=鈥?/span>utf8鈥?/span>, # 涓嶈兘鍐檜tf-8 鍦∕ySQL閲岄潰鍐檜tf-8浼氭姤閿? database=鈥?/span>x_player鈥?/span>, # 鏁版嵁搴撳簱鍚? cursorclass=DictCursor) # 浣跨敤cursor()鏂规硶鑾峰彇鎿嶄綔娓告爣 cursor = conn.cursor() # 鎵цsql璇彞 r = cursor.execute(sql, (thread_id, thread_title, nickname, publish_time, avatar_url)) # r = cursor.execute(sql) # 鎻愪氦鍒版暟鎹簱鎵ц conn.commit() print(鈥?/span>save success - 鈥?/span>, r) except: # 鍙戠敓閿欒鏃跺洖婊? print(鈥?/span>ERROR - 鈥?/span>, thread_id) # 鍏抽棴鏁版嵁搴撹繛鎺? cursor.close() conn.close() # 淇濆瓨姣忎釜妤煎眰杈撳叆锛堝彧鐖彇璐村瓙鐨勭涓€椤垫ゼ灞傛暟鎹級 def save_thread_detail(thread_detail_id, thread_id, content, image, publish_date): # SQL 鎻掑叆璇彞 sql = 鈥?/span>insert into thread_detail_info(thread_detail_id, thread_id, content, image, publish_date) 鈥?/span> 鈥?/span>value (%s, %s, %s, %s, %s )鈥?/span> try: conn = pymysql.connect( host=鈥榵x.xxx.xxx.xxx鈥?/span>, # TODO:杩炴帴鍚?/em> port=3306, # TODO:绔彛 user=鈥?/span>xxx鈥?/span>, # TODO:鐢ㄦ埛鍚?/span> password=鈥?/span>xxx!鈥?/span>, # TODO:瀵嗙爜 charset=鈥?/span>utf8鈥?/span>, # 涓嶈兘鍐檜tf-8 鍦∕ySQL閲岄潰鍐檜tf-8浼氭姤閿?/span> database=鈥?/span>xxx鈥?/span>, # TODO:鏁版嵁搴撳簱鍚?/span> cursorclass=DictCursor) # 浣跨敤cursor()鏂规硶鑾峰彇鎿嶄綔娓告爣 cursor = conn.cursor() # 鎵цsql璇彞 r = cursor.execute(sql, (thread_detail_id, thread_id, content, image, publish_date)) # 鎻愪氦鍒版暟鎹簱鎵ц conn.commit() print(鈥?/span>save detail success - 鈥?/span>, r) except: print(鈥?/span>!!!!!!!save detail error:- 鈥?/span>, thread_detail_id) # 鍏抽棴鏁版嵁搴撹繛鎺? cursor.close() conn.close() # 灏嗘暟鎹繚瀛樺埌txt鏂囦欢 def savefile(data): for item in data: file.write(鈥?/span>---------------------------------------- 鈥?/span>) file.write(鈥?/span>title锛?/span>鈥?/span> + str(item[0]) + 鈥?/span> 鈥?/span>) file.write(鈥?/span>author锛?/span>鈥?/span> + str(item[1]) + 鈥?/span> 鈥?/span>) file.write(鈥?/span>url锛?/span>鈥?/span> + str(item[2]) + 鈥?/span> 鈥?/span>) file.write(鈥?/span>images锛?/span>鈥?/span> + str(item[3]) + 鈥?/span> 鈥?/span>) # 鍥剧墖涓嬭浇鍒版湰鍦?/span>/鏈嶅姟鍣?def saveImg(images): for img in images: img_data = requests.get(img, headers=header).content # 浜岃繘鍒舵暟鎹敤content image_name = img.split(鈥?/span>/鈥?/span>)[-1] with open(鈥?/span>./tieba/鈥?/span> + image_name, 鈥?/span>wb鈥?/span>) as f: f.write(img_data) print(鈥?/span>%s download img...鈥?/span> % image_name) if __name__ == 鈥?/span>__main__鈥?/span>: start = int(input("杈撳叆寮€濮嬬埇鍙栬创鍚х殑椤电爜:")) end = int(input(鈥?/span>杈撳叆缁撴潫鐖彇璐村惂鐨勯〉鐮侊紙榛樿璇疯緭鍏?锛夛細鈥?/span>)) end=end+1 if end !=0 else 3057000 + 1; spider(start,end, 50)
缁撳眬璇細绠€鍗曟渚嬶紝浠呬緵鍙傝€冿紝閫傚悎python鍒濆鑰呫€備唬鐮佽繕鏈夊緢澶氬彲浼樺寲鐨勭┖闂淬€傛湁闇€瑕佺殑浜?鎴栬€呮湁蹇呰鐨勮瘽锛屽悗缁細鍙兘浼氭洿鏂般€?/p>
以上是关于python鐖创鍚ф暟鎹瓨mysql瀹屾暣浠g爜妗堜緥的主要内容,如果未能解决你的问题,请参考以下文章
python 鐖彇涔屼簯鎵€鏈夊巶鍟嗗悕瀛楋紝url锛屾紡娲炴€绘暟 骞跺瓨鍏ユ暟鎹簱
mac锛欸o瀹夎鍜岄厤缃?GoLand瀹夎鍜屼娇鐢ㄤ箣瀹屾暣鏁欑▼