python鐖创鍚ф暟鎹瓨mysql瀹屾暣浠g爜妗堜緥

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python鐖创鍚ф暟鎹瓨mysql瀹屾暣浠g爜妗堜緥相关的知识,希望对你有一定的参考价值。

鏍囩锛?a href='http://www.mamicode.com/so/1/utf8' title='utf8'>utf8   ===   write   閿欒   efi   use   code   ict   鐢ㄦ埛   

python鐖彇璐村惂鏁版嵁

鏈€杩戝啓涓畝鍗曠殑鐖彇璐村惂鏁版嵁鐨刣emo锛屽垎浜粰澶у

鐖彇鍐呭鍖呮嫭锛?/p>

鎶€鏈浘鐗? src=

 

 

 

鎶€鏈浘鐗? src=

  

 

import requests
import parsel  # pip install parsel
import urllib.request
import urllib.parse
import re
import json
import pymysql
from pymysql.cursors import DictCursor

header =     {
        鈥?/span>user-agent鈥?/span>: 鈥?/span>Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36鈥?/span>
    }

file = open(鈥?/span>鐢佃剳鍚ф暟鎹?txt鈥?/span>, 鈥?/span>w鈥?/span>, encoding=鈥?/span>utf-8鈥?/span>)


# 鐖彇璐村惂鏁版嵁
def spider(startpage ,endpage, pagesize):
    page_num = 0
    # range 宸﹀寘鍙充笉鍖?    for page in range(startpage, endpage + 1, pagesize):
        page_num += 1
        print(鈥?/span>===================姝e湪鎶撳彇璐村惂鐨勭{}椤垫暟鎹?==================鈥?/span>.format(page_num))
        url = 鈥?/span>https://tieba.baidu.com/f?kw=%E7%94%B5%E8%84%91&ie=utf-8&pn={}鈥?/span>.format(page)

        page_data(url)


# 瑙f瀽璐村惂涓婚〉
def page_data(url):
    request = urllib.request.Request(url=url, headers=header)
    response = urllib.request.urlopen(request)
    html = response.read().decode(鈥?/span>utf-8鈥?/span>)
    # 瑙f瀽甯栧瓙鍦板潃
    thread_ids = re.findall(r鈥?/span>href="/p/(d+)"鈥?/span>, html)
    # thread_urls = [鈥?/span>http://tieba.baidu.com/p/鈥?/span> + str(url) for url in thread_ids]

    # for url in thread_urls:
    for thread_id in thread_ids:
        parser_thread(thread_id)


# 瑙f瀽甯栧瓙鍐呭
def parser_thread(thread_id):
    thread_url = 鈥?/span>http://tieba.baidu.com/p/鈥?/span> + str(thread_id)
    #print(鈥?/span>id鈥?/span>, thread_id)
    print(鈥?/span>thread_url鈥?/span>, thread_url)
    # 瑙f瀽甯栧瓙绗竴椤垫暟鎹紝鑾峰彇甯栧瓙鎬婚〉鏁?    response = requests.get(thread_url, headers=header).text
    response_data = parsel.Selector(response)
    # 鏍囬
    thread_title = response_data.xpath(鈥?/span>//h1/text()鈥?/span>).extract()[0]
    # 鍙戝笘鏃堕棿
    content_field =         response_data.xpath(鈥?/span>//div[contains(@class,"l_post j_l_post l_post_bright")]/@data-field鈥?/span>).extract()
    content_field_json = json.loads(content_field[0])
    publish_date = content_field_json[鈥?/span>content鈥?/span>][鈥?/span>date鈥?/span>]

    # 妤间富鏄电О ps锛氬鏋滃悕瀛椾腑鏈夊浘鐗?/span>/瀛楃鍙兘瀵艰嚧涓嶅畬鏁?    thread_author = content_field_json[鈥?/span>author鈥?/span>][鈥?/span>user_name鈥?/span>]
    # 妤间富澶村儚鍦板潃
    avatar_url = 鈥?/span>https:鈥?/span> + response_data.xpath(鈥?/span>//ul/li/div/a/img/@src鈥?/span>).extract()[0]

    # 甯栧瓙鎬诲洖澶嶆暟
    thread_reply_count = response_data.xpath(鈥?/span>//li[@class="l_reply_num"]/span/text()鈥?/span>).extract()[0]
    # 甯栧瓙鎬婚〉鏁?    thread_page_count = int(response_data.xpath(鈥?/span>//li[@class="l_reply_num"]/span/text()鈥?/span>).extract()[1])
    # print(鈥?/span>----------------------------------------
鈥?/span>)
    # print(鈥?/span>id锛?/span>鈥?/span>, thread_id)
    # print(鈥?/span>閾炬帴锛?/span>鈥?/span>, thread_url)
    # print(鈥?/span>鏍囬锛?/span>鈥?/span>, thread_title)
    # print(鈥?/span>鏃ユ湡锛?/span>鈥?/span>, publish_date)
    # print(鈥?/span>浣滆€咃細鈥?/span>, thread_author)
    # print(鈥?/span>澶村儚锛?/span>鈥?/span>, avatar_url)
    # 淇濆瓨璐村瓙涓绘暟鎹?    save_thread(thread_id, thread_title, thread_author, publish_date, avatar_url)
    # print(鈥?/span>甯栧瓙鎬婚〉鏁帮細{0},甯栧瓙鎬诲洖澶嶆暟锛歿1}鈥?/span>.format(thread_page_count,thread_reply_count))
    # for page_index in range(0, thread_page_count+1):
    #     page_url = thread_url+"?pn={}".format(page_index+1)
    # parser_thread_detail(thread_url)

    # 甯栧瓙鍐呭闆嗗悎
    thread_contents = response_data.xpath(鈥?/span>.//div[contains(@id,"post_content_")]鈥?/span>)
    # index 妤煎眰
    index = 0
    while index < len(thread_contents):
        # 妤煎眰鏂囨
        content_text = thread_contents.xpath(鈥?/span>string(.)鈥?/span>).extract()[index]
        # 妤煎眰鍓嶉潰绌烘牸鍘婚櫎
        content_text = content_text[12:]
        field_json = json.loads(content_field[index])
        detail_publish_date = field_json[鈥?/span>content鈥?/span>][鈥?/span>date鈥?/span>]
        thread_detail_id = field_json[鈥?/span>content鈥?/span>][鈥?/span>post_id鈥?/span>]

        # 璇ュ眰鐨凷elector
        content_sel = thread_contents[index]
        # 鑾峰彇璇ュ眰鍥剧墖
        images = content_sel.xpath(鈥?/span>img/@src鈥?/span>).extract()
        index = index + 1
        print(鈥?/span>绗瑊}妤?/span>鈥?/span>.format(index))
        # print(鈥?/span>鏂囨锛?/span>鈥?/span>, content_text)
        save_thread_detail(thread_detail_id, thread_id, content_text, str(images), detail_publish_date)

        # thread_images = response_data.xpath(鈥?/span>//cc/div/img[@class="BDE_Image"]/@src鈥?/span>).extract()
        # saveImg(thread_images)


# 淇濆瓨璐村瓙涓绘暟鎹?def save_thread(thread_id, thread_title, nickname, publish_time, avatar_url):
    # SQL 鎻掑叆璇彞
    sql = 鈥?/span>insert into thread_info(thread_id, thread_title, nickname, publish_time, avatar_url) 鈥?/span>           鈥?/span>value (%s, %s, %s, %s, %s )鈥?/span>
    try:
        conn = pymysql.connect(
            host=鈥?/span>47.101.213.133鈥?/span>,  # 杩炴帴鍚?            port=3306,  # 绔彛
            user=鈥?/span>dreaming鈥?/span>,  # 鐢ㄦ埛鍚?            password=鈥?/span>30wish2003!鈥?/span>,  # 瀵嗙爜
            charset=鈥?/span>utf8鈥?/span>,  # 涓嶈兘鍐檜tf-8 鍦∕ySQL閲岄潰鍐檜tf-8浼氭姤閿?            database=鈥?/span>x_player鈥?/span>,  # 鏁版嵁搴撳簱鍚?            cursorclass=DictCursor)
        # 浣跨敤cursor()鏂规硶鑾峰彇鎿嶄綔娓告爣
        cursor = conn.cursor()
        # 鎵цsql璇彞
        r = cursor.execute(sql, (thread_id, thread_title, nickname, publish_time, avatar_url))
        # r = cursor.execute(sql)
        # 鎻愪氦鍒版暟鎹簱鎵ц
        conn.commit()
        print(鈥?/span>save success - 鈥?/span>, r)
    except:
        # 鍙戠敓閿欒鏃跺洖婊?        print(鈥?/span>ERROR - 鈥?/span>, thread_id)
    # 鍏抽棴鏁版嵁搴撹繛鎺?    cursor.close()
    conn.close()


# 淇濆瓨姣忎釜妤煎眰杈撳叆锛堝彧鐖彇璐村瓙鐨勭涓€椤垫ゼ灞傛暟鎹級
def save_thread_detail(thread_detail_id, thread_id, content, image, publish_date):
    # SQL 鎻掑叆璇彞
    sql = 鈥?/span>insert into thread_detail_info(thread_detail_id, thread_id, content, image, publish_date) 鈥?/span>           鈥?/span>value (%s, %s, %s, %s, %s )鈥?/span>
    try:
        conn = pymysql.connect(
            host=鈥榵x.xxx.xxx.xxx鈥?/span>,  # TODO:杩炴帴鍚?/em>
            port=3306,  # TODO:绔彛
            user=鈥?/span>xxx鈥?/span>,  # TODO:鐢ㄦ埛鍚?/span>
            password=鈥?/span>xxx!鈥?/span>,  # TODO:瀵嗙爜
            charset=鈥?/span>utf8鈥?/span>,  # 涓嶈兘鍐檜tf-8 鍦∕ySQL閲岄潰鍐檜tf-8浼氭姤閿?/span>
            database=鈥?/span>xxx鈥?/span>,  # TODO:鏁版嵁搴撳簱鍚?/span>
            cursorclass=DictCursor)
        # 浣跨敤cursor()鏂规硶鑾峰彇鎿嶄綔娓告爣
        cursor = conn.cursor()
        # 鎵цsql璇彞
        r = cursor.execute(sql, (thread_detail_id, thread_id, content, image, publish_date))
        # 鎻愪氦鍒版暟鎹簱鎵ц
        conn.commit()
        print(鈥?/span>save detail success - 鈥?/span>, r)
    except:
        print(鈥?/span>!!!!!!!save detail error:- 鈥?/span>, thread_detail_id)
    # 鍏抽棴鏁版嵁搴撹繛鎺?    cursor.close()
    conn.close()


# 灏嗘暟鎹繚瀛樺埌txt鏂囦欢
def savefile(data):
    for item in data:
        file.write(鈥?/span>----------------------------------------
鈥?/span>)
        file.write(鈥?/span>title锛?/span>鈥?/span> + str(item[0]) + 鈥?/span>
鈥?/span>)
        file.write(鈥?/span>author锛?/span>鈥?/span> + str(item[1]) + 鈥?/span>
鈥?/span>)
        file.write(鈥?/span>url锛?/span>鈥?/span> + str(item[2]) + 鈥?/span>
鈥?/span>)
        file.write(鈥?/span>images锛?/span>鈥?/span> + str(item[3]) + 鈥?/span>
鈥?/span>)


# 鍥剧墖涓嬭浇鍒版湰鍦?/span>/鏈嶅姟鍣?def saveImg(images):
    for img in images:
        img_data = requests.get(img, headers=header).content  # 浜岃繘鍒舵暟鎹敤content
        image_name = img.split(鈥?/span>/鈥?/span>)[-1]
        with open(鈥?/span>./tieba/鈥?/span> + image_name, 鈥?/span>wb鈥?/span>) as f:
            f.write(img_data)
        print(鈥?/span>%s download img...鈥?/span> % image_name)


if __name__ == 鈥?/span>__main__鈥?/span>:
    start = int(input("杈撳叆寮€濮嬬埇鍙栬创鍚х殑椤电爜:"))
    end = int(input(鈥?/span>杈撳叆缁撴潫鐖彇璐村惂鐨勯〉鐮侊紙榛樿璇疯緭鍏?锛夛細鈥?/span>))

    end=end+1 if end !=0 else 3057000 + 1;

    spider(start,end, 50)

 

缁撳眬璇細绠€鍗曟渚嬶紝浠呬緵鍙傝€冿紝閫傚悎python鍒濆鑰呫€備唬鐮佽繕鏈夊緢澶氬彲浼樺寲鐨勭┖闂淬€傛湁闇€瑕佺殑浜?鎴栬€呮湁蹇呰鐨勮瘽锛屽悗缁細鍙兘浼氭洿鏂般€?/p>

以上是关于python鐖创鍚ф暟鎹瓨mysql瀹屾暣浠g爜妗堜緥的主要内容,如果未能解决你的问题,请参考以下文章

澶ф暟鎹粍浠?瀛︿範鐭ヨ瘑鍥捐氨

python 鐖彇涔屼簯鎵€鏈夊巶鍟嗗悕瀛楋紝url锛屾紡娲炴€绘暟 骞跺瓨鍏ユ暟鎹簱

mac锛欸o瀹夎鍜岄厤缃?GoLand瀹夎鍜屼娇鐢ㄤ箣瀹屾暣鏁欑▼

瀹屾暣妗堜緥鈥斺€旈厤缃墠绔拰鍚庣API搴旂敤鐨勫畨鍏ㄨ璇佲€斺€斿熀浜嶢zure瀹炵幇

CentOS 7涓嬫渶鏂扮増(6.2.4)ELK+Filebeat+Log4j鏃ュ織闆嗘垚鐜鎼缓瀹屾暣鎸囧崡