python [Python] Steam게임평크롤링코드

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python [Python] Steam게임평크롤링코드相关的知识,希望对你有一定的参考价值。

import urllib
import urllib.request
import urllib.parse
import bs4
import re
import os
import time
from concurrent.futures import ThreadPoolExecutor
 
 
def deleteTag(x):
    return re.sub("<[^>]*>", "", x)
 
 
def getComments(code):
    def makeArgs(code, page):
        params = {
            'code': code,
            'type': 'after',
            'isActualPointWriteExecute': 'false',
            'isMileageSubscriptionAlready': 'false',
            'isMileageSubscriptionReject': 'false',
            'page': page
        }
        return urllib.parse.urlencode(params)
 
    def innerHTML(s, sl=0):
        ret = ''
        for i in s.contents[sl:]:
            if i is str:
                ret += i.strip()
            else:
                ret += str(i)
        return ret
 
    def fText(s):
        if len(s): return innerHTML(s[0]).strip()
        return ''
 
    retList = []
    colSet = set()
    print("Processing: %d" % code)
    page = 1
    while 1:
        try:
            f = urllib.request.urlopen(
                "http://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?" + makeArgs(code, page))
            data = f.read().decode('utf-8')
        except:
            break
        soup = bs4.BeautifulSoup(re.sub("&#(?![0-9])", "", data), "html.parser")
        cs = soup.select(".score_result li")
        if not len(cs): break
        for link in cs:
            try:
                url = link.select('.score_reple em a')[0].get('onclick')
            except:
                print(page)
                print(data)
                raise ""
            m = re.search('[0-9]+', url)
            if m:
                url = m.group(0)
            else:
                url = ''
            if url in colSet: return retList
            colSet.add(url)
            cat = fText(link.select('.star_score em'))
            cont = fText(link.select('.score_reple p'))
            cont = re.sub('<span [^>]+>.+?</span>', '', cont)
            retList.append((url, cat, cont))
        page += 1
 
    return retList
 
 
def fetch(i):
    outname = 'comments/%d.txt' % i
    try:
        if os.stat(outname).st_size > 0: return
    except:
        None
    rs = getComments(i)
    if not len(rs): return
    f = open(outname, 'w', encoding='utf-8')
    f.write('INSERT IGNORE INTO movie VALUES ')
    for idx, r in enumerate(rs):
        if idx: f.write(',\n')
        f.write("(%d,%s,%s,'%s')" % (i, r[0], r[1], r[2].replace("'", "''").replace("\\", "\\\\")))
    f.write(';\n')
    f.close()
    time.sleep(1)
 
 
with ThreadPoolExecutor(max_workers=5) as executor:
# 영화 고유 ID값의 범위를 몰라서 대략 아래처럼 잡았습니다.
    for i in range(10000, 200000):
        executor.submit(fetch, i)


출처: http://bab2min.tistory.com/556 [나의 큰 O는 logx야..]

출처: http://bab2min.tistory.com/556 [나의 큰 O는 logx야..]

以上是关于python [Python] Steam게임평크롤링코드的主要内容,如果未能解决你的问题,请参考以下文章

markdown 私人네임네임맹#python #tutorial

markdown 네임스페이스와스코프#python #tutorial

python爬虫 爬取steam热销游戏

利用Python白玩steam游戏,我是专业的

4-Python游戏编程-贪吃蛇(教程+源码)steam少儿编程课件

2-Python游戏编程-拼图游戏(教程+源码)steam少儿编程课件