Is this its limit?

Posted rsapaper

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Is this its limit?相关的知识,希望对你有一定的参考价值。

 

import sys
import os

curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(rootPath)

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys
import threading
import urllib.parse
import xlrd
import sys
import os
import sqlite3

MAX_TIME = 600


def py_stop_update_db():
    # db = \'py_bdspider_status.db\'
    # db = \'%s\\\\%s\' % (curPath, db)
    # conn = sqlite3.connect(db)
    # pyname = os.path.basename(__file__).split(\'.py\')[0]
    # sql_ = \'%s%s%s\' % (\'UPDATE pystatus_table SET pystatus =2 WHERE pyname="\', pyname, \'"\')
    # print(sql_)
    # conn.execute(sql_)
    # conn.commit()
    # conn.close()
    return


def chk_time(browser, start_time):
    if time.time() - start_time > MAX_TIME:
        py_stop_update_db()
        browser.delete_all_cookies()
        browser.quit()
    return


dir_html = \'baidu_map_html_firstpage_pc_not_shop\'
filepath = \'%s\\\\%s\' % (curPath, dir_html)
requested_file_list = []
pathDir = os.listdir(filepath)
for allDir in pathDir:
    child = os.path.join(\'%s%s\' % (filepath, allDir))
    requested_file = child.split(dir_html)[1].split(\'&\')[0].split(\'.html\')[0]
    requested_file_list.append(requested_file)

tag_jmtool_list = [\'(\', \'(\', \'-\']


def extract_name(name_):
    for i in tag_jmtool_list:
        name_ = name_.split(i)[0]
    return name_


pcity_list = []
pcity_file = \'%s\\\\%s\' % (curPath, \'省会城市.txt\')
with open(pcity_file, \'r\', encoding=\'utf-8\') as pf:
    c_ = 0
    for i in pf:
        c_ += 1
        if c_ == 3:
            c_ = 0
            pcity_list.append(i.replace(\' \', \'\').replace(\'\\n\', \'\') + \'市\')
pcity_sorted_list = sorted(pcity_list)

target_type_list = [\'住宅小区\', \'写字楼\']
# target_type_list = [\'住宅小区\']
target_type_list = [\'专科医院\']
target_type_list = [\'商场\']
requested_type_counter = 0
# 商场 4705 酒店 24915 专科医院 2513 商圈 334
target_dic = {}
# target_city_list = [\'北京市\', \'上海市\', \'深圳市\', \'广州市\']
target_city_list = [\'深圳市\', \'广州市\']
target_city_list = [\'深圳市\']
target_city_list = [\'北京市\', \'上海市\']
target_city_list = [\'北京市\', \'上海市\', \'深圳市\', \'广州市\']
target_city_list = [\'北京市\', \'上海市\']
target_city_list = [\'深圳市\', \'广州市\']
target_city_list = [\'北京市\']
target_city_list = [\'北京市\', \'上海市\', \'深圳市\', \'广州市\']
target_city_list = pcity_sorted_list[21:28]
#pcity_sorted_list[7:14]
#target_city_list = pcity_sorted_list

# target_city_list = [\'杭州市\']
file_name = \'JMTool任务_csv_py_wholeCSV\'

FEXCEL = \'%s\\\\%s%s\' % (curPath, file_name, \'.xlsx\')
data = xlrd.open_workbook(FEXCEL)
table = data.sheets()[0]
nrows, ncols = table.nrows, table.ncols
res_dic, counter_ = {}, 0
for i in range(0, nrows):
    l = table.row_values(i)
    dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, emp_, emp_1 = l
    if city not in target_city_list:
        continue
    # if city not in target_city_list:
    #     target_city_list.append(city)
    type_ = ref_area_type_code
    if type_ not in target_type_list:
        continue
    name_ = name_.replace(\'?\', \'\')
    name_reduction = extract_name(name_)
    if len(name_reduction) < 3:
        name_reduction = name_
    if city not in target_dic:
        target_dic[city] = {}
    if district not in target_dic[city]:
        target_dic[city][district] = {}
    if type_ not in target_dic[city][district]:
        target_dic[city][district][type_] = {}
    if name_reduction not in target_dic[city][district]:
        target_dic[city][district][type_][name_reduction] = {}
        target_dic[city][district][type_][name_reduction][\'name_reduction_list\'] = []
        target_dic[city][district][type_][name_reduction][\'history_list\'] = []
    try:
        target_dic[city][district][type_][name_reduction][\'name_reduction_list\'].append(name_)
        target_dic[city][district][type_][name_reduction][\'history_list\'].append(l)
    except Exception:
        print(Exception)

write_res_html_dir = \'%s\\\\%s\\\\\' % (curPath, dir_html)


def write_res_html(browser, dir_=write_res_html_dir):
    close_alert(browser)
    current_url_ = urllib.parse.unquote(browser.current_url)
    try:
        input_ = current_url_.split(\'&wd=\')[1].split(\'/?\')[0]
    except Exception:
        print(\'Exception-\', __file__, sys._getframe().f_lineno, current_url_)
        return
    current_url_ = \'%s%s%s\' % (\'<!--\', browser.current_url, \'-->\')
    page_source = \'%s%s\' % (current_url_, browser.page_source)
    # localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
    # file_name = \'%s%s%s%s\' % (dir_, input_, localtime_, \'.html\')
    file_name = \'%s%s%s\' % (dir_, input_, \'.html\')
    fo = open(file_name, \'w\', encoding=\'utf-8\')
    fo.write(page_source)
    fo.closed
    print(os.path.basename(__file__), \'OK-writed-\', sys._getframe().f_lineno, \'\')


def gen_random_letter():
    return chr(random.randint(97, 122))


def gen_random_num():
    return random.randint(0, 10)


def gen_sougo_pid():
    res_ = \'\'
    for i in range(1, 17, 1):
        if i in [1, 3, 4, 15]:
            res_ = \'%s%s\' % (res_, gen_random_letter())
        else:
            res_ = \'%s%s\' % (res_, gen_random_num())
    return res_


def close_alert(browser, attitude=\'accept\'):
    return


# executable_path_str = \'%s\\\\%s\' % (curPath, \'geckodriver.exe\')
# browser = webdriver.Firefox(executable_path=executable_path_str)



def mobile_mobile_pages_html(browser, input_):
    start_time = time.time()
    chk_time(browser, start_time)
    sleep(3)
    url_ = \'%s%s\' % (\'http://map.baidu.com/?s=s%26wd%3D\', input_)
    browser.get(url_)
    write_res_html(browser)


class MyThread(threading.Thread):
    def __init__(self, func, args, name):
        threading.Thread.__init__(self)
        self.name, self.func, self.args = name, func, args

    def run(self):
        self.func(self.args)


def thread_city(city):
    global requested_type_counter
    for district in target_dic[city]: 
        for type_ in target_dic[city][district]:
            for name_reduction in target_dic[city][district][type_]:
                for name_ in target_dic[city][district][type_][name_reduction][\'name_reduction_list\']:
                    input_ = \'%s%s%s\' % (city, district, name_)
                    if input_ in requested_file_list:
                        requested_type_counter += 1
                        print(\'requested_type_counter=\', requested_type_counter, input_)
                    else:
                        # executable_path_str = \'%s\\\\%s\' % (curPath, \'chromedriver.exe\')
                        # browser = webdriver.Chrome(executable_path=executable_path_str)

                        executable_path_str = \'%s\\\\%s\' % (curPath, \'geckodriver.exe\')
                        browser = webdriver.Firefox(executable_path=executable_path_str)
                        mobile_mobile_pages_html(browser, input_)


threads_list = []
for city in target_dic:
    thread_instance = MyThread(thread_city, (city), thread_city.__name__)
    threads_list.append(thread_instance)
for t in threads_list:
    t.setDaemon = False
    t.start()
for t in threads_list:
    t.join()

# browser.delete_all_cookies()
# browser.quit()

  

 

 

 

以上是关于Is this its limit?的主要内容,如果未能解决你的问题,请参考以下文章