Is this its limit?
Posted rsapaper
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Is this its limit?相关的知识,希望对你有一定的参考价值。
import sys import os curPath = os.path.abspath(os.path.dirname(__file__)) rootPath = os.path.split(curPath)[0] sys.path.append(rootPath) from selenium import webdriver from selenium.webdriver.chrome.options import Options # from selenium.webdriver.firefox.options import Options import time from time import sleep import math import random import sys import threading import urllib.parse import xlrd import sys import os import sqlite3 MAX_TIME = 600 def py_stop_update_db(): # db = \'py_bdspider_status.db\' # db = \'%s\\\\%s\' % (curPath, db) # conn = sqlite3.connect(db) # pyname = os.path.basename(__file__).split(\'.py\')[0] # sql_ = \'%s%s%s\' % (\'UPDATE pystatus_table SET pystatus =2 WHERE pyname="\', pyname, \'"\') # print(sql_) # conn.execute(sql_) # conn.commit() # conn.close() return def chk_time(browser, start_time): if time.time() - start_time > MAX_TIME: py_stop_update_db() browser.delete_all_cookies() browser.quit() return dir_html = \'baidu_map_html_firstpage_pc_not_shop\' filepath = \'%s\\\\%s\' % (curPath, dir_html) requested_file_list = [] pathDir = os.listdir(filepath) for allDir in pathDir: child = os.path.join(\'%s%s\' % (filepath, allDir)) requested_file = child.split(dir_html)[1].split(\'&\')[0].split(\'.html\')[0] requested_file_list.append(requested_file) tag_jmtool_list = [\'(\', \'(\', \'-\'] def extract_name(name_): for i in tag_jmtool_list: name_ = name_.split(i)[0] return name_ pcity_list = [] pcity_file = \'%s\\\\%s\' % (curPath, \'省会城市.txt\') with open(pcity_file, \'r\', encoding=\'utf-8\') as pf: c_ = 0 for i in pf: c_ += 1 if c_ == 3: c_ = 0 pcity_list.append(i.replace(\' \', \'\').replace(\'\\n\', \'\') + \'市\') pcity_sorted_list = sorted(pcity_list) target_type_list = [\'住宅小区\', \'写字楼\'] # target_type_list = [\'住宅小区\'] target_type_list = [\'专科医院\'] target_type_list = [\'商场\'] requested_type_counter = 0 # 商场 4705 酒店 24915 专科医院 2513 商圈 334 target_dic = {} # target_city_list = [\'北京市\', \'上海市\', \'深圳市\', \'广州市\'] target_city_list = [\'深圳市\', \'广州市\'] target_city_list = [\'深圳市\'] target_city_list = [\'北京市\', \'上海市\'] target_city_list = [\'北京市\', \'上海市\', \'深圳市\', \'广州市\'] target_city_list = [\'北京市\', \'上海市\'] target_city_list = [\'深圳市\', \'广州市\'] target_city_list = [\'北京市\'] target_city_list = [\'北京市\', \'上海市\', \'深圳市\', \'广州市\'] target_city_list = pcity_sorted_list[21:28] #pcity_sorted_list[7:14] #target_city_list = pcity_sorted_list # target_city_list = [\'杭州市\'] file_name = \'JMTool任务_csv_py_wholeCSV\' FEXCEL = \'%s\\\\%s%s\' % (curPath, file_name, \'.xlsx\') data = xlrd.open_workbook(FEXCEL) table = data.sheets()[0] nrows, ncols = table.nrows, table.ncols res_dic, counter_ = {}, 0 for i in range(0, nrows): l = table.row_values(i) dbid, area_code, ref_area_type_code, city, district, address, city_street, name_, emp_, emp_1 = l if city not in target_city_list: continue # if city not in target_city_list: # target_city_list.append(city) type_ = ref_area_type_code if type_ not in target_type_list: continue name_ = name_.replace(\'?\', \'\') name_reduction = extract_name(name_) if len(name_reduction) < 3: name_reduction = name_ if city not in target_dic: target_dic[city] = {} if district not in target_dic[city]: target_dic[city][district] = {} if type_ not in target_dic[city][district]: target_dic[city][district][type_] = {} if name_reduction not in target_dic[city][district]: target_dic[city][district][type_][name_reduction] = {} target_dic[city][district][type_][name_reduction][\'name_reduction_list\'] = [] target_dic[city][district][type_][name_reduction][\'history_list\'] = [] try: target_dic[city][district][type_][name_reduction][\'name_reduction_list\'].append(name_) target_dic[city][district][type_][name_reduction][\'history_list\'].append(l) except Exception: print(Exception) write_res_html_dir = \'%s\\\\%s\\\\\' % (curPath, dir_html) def write_res_html(browser, dir_=write_res_html_dir): close_alert(browser) current_url_ = urllib.parse.unquote(browser.current_url) try: input_ = current_url_.split(\'&wd=\')[1].split(\'/?\')[0] except Exception: print(\'Exception-\', __file__, sys._getframe().f_lineno, current_url_) return current_url_ = \'%s%s%s\' % (\'<!--\', browser.current_url, \'-->\') page_source = \'%s%s\' % (current_url_, browser.page_source) # localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime()) # file_name = \'%s%s%s%s\' % (dir_, input_, localtime_, \'.html\') file_name = \'%s%s%s\' % (dir_, input_, \'.html\') fo = open(file_name, \'w\', encoding=\'utf-8\') fo.write(page_source) fo.closed print(os.path.basename(__file__), \'OK-writed-\', sys._getframe().f_lineno, \'\') def gen_random_letter(): return chr(random.randint(97, 122)) def gen_random_num(): return random.randint(0, 10) def gen_sougo_pid(): res_ = \'\' for i in range(1, 17, 1): if i in [1, 3, 4, 15]: res_ = \'%s%s\' % (res_, gen_random_letter()) else: res_ = \'%s%s\' % (res_, gen_random_num()) return res_ def close_alert(browser, attitude=\'accept\'): return # executable_path_str = \'%s\\\\%s\' % (curPath, \'geckodriver.exe\') # browser = webdriver.Firefox(executable_path=executable_path_str) def mobile_mobile_pages_html(browser, input_): start_time = time.time() chk_time(browser, start_time) sleep(3) url_ = \'%s%s\' % (\'http://map.baidu.com/?s=s%26wd%3D\', input_) browser.get(url_) write_res_html(browser) class MyThread(threading.Thread): def __init__(self, func, args, name): threading.Thread.__init__(self) self.name, self.func, self.args = name, func, args def run(self): self.func(self.args) def thread_city(city): global requested_type_counter for district in target_dic[city]: for type_ in target_dic[city][district]: for name_reduction in target_dic[city][district][type_]: for name_ in target_dic[city][district][type_][name_reduction][\'name_reduction_list\']: input_ = \'%s%s%s\' % (city, district, name_) if input_ in requested_file_list: requested_type_counter += 1 print(\'requested_type_counter=\', requested_type_counter, input_) else: # executable_path_str = \'%s\\\\%s\' % (curPath, \'chromedriver.exe\') # browser = webdriver.Chrome(executable_path=executable_path_str) executable_path_str = \'%s\\\\%s\' % (curPath, \'geckodriver.exe\') browser = webdriver.Firefox(executable_path=executable_path_str) mobile_mobile_pages_html(browser, input_) threads_list = [] for city in target_dic: thread_instance = MyThread(thread_city, (city), thread_city.__name__) threads_list.append(thread_instance) for t in threads_list: t.setDaemon = False t.start() for t in threads_list: t.join() # browser.delete_all_cookies() # browser.quit()
以上是关于Is this its limit?的主要内容,如果未能解决你的问题,请参考以下文章