python3 windows下的几乎万能爬虫方法

Posted 那个雨季

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python3 windows下的几乎万能爬虫方法相关的知识,希望对你有一定的参考价值。

解决selenium下被反扒的情况

# -*- encoding:utf-8 -*-
import time

import requests
from selenium.webdriver.common.by import By


def get_data(cookies):
    headers = 
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/101.0.4951.54 Safari/537.36",
        # "Host": "www.iwencai.com",
        # 'Upgrade-Insecure-Requests': '1',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        # 'referer': 'https://www.iwencai.com/',
        "Cookie": f"cid=cookies['cid']; "
                  f"ComputerID=cookies['ComputerID']; "
                  "ver_mark=c; "
                  "other_uid=Ths_iwencai_Xuangu_9fnth9iibkd8zrtya1yj4ll48wx683s8; "
                  f"ta_random_userid=cookies['ta_random_userid']; "
                  "WafStatus=0; "
                  f"phpSESSID=cookies['PHPSESSID']; "
                  f"v=cookies['v']",
        "hexin-v": f"cookies['v']"
    
    print("开始爬取数据".center(30,"*"))
    print("请求头为:",headers)
    url = 'http://www.iwencai.com/stockpick/load-data?typed=0&preParams=&ts=1&f=1&qs=result_original&selfsectsn=&querytype=stock&searchfilter=&tid=stockpick&' \\
          'w=股价下跌,超大单净流入大于1000万元,大单净流入大于1000万元,市值小于400亿,非ST,非创业板,非科创板,股价大于10元'
    res = requests.get(url, headers=headers)
    print("res为:", res)
    if res.status_code != 200:
        print(res.text)
        return
    result_data = res.json()
    print('爬取数据的结果为:',result_data)
    return result_data['data']['result']['result']


def handle_data(result_data):
    """
    处理数据
    :return:
    """

    result_list_data = []
    for one_data in result_data:
        result_dict_data = 
        # 股票代码
        result_dict_data['stock_code'] = one_data[0]
        # 股票名称
        result_dict_data['stock_name'] = one_data[1]
        # 涨跌幅
        result_dict_data['stock_upanddown'] = one_data[2]
        # 超大单流入
        result_dict_data['stock_extea_large_flow'] = one_data[3]
        # 大单流入
        result_dict_data['stock_large_flow'] = one_data[4]

        # 股票市值
        result_dict_data['stock_market_value'] = one_data[5]

        # 股票价格
        result_dict_data['stock_price'] = one_data[7]

        result_list_data.append(result_dict_data)
    print("处理后的结果数据为:",result_list_data)
    return result_list_data



def handle_business(cookies):
    """
    处理业务
    :return:
    """
    # 爬取数据
    result_data = get_data(cookies)
    if not result_data:
        return
    # 处理数据为字典的列表
    result_list_data = handle_data(result_data)
    # print(result_list_data)
    # 业务统计,并发送邮件
    for one_data in result_list_data:
        # 涨跌幅小于5%的,市值小于200亿,特大单净流入大于2000万的
        if abs(float(one_data['stock_upanddown'])) > 2 and float(one_data['stock_market_value']) < 20000000000 and float(one_data['stock_extea_large_flow']) >= 20000000:
            print('满足跌幅在5%以下的,市值小于200亿,特大单净流入大于2000万的的股票为:',one_data)
            # 发送邮件通知


from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os,subprocess,threading


def open_browser():
    subprocess.call(
        'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe  --remote-debugging-port=9222 --user-data-dir="C:selenumAutomationProfile"',
        timeout=15)
    print("打开浏览器结束了")

th = threading.Thread(target=open_browser)
th.start()

options = webdriver.ChromeOptions()
options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")

# 不加载图片
# options.add_argument('blink-settings=imagesEnabled=false')

# 不显示图形界面
# options.add_argument('--headless')

driver = webdriver.Chrome(executable_path=r'D:\\Python\\Python38\\chromedriver.exe',options=options)

driver.get('https://www.iwencai.com')
#
# WebDriverWait(driver,timeout=10).until(EC.presence_of_element_located((By.ID,"auto")))
# driver.find_element(By.ID,"auto").send_keys('跌幅在5%以下的,市值小于200亿,特大单净流入大于2000万')
# driver.find_element(By.ID,'qs-enter').click()
cookies = driver.get_cookies()
print(cookies)
res_cookies = 
for one_cookie in cookies:
    if one_cookie['name'] == 'v':
        res_cookies['v'] = one_cookie['value']
    elif one_cookie['name'] == 'ta_random_userid':
        res_cookies['ta_random_userid'] = one_cookie['value']
    elif one_cookie['name'] == 'cid':
        res_cookies['cid'] = one_cookie['value']
    elif one_cookie['name'] == 'ComputerID':
        res_cookies['ComputerID'] = one_cookie['value']
    elif one_cookie['name'] == 'PHPSESSID':
        res_cookies['PHPSESSID'] = one_cookie['value']
time.sleep(3)
handle_business(res_cookies)
driver.close()
print("关闭页签")

以上是关于python3 windows下的几乎万能爬虫方法的主要内容,如果未能解决你的问题,请参考以下文章

Python3在WindowsLinuxMac三大平台的安装教程

在Python3下的multiprocessing的安装问题

Python3爬虫 数据存储之TXTJSONCSV

Python3爬虫入门到精通 | 环境安装

python爬虫 源码

python爬虫