python3 windows下的几乎万能爬虫方法
Posted 那个雨季
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python3 windows下的几乎万能爬虫方法相关的知识,希望对你有一定的参考价值。
解决selenium下被反扒的情况
# -*- encoding:utf-8 -*-
import time
import requests
from selenium.webdriver.common.by import By
def get_data(cookies):
headers =
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/101.0.4951.54 Safari/537.36",
# "Host": "www.iwencai.com",
# 'Upgrade-Insecure-Requests': '1',
'Accept-Language': 'zh-CN,zh;q=0.9',
# 'referer': 'https://www.iwencai.com/',
"Cookie": f"cid=cookies['cid']; "
f"ComputerID=cookies['ComputerID']; "
"ver_mark=c; "
"other_uid=Ths_iwencai_Xuangu_9fnth9iibkd8zrtya1yj4ll48wx683s8; "
f"ta_random_userid=cookies['ta_random_userid']; "
"WafStatus=0; "
f"phpSESSID=cookies['PHPSESSID']; "
f"v=cookies['v']",
"hexin-v": f"cookies['v']"
print("开始爬取数据".center(30,"*"))
print("请求头为:",headers)
url = 'http://www.iwencai.com/stockpick/load-data?typed=0&preParams=&ts=1&f=1&qs=result_original&selfsectsn=&querytype=stock&searchfilter=&tid=stockpick&' \\
'w=股价下跌,超大单净流入大于1000万元,大单净流入大于1000万元,市值小于400亿,非ST,非创业板,非科创板,股价大于10元'
res = requests.get(url, headers=headers)
print("res为:", res)
if res.status_code != 200:
print(res.text)
return
result_data = res.json()
print('爬取数据的结果为:',result_data)
return result_data['data']['result']['result']
def handle_data(result_data):
"""
处理数据
:return:
"""
result_list_data = []
for one_data in result_data:
result_dict_data =
# 股票代码
result_dict_data['stock_code'] = one_data[0]
# 股票名称
result_dict_data['stock_name'] = one_data[1]
# 涨跌幅
result_dict_data['stock_upanddown'] = one_data[2]
# 超大单流入
result_dict_data['stock_extea_large_flow'] = one_data[3]
# 大单流入
result_dict_data['stock_large_flow'] = one_data[4]
# 股票市值
result_dict_data['stock_market_value'] = one_data[5]
# 股票价格
result_dict_data['stock_price'] = one_data[7]
result_list_data.append(result_dict_data)
print("处理后的结果数据为:",result_list_data)
return result_list_data
def handle_business(cookies):
"""
处理业务
:return:
"""
# 爬取数据
result_data = get_data(cookies)
if not result_data:
return
# 处理数据为字典的列表
result_list_data = handle_data(result_data)
# print(result_list_data)
# 业务统计,并发送邮件
for one_data in result_list_data:
# 涨跌幅小于5%的,市值小于200亿,特大单净流入大于2000万的
if abs(float(one_data['stock_upanddown'])) > 2 and float(one_data['stock_market_value']) < 20000000000 and float(one_data['stock_extea_large_flow']) >= 20000000:
print('满足跌幅在5%以下的,市值小于200亿,特大单净流入大于2000万的的股票为:',one_data)
# 发送邮件通知
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os,subprocess,threading
def open_browser():
subprocess.call(
'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe --remote-debugging-port=9222 --user-data-dir="C:selenumAutomationProfile"',
timeout=15)
print("打开浏览器结束了")
th = threading.Thread(target=open_browser)
th.start()
options = webdriver.ChromeOptions()
options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
# 不加载图片
# options.add_argument('blink-settings=imagesEnabled=false')
# 不显示图形界面
# options.add_argument('--headless')
driver = webdriver.Chrome(executable_path=r'D:\\Python\\Python38\\chromedriver.exe',options=options)
driver.get('https://www.iwencai.com')
#
# WebDriverWait(driver,timeout=10).until(EC.presence_of_element_located((By.ID,"auto")))
# driver.find_element(By.ID,"auto").send_keys('跌幅在5%以下的,市值小于200亿,特大单净流入大于2000万')
# driver.find_element(By.ID,'qs-enter').click()
cookies = driver.get_cookies()
print(cookies)
res_cookies =
for one_cookie in cookies:
if one_cookie['name'] == 'v':
res_cookies['v'] = one_cookie['value']
elif one_cookie['name'] == 'ta_random_userid':
res_cookies['ta_random_userid'] = one_cookie['value']
elif one_cookie['name'] == 'cid':
res_cookies['cid'] = one_cookie['value']
elif one_cookie['name'] == 'ComputerID':
res_cookies['ComputerID'] = one_cookie['value']
elif one_cookie['name'] == 'PHPSESSID':
res_cookies['PHPSESSID'] = one_cookie['value']
time.sleep(3)
handle_business(res_cookies)
driver.close()
print("关闭页签")
以上是关于python3 windows下的几乎万能爬虫方法的主要内容,如果未能解决你的问题,请参考以下文章
Python3在WindowsLinuxMac三大平台的安装教程