如何使用requests.post获取网页?
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了如何使用requests.post获取网页?相关的知识,希望对你有一定的参考价值。
我想得到网页http://www3.hkexnews.hk/listedco/listconews/advancedsearch/search_active_main.aspx
的结果,股票代码输入为5。
问题是我在按下搜索后不知道网站,因为它运行javascript。
此外,如何找到传递给requests.post
所需的参数,例如:数据?是否需要标头?
您有多种选择:
1)您可以使用Selenium。首先安装Selenium。
sudo pip3 install selenium
然后获取驱动程序https://sites.google.com/a/chromium.org/chromedriver/downloads(根据您的操作系统,您可能需要指定驱动程序的位置)
from selenium import webdriver
from bs4 import BeautifulSoup
import time
browser = webdriver.Chrome()
url = "http://www3.hkexnews.hk/listedco/listconews/advancedsearch/search_active_main.aspx"
browser.get(url)
element = browser.find_element_by_id('ctl00_txt_stock_code') # find the text box
time.sleep(2)
element.send_keys('5') # populate the text box
time.sleep(2)
element.submit() # submit the form
soup = BeautifulSoup(browser.page_source, 'html.parser')
browser.quit()
for news in soup.find_all(class_='news'):
print(news.text)
2)或者使用PyQt和QWebEngineView。
在Ubuntu上安装PyQt:
sudo apt-get install python3-pyqt5
sudo apt-get install python3-pyqt5.qtwebengine
或者在其他操作系统上(64位版本的Python)
pip3 install PyQt5
基本上你加载表格的第一页。通过运行JavaScript填写表单然后提交。 loadFinished()信号被调用两次,这是第二次,因为您提交了表单,因此您可以使用if语句来区分调用。
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from bs4 import BeautifulSoup
class Render(QWebEngineView):
def __init__(self, url):
self.html = None
self.first_pass = True
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._load_finished)
self.load(QUrl(url))
self.app.exec_()
def _load_finished(self, result):
if self.first_pass:
self._first_finished()
self.first_pass = False
else:
self._second_finished()
def _first_finished(self):
self.page().runJavaScript("document.getElementById('ctl00_txt_stock_code').value = '5';")
self.page().runJavaScript("document.getElementById('ctl00_sel_DateOfReleaseFrom_y').value='1999';")
self.page().runJavaScript("preprocessMainForm();")
self.page().runJavaScript("document.forms[0].submit();")
def _second_finished(self):
self.page().toHtml(self.callable)
def callable(self, data):
self.html = data
self.app.quit()
url = "http://www3.hkexnews.hk/listedco/listconews/advancedsearch/search_active_main.aspx"
web = Render(url)
soup = BeautifulSoup(web.html, 'html.parser')
for news in soup.find_all(class_ = 'news'):
print(news.text)
输出:
Voting Rights and Capital
Next Day Disclosure Return
NOTICE OF REDEMPTION AND CANCELLATION OF LISTING
THIRD INTERIM DIVIDEND FOR 2018
Notification of Transactions by Persons Discharging Managerial Responsibilities
Next Day Disclosure Return
THIRD INTERIM DIVIDEND FOR 2018
Monthly Return of Equity Issuer on Movements in Securities for the month ended 31 October 2018
Voting Rights and Capital
PUBLICATION OF BASE PROSPECTUS SUPPLEMENT
3Q 2018 EARNINGS RELEASE AUDIO WEBCAST AND CONFERENCE CALL
3Q EARNINGS RELEASE - HIGHLIGHTS
Scrip Dividend Circular
2018 Third Interim Dividend; Scrip Dividend
THIRD INTERIM DIVIDEND FOR 2018 SCRIP DIVIDEND ALTERNATIVE
NOTIFICATION OF MAJOR HOLDINGS
EARNINGS RELEASE FOR THIRD QUARTER 2018
NOTIFICATION OF MAJOR HOLDINGS
Monthly Return of Equity Issuer on Movements in Securities for the month ended 30 September 2018
THIRD INTERIM DIVIDEND FOR 2018; DIVIDEND ON PREFERENCE SHARES
或者你可以使用Scrapy splash https://github.com/scrapy-plugins/scrapy-splash
或请求-HTML https://html.python-requests.org/。
但我不确定如何使用这两种方法填写表格。
更新了如何阅读下一页:
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from bs4 import BeautifulSoup
class Render(QWebEngineView):
def __init__(self, url):
self.html = None
self.count = 0
self.first_pass = True
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._load_finished)
self.load(QUrl(url))
self.app.exec_()
def _load_finished(self, result):
if self.first_pass:
self._first_finished()
self.first_pass = False
else:
self._second_finished()
def _first_finished(self):
self.page().runJavaScript("document.getElementById('ctl00_txt_stock_code').value = '5';")
self.page().runJavaScript("document.getElementById('ctl00_sel_DateOfReleaseFrom_y').value='1999';")
self.page().runJavaScript("preprocessMainForm();")
self.page().runJavaScript("document.forms[0].submit();")
def _second_finished(self):
try:
self.page().toHtml(self.parse)
self.count += 1
if self.count > 5:
self.page().toHtml(self.callable)
else:
self.page().runJavaScript("document.getElementById('ctl00_btnNext2').click();")
except:
self.page().toHtml(self.callable)
def parse(self, data):
soup = BeautifulSoup(data, 'html.parser')
for news in soup.find_all(class_ = 'news'):
print(news.text)
def callable(self, data):
self.app.quit()
url = "http://www3.hkexnews.hk/listedco/listconews/advancedsearch/search_active_main.aspx"
web = Render(url)
以上是关于如何使用requests.post获取网页?的主要内容,如果未能解决你的问题,请参考以下文章