使用PyQt5和BeautifulSoup将JavaScript变量提取到Python字典中
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了使用PyQt5和BeautifulSoup将JavaScript变量提取到Python字典中相关的知识,希望对你有一定的参考价值。
[我要使用BeautifulSoup提取html后,必须将PyQt5与BeautifulSoup结合使用才能在客户端上运行javascript,所以我试图将_Flourish_data变量转换为Python字典。
是否有一种简单的方法可以将Javascript变量_Flourish_data提取到Python字典中?这是我当前使用PyQt5和BeautifulSoup提取Javascript的Python:
import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
def Callable(self, html_str):
self.html = html_str
self.app.quit()
page = Page('https://flo.uri.sh/visualisation/2451841/embed?auto=1')
soup = bs.BeautifulSoup(page.html, 'html.parser')
js_test = soup.find_all('script')
js_test[5]
现有代码的输出是
<script>
function _Flourish_unflattenInto(dest, src)
dest = dest || ;
for (var k in src)
var t = dest;
for (var i = k.indexOf("."), p = 0; i >= 0; i = k.indexOf(".", p = i+1))
var s = k.substring(p, i);
if (!(s in t)) t[s] = ;
t = t[s];
t[k.substring(p)] = src[k];
return dest;
var _Flourish_settings = "cell_fill_1":"#ffffff","cell_fill_2":"#ebebeb","cell_fill_direction":"horizontal","cell_font_size":"1","cell_height":20,"cell_horizontal_alignment":"center","cell_link_color":"#2886b2","cell_padding_horizontal":16,"cell_padding_vertical":11,"column_width_mode":"auto","column_widths":"10%, 10%, 10%, 10%, 50%, 10%","header_fill":"#181f6c","header_font_color":"#ffffff","header_font_default":false,"header_font_size":1.1,"header_horizontal_alignment":"center","header_style_default":true,"layout.body_font":"name":"Source Sans Pro","url":"https://fonts.googleapis.com/css?family=Source+Sans+Pro:400,700","layout.layout_order":"stack-default","layout.space_between_sections":"0.5","mobile.view":true,"no_results_text":"Use the search bar to find your state","pagination_amount":41,"pagination_amount_search":"5","search_enabled":false,"search_hide_table":false,"search_placeholder":"Search to find your state","search_resize":true,"search_width":15;
_Flourish_unflattenInto(window.template.state, _Flourish_settings);
var _Flourish_data_column_names = "rows":"columns":["State ","Earliest/Planned Start Date for 20/21 Academic Year ","","","",""],
_Flourish_data = "rows":["columns":["Alabama","Varies by district","","","",""],"columns":["Alaska","Varies by district","","","",""],"columns":["American Samoa","Unknown","","","",""],"columns":["Arizona","Varies by district","","","",""],"columns":["Arkansas","Varies by district","","","",""],"columns":["Bureau of Indian Education","Varies by district","","","",""],"columns":["California","Varies by district","","","",""],"columns":["Colorado","Varies by district","","","",""],"columns":["Connecticut","Not yet determined","","","",""],"columns":["Delaware","Varies by district","","","",""],"columns":["Department of Defense Education Activity\n ","Varies by district","","","",""],"columns":["District of Columbia","8/31/2020","","","",""],"columns":["Florida","Unknown","","","",""],"columns":["Georgia","Unknown","","","",""],"columns":["Guam","Unknown","","","",""],"columns":["Hawaii","Not yet determined","","","",""],"columns":["Idaho","Varies by District","","","",""],"columns":["Illinois","Varies by district","","","",""],"columns":["Indiana","Not yet determined","","","",""],"columns":["Iowa","Varies by district","","","",""],"columns":["Kansas","Not yet determined","","","",""],"columns":["Kentucky","Unknown","","","",""],"columns":["Louisiana","Varies by district","","","",""],"columns":["Maine","Varies by district","","","",""],"columns":["Maryland","Not yet determined","","","",""],"columns":["Massachusetts","Not yet determined","","","",""],"columns":["Michigan","Not yet determined","","","",""],"columns":["Minnesota","Not yet determined","","","",""],"columns":["Mississippi ","Varies by district","","","",""],"columns":["Missouri","Varies by district","","","",""],"columns":["Montana","Varies by district","","","",""],"columns":["Nebraska","Varies by district","","","",""],"columns":["Nevada","Varies by district","","","",""],"columns":["New Hampshire","Not yet determined","","","",""],"columns":["New Jersey","Varies by district","","","",""],"columns":["New Mexico","Unknown","","","",""],"columns":["New York","Not yet determined","","","",""],"columns":["North Carolina","8/17/2020","","","",""],"columns":["North Dakota","Varies by district","","","",""],"columns":["Northern Marianas","Unknown","","","",""],"columns":["Ohio","Not yet determined","","","",""],"columns":["Oklahoma","Varies by district","","","",""],"columns":["Oregon","Not yet determined","","","",""],"columns":["Pennsylvania","Varies by district","","","",""],"columns":["Puerto Rico","Unknown","","","",""],"columns":["Rhode Island","Not yet determined","","","",""],"columns":["South Carolina","Not yet determined","","","",""],"columns":["South Dakota","Varies by district","","","",""],"columns":["Tennessee","Varies by district","","","",""],"columns":["Texas","Varies by district","","","",""],"columns":["U.S. Virgin Islands\n ","Not yet determined","","","",""],"columns":["Utah","Varies by district","","","",""],"columns":["Vermont","Not yet determined","","","",""],"columns":["Virginia","Not yet determined","","","",""],"columns":["Washington","Varies by District","","","",""],"columns":["West Virginia","Not yet determined","","","",""],"columns":["Wisconsin","Varies by district","","","",""],"columns":["Wyoming","Not yet determined","","","",""]];
for (var _Flourish_dataset in _Flourish_data)
window.template.data[_Flourish_dataset] = _Flourish_data[_Flourish_dataset];
window.template.data[_Flourish_dataset].column_names = _Flourish_data_column_names[_Flourish_dataset];
window.template.draw();
</script>
我只想要HTML标记中的var _flourish_data,如下所示:
_Flourish_data = "rows":["columns":["Alabama","Varies by district","","","",""],"columns":["Alaska","Varies by district","","","",""],"columns":["American Samoa","Unknown","","","",""],"columns":["Arizona","Varies by district","","","",""],"columns":["Arkansas","Varies by district","","","",""],"columns":["Bureau of Indian Education","Varies by district","","","",""],"columns":["California","Varies by district","","","",""],"columns":["Colorado","Varies by district","","","",""],"columns":["Connecticut","Not yet determined","","","",""],"columns":["Delaware","Varies by district","","","",""],"columns":["Department of Defense Education Activity\n ","Varies by district","","","",""],"columns":["District of Columbia","8/31/2020","","","",""],"columns":["Florida","Unknown","","","",""],"columns":["Georgia","Unknown","","","",""],"columns":["Guam","Unknown","","","",""],"columns":["Hawaii","Not yet determined","","","",""],"columns":["Idaho","Varies by District","","","",""],"columns":["Illinois","Varies by district","","","",""],"columns":["Indiana","Not yet determined","","","",""],"columns":["Iowa","Varies by district","","","",""],"columns":["Kansas","Not yet determined","","","",""],"columns":["Kentucky","Unknown","","","",""],"columns":["Louisiana","Varies by district","","","",""],"columns":["Maine","Varies by district","","","",""],"columns":["Maryland","Not yet determined","","","",""],"columns":["Massachusetts","Not yet determined","","","",""],"columns":["Michigan","Not yet determined","","","",""],"columns":["Minnesota","Not yet determined","","","",""],"columns":["Mississippi ","Varies by district","","","",""],"columns":["Missouri","Varies by district","","","",""],"columns":["Montana","Varies by district","","","",""],"columns":["Nebraska","Varies by district","","","",""],"columns":["Nevada","Varies by district","","","",""],"columns":["New Hampshire","Not yet determined","","","",""],"columns":["New Jersey","Varies by district","","","",""],"columns":["New Mexico","Unknown","","","",""],"columns":["New York","Not yet determined","","","",""],"columns":["North Carolina","8/17/2020","","","",""],"columns":["North Dakota","Varies by district","","","",""],"columns":["Northern Marianas","Unknown","","","",""],"columns":["Ohio","Not yet determined","","","",""],"columns":["Oklahoma","Varies by district","","","",""],"columns":["Oregon","Not yet determined","","","",""],"columns":["Pennsylvania","Varies by district","","","",""],"columns":["Puerto Rico","Unknown","","","",""],"columns":["Rhode Island","Not yet determined","","","",""],"columns":["South Carolina","Not yet determined","","","",""],"columns":["South Dakota","Varies by district","","","",""],"columns":["Tennessee","Varies by district","","","",""],"columns":["Texas","Varies by district","","","",""],"columns":["U.S. Virgin Islands\n ","Not yet determined","","","",""],"columns":["Utah","Varies by district","","","",""],"columns":["Vermont","Not yet determined","","","",""],"columns":["Virginia","Not yet determined","","","",""],"columns":["Washington","Varies by District","","","",""],"columns":["West Virginia","Not yet determined","","","",""],"columns":["Wisconsin","Varies by district","","","",""],"columns":["Wyoming","Not yet determined","","","",""]];
任何帮助将不胜感激!
答案
您不需要执行Javascript。可以使用json
和re
模块来完成。
例如:
import re
import json
import requests
url = 'https://flo.uri.sh/visualisation/2451841/embed?auto=1'
html_data = requests.get(url).text
data = re.search(r'_Flourish_data = (\.*?\);', html_data).group(1)
data = json.loads(data)
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
for row in data['rows']:
print(':<55'.format(*map(str.strip, row['columns'][:2])))
打印:
Alabama Varies by district
Alaska Varies by district
American Samoa Unknown
Arizona Varies by district
Arkansas Varies by district
Bureau of Indian Education Varies by district
California Varies by district
Colorado Varies by district
Connecticut Not yet determined
Delaware Varies by district
Department of Defense Education Activity Varies by district
District of Columbia 8/31/2020
Florida Unknown
Georgia Unknown
Guam Unknown
Hawaii Not yet determined
Idaho Varies by District
Illinois Varies by district
Indiana Not yet determined
Iowa Varies by district
Kansas Not yet determined
Kentucky Unknown
Louisiana Varies by district
Maine Varies by district
Maryland Not yet determined
Massachusetts Not yet determined
Michigan Not yet determined
Minnesota Not yet determined
Mississippi Varies by district
Missouri Varies by district
Montana Varies by district
Nebraska Varies by district
Nevada Varies by district
New Hampshire Not yet determined
New Jersey Varies by district
New Mexico Unknown
New York Not yet determined
North Carolina 8/17/2020
North Dakota Varies by district
Northern Marianas Unknown
Ohio Not yet determined
Oklahoma Varies by district
Oregon Not yet determined
Pennsylvania Varies by district
Puerto Rico Unknown
Rhode Island Not yet determined
South Carolina Not yet determined
South Dakota Varies by district
Tennessee Varies by district
Texas Varies by district
U.S. Virgin Islands Not yet determined
Utah Varies by district
Vermont Not yet determined
Virginia Not yet determined
Washington Varies by District
West Virginia Not yet determined
Wisconsin Varies by district
Wyoming Not yet determined
以上是关于使用PyQt5和BeautifulSoup将JavaScript变量提取到Python字典中的主要内容,如果未能解决你的问题,请参考以下文章
使用 Python 和 BeautifulSoup(将网页源代码保存到本地文件中)
使用将保留   和西里尔字符的格式化程序使用 BeautifulSoup 进行美化?
如何使用 Matplotblib 和 PyQt5 将数据游标添加到条形图中?