BeautifulSoup - 从 JS 中提取 json
Posted
技术标签:
【中文标题】BeautifulSoup - 从 JS 中提取 json【英文标题】:BeautifulSoup - extract specific JSON key-value from JS 【发布时间】:2017-10-06 17:22:42 【问题描述】:我正在玩 BeautilfulSoup,我正在寻找一种在 JS 元素中获取特定 json 字符串的方法。
这是 JS:
<script>window.pinball = window.pinball || [];
window.pinball.push(['add', "srp_cleanup":"inactive","book_visit":"inactive","my_visits":"inactive"]);
window.Rent = window.Rent || ;
window.Rent.zutron = "error_div":".js-generic-error","host":"rent","user_type":null,"zid":null,"origin":null,"provider":null;
window.Rent.book_visit = "book_visit_host":"http://bookavisit.prod.services.rentpath.com"
window.Rent.tagging = "tealium":"env":"prod","profile":"tealium.rent.com","account":"rentpath";
window.Rent.realm = "rent";
window.Rent.data = "floorplans":"1159255":"availability":"1 Unit Available","availability_class":"floorplan-available-now","unitstyle":"aa1- 1 Bed/1 Bath","deposit":"","floorplan_id":1159255,"bed":"1 bed","listing_id":"571535","bath":"1 bath","sqft":"763 sqft","rent":"$1950 - $2322 /mo","propertyname":"Reading Commons","fp3dunfurnished":"http://image.rent.com/imgr/52ad5930427b3e739676240c01b7d6cc/650-","fp3dfurnished":"http://image1.rent.com/imgr/07733fbd8c8a6a9134d5e0af77d52cb2/650-","floorplanimage":"http://image.rent.com/imgr/44c2395728fa733c2682506d96ec68f5/650-","1159257":"availability":"2 Units Available","availability_class":"floorplan-available-now","unitstyle":"aa3- 1 Bed/1 Bath","deposit":"","floorplan_id":1159257,"bed":"1 bed","listing_id":"571535","bath":"1 bath","sqft":"893 sqft","rent":"$1995 - $2531 /mo","propertyname":"Reading Commons","fp3dunfurnished":"http://image.rent.com/imgr/187753b2e7e6beb5aaf8602514361d89/650-","fp3dfurnished":"http://image.rent.com/imgr/55673aa4253387f0d06aa02495ccf2bc/650-","floorplanimage":"http://image.rent.com/imgr/389adb5ac1fa61c56aa04c88fe97c02f/650-","1159259":"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"aa5- 1 Bed/1 Bath","deposit":"","floorplan_id":1159259,"bed":"1 bed","listing_id":"571535","bath":"1 bath","sqft":"899 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","floorplanimage":"http://image.rent.com/imgr/24059a4611740bd58436236758d65e20/650-","1159256":"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"aa2- 1 Bed/1 Bath","deposit":"","floorplan_id":1159256,"bed":"1 bed","listing_id":"571535","bath":"1 bath","sqft":"880 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","floorplanimage":"http://image1.rent.com/imgr/0854a95e69c0b75ee0b13c41db2f31f1/650-","1159258":"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"aa4- 1 Bed/1 Bath","deposit":"","floorplan_id":1159258,"bed":"1 bed","listing_id":"571535","bath":"1 bath","sqft":"897 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","floorplanimage":"http://image1.rent.com/imgr/deb3efc9ee3933a0a1b4844d886b7a8a/650-","1159262":"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"bc3- 2 Bed/2 Bath","deposit":"","floorplan_id":1159262,"bed":"2 beds","listing_id":"571535","bath":"2 baths","sqft":"1194 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","floorplanimage":"http://image1.rent.com/imgr/a1fff6050e86f98b7249b843cd6f0836/650-","1159263":"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"bc4- 2 Bed/2 Bath","deposit":"","floorplan_id":1159263,"bed":"2 beds","listing_id":"571535","bath":"2 baths","sqft":"1201 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","fp3dunfurnished":"http://image1.rent.com/imgr/33e2bb30c9aa1fcdbbf8ce4882a18fcd/650-","fp3dfurnished":"http://image.rent.com/imgr/c4d4df83e18f2b12c8cae6dab523769b/650-","floorplanimage":"http://image.rent.com/imgr/11ac88f52ca904e7646e03b6791f8455/650-","1159266":"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"bc7- 2 Bed/2 Bath","deposit":"","floorplan_id":1159266,"bed":"2 beds","listing_id":"571535","bath":"2 baths","sqft":"1461 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","fp3dunfurnished":"http://image.rent.com/imgr/0a3887c07a7bc05670a826cd5562c49d/650-","fp3dfurnished":"http://image.rent.com/imgr/efa94735904b40ba463cbd26bc5504cf/650-","floorplanimage":"http://image1.rent.com/imgr/36413f72b93f0b0ed2f4f89337ef719d/650-","1159264":"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"bc5- 2 Bed/2 Bath","deposit":"","floorplan_id":1159264,"bed":"2 beds","listing_id":"571535","bath":"2 baths","sqft":"1325 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","floorplanimage":"http://image.rent.com/imgr/ce1627742dbca97cc44d726b1d906fc3/650-","1159267":"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"bcl1-2 Bed/2 Bath","deposit":"","floorplan_id":1159267,"bed":"2 beds","listing_id":"571535","bath":"2 baths","sqft":"1500 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","fp3dunfurnished":"http://image.rent.com/imgr/a5888b34db510f6932af116e5197ce0c/650-","fp3dfurnished":"http://image1.rent.com/imgr/68f33736e29613562d9a5618eec1a4c6/650-","floorplanimage":"http://image1.rent.com/imgr/d7a833b56639b121178ddc86ac074754/650-","1159261":"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"bc2- 2 Bed/2 Bath","deposit":"","floorplan_id":1159261,"bed":"2 beds","listing_id":"571535","bath":"2 baths","sqft":"1187 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","fp3dunfurnished":"http://image.rent.com/imgr/33e2bb30c9aa1fcdbbf8ce4882a18fcd/650-","fp3dfurnished":"http://image.rent.com/imgr/c4d4df83e18f2b12c8cae6dab523769b/650-","floorplanimage":"http://image1.rent.com/imgr/11ac88f52ca904e7646e03b6791f8455/650-","1159265":"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"bc6- 2 Bed/2 Bath","deposit":"","floorplan_id":1159265,"bed":"2 beds","listing_id":"571535","bath":"2 baths","sqft":"1400 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","fp3dunfurnished":"http://image.rent.com/imgr/3f80d6e4386db5f450a6750c1a537b84/650-","fp3dfurnished":"http://image1.rent.com/imgr/f54aefd699a9ed3f1d8b6fb8e4ce1500/650-","floorplanimage":"http://image1.rent.com/imgr/b78bda34547615be4973da38dbd9a10f/650-","1159260":"availability":"UNAVAILABLE","availability_class":"floorplan-available-later","unitstyle":"bc1- 2 Bed/2 Bath","deposit":"","floorplan_id":1159260,"bed":"2 beds","listing_id":"571535","bath":"2 baths","sqft":"1121 sqft","rent":"Contact for Pricing","propertyname":"Reading Commons","fp3dunfurnished":"http://image1.rent.com/imgr/3b4e4306d4cc2317bd271888532405a0/650-","fp3dfurnished":"http://image1.rent.com/imgr/8ca6a08b9c4eed76575520b4f1dcc03c/650-","floorplanimage":"http://image.rent.com/imgr/f25bcd28009d72a91f02d4e125340b65/650-","1159268":"availability":"1 Unit Available","availability_class":"floorplan-available-now","unitstyle":"cdta1- 3 Bed/3 Bath Office TH","deposit":"","floorplan_id":1159268,"bed":"3 beds","listing_id":"571535","bath":"3 baths","sqft":"2100 sqft","rent":"$3798 - $5073 /mo","propertyname":"Reading Commons","fp3dunfurnished":"http://image1.rent.com/imgr/82ba57c2f1be5071c3d5f48a79c9d45e/650-","fp3dfurnished":"http://image.rent.com/imgr/bc7908ca722b6f9407a247ebf7af49bd/650-","floorplanimage":"http://image.rent.com/imgr/3c881fbe1aba5ba7be68ca6399e7daa3/650-","1159269":"availability":"1 Unit Available","availability_class":"floorplan-available-now","unitstyle":"cdta2- 3 Bed/3 Bath Office TH","deposit":"","floorplan_id":1159269,"bed":"3 beds","listing_id":"571535","bath":"3 baths","sqft":"2310 sqft","rent":"$3908 - $4995 /mo","propertyname":"Reading Commons","fp3dunfurnished":"http://image1.rent.com/imgr/86b5248dfbaef2534218a8bdb724d93e/650-","fp3dfurnished":"http://image.rent.com/imgr/ee01414c664925a3463bad279f943363/650-","floorplanimage":"http://image.rent.com/imgr/ba58885223be2f4f8bfd1588d9ddca9e/650-","reviews":"startingrecordnumber":1,"totalnumberofmatchingrecords":18,"numberofrecordsreturned":10,"numberofpages":2,"endingrecordnumber":10,"pagenumber":1,"numberofrecordsperpage":10,"listing":"id":"571535","name":"Reading Commons","address_full":"7 Archstone Circle, Reading, MA 01867","phone_desktop":"(781) 205-2341","visits_enabled":true;
window.Rent.mapbox_api_key = "pk.eyJ1IjoibmhnbWFwYm94IiwiYSI6ImNpb2VrYW5uazAwbHp5OG0yYmp6bms5bjYifQ.4RylIPWDNDEie2NreUsbig";
window.Rent.asset_host = "rent.assets.rentpathcdn.com";
window.zutron_host = "http://zutron.primedia.com";
window.ONESEARCH_URL = "http://onesearch.svc.primedia.com";
window.Rent.pageType = "pdp";
// these two globals are used in onesearch.js, not sure where else
window.channel = "apartments";
window.APPLICATION = "rent";
window.googletag = window.googletag || ;
window.googletag.cmd = window.googletag.cmd || [];
// SID is used by the Moving Leads Service
window.Rent.MOVING_LEADS_SID = 96;</script>
我能够通过 BeautifulSoup 提取 JS,我正在寻找与 window.Rent.data
键对应的 json 字符串。
有没有办法做到这一点而不必求助于re
?
【问题讨论】:
【参考方案1】:这个想法是使用带有捕获组的正则表达式模式。然后,使用此正则表达式通过文本定位script
元素,然后从脚本本身中提取子字符串。然后,您可以使用json.loads()
将 JSON 字符串加载到 Python 对象中:
import json
import re
from bs4 import BeautifulSoup
data = """
your html here"""
soup = BeautifulSoup(data, "html.parser")
pattern = re.compile(r"window.Rent.data\s+=\s+(\.*?\);\n")
script = soup.find("script", text=pattern)
data = pattern.search(script.text).group(1)
data = json.loads(data)
print(data)
还有另一种方法 - javascript 解析器 - I've experimented with slimit
on *** a couple of times,检查一下。
【讨论】:
以上是关于BeautifulSoup - 从 JS 中提取 json的主要内容,如果未能解决你的问题,请参考以下文章
使用 BeautifulSoup 从 img 标签中提取 src 属性