使用VBA宏遍历javascrape网页上的每个表
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了使用VBA宏遍历javascrape网页上的每个表相关的知识,希望对你有一定的参考价值。
我正在尝试从网站上抓取多个表格。到目前为止,我已经构建了一个excel VBA宏来执行此操作。我还想出了如何在网站的多个页面上获取所有数据。例如,如果我有1000个结果,但每页显示50个。问题是我在多个页面上有相同的5个表,因为每个表有1000个结果。
我的代码只能遍历1个表的每个页面。我也有编写代码来抓取每个表,但我无法弄清楚如何为50个搜索结果(每个页面)中的每一个执行此操作。
如何遍历多个表并单击流程中的下一页以捕获所有数据?
Sub ETFDat()
Dim IE As Object
Dim i As Long
Dim strText As String
Dim jj As Long
Dim hBody As Object
Dim hTR As Object
Dim hTD As Object
Dim tb As Object
Dim bb As Object
Dim Tr As Object
Dim Td As Object
Dim ii As Long
Dim doc As Object
Dim hTable As Object
Dim y As Long
Dim z As Long
Dim wb As Excel.Workbook
Dim ws As Excel.Worksheet
Set wb = Excel.ActiveWorkbook
Set ws = wb.ActiveSheet
Set IE = CreateObject("InternetExplorer.Application")
IE.Visible = True
y = 1 'Column A in Excel
z = 1 'Row 1 in Excel
Sheets("Fund Basics").Activate
Cells.Select
Selection.Clear
IE.navigate "http://www.etf.com/channels/smart-beta-etfs/channels/smart- beta-etfs?qt-tabs=0#qt-tabs" ', , , , "Content-Type: application/x-www-form-urlencoded" & vbCrLf
Do While IE.busy: DoEvents: Loop
Do While IE.ReadyState <> 4: DoEvents: Loop
Set doc = IE.document
Set hTable = doc.getElementsByTagName("table") '.GetElementByID("tablePerformance")
ii = 1
Do While ii <= 17
For Each tb In hTable
Set hBody = tb.getElementsByTagName("tbody")
For Each bb In hBody
Set hTR = bb.getElementsByTagName("tr")
For Each Tr In hTR
Set hTD = Tr.getElementsByTagName("td")
y = 1 ' Resets back to column A
For Each Td In hTD
ws.Cells(z, y).Value = Td.innerText
y = y + 1
Next Td
DoEvents
z = z + 1
Next Tr
Exit For
Next bb
Exit For
Next tb
With doc
Set elems = .getElementsByTagName("a")
For Each e In elems
If (e.getAttribute("id") = "nextPage") Then
e.Click
Exit For
End If
Next e
End With
ii = ii + 1
Application.Wait (Now + TimeValue("00:00:05"))
Loop
MsgBox "Done"
End Sub
答案
有一个示例显示如何使用XHR和JSON解析从网站检索数据,它包含几个步骤。
- 检索数据。
我使用Chrome开发者工具网络选项卡查看了XHR的一些内容。我找到的大多数相关数据是我单击下一页按钮后由http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/50/50/1从GET XHR返回的JSON字符串:
响应具有以下结构for single row item:
[
{
"productId": 576,
"fund": "iShares Russell 1000 Value ETF",
"ticker": "IWD",
"inceptionDate": "2000-05-22",
"launchDate": "2000-05-22",
"hasSegmentReport": "true",
"genericReport": "false",
"hasReport": "true",
"fundsInSegment": 20,
"economicDevelopment": "Developed Markets",
"totalRows": 803,
"fundBasics": {
"issuer": "<a href='/channels/blackrock-etfs' alt='BlackRock'>BlackRock</a>",
"expenseRatio": {
"value": 20
},
"aum": {
"value": 36957230250
},
"spreadPct": {
"value": 0.000094
},
"segment": "Equity: U.S. - Large Cap Value"
},
"performance": {
"priceTrAsOf": "2017-02-27",
"priceTr1Mo": {
"value": 0.031843
},
"priceTr3Mo": {
"value": 0.070156
},
"priceTr1Yr": {
"value": 0.281541
},
"priceTr3YrAnnualized": {
"value": 0.099171
},
"priceTr5YrAnnualized": {
"value": 0.13778
},
"priceTr10YrAnnualized": {
"value": 0.061687
}
},
"analysis": {
"analystPick": null,
"opportunitiesList": null,
"letterGrade": "A",
"efficiencyScore": 97.977103,
"tradabilityScore": 99.260541,
"fitScore": 84.915658,
"leveragedFactor": null,
"exposureReset": null,
"avgDailyDollarVolume": 243848188.037378,
"avgDailyShareVolume": 2148400.688889,
"spread": {
"value": 0.010636
},
"fundClosureRisk": "Low"
},
"fundamentals": {
"dividendYield": {
"value": 0.021543
},
"equity": {
"pe": 27.529645,
"pb": 1.964124
},
"fixedIncome": {
"duration": null,
"creditQuality": null,
"ytm": {
"value": null
}
}
},
"classification": {
"assetClass": "Equity",
"strategy": "Value",
"region": "North America",
"geography": "U.S.",
"category": "Size and Style",
"focus": "Large Cap",
"niche": "Value",
"inverse": "false",
"leveraged": "false",
"etn": "false",
"selectionCriteria": "Multi-Factor",
"weightingScheme": "Multi-Factor",
"activePerSec": "false",
"underlyingIndex": "Russell 1000 Value Index",
"indexProvider": "Russell",
"brand": "iShares"
},
"tax": {
"legalStructure": "Open-Ended Fund",
"maxLtCapitalGainsRate": 20,
"maxStCapitalGainsRate": 39.6,
"taxReporting": "1099"
}
}
]
- 属性
"totalRows": 803
指定总行数。因此,为了尽可能快地进行数据检索,最好使请求获得第一行。正如您在URL中看到的那样,有../-aum/50/50/..
尾部,它指向排序顺序,要开始的项目以及要返回的总项目。因此,要获得唯一的行应该是http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/0/1/1
- 解析检索到的JSON,从
totalRows
属性中获取总行数。 - 另一个请求获取整个表。
- 解析整个表JSON,将其转换为2d数组并输出。您可以通过直接访问阵列来执行进一步处理。
对于下表:
结果表包含803行和带有列的标题,如下所示:
productId
fund
ticker
inceptionDate
launchDate
hasSegmentReport
genericReport
hasReport
fundsInSegment
economicDevelopment
totalRows
fundBasics_issuer
fundBasics_expenseRatio_value
fundBasics_aum_value
fundBasics_spreadPct_value
fundBasics_segment
performance_priceTrAsOf
performance_priceTr1Mo_value
performance_priceTr3Mo_value
performance_priceTr1Yr_value
performance_priceTr3YrAnnualized_value
performance_priceTr5YrAnnualized_value
performance_priceTr10YrAnnualized_value
analysis_analystPick
analysis_opportunitiesList
analysis_letterGrade
analysis_efficiencyScore
analysis_tradabilityScore
analysis_fitScore
analysis_leveragedFactor
analysis_exposureReset
analysis_avgDailyDollarVolume
analysis_avgDailyShareVolume
analysis_spread_value
analysis_fundClosureRisk
fundamentals_dividendYield_value
fundamentals_equity_pe
fundamentals_equity_pb
fundamentals_fixedIncome_duration
fundamentals_fixedIncome_creditQuality
fundamentals_fixedIncome_ytm_value
classification_assetClass
classification_strategy
classification_region
classification_geography
classification_category
classification_focus
classification_niche
classification_inverse
classification_leveraged
classification_etn
classification_selectionCriteria
classification_weightingScheme
classification_activePerSec
classification_underlyingIndex
classification_indexProvider
classification_brand
tax_legalStructure
tax_maxLtCapitalGainsRate
tax_maxStCapitalGainsRate
tax_taxReporting
将以下代码放入VBA Project标准模块:
Option Explicit
Sub GetData()
Dim sJSONString As String
Dim vJSON As Variant
Dim sState As String
Dim lRowsQty As Long
Dim aData()
Dim aHeader()
' Download and parse the only first row to get total rows qty
sJSONString = GetXHR("http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/0/1/1")
JSON.Parse sJSONString, vJSON, sState
lRowsQty = vJSON(0)("totalRows")
' Download and parse the entire data
sJSONString = GetXHR("http://www.etf.com/etf-finder-channel-tag/Smart-Beta%20ETFs/-aum/0/" & lRowsQty & "/1")
JSON.Parse sJSONString, vJSON, sState
' Convert JSON to 2d array
JSON.ToArray vJSON, aData, aHeader
' Output
With Sheets(1)
.Cells.Delete
OutputArray .Cells(1, 1), aHeader
Output2DArray .Cells(2, 1), aData
.Cells.Columns.AutoFit
End With
End Sub
Function GetXHR(sURL As String) As String
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", sURL, False
.Send
GetXHR = .responseText
End With
End Function
Sub OutputArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize( _
1, _
UBound(aCells) - LBound(aCells) + 1)
.NumberFormat = "@"
.Value = aCells
End With
End With
End Sub
Sub Output2DArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize( _
UBound(aCells, 1) - LBound(aCells, 1) + 1, _
UBound(aCells, 2) - LBound(aCells, 2) + 1)
.NumberFormat = "@"
.Value = aCells
End With
End With
End Sub
再创建一个标准模块,将其命名为JSON
并将下面的代码放入其中,此代码提供JSON处理功能:
Option Explicit
Private sBuffer As String
Private oTokens As Object
Private oRegEx As Object
Private bMatch As Boolean
Private oChunks As Object
Private oHeader As Object
Private aData() As Variant
Private i As Long
Sub Parse(ByVal sSample As String, vJSON As Variant, sState As String)
' Backus–Naur form JSON parser implementation based on RegEx
' Input:
' sSample - source JSON string
' Output:
' vJson - created object or array to be returned as result
' sState - string Object|Array|Error depending on processing
sBuffer = sSample
Set oTokens = Cre以上是关于使用VBA宏遍历javascrape网页上的每个表的主要内容,如果未能解决你的问题,请参考以下文章