XHTML 网站抓取指南
Posted
技术标签:
【中文标题】XHTML 网站抓取指南【英文标题】:XHTML Website Scraping Guidance 【发布时间】:2016-10-14 07:41:12 【问题描述】:我对 VBA 和 html/XHTML 非常陌生,但是通过在线研究和这里其他优秀成员的帮助,我设法编写了一个代码来提取我想要的数据。我很难确定我想要的元素的 ID,因为它在 XHTML 中,所以我认为这是我最糟糕的地方。
网站:http://www.usbanklocations.com/banks.php?q=&ct=&ml=30&lc=
这是我希望代码执行的操作: 提取银行名称、地址、电话号码、总存款和总资产——给定我在 Excel 表中提供的银行名称和城市。
这是我的代码:
Private Declare Sub Sleep Lib "kernel32.dll" (ByVal dwMilliseconds As Long)
Sub CommunityBanks()
Dim IE As Object, TableResults As Object, webRow As Object, BankName As Variant, page As Long, pageTotal As Long, r As Long
Dim beginTime As Date, i As Long, myvalue As Variant
Set IE = CreateObject("internetexplorer.application")
IE.navigate "http://www.usbanklocations.com/banks.php?name=" & Range("A2").Value & "+Bank&ml=30&lc=" & Range("B2").Value & "%2C+TX"
IE.Visible = True
Do While IE.Busy Or IE.readystate <> 4 '4 = READYSTATE_COMPLETE
DoEvents
Loop
'input bank name into form
'myvalue = InputBox("Enter City. Press okay to begin search", "Bank Search")
'Range("F3").Value = myvalue
'IE.document.getelementbyid("MainContent_txtCity").Value = "LegacyTexas"
'click find button
'IE.document.getelementbyid("MainContent_btn").Click
'Sleep 5 * 1000
IE.document.getelementbytagname("table").getelementsbyclassname("btn").Click
Sleep 5 * 1000
'total pages
pageTotal = IE.document.getelementbyid("lsortby").innertext
page = 0
Do Until page = pageTotal
DoEvents
page = IE.document.getelementbyclassname("lsortby").innertext
With IE.document.getelementbyid("main")
For r = 1 To .Rows.Length - 1
If Not IsArray(BankName) Then
ReDim BankName(7, 0) As Variant
Else
ReDim Preserve BankName(7, UBound(BankName, 2) + 1) As Variant
End If
BankName(0, UBound(BankName, 2)) = .Rows(r).Cells(0).innertext
Next r
End With
If page < pageTotal Then
IE.document.getelementbyclassname("panelpn").Click
beginTime = Now
Application.Wait (Now + TimeValue("00:00:05"))
End If
Loop
For r = 0 To UBound(BankName, 2)
IE.navigate "http://www.usbanklocations.com/" & BankName(0, r)
Do While IE.Busy Or IE.readystate <> 4 '4 = READYSTATE_COMPLETE
DoEvents
Loop
'wait 5 sec. for screen refresh
Sleep 5 * 1000
With IE.document.getelementbytagname("table")
For i = 0 To .Rows.Length - 1
DoEvents
Select Case .Rows(i).Cells(0).innertext
Case "Name:"
BankName(1, r) = .Rows(i).Cells(1).innertext
Case "Location:"
BankName(2, r) = .Rows(i).Cells(1).innertext
Case "Phone:"
BankName(3, r) = .Rows(i).Cells(1).innertext
Case "Branch Deposit:"
BankName(4, r) = Replace(Replace(.Rows(i).Cells(1).innertext, ",", ""), "$", "")
Case "Total Assets:"
BankName(5, r) = Replace(Replace(.Rows(i).Cells(1).innertext, ",", ""), "$", "")
End Select
Next i
End With
Next r
IE.Quit
Set IE = Nothing
'post result on Excel cell
Worksheets(1).Range("A9").Resize(UBound(BankName, 2) + 1, UBound(BankName, 1) + 1).Value = Application.Transpose(BankName)
End Sub
提前感谢您!我将不胜感激任何帮助。
【问题讨论】:
ToS for usbanklocations.com 声明用户不能aggregate, copy or duplicate content on USBANKLOCATIONS.COM
- 所以我很确定你不应该抓取他们的网站......
通过“on”,他们指的是专门针对他们网站的操作。不是用户可以使用的内容。您可以复制/粘贴信息。
好的——我一般不会为了谨慎起见而参与抓取问题。我只是在你不知道的情况下指出,但如果你很高兴这很好,那就足够公平了。
谢谢@MacroMan!感谢您指出这一点。
.getelementbyvalue
、.getelementbyclass
、.getelementbytag
不是有效的方法。 .getElementsByClassName
, .getElementsByTagName
返回由类和标签名称选择的节点集合。没有本地函数可以通过节点的值来检索节点。
【参考方案1】:
考虑以下使用 XHR 而不是 IE 和基于拆分的 HTML 内容解析的示例:
Option Explicit
Sub Test_usbanklocations()
Dim oSource, oDestination, y, oSrcRow, sName, sCity, sDist, sUrl0, sUrl1, sUrl2, lPage, sResp1, sResp2, i, a1, a2, a3, a4, a5
Set oSource = Sheets(1)
Set oDestination = Sheets(2)
oDestination.Cells.Delete
DataOutput oDestination, 1, Array("Name", "Location", "Phone", "Total Assets", "Total Deposits")
y = 2
For Each oSrcRow In oSource.UsedRange.Rows
sName = oSrcRow.Cells(1, 1).Value
sCity = oSrcRow.Cells(1, 2).Value
sDist = oSrcRow.Cells(1, 3).Value
sUrl0 = "http://www.usbanklocations.com/banks.php?q=" & EncodeUriComponent(sName) & "&lc=" & EncodeUriComponent(sCity) & "&ml=" & sDist
sUrl1 = sUrl0
lPage = 1
Do
sResp1 = GetXHR(sUrl1)
If InStr(sResp1, "We can not find the address you provided. Please check.") > 0 Then Exit Do
a1 = Split(sResp1, "<div class=""pl")
For i = 1 To UBound(a1)
a2 = Split(a1(i), "</div>", 3)
a3 = Split(a2(1), "<a href=""", 2)
a4 = Split(a3(1), """>", 2)
sUrl2 = "http://www.usbanklocations.com" & a4(0)
sResp2 = GetXHR(sUrl2)
a5 = Array( _
GetFragment(sResp2, "<b>Name:</b></td><td>", "</td>"), _
Replace(GetFragment(sResp2, "<b>Location:</b></td><td>", "</td>"), "View Other Branches", ""), _
GetFragment(sResp2, "<b>Phone:</b></td>", "</td>"), _
GetFragment(sResp2, "<b>Total Assets:</b></td><td>", "</td>"), _
GetFragment(sResp2, "<b>Total Deposits:</b></td><td>", "</td>") _
)
DataOutput oDestination, y, a5
y = y + 1
DoEvents
Next
If InStr(sResp1, "Next Page >") = 0 Then Exit Do
lPage = lPage + 1
sUrl1 = sUrl0 & "&ps=" & lPage
DoEvents
Loop
Next
MsgBox "Completed"
End Sub
Function GetXHR(sUrl)
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", sUrl, False
.Send
GetXHR = .ResponseText
End With
End Function
Sub DataOutput(oSht, y, aValues)
With oSht.Cells(y, 1).Resize(1, UBound(aValues) + 1)
.NumberFormat = "@"
.Value = aValues
End With
End Sub
Function GetFragment(sText, sPatt1, sPatt2)
Dim a1, a2
a1 = Split(sText, sPatt1, 2)
If UBound(a1) <> 1 Then Exit Function
a2 = Split(a1(1), sPatt2, 2)
If UBound(a2) <> 1 Then Exit Function
GetFragment = GetInnerText(a2(0))
End Function
Function EncodeUriComponent(sText)
Static objHtmlfile As Object
If objHtmlfile Is Nothing Then
Set objHtmlfile = CreateObject("htmlfile")
objHtmlfile.parentWindow.execScript "function encode(s) return encodeURIComponent(s)", "jscript"
End If
EncodeUriComponent = objHtmlfile.parentWindow.encode(sText)
End Function
Function GetInnerText(sText)
With CreateObject("htmlfile")
.Write ("<body>" & sText & "</body>")
GetInnerText = .DocumentElement.Document.GetElementsByTagName("body")(0).InnerText
End With
End Function
例如,第一个工作表包含要搜索的数据(银行名称、位置和距离):
那么第二张工作表上的结果如下:
【讨论】:
你太棒了@omegastripes!这种 XHR/api 方法是一个很好的基础。非常感谢。实际上,我只是熟悉 XHR,这将是我第一次以这种格式查看代码。我注意到它对于大量数据集要快得多。非常感谢。 @K.K.顺便说一句,使 XHR 异步可以实现更高的速度,但是代码应该可以处理事件。 @omegastripes,感谢您的代码。这对我来说是一个全新的技能。我从中学到了。 感谢@omegastripes 和pcw,我真的很感激。我现在正在搞乱代码。速度惊人 谢谢@pcw。你帮了大忙以上是关于XHTML 网站抓取指南的主要内容,如果未能解决你的问题,请参考以下文章