VBA web Scraping - 将 HTMLdoc 转换为 XML,但在单击按钮时遇到错误
Posted
技术标签:
【中文标题】VBA web Scraping - 将 HTMLdoc 转换为 XML,但在单击按钮时遇到错误【英文标题】:VBA web Scraping - Shifting HTMLdoc to XML but facing errors while clicking on a button 【发布时间】:2021-10-30 21:08:37 【问题描述】:我有一个MShtml.HTMLDocument
代码:
打开页面"https://www.ksestocks.com/HistoryHighLow"
填充输入,即786
然后点击按钮获取表格
我使用以下代码捕获了一行及其 4 个孩子
Sub KSE_GetHTMLDocument()
Dim IE As New SHDocVw.InternetExplorer
Dim HTMLDOC As MSHTML.HTMLDocument
Dim HTMLInput As MSHTML.IHTMLElement
Dim HTMLClasses As MSHTML.IHTMLElementCollection
Dim HTMLClass As MSHTML.IHTMLElement
Dim HTMLCel As MSHTML.IHTMLElement
Dim colNum, rowNum, RowN, C As Integer
Dim Cel As Range
IE.Visible = False
IE.Navigate "https://www.ksestocks.com/HistoryHighLow"
Do While IE.ReadyState <> READYSTATE_COMPLETE
Loop
For Each Cel In Sheets("Sheet1").Range("A3:A" & Cells(Rows.Count, 1).End(xlUp).Row)
If IsEmpty(Cel.Value) = False Then
Set HTMLDOC = IE.Document
Set HTMLInput = HTMLDOC.getElementById("selscrip")
HTMLInput.Value = Trim(Cel.Value)
Debug.Print Cel.Value
HTMLDOC.getElementsByTagName("input")(0).Click
While IE.Busy Or IE.readyState < 4: DoEvents: Wend
C = 0
For Each HTMLClass In HTMLDOC.getElementsByTagName("tr")
If InStr(HTMLClass.innerText, "Last 3 years (") > 0 Then
If Left(HTMLClass.innerText, 14) = "Last 3 years (" Then
For Each HTMLCel In HTMLClass.Children
Debug.Print HTMLCel.innerText
If C = 1 Then
Cel.Offset(0, 7).Value = HTMLCel.innerText
ElseIf C = 2 Then
Cel.Offset(0, 8).Value = HTMLCel.innerText
ElseIf C = 3 Then
Cel.Offset(0, 9).Value = HTMLCel.innerText
ElseIf C = 4 Then
Cel.Offset(0, 10).Value = HTMLCel.innerText
End If
C = C + 1
Next
End If
End If
Next
End If
Next
End Sub
上面的代码在从网站获取值时运行良好,但是当我将代码更改为 XML
时,它停止工作,并且 Internet Explorer 每次都弹出一个新窗口,但没有结果。
我哪里做错了?
有没有更健壮的网页抓取方式?
运行前请检查以下代码
Sub KSE_Get_XML()
Dim XMLp As New MSXML2.XMLHTTP60
Dim HTMLDOC As New MSHTML.HTMLDocument
Dim HTMLInput As MSHTML.IHTMLElement
Dim HTMLClasses As MSHTML.IHTMLElementCollection
Dim HTMLClass As MSHTML.IHTMLElement
Dim HTMLCel As MSHTML.IHTMLElement
Dim colNum, rowNum, RowN, C As Integer
XMLp.Open "GET", "https://www.ksestocks.com/HistoryHighLow", False
XMLp.send
HTMLDOC.body.innerHTML = XMLp.responseText
Dim Cel As Range
' Do While HTMLDOC.ReadyState <> READYSTATE_COMPLETE
' Loop
For Each Cel In Sheets("Sheet1").Range("A3:A" & Cells(Rows.Count, 1).End(xlUp).Row)
If IsEmpty(Cel.Value) = False Then
HTMLDOC.body.innerHTML = XMLp.responseText
Set HTMLInput = HTMLDOC.getElementById("selscrip")
HTMLInput.Value = Trim(Cel.Value)
Debug.Print Cel.Value
HTMLDOC.getElementsByTagName("input")(0).Click
'Application.Wait Now + TimeValue("00:00:01")
'' Do While HTMLDOC.ReadyState <> READYSTATE_COMPLETE
' DoEvents
' Loop
C = 0
For Each HTMLClass In HTMLDOC.getElementsByTagName("tr")
If InStr(HTMLClass.innerText, "Last 3 years (") > 0 Then
If Left(HTMLClass.innerText, 14) = "Last 3 years (" Then
For Each HTMLCel In HTMLClass.Children
Debug.Print HTMLCel.innerText
If C = 1 Then
Cel.Offset(0, 7).Value = HTMLCel.innerText
ElseIf C = 2 Then
Cel.Offset(0, 8).Value = HTMLCel.innerText
ElseIf C = 3 Then
Cel.Offset(0, 9).Value = HTMLCel.innerText
ElseIf C = 4 Then
Cel.Offset(0, 10).Value = HTMLCel.innerText
End If
C = C + 1
Next
End If
End If
Next
End If
Next
End Sub
【问题讨论】:
【参考方案1】:完全摆脱 IE 并切换到 xmlhttp 请求,这很健壮且不易出错。当您使用 xhr 时,您需要发出带有适当参数的 post http 请求。您可以执行此操作以从该表中获取位于 Last 3 years (1 Sep 2018 - 1 Sep 2021)
旁边的结果。
Public Sub GetContent()
Const Url = "https://www.ksestocks.com/HistoryHighLow"
Dim Http As Object, Html As HTMLDocument, Htmldoc As HTMLDocument
Dim params$, I&, R&, ws As Worksheet, searchKeyword$
Set Html = New HTMLDocument
Set Htmldoc = New HTMLDocument
Set Http = CreateObject("MSXML2.XMLHTTP")
Set ws = ThisWorkbook.Worksheets("Sheet1")
R = 2
searchKeyword = "786" 'you can use different search keywords here to get related results
params = "selscrip=" & searchKeyword
With Http
.Open "POST", Url, False
.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
.setRequestHeader "content-type", "application/x-www-form-urlencoded; charset=UTF-8"
.send (params)
Html.body.innerHTML = .responseText
End With
With Html.querySelectorAll("td.plain")
For I = 0 To .Length - 1
If InStr(.item(I).innerText, "Last 3 years") > 0 Then
Htmldoc.body.innerHTML = "<table>" & .item(I).ParentNode.outerHTML & "</table>"
ws.Cells(R, 1) = Htmldoc.querySelectorAll("td.plain")(1).innerText
ws.Cells(R, 2) = Htmldoc.querySelectorAll("td.plain")(2).innerText
ws.Cells(R, 3) = Htmldoc.querySelectorAll("td.plain")(3).innerText
ws.Cells(R, 4) = Htmldoc.querySelectorAll("td.plain")(4).innerText
End If
Next I
End With
End Sub
参考添加:
1. Microsoft XML, v6.0
2. Microsoft HTML Object Library
您的搜索关键字将是您在 this image 中看到的内容。
【讨论】:
如果由于某种原因脚本不适合您,那只是因为我们的 excel 版本的变化。顺便说一句,我使用的是 Microsoft Office 2013。以上是关于VBA web Scraping - 将 HTMLdoc 转换为 XML,但在单击按钮时遇到错误的主要内容,如果未能解决你的问题,请参考以下文章
Python Web-scraping多页表到csv和DF进行分析