VBA web Scraping - 将 HTMLdoc 转换为 XML,但在单击按钮时遇到错误

Posted

技术标签:

【中文标题】VBA web Scraping - 将 HTMLdoc 转换为 XML,但在单击按钮时遇到错误【英文标题】:VBA web Scraping - Shifting HTMLdoc to XML but facing errors while clicking on a button 【发布时间】:2021-10-30 21:08:37 【问题描述】:

我有一个MShtml.HTMLDocument 代码:

    打开页面"https://www.ksestocks.com/HistoryHighLow"

    填充输入,即786

    然后点击按钮获取表格

    我使用以下代码捕获了一行及其 4 个孩子

    Sub KSE_GetHTMLDocument()
    
     Dim IE As New SHDocVw.InternetExplorer
     Dim HTMLDOC As MSHTML.HTMLDocument
     Dim HTMLInput As MSHTML.IHTMLElement
     Dim HTMLClasses As MSHTML.IHTMLElementCollection
     Dim HTMLClass As MSHTML.IHTMLElement
     Dim HTMLCel As MSHTML.IHTMLElement
     Dim colNum, rowNum, RowN, C As Integer
    
     Dim Cel As Range
    
     IE.Visible = False
     IE.Navigate "https://www.ksestocks.com/HistoryHighLow"
    
     Do While IE.ReadyState <> READYSTATE_COMPLETE
     Loop
    
     For Each Cel In Sheets("Sheet1").Range("A3:A" & Cells(Rows.Count, 1).End(xlUp).Row)
     If IsEmpty(Cel.Value) = False Then
    
         Set HTMLDOC = IE.Document
         Set HTMLInput = HTMLDOC.getElementById("selscrip")
    
         HTMLInput.Value = Trim(Cel.Value)
         Debug.Print Cel.Value
         HTMLDOC.getElementsByTagName("input")(0).Click
    
         While IE.Busy Or IE.readyState < 4: DoEvents: Wend
    
         C = 0
         For Each HTMLClass In HTMLDOC.getElementsByTagName("tr")
             If InStr(HTMLClass.innerText, "Last 3 years (") > 0 Then
                 If Left(HTMLClass.innerText, 14) = "Last 3 years (" Then
                         For Each HTMLCel In HTMLClass.Children
                             Debug.Print HTMLCel.innerText
                             If C = 1 Then
                             Cel.Offset(0, 7).Value = HTMLCel.innerText
                             ElseIf C = 2 Then
                             Cel.Offset(0, 8).Value = HTMLCel.innerText
                             ElseIf C = 3 Then
                             Cel.Offset(0, 9).Value = HTMLCel.innerText
                             ElseIf C = 4 Then
                             Cel.Offset(0, 10).Value = HTMLCel.innerText
                             End If
                             C = C + 1
                         Next
                 End If
             End If
         Next
    End If
    Next    
    End Sub
    

上面的代码在从网站获取值时运行良好,但是当我将代码更改为 XML 时,它停止工作,并且 Internet Explorer 每次都弹出一个新窗口,但没有结果。

我哪里做错了?

有没有更健壮的网页抓取方式?

运行前请检查以下代码

Sub KSE_Get_XML()
    
    Dim XMLp As New MSXML2.XMLHTTP60
    Dim HTMLDOC As New MSHTML.HTMLDocument
    
    Dim HTMLInput As MSHTML.IHTMLElement
    
    Dim HTMLClasses As MSHTML.IHTMLElementCollection
    Dim HTMLClass As MSHTML.IHTMLElement
    
    Dim HTMLCel As MSHTML.IHTMLElement
    
    Dim colNum, rowNum, RowN, C As Integer
    
    XMLp.Open "GET", "https://www.ksestocks.com/HistoryHighLow", False
    XMLp.send
    
    HTMLDOC.body.innerHTML = XMLp.responseText
    
    Dim Cel As Range
    
 '   Do While HTMLDOC.ReadyState <> READYSTATE_COMPLETE
  '  Loop
    
    For Each Cel In Sheets("Sheet1").Range("A3:A" & Cells(Rows.Count, 1).End(xlUp).Row)
    If IsEmpty(Cel.Value) = False Then
        
        HTMLDOC.body.innerHTML = XMLp.responseText
        Set HTMLInput = HTMLDOC.getElementById("selscrip")

        HTMLInput.Value = Trim(Cel.Value)
        Debug.Print Cel.Value
        HTMLDOC.getElementsByTagName("input")(0).Click
        
        'Application.Wait Now + TimeValue("00:00:01")
       '' Do While HTMLDOC.ReadyState <> READYSTATE_COMPLETE
       '     DoEvents
      '  Loop

        C = 0
        For Each HTMLClass In HTMLDOC.getElementsByTagName("tr")
            If InStr(HTMLClass.innerText, "Last 3 years (") > 0 Then
                If Left(HTMLClass.innerText, 14) = "Last 3 years (" Then
                        For Each HTMLCel In HTMLClass.Children
                            Debug.Print HTMLCel.innerText
                            If C = 1 Then
                            Cel.Offset(0, 7).Value = HTMLCel.innerText
                            ElseIf C = 2 Then
                            Cel.Offset(0, 8).Value = HTMLCel.innerText
                            ElseIf C = 3 Then
                            Cel.Offset(0, 9).Value = HTMLCel.innerText
                            ElseIf C = 4 Then
                            Cel.Offset(0, 10).Value = HTMLCel.innerText
                            End If
                            C = C + 1
                        Next
                End If
            End If
            
            
        Next
   End If
   Next

End Sub

【问题讨论】:

【参考方案1】:

完全摆脱 IE 并切换到 xmlhttp 请求,这很健壮且不易出错。当您使用 xhr 时,您需要发出带有适当参数的 post http 请求。您可以执行此操作以从该表中获取位于 Last 3 years (1 Sep 2018 - 1 Sep 2021) 旁边的结果。

Public Sub GetContent()
    Const Url = "https://www.ksestocks.com/HistoryHighLow"
    Dim Http As Object, Html As HTMLDocument, Htmldoc As HTMLDocument
    Dim params$, I&, R&, ws As Worksheet, searchKeyword$
    
    Set Html = New HTMLDocument
    Set Htmldoc = New HTMLDocument
    Set Http = CreateObject("MSXML2.XMLHTTP")
    Set ws = ThisWorkbook.Worksheets("Sheet1")
    R = 2
    
    searchKeyword = "786"   'you can use different search keywords here to get related results
    
    params = "selscrip=" & searchKeyword
    
    With Http
        .Open "POST", Url, False
        .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
        .setRequestHeader "content-type", "application/x-www-form-urlencoded; charset=UTF-8"
        .send (params)
        Html.body.innerHTML = .responseText
    End With

    With Html.querySelectorAll("td.plain")
        For I = 0 To .Length - 1
            If InStr(.item(I).innerText, "Last 3 years") > 0 Then
                Htmldoc.body.innerHTML = "<table>" & .item(I).ParentNode.outerHTML & "</table>"
                ws.Cells(R, 1) = Htmldoc.querySelectorAll("td.plain")(1).innerText
                ws.Cells(R, 2) = Htmldoc.querySelectorAll("td.plain")(2).innerText
                ws.Cells(R, 3) = Htmldoc.querySelectorAll("td.plain")(3).innerText
                ws.Cells(R, 4) = Htmldoc.querySelectorAll("td.plain")(4).innerText
            End If
        Next I
    End With
End Sub

参考添加:

1. Microsoft XML, v6.0
2. Microsoft HTML Object Library

您的搜索关键字将是您在 this image 中看到的内容。

【讨论】:

如果由于某种原因脚本不适合您,那只是因为我们的 excel 版本的变化。顺便说一句,我使用的是 Microsoft Office 2013。

以上是关于VBA web Scraping - 将 HTMLdoc 转换为 XML,但在单击按钮时遇到错误的主要内容,如果未能解决你的问题,请参考以下文章

Python中的Web Scraping

将 Web Scraping 的结果存储到数据库中

多处理在python web-scraping中不起作用

Python Web-scraping多页表到csv和DF进行分析

使用无头浏览器进行 Android Web Scraping [关闭]

阅读OReilly.Web.Scraping.with.Python.2015.6笔记---Crawl