爬去网页离线数据
Posted bignine
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬去网页离线数据相关的知识,希望对你有一定的参考价值。
重命名文件
On Error Resume Next
Dim dd As String
Dim k%
‘‘提取文件夹名称
dd = Dir(Sheets("Sheet1").Cells(1, 2) & "*", vbDirectory)
Do
dd = Dir
‘‘判断是否为文件夹
If dd <> "" And InStr(1, dd, ".") = 0 Then
Dim aa
Set aa = CreateObject("Scripting.FileSystemObject")
k = k + 1
‘‘文件夹重命名
aa.MoveFolder Sheets("Sheet1").Cells(1, 2) & dd, Sheets("Sheet1").Cells(1, 2) & "改名" & k
End If
Loop Until Len(dd) = 0
Set aa = Nothing
‘爬去数据
Dim arr, brr, i%, s$, html, Ta, n%, j%, str$, Url$, Db, tr, td
tempPath = Cells(1, 2)
If Mid(tempPath, Len(tempPath), 1) <> "" Then
tempPath = tempPath & ""
End If
If Dir(tempPath, vbDirectory) = "" Then
MsgBox "错误!需要处理的文件目录不存在 " & tempPath
Exit Sub
End If
Dim fn
fn = Dir(tempPath & "*.htm")
Do While fn <> ""
Set html = CreateObject("htmlfile")
dataTxt = GetCode("UTF-8", tempPath & fn) ‘tempPath & fn
html.body.innerhtml = dataTxt
If (InStr(dataTxt, "tooltip-title") = 0) Then
oneclick dataTxt, onenum
onenum = onenum + 1
Else
twoclick dataTxt, twonum
twonum = twonum + 1
End If
fn = Dir()
Loop
‘table数据
Set Db = html.all.tags("table")(3)
i = 0: j = 0
For Each tr In Db.Rows
m = 0
i = i + 1: j = 0
If i > 1 Then
For Each td In tr.Cells
m = m + 1
Sheets("Sheet3").Cells(m, pagenum) = Replace(td.innerText, Chr(10) & Chr(10), Chr(10))
Next
End If
pagenum = pagenum + 1
Next
‘页面编码
Public Function GetCode(CodeBase, Url) ‘第一个参数是设置编码方式(GB2312或UTF-8)第二个参数是地址.
Dim xmlHTTP1
Set xmlHTTP1 = CreateObject("Microsoft.XMLHTTP")
xmlHTTP1.Open "get", Url, True
xmlHTTP1.send
While xmlHTTP1.readyState <> 4
DoEvents
Wend
GetCode = xmlHTTP1.responseBody
If CStr(GetCode) <> "" Then GetCode = BytesToBstr(GetCode, CodeBase)
Set xmlHTTP1 = Nothing
End Function
Public Function BytesToBstr(strBody, CodeBase)
Dim ObjStream
Set ObjStream = CreateObject("Adodb.Stream")
With ObjStream
.Type = 1
.Mode = 3
.Open
.write strBody
.Position = 0
.Type = 2
.Charset = CodeBase
BytesToBstr = .ReadText
.Close
End With
Set ObjStream = Nothing
End Function
以上是关于爬去网页离线数据的主要内容,如果未能解决你的问题,请参考以下文章