本帖最后由 aiai 于 2020-5-14 11:49 编辑
需求来源:想用Word VBA来爬取百度文库json源数据,然后解析json还原成word源文档
测试链接:https://wenku.baidu.com/view/f6fc53e7bc64783e0912a21614791711cd79797d.html
首先第一步就是获取主页的源代码
[Visual Basic] 纯文本查看 复制代码 Dim http As Object
Set http = CreateObject("WinHttp.WinHttpRequest.5.1")
Dim htmlurl As String
htmlurl = "https://wenku.baidu.com/view/4ed46924031ca300a6c30c22590102020640f263.html"
With http
.Open "GET", htmlurl, False
.SetRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
.Send
Dim body() As Byte
body = .responseBody
Dim htmltext As String
htmltext = encoding(body, "gb2312")
End With
然后可以通过源代码获得总页数和各个页面的地址
[Visual Basic] 纯文本查看 复制代码 '获取总页数
Dim reg As Object
Dim page As Integer
Set reg = CreateObject("VbScript.regexp")
With reg
.Global = False
.Pattern = "totalPageNum.+,"
page = CInt(Split(.Execute(htmltext)(0), "'")(2))
End With
'获取获取链接
Dim urllist As String
With reg
.Global = False
.Pattern = "WkInfo.htmlUrls.+(?=')"
urllist = .Execute(htmltext)(0)
End With
urllist = Right(urllist, Len(urllist) - 19)
这里的页面地址还有ascii编码和转义的,写个自定义函数处理一下,就可以得到标准的json数据,再对这个json进行解析
[Visual Basic] 纯文本查看 复制代码 'ascii解码
urllist = asciidecode(urllist)
Dim json As Object
Set json = jsonpath(urllist, "")
Dim url
For Each url In json.json
With http
.Open "GET", url.pageLoadUrl, False
.Send
body = .responseBody
htmltext = encoding(body, "gb2312")
End With
htmltext = Mid(htmltext, InStr(1, htmltext, "(", vbBinaryCompare) + 1)
htmltext = Left(htmltext, Len(htmltext) - 1)
Dim pagejson As Object
Set pagejson = jsonpath(htmltext, "")
Next url
这时就已经获得所有json的源数据,但是只有前50页。百度文库每50页分割一次,后50页的地址为
https://wenku.baidu.com/view/f6fc53e7bc64783e0912a21614791711cd79797d.html?pn=51
通过一开始的总页数,可以确定循环的次数,至此就可以获得所有的json源数据
上面需要用到的自定义函数
[Visual Basic] 纯文本查看 复制代码 Public Function encoding(body() As Byte, CodeBase As String) As String
Dim ado As Object
Set ado = CreateObject("Adodb.Stream")
With ado
.Type = 1
.mode = 3
.Open
.Write body
.Position = 0
.Type = 2
.Charset = CodeBase
encoding = .readtext
End With
End Function
Public Function asciidecode(body As String) As String
'处理ascii
Dim index As Long
For index = 0 To 255 Step 1
Dim text As String
text = Hex(index)
If Len(text) = 1 Then
text = "\x0" & text
Else
text = "\x" & text
End If
body = Replace(body, text, Chr(index))
Next index
'处理转义
asciidecode = ""
For index = 1 To Len(body) Step 1
text = Mid(body, index, 1)
If text = "\" Then
asciidecode = asciidecode & Mid(body, index + 1, 1)
index = index + 1
Else
asciidecode = asciidecode & text
End If
Next index
End Function
Public Function jsonpath(js As String, path As String)
Dim jsonpa As Object
Set jsonpa = CreateObject("msscriptcontrol.scriptcontrol")
jsonpa.Language = "JavaScript"
jsonpa.AddCode ("var jsons = " & js)
If TypeName(jsonpa.eval("jsons" + path)) = "JScriptTypeInfo" Then
Set jsonpath = jsonpa.eval("jsons" + path)
Else
jsonpath = jsonpa.eval("jsons" + path)
End If
End Function
但是这个json源数据看的不太懂,不知道怎么还原为word源文档,请教给为坛友,如果有了解的可以分享一下
网络抓包的结果
VBA运行的结果
因为百度的json数据用的是css样式,对于word来说样式很难还原,那么就先还原一下简单的文字
[Visual Basic] 纯文本查看 复制代码 urllist = asciidecode(urllist)
Dim json As Object
Set json = jsonpath(urllist, "")
Dim jsonlen As Integer
jsonlen = jsonpath(urllist, ".json.length")
Dim jsonpng As Integer
jsonpng = jsonpath(urllist, ".png.length")
Dim index As Byte
For index = 0 To jsonlen - 1 Step 1
'下载图片,不一定存在
If jsonpng > 0 Then
Dim pngurl As String
pngurl = jsonpath(urllist, ".png[" & index & "].pageLoadUrl")
With http
.Open "GET", pngurl, False
.Send
body = .responseBody
'Call savebytefile(body, "D:/" & index + 1 & ".png")
End With
End If
'下载主体
Dim bodyurl As String
bodyurl = jsonpath(urllist, ".json[" & index & "].pageLoadUrl")
With http
.Open "GET", bodyurl, False
.Send
body = .responseBody
htmltext = encoding(body, "gb2312")
End With
htmltext = Mid(htmltext, InStr(1, htmltext, "(", vbBinaryCompare) + 1)
htmltext = Left(htmltext, Len(htmltext) - 1)
Dim pagejson As Object
Set pagejson = jsonpath(htmltext, "")
'仅还原文字
Dim pagelen As Integer
pagelen = jsonpath(htmltext, ".body.length")
Dim index2 As Long
For index2 = 0 To pagelen - 1 Step 1
Dim word
Set word = jsonpath(htmltext, ".body[" & index2 & "]")
If word.t = "word" Then
Dim worklen As Byte
worklen = Len(word.c)
Selection.TypeText text:=word.c
Selection.MoveLeft Unit:=wdCharacter, Count:=worklen, Extend:=wdExtend
Selection.Font.Size = word.p.h / 2
Selection.MoveRight Unit:=wdCharacter, Count:=1
If TypeName(word.ps) = "JScriptTypeInfo" Then
If jsonpath(htmltext, ".body[" & index2 & "].ps._enter") <> Empty Then
Selection.TypeParagraph
End If
End If
End If
Next index2
Selection.InsertBreak Type:=0
Next index
百度显示的内容
还原文字的内容
|