好友
阅读权限 10
听众
最后登录 1970-1-1
之前在评论区看到了这个福利网址,就想着自己爬一下练练手。可惜不会python,就只能使用golang了。
我是用的colly+goquery来爬取的网站。F12分析了一波网站的页面,发现是通过这个链接去获取的下载地址 “域名/wp-admin/admin-ajax.php”,查看返回来的值发现是加密了的。爬了一下js,找到了js解密的地方。可惜js是加密了的,对于前端不是怎么了解,不过仔细研究了一下下载链接,发现下载链接是有规律的,而且这个规律十分简单。拼接规则就是 “域名+日期+id.zip”,日期和id都可以从网页中获取,域名又是固定的,后台直接处理一下拼接就ok了。因为是自学,可能会有一些问题,请见谅。下面上代码:
package main
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly"
"github.com/gocolly/colly/extensions"
"io"
"log"
"net/http"
"os"
"strconv"
"strings"
"sync"
"time"
)
var downLoadPath = " 本地保存路径 "
var tUrl string = "网站地址"
var downUrl string = " 网站地址/wp-admin/admin-ajax.php"
var wg sync . WaitGroup
func main () {
info := getPicInfo (tUrl)
for k := range info {
s := info[k]
if len (s) != 0 {
wg. Add ( 1 )
go DownloadFileProgress ( strings . TrimSpace (s) , strconv . Itoa (k))
}
}
wg. Wait ()
}
func addZero (s string ) string {
if len (s) < 2 {
return "0" + s
} else {
return s
}
}
func getPicInfo (u string ) [] string {
var urls [] string
start := time . Now ()
c := colly . NewCollector ( func (collector * colly . Collector ) {
extensions . RandomUserAgent (collector) // 设置随机头
collector.Async = true
})
// 解析页面
c. OnHTML ( "li a" , func (e * colly . HTMLElement ) {
urlValue := e. Attr ( "href" )
if strings . Contains (urlValue , "xyz" ) {
c. Visit (e.Request. AbsoluteURL (urlValue))
}
})
c. OnHTML ( "div.main_left" , func (e * colly . HTMLElement ) {
e.DOM. Each ( func (i int , s * goquery . Selection ) {
text := s. Find ( "div.down_meta_dec" ). Text ()
time := s. Find ( "span.image-info-time" ). Text ()
if strings . Contains (text , "zip" ) {
// 解析图片的 pid
//split := strings.Split(text, ".zip")
//pid := split[0]s
strings . TrimSpace (time)
time = time[ 3 :]
s := strings . Split (time , "." )
// 下载请求
u := "下载地址/zip/" + s [ 0 ] + "/" + addZero ( s [ 1 ]) + "/" + addZero ( s [ 2 ]) + "/" + text
//fmt.Printf("find downloadUrl: %s\n", u)
urls = append (urls , u )
}
})
})
c. OnError ( func (response * colly . Response , err error ) {
fmt . Println ( " 错误原因: " , err)
})
time . Sleep ( 1 * time . Second )
c. Visit (u)
c. Wait ()
fmt . Printf ( " 花费时间 : %s\n " , time . Since (start))
return urls
}
type Reader struct {
io . Reader
Total int64
Current int64
}
func ( r * Reader ) Read (p [] byte ) (n int , err error ) {
n , err = r .Reader. Read (p)
r .Current += int64 (n)
fmt . Printf ( " \r 进度 %.2f%% " , float64 ( r .Current* 10000 / r .Total)/ 100 )
return
}
func DownloadFileProgress (url , filename string ) {
r , err := http . Get (url)
if err != nil {
log . Fatal (err)
}
defer func () { _ = r.Body. Close () }()
f , err := os . Create (downLoadPath + filename)
if err != nil {
log . Fatal (err)
}
defer func () { _ = f. Close () }()
reader := & Reader {
Reader: r.Body ,
Total: r.ContentLength ,
}
_ , _ = io . Copy (f , reader)
wg. Done ()
}
// 获取加密后的下载链接,返回的值是加密的,网页中的 js 又经过加密所以就没有去请求下载连接了
func downInfo (sUrl string ) {
//var action string = "Post_down_ajax"
//var name string = " 资源一下载 "
//var down string = "ntXXpthwZZGmoV7Yo17JmdBfqKWhX+CbqZFnY2dlYZFpkJNnlGdmlWlnXt2coQ=="
//params := url.Values{}
//params.Set("action", action)
//params.Set("pid", pid)
//params.Set("down", down)
//params.Set("name", name)
//requestData := map[string][]string{
// "action": action,
// "pid": pid,
// "down": down,
// "name": name,
//}
//c2 := colly.NewCollector()
//c2.OnRequest(func(request *colly.Request) {
// request.Headers.Set("Content-Type","application/x-www-form-urlencoded")
//})
//err := c2.Post(sUrl, requestData)
//if err!=nil{
// log.Fatal(err)
//}
}
发帖前要善用【论坛搜索 】 功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。