golang爬虫
```package main
import (
"fmt"
query "github.com/antchfx/xquery/html"
"golang.org/x/net/html"
"io/ioutil"
"math/rand"
"net/http"
"os"
"path/filepath"
"strconv"
"strings"
"time"
)
var UA = []string{
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
}
func fetch(url string) string {
//fmt.Println("Fetch Url", url)
client := &http.Client{}
//method参数 GET、POST要大写
req, _ := http.NewRequest("GET", url, nil)
index := rand.Intn(len(UA))
req.Header.Set("User-Agent", UA)
req.Header.Add("Referer", "https://www.mzitu.com/")
rsp, err := client.Do(req)
if err != nil {
fmt.Println("Http get err: ", err)
return ""
}
if rsp.StatusCode != 200 {
fmt.Println("Http status code:", rsp.StatusCode)
return ""
}
defer rsp.Body.Close()
body, err := ioutil.ReadAll(rsp.Body)
if err != nil {
fmt.Println("Read error:", err)
return ""
}
return string(body)
}
/**
* 下载图片
*/
func downloadImg(url string, dir string) {
_, name := filepath.Split(url)
fileName := filepath.Join(dir, name)
data := fetch(url)
err := ioutil.WriteFile(fileName, []byte(data), 0644)
if err != nil {
fmt.Println("ioutil.WriteFile err:", err)
}
}
//获取页面中最大页码
func getMaxPage(url string, xpath string) int {
result := fetch(url)
root, err := query.Parse(strings.NewReader(result))
if err != nil {
panic(err)
}
maxPageNode := query.FindOne(root, xpath)
maxPage := query.InnerText(maxPageNode)
length, _ := strconv.Atoi(maxPage)
return length
}
func getHtmlNode(url string, xpath string) *html.Node {
result := fetch(url)
root, err := query.Parse(strings.NewReader(result))
if err != nil {
fmt.Printf("query.Parse err:%s\n", err)
}
return query.FindOne(root, xpath)
}
func getHtmlNodes(url string, xpath string) []*html.Node {
result := fetch(url)
root, err := query.Parse(strings.NewReader(result))
if err != nil {
fmt.Printf("query.Parse err:%s\n", err)
}
return query.Find(root, xpath)
}
func main() {
url := "https://www.mzitu.com/mm/"
dir := "D:\\新建文件夹111"
length := getMaxPage(url, "//div[@class='nav-links']/a")
listPageUrl := url + "page/%d"
for i := 1; i <= length; i++ {
url = fmt.Sprintf(listPageUrl, i)
imgListNodes := getHtmlNodes(url, "//ul[@id='pins']/li/a")
for _, img := range imgListNodes {
href := query.SelectAttr(img, "href")
nameNode := query.FindOne(img, "//img")
name := query.SelectAttr(nameNode, "alt")
fmt.Printf("%s----%s\n", name, href)
filePath := filepath.Join(dir, name)
err := os.MkdirAll(filePath, 0777)
if err != nil {
fmt.Printf("os.MkdirAll err:%s\n", err)
}
//进入详情页
maxImagePageInt := getMaxPage(href, "//div[@class='pagenavi']/a/span")
for j := 1; j < maxImagePageInt; j++ {
if j == 1 {
url = href
} else {
url = href + "/" + strconv.Itoa(j)
}
imgUrlNode := getHtmlNode(url, "//div[@class='main-image']/p/a/img")
imgUrl := query.SelectAttr(imgUrlNode, "src")
fmt.Printf("download:%s\n", imgUrl)
downloadImg(imgUrl, filePath)
time.Sleep(500 * time.Millisecond)
}
time.Sleep(500 * time.Millisecond)
}
}
}
```
> 不多说,直接上代码,代码有点丑,莫笑。 Go版的妹子图爬虫 很好啊,能想到用golang写爬虫,但没用到golang最优势的协程,是不是可以再改改?或者直接用colly吧。 不错,感谢分享! 学习了,谢谢。
thepoy 发表于 2020-10-29 00:02
很好啊,能想到用golang写爬虫,但没用到golang最优势的协程,是不是可以再改改?或者直接用colly吧。
刚学,还不会,正在研究 保存图片时应该开启一个goroutine,不应该在主goroutine中保存图片,这样不能发挥go的优势,开启子goroutine后下载会更快 可以可以 学习了,谢谢分享
页:
[1]
2