go爬取小说
go语言使用colly+goquery爬取小说章节,因为懒所以没有去爬取单独小说的前缀路劲,需要手动修改。会在noval文件夹下生成每个章节的txt文件package main
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly"
"github.com/gocolly/colly/extensions"
"os"
"strconv"
"strings"
"time"
)
var indexUrl string = "http://www.xbiquge.la"
var novelIndex string="http://www.xbiquge.la/61/61403/"
func main() {
start := time.Now()
number := 1
// 初始化collector
c := colly.NewCollector(func(c *colly.Collector) {
extensions.RandomUserAgent(c) // 设置随机头
c.Async = true
},
)
/*c.OnHTML("a", func(e *colly.HTMLElement) {
link := e.Attr("href")
fmt.Printf("find link: %s\n", e.Request.AbsoluteURL(link))
c.Visit(e.Request.AbsoluteURL(link))
})*/
urls := make([]string, 0, 100)
//urlList := make([]string,30,100)
// 解析页面获取需要的信息
c.OnHTML("dd", func(e *colly.HTMLElement) {
e.DOM.Each(func(i int, selection *goquery.Selection) {
//title := selection.Find("dd a").Text()
title := selection.Find("dd a").Text()
link, _ := selection.Find("dd a").Attr("href")
if len(link) != 0 {
link = indexUrl + link
urls = append(urls, link)
}
fmt.Printf("%d --> %s-->%s\n", number, title, link)
number += 1
time.Sleep(2 * time.Second)
c.Visit(e.Request.AbsoluteURL(link))
return
})
})
dd := 1
c.OnHTML("div.box_con", func(e *colly.HTMLElement) {
e.DOM.Each(func(i int, selection *goquery.Selection) {
fileName := selection.Find("div.bookname h1").First().Text()
split := strings.SplitN(fileName, " ",2)
if len(split)==2 {
fileName=strconv.Itoa(dd)+split[1]
}
text := selection.Find("div#content").Text()
// 保存为txt文件
dstFile, err := os.Create("novel\\"+fileName)
if err != nil {
fmt.Println(err.Error())
return
}
defer dstFile.Close()
dstFile.WriteString(text + "\n")
dd += 1
})
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c.OnError(func(response *colly.Response, err error) {
fmt.Println(err)
})
c.Visit(novelIndex)
c.Wait()
fmt.Printf("花费时间:%s\n", time.Since(start))
fmt.Println("获取的url总数量:", number)
fmt.Println("获取的url长度:", len(urls))
fmt.Println("获取的url容量:", cap(urls))
}
页:
[1]