go爬取小说

hello_World20 发表于 2021-2-22 15:41

go语言使用colly+goquery爬取小说章节，因为懒所以没有去爬取单独小说的前缀路劲，需要手动修改。会在noval文件夹下生成每个章节的txt文件
package main

import (
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly"
"github.com/gocolly/colly/extensions"
"os"
"strconv"
"strings"
"time"
)

var indexUrl string = "http://www.xbiquge.la"
var novelIndex string="http://www.xbiquge.la/61/61403/"

func main() {
start := time.Now()
number := 1
// 初始化collector
c := colly.NewCollector(func(c *colly.Collector) {
   extensions.RandomUserAgent(c) // 设置随机头
   c.Async = true
},
)
/*c.OnHTML("a", func(e *colly.HTMLElement) {
   link := e.Attr("href")
   fmt.Printf("find link: %s\n", e.Request.AbsoluteURL(link))
   c.Visit(e.Request.AbsoluteURL(link))
})*/
urls := make([]string, 0, 100)
//urlList := make([]string,30,100)
// 解析页面获取需要的信息
c.OnHTML("dd", func(e *colly.HTMLElement) {
   e.DOM.Each(func(i int, selection *goquery.Selection) {
      //title := selection.Find("dd a").Text()
      title := selection.Find("dd a").Text()
      link, _ := selection.Find("dd a").Attr("href")
      if len(link) != 0 {
         link = indexUrl + link
         urls = append(urls, link)
      }
      fmt.Printf("%d --> %s-->%s\n", number, title, link)
      number += 1
      time.Sleep(2 * time.Second)
      c.Visit(e.Request.AbsoluteURL(link))
      return
   })
})
dd := 1
c.OnHTML("div.box_con", func(e *colly.HTMLElement) {
   e.DOM.Each(func(i int, selection *goquery.Selection) {
      fileName := selection.Find("div.bookname h1").First().Text()
      split := strings.SplitN(fileName, " ",2)
      if len(split)==2 {
         fileName=strconv.Itoa(dd)+split[1]
      }
      text := selection.Find("div#content").Text()
      // 保存为txt文件
      dstFile, err := os.Create("novel\\"+fileName)
      if err != nil {
         fmt.Println(err.Error())
         return
      }
      defer dstFile.Close()
      dstFile.WriteString(text + "\n")
      dd += 1
   })
})

c.OnRequest(func(r *colly.Request) {
   fmt.Println("Visiting", r.URL)
})
c.OnError(func(response *colly.Response, err error) {
   fmt.Println(err)
})
c.Visit(novelIndex)
c.Wait()
fmt.Printf("花费时间:%s\n", time.Since(start))
fmt.Println("获取的url总数量：", number)
fmt.Println("获取的url长度：", len(urls))
fmt.Println("获取的url容量：", cap(urls))
}

页: [1]

吾爱破解 - 52pojie.cn's Archiver

go爬取小说