[Python] 纯文本查看 复制代码
package main
package main
import (
"bytes"
"fmt"
"io/ioutil"
"log"
"net/http"
"net/url"
"os"
"strconv"
"strings"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly"
)
// 判断文件夹是否存在(公共方法)
func PathExists(path string) (bool, error) {
_, err := os.Stat(path)
if err == nil {
return true, nil
}
if os.IsNotExist(err) {
return false, nil
}
return false, err
}
// 判断所给路径是否为文件夹
func IsDir(path string) bool {
s, err := os.Stat(path)
if err != nil {
return false
}
return s.IsDir()
}
// 判断所给路径是否为文件
func IsFile(path string) bool {
return !IsDir(path)
}
// 判断所给路径文件/文件夹是否存在
func Exists(path string) bool {
_, err := os.Stat(path) //os.Stat获取文件信息
if err != nil {
if os.IsExist(err) {
return true
}
return false
}
return true
}
var waitGroup = new(sync.WaitGroup)
//下载图片(存在则跳过)
func download(name string, imgurl string, path string, imgtype string) {
isExi := Exists(path + "//" + name + "." + imgtype)
// fmt.Printf(strconv.FormatBool(isExi))
if isExi {
fmt.Printf("开始下载:文件已存在!\n")
waitGroup.Done()
return
}
fmt.Printf("开始下载:%s\n", imgurl)
res, err := http.Get(imgurl)
if err != nil || res.StatusCode != 200 {
fmt.Printf("下载失败:%s", res.Request.URL)
}
fmt.Printf("开始读取文件内容,url=%s\n", imgurl)
data, err2 := ioutil.ReadAll(res.Body)
if err2 != nil {
fmt.Printf("读取数据失败")
}
ioutil.WriteFile(fmt.Sprintf(path+"//%s."+imgtype, name), data, 0644)
//if failed, sudo chmod 777 pic2016/
//计数器-1
waitGroup.Done()
}
// 解析URL-返回Colly控制器
func OpenUrl(urlstr string) *colly.Collector {
//解析URL
u, err := url.Parse(urlstr)
if err != nil {
log.Fatal(err)
}
c := colly.NewCollector()
// 超时设定
c.SetRequestTimeout(100 * time.Second)
// 指定Agent信息
c.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20110101 Firefox/71.0"
c.OnRequest(func(r *colly.Request) {
// Request头部设定
r.Headers.Set("Host", u.Host)
r.Headers.Set("Connection", "keep-alive")
r.Headers.Set("Accept", "*/*")
r.Headers.Set("Origin", u.Host)
r.Headers.Set("Referer", urlstr)
r.Headers.Set("Accept-Encoding", "gzip, deflate")
r.Headers.Set("Accept-Language", "zh-CN, zh;q=0.9")
})
return c
}
// 根据图片地址下载图片
func UrlDow(band string, prefix string, urlpath string, s *goquery.Selection, i int, _dir string, err error) {
c := OpenUrl(band)
// c.OnRequest(func(r *colly.Request) {
// // waitGroup.Done()
// })
// c.OnHTML("title", func(e *colly.HTMLElement) {
// })
c.OnResponse(func(resp *colly.Response) {
fmt.Println("response received 1", resp.StatusCode)
fmt.Printf(band + "\n")
title := s.Text()
fmt.Printf("链接 %d: %s - %s\n", i, title, band)
waitGroup.Add(1)
go download(urlpath, band, _dir, "png")
})
c.OnError(func(resp *colly.Response, errHttp error) {
err = errHttp
})
c.OnScraped(func(r *colly.Response) {
waitGroup.Done()
})
err = c.Visit(band)
}
// 根据页码地址下载图片
func PageUrlDow(urlstr string, _dir string, err error, sec int) {
// urlstr := "https://wallhaven.cc/toplist?page=1"
c := OpenUrl(urlstr)
c.OnHTML("title", func(e *colly.HTMLElement) {
fmt.Println("title:", e.Text)
})
c.OnScraped(func(r *colly.Response) {
waitGroup.Done()
})
//获取图片列表
c.OnResponse(func(resp *colly.Response) {
fmt.Println("response received", resp.StatusCode)
// goquery直接读取resp.Body的内容
htmlDoc, err := goquery.NewDocumentFromReader(bytes.NewReader(resp.Body))
// fmt.Printf(htmlDoc.Html()) //查看页面内容
if err != nil {
log.Fatal(err)
}
// 找到抓取项 <div class="hotnews" alog-group="focustop-hotnews"> 下所有的a解析
htmlDoc.Find("div[class=boxgrid] img").Each(func(i int, s *goquery.Selection) {
temp, bl := s.Attr("data-src")
if bl != true {
return
// log.Fatal(err)
}
imgurl := strings.Replace(temp, "thumb-350-", "", -1)
fmt.Printf(imgurl + "\n") //查看链接
waitGroup.Add(1)
urlpath := strings.Split(imgurl, "/")[len(strings.Split(imgurl, "/"))-1]
urlpath = strings.Split(urlpath, ".")[0]
prefix := urlpath
fmt.Println("\n文件名:" + urlpath)
fmt.Println("\n暂无:" + prefix)
go UrlDow(imgurl, prefix, urlpath, s, i, _dir, err)
// go ImgUrlDow(imgurl, _dir, err, 2)
})
})
c.OnError(func(resp *colly.Response, errHttp error) {
err = errHttp
})
err = c.Visit(urlstr)
}
//主线程
func main() {
//创建文件夹
_dir := "./Wnacg"
exist, err := PathExists(_dir)
if err != nil {
fmt.Printf("get dir error![%v]\n", err)
return
}
if exist {
fmt.Printf("has dir![%v]\n", _dir)
} else {
fmt.Printf("no dir![%v]\n", _dir)
// 创建文件夹
err := os.Mkdir(_dir, os.ModePerm)
if err != nil {
fmt.Printf("mkdir failed![%v]\n", err)
} else {
fmt.Printf("mkdir success!\n")
}
}
now := time.Now()
fmt.Printf("多页模式:下载0-n页\n")
fmt.Printf("中断模式:下载n-m页\n")
isFirst := "10"
fmt.Printf("请输入n或n,m选择-多页模式或者中断模式(默认为多页模式n=10): ")
fmt.Scanln(&isFirst)
urlstr := "https://wall.alphacoders.com/by_sub_category.php?id=239594&name=Fate%2FGrand+Order+Wallpapers"
fmt.Printf("请输入页面地址: ")
fmt.Scanln(&urlstr)
num := strings.Split(isFirst, ",")
if len(num) == 1 {
int, err := strconv.Atoi(num[0])
if err != nil {
fmt.Printf("get dir error![%v]\n", err)
return
}
for i := 1; i <= int; i++ {
if i == 1 {
waitGroup.Add(1)
go PageUrlDow(urlstr, _dir, err, 2)
} else {
waitGroup.Add(1)
go PageUrlDow(urlstr+"&page="+strconv.Itoa(i), _dir, err, 2)
}
}
} else {
intone, err := strconv.Atoi(num[0])
if err != nil {
fmt.Printf("get dir error![%v]\n", err)
return
}
inttwo, errt := strconv.Atoi(num[1])
if errt != nil {
fmt.Printf("get dir error![%v]\n", err)
return
}
for i := intone; i <= inttwo; i++ {
if i == 1 {
waitGroup.Add(1)
go PageUrlDow(urlstr, _dir, err, 2)
} else {
waitGroup.Add(1)
go PageUrlDow(urlstr+"&page="+strconv.Itoa(i), _dir, err, 2)
}
}
}
//等待所有协程操作完成
waitGroup.Wait()
fmt.Printf("下载总时间:%v\n", time.Now().Sub(now))
time.Sleep(60 * time.Second)
}