第一次写某福利网爬虫请指教。。
本帖最后由 疯兔neo 于 2020-2-26 22:48 编辑因为第一次写,变量名什么的都比较随意。。。还请见谅。
import requests, os, time
from lxml import etree
class Spider():
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
'Referer': 'https://www.mzitu.com/xinggan/'
}
def start(self):
#根据需要的页数更改range值
for i in range(1, 241):
print('正在打开第%s页'%i)
response = requests.get('https://www.mzitu.com/page/' + str(i) + '/', headers=self.headers)
html = etree.HTML(response.content.decode())
self.jiexi(html)
def jiexi(self, html):
addr_list = html.xpath('//*[@id="pins"]/li/a/@href')
name_list = html.xpath('//*[@id="pins"]/li/a/img/@alt')
for addr, name in zip(addr_list, name_list):
print(addr)
print(name)
self.file_name = name.replace('/','_')
response = requests.get(addr, headers=self.headers)
html1 = etree.HTML(response.content.decode())
m = 1
all_ye = html1.xpath("string(/html/body/div/div/div/a/span)")
print(all_ye)
for i in range(1, int(all_ye)):
print('正在下载第%s页'%i)
response2 = requests.get(addr + '/' + str(m) + '/', headers=self.headers)
#防止封IP
time.sleep(0.5)
html_ye = etree.HTML(response2.content.decode())
if m == all_ye:
break
else:
self.down(html_ye)
m += 1
def down(self,html):
file_path = '/Users/neo/Documents/mmm/' + self.file_name
if not os.path.exists(file_path):
os.mkdir(file_path)
print(file_path + '创建成功')
else:
pass
os.chdir(file_path)
p_addr = html.xpath("string(/html/body/div/div/div/p/a/img/@src)")
response = requests.get(p_addr, headers=self.headers)
p_name = p_addr
print(p_addr)
try:
with open(p_name, 'wb') as f:
f.write(response.content)
except:
print('文件名错误')
if __name__ == '__main__':
sp = Spider()
sp.start()
我也是边写边查资料,不过我爬的是这个站:meitulu 我也写了个,不过开始没加sleep被封了IP 看看啥时候解禁吧
package main
import (
"bufio"
"fmt"
"io"
"net/http"
"os"
"path"
"regexp"
"strconv"
"strings"
"time"
)
//爬取指定url的网页内容
func HttpGet(url string) (result string, err error) {
req, _ := http.NewRequest("GET", url, nil)
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36")
req.Header.Set("Referer", "https://www.mzitu.com/all/")
resp, err1 := (&http.Client{}).Do(req)
if err1 != nil {
err = err1
return
}
defer resp.Body.Close()
//读取网页当前的内容
buf := make([]byte, 4*1024)
for {
n, _ := resp.Body.Read(buf)
if n == 0 {
break
}
result += string(buf[:n])
}
return
}
//解析爬取的地址信息
//使用正则表达式解析当前内的地址
func SplitHomePage(htmlString string) (urls []string) {
re := regexp.MustCompile(`日: <atarget="_blank">`)
if re == nil {
fmt.Println("regexp.MustCompile err")
}
girlUrls := re.FindAllStringSubmatch(htmlString, -1)
//取出切片内的url实际内容
for _, data := range girlUrls {
//获取每一个妹子的图片utl图集
urls = append(urls, "https://www.mzitu.com/"+data)
}
return
}
//爬取一个妹子的每一张图片
func SpiderOneGirl(url, picPath string) bool {
//获取当前页面的title信息
fmt.Println("单页网址为:", url)
result, err := HttpGet(url)
if err != nil {
fmt.Printf("url= %s 获取信息失败\n", url)
return false
}
re := regexp.MustCompile(`<title>(?s:(.*?))</title>`)
if re == nil {
fmt.Println("regexp.MustCompile err")
}
title := re.FindStringSubmatch(result)
fmt.Println("标题为:", title)
//循环获取网址的信息,当网址请求访问返回值不为404时,进行图片地址解析
i := 0
for {
if !SpiderOneGirlPicture(url, picPath+title+"/", i) {
break
}
time.Sleep(2000)
i++
}
return false
}
//爬取某一个妹子的分页图片实际地址,并按照标题保存
func SpiderOneGirlPicture(url, picPath string, i int) bool {
requestUrl := url + "/" + strconv.Itoa(i)
picResult, err := HttpGet(requestUrl)
if err != nil {
//如果发生错误直接返回 不进行任何匹配
return true
}
re := regexp.MustCompile(`><img src="(?s:(.*?))"`)
if re == nil {
fmt.Println("regexp.MustCompile err")
}
picUrl := re.FindStringSubmatch(picResult)
if len(picUrl) > 0 {
fmt.Println("需要保存的图片地址页为:", picUrl)
//下载图片
saveGirlPicture(picUrl+"?width=700&height=1050", picPath, requestUrl)
time.Sleep(2000)
return true
}
return false
}
func PathExists(path string) (bool, error) {
_, err := os.Stat(path)
if err == nil {
return true, nil
}
if os.IsNotExist(err) {
return false, nil
}
return false, err
}
//保存网络图片
func saveGirlPicture(imgUrl, imgPath, referer string) {
//创建文件夹
exist, err := PathExists(imgPath)
if !exist {
_ = os.Mkdir(imgPath, os.ModePerm)
}
fileTemName := path.Base(imgUrl)
fileName := fileTemName
req, _ := http.NewRequest("GET", imgUrl, nil)
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36")
req.Header.Set("Referer", referer)
res, err := (&http.Client{}).Do(req)
if err != nil {
fmt.Println("A error occurred!")
return
}
defer res.Body.Close()
// 获得get请求响应的reader对象
reader := bufio.NewReaderSize(res.Body, 32*1024)
file, err := os.Create(imgPath + fileName)
if err != nil {
panic(err)
}
// 获得文件的writer对象
writer := bufio.NewWriter(file)
written, _ := io.Copy(writer, reader)
fmt.Printf("Total length: %d", written)
}
func DoWork() bool {
fmt.Printf("准备获取妹子图")
//获取全部页面的数据
url := "https://www.mzitu.com/all/"
//地址
picPath := "D:/downloads/pic/mzitu/"
//开始获取全部页面内容
result, err := HttpGet(url)
if err != nil {
fmt.Println("HttpGet Error = ", err)
return false
}
//获取到全部的妹子图图片网址切片
urls := SplitHomePage(result)
for _, url := range urls {
if !SpiderOneGirl(url, picPath) {
continue
}
}
return true
}
func main() {
//工作函数
if DoWork() {
fmt.Println("获取图片完毕")
}
}
第一次写成这样,可以哦{:1_921:} 学习一下 喜欢那个站点{:1_918:},感谢分享 学习学习 第一次写成这样,很好了 知意执意 发表于 2020-2-25 20:26
第一次写成这样,可以哦
边写边查资料{:1_936:} l楼主可以的啊{:1_921:} 学习大佬 {:1_908:}我写了无数次都没有你写的好