分享3个爬虫(壁纸与二次元)
好久没学习了,写几个超简单的爬虫玩了会,代码不一定是最简的,也没做优化。。以后学会多线程这些壁纸地址都从书签地球找的
第一个,4K壁纸爬取,这网站的图,好像是360那的,不管了。
# -*- coding: utf-8 -*-
import os
import requests# 导入必备模块
def get():
for ppp in range(1, 100, 12):# 页码
url = "https://bird.ioliu.cn/v2?url=http://wallpaper.apc.360.cn/index.php?c=WallPaper&start={}&count=12&from=360chrome&a=getAppsByCategory&cid=36".format(ppp)
req = requests.get(url)# 请求
text = req.json()# 转换数据类型
for i in text["data"]:# 筛选
id = i["id"]
res = i["resolution"]# 分辨率
name = id + "---" + res# 保存文件名
root = "./壁纸/"# 下载目录
try:
url_m = i["url"]
url = url_m.replace("__85", "__100")# __100的是4K的,但只有85,手动修改
path = root + str(name) + ".jpg"
if not os.path.exists(root):
os.mkdir(root)# 创建文件夹
if not os.path.exists(path):
r = requests.get(url)# 获取二进制数据
with open(path, 'wb') as f:# 写入图片
f.write(r.content)
f.close()
print('{} 图片保存成功'.format(name))
except:
print(name, "错误")
get()
第二个好像是海外的壁纸
# -*- coding: utf-8 -*-
import requests
from lxml import etree
import os
url = "https://wallpaperscraft.com"
n = 0
for ppp in range(1,101):
req = requests.get(url + "/catalog/anime/page{}".format(ppp))
req_html = etree.HTML(req.text)
a = req_html.xpath('//*[@class="wallpapers__link"]//@href')
for i in a:
req = requests.get(url + i)
req_html = etree.HTML(req.text)
a = req_html.xpath('//*[@class="wallpaper-table__cell"]//@href')
req = requests.get(url + a)
req_html = etree.HTML(req.text)
a = req_html.xpath('//*[@class="gui-button gui-button_full-height"]//@href')
root = "./壁纸/"
path = root + str(n) + ".jpg"
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(a)
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print('{} 图片保存成功'.format(n))
except:
print(n, "错误")
n += 1
第三个,二次元地址,有骚的~~~~
# -*- coding: utf-8 -*-
import requests
import os
from urllib.request import urlretrieve
from lxml import etree
url = "https://anime-pictures.net"
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"content-type": "text/html; charset=UTF-8",
"cookie": "sitelang=zh_CN; __cfduid=d35094b211350abbf73c74accaf4e60a41618399143; cookieconsent_status=dismiss",
"if-none-match": "W/\"85073814374d01fb84f26b772f5f26ff4fee05a8\"",
"referer": "https://anime-pictures.net/pictures/view_posts/0?lang=zh_CN",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
def get():
j = 1
for ppppp in range(0, 5000):
req = requests.get(url + "/pictures/view_posts/{}?lang=zh_CN".format(ppppp), headers=headers)# 请求
req_text = req.text
req_html = etree.HTML(req_text)# 转换成html
a = req_html.xpath('//*[@class="posts_block"]/span/a//@href')# 筛选
for i in a:
req = requests.get(url + i, headers=headers)# 获取下载链接
wc = req.text
wc_html = etree.HTML(wc)
b = wc_html.xpath('//*[@class="download_icon"]//@href')
root = "./美图/"
path = root + str(j) + ".jpg"
if not os.path.exists(root):
os.mkdir(root)# 创建文件夹
if not os.path.exists(path):
r = requests.get(url + b)# 获取二进制数据
with open(path, 'wb') as f:# 写入图片
f.write(r.content)
f.close()
print('第{}张图片保存成功'.format(j))
else:
print('此图片已存在')
j += 1
get()
欢迎大佬指出可优化之处 shszss 发表于 2021-4-16 20:59
我也想学习爬虫,要从哪里入手呢?零知识,小白
直接学爬虫,基础在接触爬虫的过程中学习 shszss 发表于 2021-4-16 20:59
我也想学习爬虫,要从哪里入手呢?零知识,小白
用爬虫来入门python,感觉挺简单的,可以先找一下好爬的网址试试。。 阔以阔以,学习 我也想学习爬虫,要从哪里入手呢?零知识,小白 非常喜欢第三个,呵呵呵 都是不错的东东,辛苦了哈 太牛了,学习一下下 感谢分享
页:
[1]
2