本帖最后由 jinyi666 于 2022-7-24 16:25 编辑
用python把所有图片保存下来,然后慢慢挑
我记得这个彼岸网好像下载的话只能下一张
库:lxml、requests、os
软件:PyCharm Python3.9
[Python] 纯文本查看 复制代码
import requests from lxml import etree
import os
# 如果没有img2这个文件夹则创建一个
filename = 'img2\\'
if not os.path.exists(filename):
os.mkdir(filename)
class ImageSpider(object):
def __init__(self):
self.index_url = "http://www.netbian.com/1920x1080/index.htm"
self.url = "http://www.netbian.com/1920x1080/index_{}.htm"
self.headers = {'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0'
}
'''发送请求 获取响应'''
def get_page(self, url):
res = requests.get(url=url, headers=self.headers)
html = res.content.decode("gbk") # 网络编码
return html # 返回值
'''解析数据'''
def parse_page(self, html):
parse_html = etree.HTML(html)
image_src_list = parse_html.xpath('//div[@class="list"]/ul/li/a//@href')
for image_src in image_src_list:
fa = "http://www.netbian.com" + image_src
# print(fa)
html1 = self.get_page(fa) # 第二个发生请求
parse_html1 = etree.HTML(html1)
# print(parse_html1)
big_url = parse_html1.xpath('//div[@class="pic-down"]/a/@href')
for i in big_url:
diet = "http://www.netbian.com" + i
# print(diet)
html2 = self.get_page(diet)
parse_html2 = etree.HTML(html2)
# print(parse_html2)
url2 = parse_html2.xpath('//table[@id="endimg"]//tr//td//a/img/@src')
for r in url2:
pass
# print(r)
file_name = parse_html2.xpath('//table[@id="endimg"]//tr//td//a/@title')
# print(url2)
for e in file_name:
# print(e)
html2 = requests.get(url=r, headers=self.headers).content
# print(html2)
with open("img2\\" + e + '.jpg', 'wb') as f:
f.write(html2)
print("%s下载成功" % file_name)
def main(self):
print('此软件用于爬取彼岸壁纸网(http://www.netbian.com/1920x1080/index.htm)的壁纸,下面请输入你要爬取的页数:\n')
start_Page = int(input("开始页:"))
end_Page = int(input("结束页:"))
for page in range(start_Page, end_Page + 1):
if page == 1:
url = self.index_url
else:
url = self.url.format(page)
# print(url)
html = self.get_page(url)
print("第%s页壁纸开始下载!!!!" % page)
# print(html)
self.parse_page(html)
if __name__ == '__main__':
imageSpider = ImageSpider()
imageSpider.main()
不建议抓取太多页数,容易对服务器造成负载。
下面是用pyinstaller模块打包成的EXE
https://wws.lanzouv.com/i0Ujr088q9tg
密码:433g
本人刚学爬虫没多久,大佬勿喷 |