爬取某站壁纸
以前爬过一个羞涩站的图片,代码还在,不知道能不能用了.这个是爬取壁纸的.py小白,代码粗糙.有不规范的地方,请大神指出.谢谢.
# *_*coding:utf-8 *_*
import requests
import os
from lxml import etree
'''
请在此页面http://www.win4000.com/zt/index.html下点击相应的专题,进入专题后复制专题页面的网址
'''
picUrl = input("请输入壁纸下载的分类地址如(http://www.win4000.com/zt/junshi.html):")
if picUrl:
url = picUrl.strip()
else:
url = 'http://www.win4000.com/zt/junshi.html'
# print(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
s = requests.session()
s.keep_alive = False
page_text = s.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
# print(tree.xpath('/html/body/div/div/div/div/div/div/div/div/ul/li/a/@href'))
pic_urls = tree.xpath('/html/body/div/div/div/div/div/div/div/div/ul/li/a/@href')
pic_titles = tree.xpath('/html/body/div/div/div/div/div/div/div/div/ul/li/a/@title')
print(map(lambda x, y: , pic_urls, pic_titles))
dict = map(lambda x, y: , pic_urls, pic_titles)
def down_pic():
pic_url_detail = s.get(url=k, headers=headers).text
tree = etree.HTML(pic_url_detail)
pic_amounts = int(tree.xpath('/html/body/div/div/div/div/div/div/em/text()'))
print('%s的壁纸数量是:%s' % (v, pic_amounts))
# 拼接每张图片的路径
for i in range(pic_amounts):
# print(i)
pic_url_html = k.split('.html') + "_%s" + '.html'
# print("图片网址:", pic_url_html % str(i + 1))
pic_url_html = pic_url_html % str(i + 1)
# 获取每张图片的地址
pic_url_html_detail = s.get(url=pic_url_html, headers=headers).text
# 解析每个图片地址的网页,用于获取xpath信息
tree = etree.HTML(pic_url_html_detail)
pic_down_path = tree.xpath('/html/body/div/div/div/div/div/div/div/a/img/@src')
# print('图片下载地址:', pic_down_path)
# 取图片名
pic_name = pic_down_path.split('/')[-1]
# print(pic_name)
# 判断图片是否存在
if os.path.exists('imgs/' + v + '/' + pic_name):
print("图片已经存在,跳过!")
else:
pic_data = s.get(url=pic_down_path, headers=headers).content
with open('imgs/' + v + '/' + pic_name, 'wb')as fp:
fp.write(pic_data)
print('图片%s下载完成' % pic_name)
for k, v in dict:
print(k, v)
if not os.path.exists('imgs/' + v):
os.mkdir('imgs/' + v)
down_pic()
else:
print('目录已经存在')
down_pic()
任何网站都可以啊 把网站修改一下以后,用文本另存为bat吗?
汽车之家能爬嘛? 咱也不懂,咱也不敢问:lol 我,,看不懂爬虫。。这种没有界面的软件。我是真看不懂啊
页:
[1]