感觉逻辑还是有点乱,希望有大神们多指点,该网站没有什么反爬措施,因此没有加headers,因为主要是练手用,我特意把爬取的页数改小了,感觉做人还是要厚道一些。
爬取过程中有一个提示,大概就是这个if else需要改成别的(这种if else 写法是我前几天刚刚学到的,感觉很实用),但是我不知道怎么改,期待大神指教。
[Python] 纯文本查看 复制代码 FutureWarning: The behavior of this method will change in future versions. Use specific 'len(elem)' or 'elem is not None' test instead.
img_url_ = tr.xpath('//li/*[@target="_blank"]/@href') if tr else None
全部代码如下:
[Python] 纯文本查看 复制代码 import requests
from lxml import etree
import os
class MeiNV(object):
def __init__(self):
self.urls = []
self.imgs = []
def url_link(self):
start_url = 'http://pic.netbian.com/4kmeinv/'
for x in range(1, 5):
if x != 1:
self.urls.append(f"{start_url}index_{x}.html")
else:
self.urls.append('http://pic.netbian.com/4kmeinv/')
def download(self, url):
r = requests.get(url)
return r.content
def go(self):
self.url_link()
self.parse()
self.get_img()
def parse(self):
for url in self.urls:
html = self.download(url)
tr = etree.HTML(html)
img_url_ = tr.xpath('//li/*[@target="_blank"]/@href') if tr else None
self.imgs.append(list(map(lambda x: f"http://pic.netbian.com{x}", img_url_)))
def get_img(self):
imgs = self.imgs
print(len(imgs))
cont = 1
for links in imgs:
for link in links:
img_html = self.download(link)
img_tr = etree.HTML(img_html)
img_link1 = img_tr.xpath('//*[@id="img"]//@src')
img_link = f"http://pic.netbian.com{img_link1[0]}" if img_link1 else None
img = self.download(img_link)
img_name = img_link.split('/')[-1]
path = f'第{cont}页图片'
if not os.path.exists(path):
os.makedirs(path)
with open(f'第{cont}页图片/{img_name}', 'wb') as f:
f.write(img)
print(f"正在下载第{cont}页图片", img_link)
cont += 1
if __name__ == '__main__':
meinv = MeiNV()
meinv.go()
|