自学Python----爬取Beautyleg图片
1、自己需要导入代码中使用的库,例如request库,lxml库。2、自己的python版本:Python 3.7.2
3、第一次发帖有什么违规的请见谅~~!
import requestsimport os
import time
from lxml import etree
'''
xpath获取的element对象返回的是个字典.tag(标签名称).attrib(标签属性) .text(标签文本)
'''
def get_xpath_text(url,headers):
response = requests.get(url, headers)
response.encoding = 'utf-8'
xpath_html = etree.HTML(response.text)
return xpath_html
def xpath_get_urls(html):
xpath_html = etree.HTML(html)
urls = xpath_html.xpath("//div[@class='item']/div[@class='p']/a")
return urls
def get_group_pageinfo(page_url):
grouppage_urls=[]
xpath_html = get_xpath_text(page_url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36'})
pagenumstr = xpath_html.xpath("//div[@class='page']/li/a").text
group_pic_name = xpath_html.xpath("//h1").text
pagenumstr = pagenumstr.replace('共', '').replace('页:', '')
for i in range(1,int(pagenumstr)+1):
if(i==1):
grouppage_urls.append(page_url)
else:
grouppage_urls.append(page_url.replace('.html','_%d.html'%i))
return grouppage_urls,group_pic_name
def get_picurls(page_url):
picurls = []
xpath_html = get_xpath_text(page_url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36'})
picurl_elements = xpath_html.xpath('//div[@class="contents"]/a/img')
for pic_url_element in picurl_elements:
picurls.append(pic_url_element.attrib['src'])
return picurls
if __name__ == '__main__':
#存储每组图片的url
group_pic_urls=[]
#选取10组图片
for num in range(1,2):
page_url = 'http://www.beautyleg7.com/siwameitui/list_3_%d.html' % num
response = requests.get(page_url,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36'})
response.encoding = 'utf-8'
for ul in xpath_get_urls(response.text):
group_pic_urls.append(ul.attrib['href'])
#for循环访问每组套图的url 每个页面获取图片的xpath://div[@class="contents"]/a
#获取总共的页数xpath://div[@class="page"]/li/a
for group_pic_url in group_pic_urls:
group_pageurls,group_pic_name = get_group_pageinfo(group_pic_url)
num = 1
//文件的存储路径可根据需要自己选择
filepath = "F:\crawler\leg\\"+group_pic_name.replace('/','-')
print(filepath)
if( not os.path.exists(filepath)):
os.makedirs(filepath.strip())
os.chdir(filepath)
for group_pageurl in group_pageurls:
picurls = get_picurls(group_pageurl)
for pic_url in picurls:
print(pic_url)
response = requests.get(pic_url)
img = response.content
with open(filepath+'\%d.jpg'%num,'wb') as f:
f.write(img)
num=num+1
流啤,学到了 学习了,感谢LZ分享 学习了。。 感谢您的分享 卧槽,人才啊 挺好的,简单哈 不错唷不错哟 大哥 能不能发一下自学的书籍及教程 也想自学一下 大佬,帮忙看下我的问题,可以解答不,谢谢https://www.52pojie.cn/thread-1198015-1-1.html
页:
[1]
2