python爬虫古装汉服图片
本帖最后由 qq58452077 于 2019-8-11 14:10 编辑```python
#!/usr/bin/env python
# encoding: utf-8
"""
@version: 1.0
@author: CJ
@software: PyCharm
@file: 52guzhuang.py
@time: 2017/7/13 23:01
"""
import urllib.request
import lxml.html
import time
import os
import random
import re
def serchIndex(url='http://www.52guzhuang.com/'):
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36')
html = urllib.request.urlopen(req).read().decode('GBK')
return html
def findPageTotal(html,isNext):
tree = lxml.html.fromstring(html)
eList = tree.cssselect('div#postlist > div')
dicts = {}
page = ''
for i in range(0,len(eList)-1):
links = []
if i == 0:
if not links:
links = eList.cssselect('#jiathis_share_CODE_HTML4 > div.t_fsz >table > tr:nth-child(1) >td.t_f >div:nth-last-of-type(1) > font > font > font > ignore_js_op')
if not links:
links = eList.cssselect('#jiathis_share_CODE_HTML4 > div.t_fsz >table > tr:nth-child(1) >td.t_f > ignore_js_op')
if not links:
links = eList.cssselect('table.plhin > tr:nth-child(1) > td.plc > div.pct > div.pcb > div.t_fsz > table > tr:nth-child(1) >td.t_f > div > ignore_js_op')
else:
links = eList.cssselect('table.plhin > tr:nth-child(1) > td.plc > div.pct > div.pcb > div.t_fsz > table > tr:nth-child(1) >td.t_f > div > ignore_js_op')
for index,link in enumerate(links):
src = link.get('zoomfile')
name = link.get('aid')
dicts = "http://www.52guzhuang.com/"+src;
if isNext:
ele = tree.cssselect("div#ct > div.pgs.mtm.mbm.cl > div.pg > a:nth-last-child(3)");
if ele and len(ele) != 0:
page = ele.text
return dicts,page
def dowmloadImage(image_url,filename):
for i inrange(len(image_url)):
try:
req = urllib.request.Request(image_url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36')
image_data = urllib.request.urlopen(req).read()
except (urllib.HTTPError, urllib.URLError) as e:
time.sleep(0.1)
continue
open(filename,'wb').write(image_data)
break
def mkdirByGallery(path):
# 去除首位空格
path = path.strip()
# path = '/photo/'+path
path = 'E:\\py\\photo\\' + path
#这两个函数之间最大的区别是当父目录不存在的时候os.mkdir(path)
#不会创建,os.makedirs(path)
#则会创建父目录。
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
return path
if __name__ == '__main__':
url = input("请输入爬取网站:")
path = mkdirByGallery("52guzhuang")
html = serchIndex(url)
dicts = {}
image_dict,page= findPageTotal(html,True)
dicts.update(image_dict)
if page:
for i in range(2,int(page)+1):
#html= serchIndex(url+"&page="+str(i))
html= serchIndex(url+"-"+str(i)+"-1.html")
image_dict,page = findPageTotal(html,False)
dicts.update(image_dict)
for k,v in dicts.items():
dowmloadImage(v,path +"/"+str(k)+".jpg")
```
使用教程:
1.需要安装lxml第三方库
2.数据来源:52古装网
喜欢汉服和汉元素的朋友欢迎来评分(喜欢汉服小姐姐也欢迎来评分)!!
##此代码仅供学习和查考
好多好看的小姐姐 好厉害呀 学习下,谢谢楼主的分享 好看的小姐姐{:301_974:} 学习一下,感谢楼主分享
好厉害,牛。 没成品吗 你主函数那边已经没必要再用户输入网址了,你上面都写死了url了,那还要输入什么网址呢,我一开始看见你这个我以为你什么网站都能爬 吓死我了。我说呢就这么点代码怎么可能做到。
页:
[1]