【python】多线程爬虫下载英雄联盟皮肤
import refrom urllib import request
import threading
import os
import json
from queue import Queue
class Procuder(threading.Thread):
''' 获取每个皮肤及原画的链接和名字'''
def __init__(self,hero_queue,img_queue,*args,**kwargs):
super(Procuder,self).__init__(*args,**kwargs)
self.hero_queue=hero_queue
self.img_queue=img_queue
def run(self):
while True:
if self.hero_queue.empty():
break
url=self.hero_queue.get()
self.parser_page(url)
def parser_page(self,url):
response=request.urlopen(url)
data=response.read().decode('utf-8')
#print(data)
skin_json=re.findall(r'{"data":(.*?);',data)
#print(skin_json)
skin_jsons='{"data":'+skin_json
skin=json.loads(skin_jsons)
default=skin['data']['name']
skins=skin['data']['skins']
for key in skins:
if key['name']=='default':
imgname=default
#print(imgname)
else:
imgname=key['name']
imgid=key['id']
#print(imgid,imgname)
imgname=re.sub(r'/','',imgname)#替换K/AD
save_name="D://lolskin//"+imgname+'.jpg'
img_url="http://ossweb-img.qq.com/images/lol/web201310/skin/big" + imgid + ".jpg"
self.img_queue.put((img_url,save_name))
class Consumer(threading.Thread):
'''对获取到的连接进行下载'''
def __init__(self,hero_queue,img_queue,*args,**kwargs):
super(Consumer,self).__init__(*args,**kwargs)
self.hero_queue=hero_queue
self.img_queue=img_queue
def run(self):
while True:
if self.hero_queue.empty() and self.img_queue.empty():
break
img_url,save_name=self.img_queue.get()
try:
if not os.path.exists(save_name):
request.urlretrieve(img_url,save_name)
print("下载完成")
except Exception:
print("下载失败")
def main():
hero_queue=Queue(150)
res=request.urlopen('http://lol.qq.com/biz/hero/champion.js')
text=res.read().decode('utf-8')
hero_lists=re.findall(r'LOLherojs.champion=(.+?);',text)
hero_dic=json.loads(hero_lists)['keys']
#英雄详情页的列表
for key in hero_dic:
url_hero="http://lol.qq.com/biz/hero/"+hero_dic+".js"
hero_queue.put(url_hero)
img_queue=Queue(1000)
#print(hero_queue.get())
save_dir="D://lolskin//"
if not os.path.exists(save_dir):
os.mkdir(save_dir)
for i in range(2):#这里两个线程就够多了
t=Procuder(hero_queue,img_queue)
t.start()
for i in range(5):#对这里的5进行修改可改变线程数,线程越多下载越快
t=Consumer(hero_queue,img_queue)
t.start()
if __name__=='__main__':
main() 虽然看不懂,但表示支持。 要是早点看到这个帖子,我去年的爬虫作业就有办法了,感谢楼主分享 楼主威武 厉害了,大哥 简单易懂,适合新手学习 感谢分享,正好拿来学习一下 img_url 是什么地址 学到了 这个例子写的很有参考意义感谢分享 点赞 {:1_921:}{:1_921:}{:1_921:} 收藏学习了{:301_1000:}
页:
[1]
2