多线程贴吧图片爬虫
本帖最后由 想要个妹妹 于 2020-8-5 12:59 编辑✿
最近在琢磨python爬虫,抱着学习的态度尝试写了一个
经测试200张图片+重命名大概8秒
格式不太规范哈(忽略)也遇到了些问题
多线程futures下载的图片重命名1,2,3(出现了覆盖的问题)
后来用time.time()+random.random()双随机来解决覆盖
但肯定这是比较笨的方法,大家有什么好方法可以分享探讨一下import re
import urllib.request
import time
from os import listdir,getcwd,rename
from concurrent import futures
import random
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
return html
def getImg(a):
reg = r'src="([.*\S]*.jpg)" size="'
imgre = re.compile(reg)
imglist = re.findall(imgre, a)
return imglist
def getPage(a):
reg = r'共<span class="red">([\w]*)</span>页'
pagere = re.compile(reg)
pagelist = re.findall(pagere, a)
return pagelist
start = time.time()
url = "https://tieba.baidu.com/p/6279034183"
html = getHtml(url)
html = html.decode('UTF-8')
page = getPage(html)
print("共{}页".format(page))
imgList = []
for i in range(1,int(page)+1):
html = getHtml(url+"?%0d" % i)
html = html.decode('UTF-8')
imgList += getImg(html)
##imgName = 0
def saveOneImg(imgurl):
## global imgName
## print("现在下载第%0d张" %imgName)
name = "pic/"+str(time.time())+str(random.random())+".jpg"
print("download:"+name+"\n")
f = open(name, "wb")
f.write((urllib.request.urlopen(imgurl)).read())
f.close()
## imgName += 1
def saveManyImg(imgList):
with futures.ThreadPoolExecutor(10) as executor:
res = executor.map(saveOneImg, imgList)
##print(imgList)
saveManyImg(imgList)
print("重命名ing......")
j=0
for i in listdir(getcwd()+"//pic"):
rename("pic//"+i,"pic//"+str(j)+".jpg")
j+=1
print("All Done!")
stop = time.time()
print(stop-start)
有点意思的这个 学习学习思路 有点意思的这个 学习一下 不知道还能用不?
页:
[1]