使用正则会出现无法使用正则匹配出网址,原因是<a >标签中漏了内容,所以借用你的代码改进了一下,同时开启了多线程进行爬图下载操作,本人初学python 如有不妥,请指正
[Python] 纯文本查看 复制代码 import urllib.request
import re
import os
import time
import threading
img_url_partern = '<div class="content" id="content"><a href="[a-zA-z]+://[^\s]*"><img src="(.*?)" alt'
max_page_partern = '</i><a href="/mm/\w+/\w+">(.*?)</a><em'
img_name_partern = '<h2>(.*?)</h2>'
flag = 0
headers = {
'Connection':'keep-alive',
'Host':'http://www.mmjpg.com',
'Referer':'http://www.mmjpg.com/mm/1017/4',
'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'
}
def getsize(file):
size = os.path.getsize(file)
size = size/float(1024)
return round(size,0)
def down_img(url_1):
html_data = urllib.request.urlopen(url_1).read().decode()
max_page = re.compile(max_page_partern).findall(html_data)
img_name = re.compile(img_name_partern).findall(html_data)
print('当前采集的Url:' + url_1 + '共有' + max_page[0] + '张照片')
for c in range(1, int(max_page[0]) + 1):
url_2 = url_1 + '/' + str(c)
html_data_2 = urllib.request.urlopen(url_2).read().decode()
img_url = re.compile(img_url_partern).findall(html_data_2)
if not os.path.exists("%s" % img_name[0]):
os.mkdir("%s" % img_name[0])
save_name = "%s" % img_name[0] + '\\' + str(c) + '.jpg'
# print(img_url)
for err_num in range(1,11):
try:
urllib.request.urlretrieve(url=img_url[0], filename=save_name)
except:
time.sleep(0.5)
continue
if getsize(save_name)<10:
os.remove(save_name)
continue
else:
break
def down_mul(basic_url):
global flag
while flag < 1264:
flag += 1
url = basic_url + str(flag)
try:
down_img(url)
except:
print('[-]The Url:' + url + ' Was No Found!')
print('The No.'+str(flag)+' is OK')
def main():
basic_url = 'http://www.mmjpg.com/mm/'
for i in range(1,21):
t = threading.Thread(target = down_mul,args = (basic_url,))
t.start()
print('['+str(i)+']Runing Star')
if __name__ == '__main__':
opener = urllib.request.build_opener()
header_list = []
for key, value in headers.items():
header_list.append((key, value))
opener.addheaders = header_list
urllib.request.install_opener(opener)
main() |