最近在看论坛大佬 @wushaominkk 写的python 入门教程 https://www.52pojie.cn/thread-739688-1-1.html
跟着写了一个,爬取一个图片站并下载图片,但是因为是单线程,所以效率比较慢,恳请大神指点一下怎么优化以及加一个线程,感谢!
代码如下:
[Python] 纯文本查看 复制代码 from urllib import request
import re
import os
link_zhengze=r'<a class="img" href="(.+?)" title=".+?" target="_blank">.+?</a>'
total_page_reg = r'<a class="page-numbers" href=".+?" title="最后页">(.+?)</a>'
img_link_reg = r'<a class="image_cx_cont" .+?><img src="(.+?)".+? /></a>'
img_title_reg = r'<title>(.+?)-.+?</title>'
list_link = list()
view_page_link = ''
view_page_link_ext = ''
root_path = os.getcwd() + '/images'
img_path = ''
def http_request(url,coding = 'utf-8'):
try:
html = request.urlopen(url)
except BaseException:
return False
else:
return html.read().decode(coding)
def regular(reg,data):
return re.findall(reg,data,re.S)
def content_link(url):
resquest=http_request(url)
if(resquest == False): return False
return regular(link_zhengze,resquest)
def page_link_format(url):
global view_page_link,view_page_link_ext
format = url.split('.')
view_page_link = format[0] + '.' + format[1]
view_page_link_ext = format[2]
def mkdir_path(root_path,name = ''):
path = root_path + '/' + name
is_file = os.path.exists(path)
if (is_file == False) : os.makedirs(path)
def set_img_file(file,data):
f = open(file,'wb')
f.write(data)
f.close()
def get_down_img(imgs):
for index in imgs:
if index is None:
continue
print('download now: ' + index)
set_img_file(img_path + '/' + index.split('/')[-1],request.urlopen(index).read())
def get_view_links(url):
global img_path
html = http_request(url)
view_img = regular(img_link_reg,html)
if(view_img is None) : return False
total_page = int(regular(total_page_reg,html)[0])
page_link_format(list_link[0])
title = regular(img_title_reg,html)[0]
img_path = root_path + '/' + title
mkdir_path(img_path)
print('Download file:' + title)
get_down_img(view_img)
for index in range(1,total_page):
view_img = regular(img_link_reg,http_request(view_page_link + '_' + str(index + 1) + '.' + view_page_link_ext))
if view_img is None:
continue
get_down_img(view_img)
def get_list_link(list_link):
for item in list_link:
returl = get_view_links(item)
if (returl == False): continue
web_link = ['http://mzsock.com/sw/page/','http://mzsock.com/cy/page/','http://mzsock.com/mv/page/','http://mzsock.com/lz/page/','http://mzsock.com/fbx/page/','http://mzsock.com/ydx/page/','http://mzsock.com/rzt/page/','http://mzsock.com/cwzp/page/']
error_number = 0
list_page_number = 1
for index in web_link:
error_status = True
while error_status:
list_link = content_link(index + str(list_page_number))
list_page_number = list_page_number + 1
if list_link == False:
if error_number == 2:
error_status = False
else:
error_number = error_number + 1
else:
get_list_link(list_link)
print('Success')
|