Exception in thread Thread-2:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/urllib/request.py", line 1317, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "/usr/local/lib/python3.7/http/client.py", line 1229, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/usr/local/lib/python3.7/http/client.py", line 1275, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/usr/local/lib/python3.7/http/client.py", line 1224, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/usr/local/lib/python3.7/http/client.py", line 1016, in _send_output
self.send(msg)
File "/usr/local/lib/python3.7/http/client.py", line 956, in send
self.connect()
File "/usr/local/lib/python3.7/http/client.py", line 928, in connect
(self.host,self.port), self.timeout, self.source_address)
File "/usr/local/lib/python3.7/socket.py", line 727, in create_connection
raise err
File "/usr/local/lib/python3.7/socket.py", line 716, in create_connection
sock.connect(sa)
TimeoutError: [Errno 110] Connection timed out
爬虫的代码如下:
[Python] 纯文本查看复制代码
from urllib import request,error
import re
import os
import threading
web_link = ['http://mzsock.com/sw/page/','http://mzsock.com/cy/page/','http://mzsock.com/mv/page/','http://mzsock.com/lz/page/','http://mzsock.com/fbx/page/','http://mzsock.com/ydx/page/','http://mzsock.com/rzt/page/','http://mzsock.com/cwzp/page/']
error_number = 0
page_number = 1
root_path = os.getcwd() + '/images/'
#下载线程
download_thred_line = 10
def http_request(url):
try:
result = request.urlopen(url)
except error.HTTPError as e:
return e.code
else:
return result.read().decode('utf-8')
def regular(html,reg):
return re.findall(reg,html,re.S)
def view_link(html):
link_reg=r'<a class="img" href="(.+?)" title=".+?" target="_blank">.+?</a>'
return regular(html,link_reg)
def view_img_page_link_format(url):
format = url.split('.')
return [format[0] + '.' + format[1],format[2]]
def get_title(html):
img_title_reg = r'<title>(.+?)-.+?</title>'
return regular(html,img_title_reg)[0]
def mkdir_path(path):
if (os.path.exists(path) == False) : os.makedirs(path)
def get_img_links(html,page_link_format):
img_link = [];
total_page_reg = r'<a class="page-numbers" href=".+?" title="最后页">(.+?)</a>'
img_link_reg = r'<a class="image_cx_cont" .+?><img src="(.+?)".+? /></a>'
page_total = int(regular(html,total_page_reg)[0])
img_link = regular(html,img_link_reg)
for index in range(1,page_total):
page_html = http_request(page_link_format[0] + '_' + str(index + 1) + '.' + page_link_format[1])
if(type(page_html) == int or len(page_html) == 0) : continue
img_link = img_link + regular(page_html,img_link_reg)
return img_link
def set_img_file(file,link):
try:
html = request.urlopen(link).read()
except error.HTTPError as e:
return False
else:
f = open(file,'wb')
f.write(html)
f.close()
def download_img(links,path):
ths = []
for link in links:
if(link is None) : continue
print('Download link:' + link)
th = threading.Thread(target=set_img_file,args=(path + '/' + link.split('/')[-1],link))
th.start()
ths.append(th)
if len(ths) >= download_thred_line:
ths[0].join()
def get_view_links(html):
title = get_title(html)
view_img_page_format = view_img_page_link_format(item)
img_link = get_img_links(html,view_img_page_format)
if(len(img_link) == 0) : return False
img_path = root_path + title
mkdir_path(img_path)
print('Create file:' + title)
download_img(img_link,img_path)
def get_view(links):
ths = []
for item in links:
html = http_request(item)
if(type(html) == int) : continue
title = get_title(html)
view_img_page_format = view_img_page_link_format(item)
img_link = get_img_links(html,view_img_page_format)
if(len(img_link) == 0) : return False
img_path = root_path + title
mkdir_path(img_path)
print('Create file:' + title)
download_img(img_link,img_path)
def main():
global page_number,error_number
for url in web_link:
list_html = http_request(url + str(page_number))
if error_number >= 2:
error_number = 0
continue
if type(list_html) == int:
error_number = error_number + 1
view_list_link = view_link(list_html)
if len(view_list_link) == 0:
continue
get_view(view_list_link)
exit()
if __name__ == '__main__':
main()