请教各位大神一个多线程爬虫的问题

xiaobaisky · 发表于 2019-12-9 20:56

我在昨天的单线程图片下载的基础上开了一个多线程下载，但是在实际下载的时候会报错误：
错误如下：

[Bash shell] 纯文本查看 复制代码

Exception in thread Thread-2:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/urllib/request.py", line 1317, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "/usr/local/lib/python3.7/http/client.py", line 1229, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "/usr/local/lib/python3.7/http/client.py", line 1275, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "/usr/local/lib/python3.7/http/client.py", line 1224, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "/usr/local/lib/python3.7/http/client.py", line 1016, in _send_output
    self.send(msg)
  File "/usr/local/lib/python3.7/http/client.py", line 956, in send
    self.connect()
  File "/usr/local/lib/python3.7/http/client.py", line 928, in connect
    (self.host,self.port), self.timeout, self.source_address)
  File "/usr/local/lib/python3.7/socket.py", line 727, in create_connection
    raise err
  File "/usr/local/lib/python3.7/socket.py", line 716, in create_connection
    sock.connect(sa)
TimeoutError: [Errno 110] Connection timed out

爬虫的代码如下：

[Python] 纯文本查看 复制代码

from urllib import request,error
import re
import os
import threading


web_link = ['http://mzsock.com/sw/page/','http://mzsock.com/cy/page/','http://mzsock.com/mv/page/','http://mzsock.com/lz/page/','http://mzsock.com/fbx/page/','http://mzsock.com/ydx/page/','http://mzsock.com/rzt/page/','http://mzsock.com/cwzp/page/']

error_number = 0
page_number = 1
root_path = os.getcwd() + '/images/'
#下载线程
download_thred_line = 10


def http_request(url):
	try:
		result = request.urlopen(url)
	except error.HTTPError as e:
		return e.code
	else:
		return result.read().decode('utf-8')

def regular(html,reg):
	return re.findall(reg,html,re.S)

def view_link(html):
	link_reg=r'<a class="img" href="(.+?)" title=".+?" target="_blank">.+?</a>'
	return regular(html,link_reg)

def view_img_page_link_format(url):
	format = url.split('.')
	return [format[0] + '.' + format[1],format[2]]

def get_title(html):
	img_title_reg = r'<title>(.+?)-.+?</title>'
	return regular(html,img_title_reg)[0]

def mkdir_path(path):
	if (os.path.exists(path) == False) : os.makedirs(path)


def get_img_links(html,page_link_format):
	
	img_link = [];
	total_page_reg = r'<a class="page-numbers" href=".+?" title="最后页">(.+?)</a>'
	img_link_reg = r'<a class="image_cx_cont" .+?><img src="(.+?)".+? /></a>'
	page_total = int(regular(html,total_page_reg)[0])
	img_link = regular(html,img_link_reg)
	
	for index in range(1,page_total):
		
		page_html = http_request(page_link_format[0] + '_' + str(index + 1) + '.' + page_link_format[1])
		if(type(page_html) == int or len(page_html) == 0) : continue
		img_link = img_link + regular(page_html,img_link_reg)

	return img_link

def set_img_file(file,link):
	try:
		html = request.urlopen(link).read()
	except error.HTTPError as e:
		return False
	else:
		f = open(file,'wb')
		f.write(html)
		f.close()

def download_img(links,path):
	ths = []
	for link in links:
		if(link is None) : continue

		print('Download link:' + link)
		th = threading.Thread(target=set_img_file,args=(path + '/' + link.split('/')[-1],link))
		th.start()
		ths.append(th)

		if len(ths) >= download_thred_line:
			ths[0].join()


def get_view_links(html):
	title = get_title(html)
	view_img_page_format = view_img_page_link_format(item)
	img_link = get_img_links(html,view_img_page_format)
	if(len(img_link) == 0) : return False
	img_path = root_path + title
	mkdir_path(img_path)
	print('Create file:' + title)
	download_img(img_link,img_path)


def get_view(links):
	ths = []
	for item in links:
		html = http_request(item)
		if(type(html) == int) : continue
		title = get_title(html)
		view_img_page_format = view_img_page_link_format(item)
		img_link = get_img_links(html,view_img_page_format)
		if(len(img_link) == 0) : return False
		img_path = root_path + title
		mkdir_path(img_path)
		print('Create file:' + title)
		download_img(img_link,img_path)
		



def main():
	global page_number,error_number

	for url in web_link:
		list_html = http_request(url + str(page_number))
		if error_number >= 2:
			error_number = 0
			continue

		if type(list_html) == int:
			error_number = error_number + 1
       
		view_list_link = view_link(list_html)
		if len(view_list_link) == 0:
			continue

		get_view(view_list_link)
		exit()



if __name__ == '__main__':
	main()

请教一下各位大神，这个问题是出在哪，感激！

1170 · 发表于 2019-12-9 21:49

链接超时，这个网站好像打不开

NvidiaChina · 发表于 2019-12-9 22:08

虽不明但觉厉

bobowxc · 发表于 2019-12-9 23:01

界面未找到

xiaobaisky · 发表于 2019-12-10 11:13

1170 发表于 2019-12-9 21:49
链接超时，这个网站好像打不开

单线程的时候可以正常执行不会出现这个错误。

帐号		自动登录	找回密码
密码			注册[Register]

[求助] 请教各位大神一个多线程爬虫的问题