本帖最后由 D.A. 于 2020-9-1 21:30 编辑
最近想实现通过公司网站爬取网页内置链接并汇总文字信息,折腾了一通本来可以实现了,但是要爬取对象多内置链接也多,速度太慢,遂学习了一下多进程在python爬取中的实现方式,和大家分享一下。
补充了多线程爬取效果对比,由于有网页响应时间影响,无法直接判断哪个方式更快,但整体而言两种方式速度都比普通模式快很多。
模式 | 普通模式 | 多线程模式 | 多进程模式 | 时间 | 257.9s | 200.2s | 196.7s |
爬取思路:
1、根据公司网站内容获取内置链接:
[Python] 纯文本查看 复制代码
#获取网页文本
def getpage(url):
try:
res = requests.get(url, headers, timeout = 30)
res.encoding = res.apparent_encoding
text = res.text
except:
text = ''
return text
links = ['http://www.fourd.cn', 'http://www.tfsea.com.cn', 'http://www.csscwshi.com' ,'http://www.marina-zh.com/', 'http://www.gdmoko.cn']
for rawlink in links:
rawtext = getpage(rawlink) # 获取网页信息
link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", rawtext) # 获取网页内含链接
link_list = list(set(link_list)) # 去除重复项
2、获取网页内可访问链接
[Python] 纯文本查看 复制代码 #获取网页内可访问链接
csurls = []
for link in link_list:
if getcode(link) == 200:
csurl = link
elif getcode('http://' + link) == 200:
csurl = 'http://' + link
elif getcode('https://' + link) == 200:
csurl = 'https://' + link
elif getcode(rawlink + '//' + link) == 200:
csurl = rawlink + link
else:
csurl =""
csurls.append(csurl)
csurls = [x for x in csurls if x != '']#删除列表空值
3、根据关联链接获取中文文本信息
[Python] 纯文本查看 复制代码
# 获取网页中文文本
def getchtext(url):
text = getpage(url)
chinese_pattern = '[\u4e00-\u9fa5]+'
chtexts = re.findall(chinese_pattern, text)
chtexts = list(set(chtexts)) #去重
chtexts = " ".join(chtexts)
return chtexts
res = []
for csurl in csurls:
res.append(getchtext(csurl))
4、设置进程池实现多进程爬取
以下内容替代第三步中的最后三行[Python] 纯文本查看 复制代码 #设置多核进程池
pool = mp.Pool(processes=2) #设置多进程数量,可不设置
es = pool.map(getchtext, csurls)#多进程获取网页内置链接中文信息
pool.close()
pool.join()
5、设置多线程爬取
getchtext函数需同步修改,因多线程无法直接return,需引入queen取数。后半部分替换第三步中最后三行
[Python] 纯文本查看 复制代码 # 获取网页中文文本
def getchtext(url,q):
text = getpage(url)
chinese_pattern = '[\u4e00-\u9fa5]+'
chtexts = re.findall(chinese_pattern, text)
chtexts = list(set(chtexts)) #去重
chtexts = " ".join(chtexts)
q.put(chtexts)
# 多线程模式运行
q = Queue()
threads = []
for i in range(len(csurls)):
t = threading.Thread(target= getchtext, args=(csurls[i], q))
t.start()
threads.append(t)
for thread in threads:
thread.join()
res =[]
for _ in range(len(csurls)):
res.append(q.get())
多进程完整代码如下:
[Python] 纯文本查看 复制代码 import requests
import time
import re
from fake_useragent import UserAgent
import multiprocessing as mp
#设置headers
ua = UserAgent(verify_ssl=False)
headers = {
"User-Agent": ua.random,
}
#获取网页文本
def getpage(url):
try:
res = requests.get(url, headers, timeout = 30)
res.encoding = res.apparent_encoding
text = res.text
except:
text = ''
return text
#获取网页状态码
def getcode(url):
try:
res = requests.get(url, headers, timeout = 30)
code = res.status_code
except:
code = 0
return code
# 获取网页中文文本
def getchtext(url):
text = getpage(url)
chinese_pattern = '[\u4e00-\u9fa5]+'
chtexts = re.findall(chinese_pattern, text)
chtexts = list(set(chtexts)) #去重
chtexts = " ".join(chtexts)
return chtexts
#获取网页及关联网页信息
if __name__ == '__main__':
start = time.time()
links = ['http://www.fourd.cn', 'http://www.tfsea.com.cn', 'http://www.csscwshi.com' ,'http://www.marina-zh.com/', 'http://www.gdmoko.cn']
for rawlink in links:
rawtext = getpage(rawlink) # 获取网页信息
corpchtext = getchtext(rawlink) # 获取网页中文文本信息
link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", rawtext) # 获取网页内含链接
link_list = list(set(link_list)) # 去除重复项
#获取网页内可访问链接
csurls = []
for link in link_list:
if getcode(link) == 200:
csurl = link
elif getcode('http://' + link) == 200:
csurl = 'http://' + link
elif getcode('https://' + link) == 200:
csurl = 'https://' + link
elif getcode(rawlink + '//' + link) == 200:
csurl = rawlink + link
else:
csurl =""
csurls.append(csurl)
csurls = [x for x in csurls if x != '']
print('关联网址为:\n',csurls)
#设置多核进程池
pool = mp.Pool(processes=2) #设置多进程数量
res = pool.map(getchtext, csurls)#多进程获取网页内置链接中文信息
pool.close()
pool.join()
#汇总所有信息
corptext = " ".join(res) + corpchtext
print('公司网页信息汇总:\n',corptext)
end = time.time()
cost = end - start
print('共花费{}s'.format(cost))
#弹窗提醒
from tkinter import messagebox
messagebox.showinfo("提示","信息采集完成!")
多线程完整代码如下:
[Python] 纯文本查看 复制代码 import requests
import time
import re
from fake_useragent import UserAgent
import multiprocessing as mp
import threading
from queue import Queue
#设置headers
ua = UserAgent(verify_ssl=False)
headers = {
"User-Agent": ua.random,
}
#获取网页文本
def getpage(url):
try:
res = requests.get(url, headers, timeout = 30)
res.encoding = res.apparent_encoding
text = res.text
except:
text = ''
return text
#获取网页状态码
def getcode(url):
try:
res = requests.get(url, headers, timeout = 30)
code = res.status_code
except:
code = 0
return code
# 获取网页中文文本
def getchtext(url,q):
text = getpage(url)
chinese_pattern = '[\u4e00-\u9fa5]+'
chtexts = re.findall(chinese_pattern, text)
chtexts = list(set(chtexts)) #去重
chtexts = " ".join(chtexts)
q.put(chtexts)
#获取网页及关联网页信息
if __name__ == '__main__':
start = time.time()
links = ['http://www.fourd.cn', 'http://www.tfsea.com.cn', 'http://www.csscwshi.com' ,'http://www.marina-zh.com/', 'http://www.gdmoko.cn']
for rawlink in links:
rawtext = getpage(rawlink) # 获取网页信息
#corpchtext = getchtext(rawlink) # 获取网页中文文本信息
link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", rawtext) # 获取网页内含链接
link_list = list(set(link_list)) # 去除重复项
#获取网页内可访问链接
csurls = []
for link in link_list:
if getcode(link) == 200:
csurl = link
elif getcode('http://' + link) == 200:
csurl = 'http://' + link
elif getcode('https://' + link) == 200:
csurl = 'https://' + link
elif getcode(rawlink + '//' + link) == 200:
csurl = rawlink + link
else:
csurl =""
csurls.append(csurl)
csurls = [x for x in csurls if x != '']
print('关联网址为:\n',csurls)
#多线程模式运行
q = Queue()
threads = []
for i in range(len(csurls)):
t = threading.Thread(target= getchtext, args=(csurls[i], q))
t.start()
threads.append(t)
for thread in threads:
thread.join()
res =[]
for _ in range(len(csurls)):
res.append(q.get())
#汇总所有信息
corptext = " ".join(res) #+ corpchtext
print('公司网页信息汇总:\n',corptext)
end = time.time()
cost = end - start
print('共花费{}s'.format(cost))
#弹窗提醒
from tkinter import messagebox
messagebox.showinfo("提示","信息采集完成!")
|