##获得网页源码后清洗里面的网页字符
def fre(html):
res ={r'<br/>',
r'[<div id="BookText">',
r'一秒记住【风雨小说网 www.44pq.cc】,精彩小说无弹窗免费阅读!',
r'</div>]',
r'<h1>',
r'</h1>',
}
for i in res:
html = str(html).replace(i,'')
return html
##用BeautifulSoup清洗
def htmlse(url,sel,namesel=''):
html = gethtml(url)
soup=BeautifulSoup(html,'lxml')
data = soup.select(sel)
if namesel == '':
return data
name = soup.select(namesel)
return name,data
##用re正则清洗
def htmlre(html,res):
htmllist=[]
for i in html:
a = re.findall(res,str(i))
if a : htmllist.append(a[0])
return htmllist
if __name__ == "__main__":
start = time.time() #开始时间
url = 'https://www.44pq.cc/kan/151682/'
sel = r'body > div.container > div.main > div > dl > dd'
namesel = r'#BookCon > h1'
selz = r'#BookText'
res = r'href="(.*.html)'
htmllist = htmlre(htmlse(url,sel),res)
for i in range(0,10): #len(htmllist)
text = htmlse(url+htmllist,selz,namesel)
print (fre(text[0]))
with open('万古第一神.txt', 'a', encoding='utf-8') as f:
f.write(fre(text[0]))
f.write(fre(text[1]))
f.write('\n'*5)
#time.sleep(0.2)
print('%.1f' % (float(time.time()-start))) #结束时间
多线程要设置线程锁,速度也是快得一批,十分钟不到一千多章就完了
我创建的多线程是这样:
ts = []
for j in range(24):
t = threading.Thread(target=a.get_chapter, args=(wifi,))
ts.append(t)
for j in range(24):
t = ts[j]
t.start()
for j in range(24):
t = ts[j]
t.join()