python写的小说抓取源码【原创】
闲来无事,不想学C++了。听说最近流行的python很火,研究一下。顺手将小说抓取完善一下。源码列在下面了。需要的自行整理吧。很简单,也就不做说明了。
觉得有用的,给个免费的热心值支持一下吧。
from urllib.request import urlopen
myurl = 'https://www.zwdu.com/book/31855/'
myhost = myurl
f = open('d:/text.txt','w+',encoding='gbk')
smsg = urlopen(myurl).read().decode('gbk')
tmsg = smsg.find("<dd>")
while tmsg > 0:
t = smsg
smsg = smsg
tmsg = smsg.find("<dd>")
chapurl = myhost + t
chapname = t+"\n"
temp = urlopen(chapurl).read().decode('gbk')
content = temp + "\n"
content = content.replace("\t","")
content = content.replace("<br />","\n")
f.write(chapname)
f.write(content)
f.close()
谢谢分享,学习 遇到解码问题,搜索后找到解决方法。源码经过修改如下:
import re
from urllib.request import urlopen
myurl = 'http://www.purepen.com/hlm/'
myhost = myurl
f = open('d:/mytemp/红楼梦.txt','w+',encoding='gb18030')
smsg = urlopen(myurl).read()
#选择解码字符集
if re.search(b'*',smsg).group() == b'GB2312' \
or re.search(b'*',smsg).group() == b'gb2312':
charset = 'gb18030'
if re.search(b'*',smsg).group() == b'GBK' \
or re.search(b'*',smsg).group() == b'gbk':
charset = 'gbk'
if re.search(b'*',smsg).group() == b'UTF-8' \
or re.search(b'*',smsg).group() == b'utf-8':
charset = 'utf-8'
smsg = smsg.decode(charset) #解码
tmsg = smsg.find("<TD>第 一 回")
t = smsg
tmsg = t.find("<A HREF")
while tmsg > 0:
#smsg = smsg
#tmsg = smsg.find("</a>")
chapurl = myhost + t
#chapname = t+"\n"
tmsg = t.find("</A>")
t = t
tmsg = t.find("<A HREF")
temp = urlopen(chapurl).read().decode('gb18030')
chapname = temp + '\n'
f.write(chapname)
temp = temp
content = temp + "\n"
content = content.replace("\t","")
content = content.replace("<br />","")
content = content.replace(" ","")
temp = re.split(r'\n',content)
for i in range(len(temp)):
if len(temp)<33:
temp = temp+'\n'
f.write(temp)
f.close()
页:
[1]