[Python] 纯文本查看 复制代码
from lxml import etree
from os import makedirs,path,getcwd, system
import requests
from time import sleep
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)#移除警告
is_prox=False
proxies =None# {"http": "http://127.0.0.1:10809"}#使用代{过}{滤}理
mysession = requests.Session()#创建session对象
ser_num = 0 #输出时的计数
allow_redirects=False
rooturl=cata_url=title_xpath=url_xpath=filename_xpaht=content_xpath=""
def isconnect_enable(_url):
try:
# if proxies !=None:
requests.get(_url,proxies=proxies)
print("网站可以连通")
return True
except :
print("网站无法连接")
return False
def gethtml(_url):#根据网址取得网页html源码
header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36",
}
try_count=5#尝试次数
while True:
try:
print("正在获取HTML,网址:",_url,end="")
html = mysession.get(_url, headers=header,proxies=proxies,allow_redirects=allow_redirects)
# html = html.content
html.encoding = html.apparent_encoding#直接执行encoding为正确编码
# print(html)
mysession.close()
return html.text
except requests.exceptions.RequestException:
if try_count<=0:
print("\n获取HTML失败,程序退出!")
exit(0)
sleep(0.1)
print(".",end="")
try_count-=1
def get_title_titleurl(_url,title_xpath,url_xpath,filename_xpath):#根据目录页提取文章题目和url,返回字典和小说名字(文件名)
'''_url:目录面网址
title_xpath:题目的xpath,记得加text()哦
url_xpath:内容页网址的xpaht,记得加@href
filename_xpath:获取小说题目的xpath
'''
html = gethtml(_url)#取得列表网址源码
tree = etree.HTML(html,etree.HTMLParser())
filename = str(tree.xpath(filename_xpath)[0]).strip()
title_dict = {}#返回的字典
title_list = tree.xpath(title_xpath)
url_list = tree.xpath(url_xpath)#这里要注意,默认为全路径,如果是相对路径需要加根url哦
for i in range(len(title_list)):
title_dict[str(title_list[i]).strip()]=url_list[i]
return title_dict,filename#返回字典和小说名字(文件名)
def get_file_content(url,content_xpath):#取得文章内容
userAgent = r"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
header = {'User-Agent': userAgent}
global ser_num
print (ser_num,"\t正在获取网页内容...",end="")
ser_num+=1
postUrl=url
responseRes = mysession.get(postUrl,proxies=proxies,headers = header,verify=False)
mysession.close()#关闭连接
responseRes.encoding='utf-8'
# print("返回状态码:",responseRes.status_code,end="\t")
if str(responseRes.status_code)=="200":
print("获取成功!",end="\t")
tree =etree.HTML(responseRes.text)
str_content = tree.xpath(content_xpath)
str_content="".join(str_content).replace("\r\n","\n")#清除一下多余的空行
return str_content
def write_to_file(_dirs,_filename,_content,chapter="",_extfn=".txt",mode="w",_prefix="",_fileurl=""):#写入到文件
'''
_dirs:目录
_prefix:文件名前缀
_filename:文件名
_content:文件内容
_extfn:文件扩展名
mode:写入模式
_fileurl:本文网址
'''
if chapter!="":
# _filename=_filename+"\t"+chapter
# chapter
print("写入->文件夹:{0}\t文件名:{1}".format(_dirs,_prefix+_filename),"\t"+chapter)
else:
print("写入->文件夹:{0}\t文件名:{1}".format(_dirs,_prefix+_filename))
if not path.exists(_dirs):
makedirs(_dirs)
with open(_dirs+"\\"+_prefix+_filename+".txt",mode,encoding = "utf-8") as myfile:
# myfile.write(_filename+"\n")
if _fileurl!="":
myfile.write("本文网址:"+str(_fileurl)+"\n")
myfile.write(_content)
def config(site_choice,bookid):
if site_choice=="纵横":
global rooturl,cata_url,title_xpath,url_xpath,filename_xpaht,content_xpath,allow_redirects
rooturl = r''#章节url如果是相对路径需要在此添加根路径
cata_url = f'http://book.zongheng.com/showchapter/{bookid}.html' #目录页网址
title_xpath='/html/body/div[3]/div[2]/div[2]/div/ul/li/a/text()' #章节文字xpath
url_xpath= '/html/body/div[3]/div[2]/div[2]/div/ul/li/a/@href' #章节url的xpath
filename_xpaht='/html/body/div[3]/div[1]/h1/text()' #文件名,即小说名字的xpath
content_xpath='//*[@id="readerFt"]/div/div[5]//text()' #内容页文本的xpath
if site_choice=="17k":
# global rooturl,cata_url,title_xpath,url_xpath,filename_xpaht,content_xpath
rooturl = r'http://www.17k.com'#章节url如果是相对路径需要在此添加根路径
cata_url = f'http://www.17k.com/list/{bookid}.html' #目录页网址
title_xpath='/html/body/div[5]/dl/dd/a/span/text()' #章节文字xpath
url_xpath= '/html/body/div[5]/dl/dd/a/@href' #章节url的xpath
filename_xpaht='/html/body/div[5]/h1/text()' #文件名,即小说名字的xpath
content_xpath='/html/body/div[4]/div[2]/div[2]/div[1]/div[2]/p//text()' #内容页文本的xpath
if site_choice=="起点":
# global rooturl,cata_url,title_xpath,url_xpath,filename_xpaht,content_xpath
allow_redirects=True
rooturl = r'http:'#章节url如果是相对路径需要在此添加根路径
cata_url = f'http://book.qidian.com/info/{bookid}/#Catalog' #目录页网址
title_xpath='//*[@id="j-catalogWrap"]/div[2]/div[1]/ul/li/a/text()' #章节文字xpath
url_xpath= '//*[@id="j-catalogWrap"]/div[2]/div[1]/ul/li/a/@href' #章节url的xpath
filename_xpaht='/html/body/div[1]/div[6]/div[1]/div[2]/h1/em/text()' #文件名,即小说名字的xpath
content_xpath='//div[1]/div/div/div[2]/p/text()' #内容页文本的xpath
# /html/body/div[2]/div[3]/div[2]/div[1]/div[2]/div/div[2]/p/span[1]
if __name__ == "__main__":
site_list = ["纵横","17k","起点"]
for i in range(len(site_list)):
print("\t",i,"\t",site_list[i])
site=input("请选择网站:")
site=site_list[int(site)]
bookid=input("请输入文章ID:")
config(site,str(bookid))
path_dir = ".\mytemp" #存放小说文件的路径
title_url_dict,filename = get_title_titleurl(cata_url,title_xpath,url_xpath,filename_xpaht)
print("\t小说或文章标题:《"+filename+"》")
for title in title_url_dict:
target_url = rooturl+title_url_dict[title]#内容页网址
content = get_file_content(target_url,content_xpath)
write_to_file(path_dir,filename,"*"*100+"\n"+title+"\n"+(content)+"\n",chapter=title,mode="a",_fileurl=target_url)
sleep(1)
print("程序运行结束!!!")