代码在这
[Python] 纯文本查看 复制代码 import re
import requests
from lxml import etree
post_url = input('请输入文章地址:')
#根据文章地址get数据
res = requests.get(post_url)
xx = res.content.decode('utf-8')
x = etree.HTML(xx)
#需要获取父级xpath
#xpath示例://*[@id="article-container"]
#不会的百度吧
xpath = input('请输入xpath路径,可打开控制台查看:')
content = x.xpath(xpath + '//*')
ree = re.compile(r'class=".*"|id=".*"')
urll = re.compile(r'(?<=(src="))(/).*?(?=("))')
with open('resualt.txt', 'w', encoding='utf-8') as file:
tep1 = ''
for i in content:
tep = etree.tostring(i, encoding='utf-8').decode('utf-8').strip()
tep = re.sub(ree, '', tep)
strr = re.search(urll, tep)
#如果图片是相对路径,就自动替换成绝对路径,(需要自己寻找修改路径地址)
#后面不用管,只需要找到前面的路径就行 就像https://dreamtea.top
#需要自己实测
if strr is not None:
strr = strr.group()
tep = re.sub(urll, 'https://cdn.com' + '/' + strr, tep)
# print(tep)
strr = None
if tep != tep1 and tep in tep1:
#print(tep)
continue
file.write(tep)
tep1 = tep
print('导出完成!')
|