[Python] 纯文本查看 复制代码 from re import sub
from re import compile
from time import sleep
from parsel import Selector
import tomd
from requests import get
def spider_csdn(url, ccs_head, css_text):
"""
ccs_head 标题的class
css_text 文本的class
"""
title_url = url
if not title_url:
print('错误', '请输入网址')
sleep(5)
return None
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.52'}
html = get(url=title_url, headers=head).text
page = Selector(html)
title = page.css('%s::text' % ccs_head).get()
res = compile('[^一-龥^a-z^A-Z^0-9]')
restr = ''
res.sub(restr, title)
content = page.css('%s' % css_text).get()
content = sub('<a.*?a>', '', content)
content = sub('<br>', '', content)
content = sub('<li>', '', content)
content = sub('</li>', '', content)
content = sub('^#', '', content)
content = sub('(<img.*?>)', '<p>\\1</p>', content)
content = sub('loading="lazy"', '', content)
texts = tomd.Tomd(content).markdown
title = title.replace(' ', '')
title = title.replace('\n', '')
with open((title + '.md'), mode='w', encoding='utf-8') as (f):
f.write('#' + title)
f.write(texts)
print('获取文章完成')
import subprocess
from os import getcwd
addr = getcwd()
subprocess.Popen('explorer %s' % addr)
if __name__ == '__main__':
from pyperclip import paste
url = paste()
while True:
if 'csdn.net/' in url:
spider_csdn(url, '.title-article', 'article')
input('按任意键继续 ')
url = paste()
elif 'bbsmax.com/' in url:
spider_csdn(url, '.title', '.post-content')
input('按任意键继续 ')
url = paste()
elif 'cnblogs.com/' in url:
spider_csdn(url, '#cb_post_title_url > span', '.postBody')
input('按任意键继续 ')
url = paste()
elif 'zhuanlan.zhihu.com/' in url:
spider_csdn(url, '#root > div > main > div > article > header > h1', '#root > div > main > div > article > div.Post-RichTextContainer')
input('按任意键继续 ')
url = paste()
elif 'weixin.qq.com/' in url:
spider_csdn(url, '.rich_media_title', '/html/body/div[1]/div/div[1]/div[2]')
input('按任意键继续 ')
url = paste()
elif 'juejin.cn/' in url:
spider_csdn(url, '.article-title', '.article-content')
input('按任意键继续 ')
url = paste()
continue
else:
print('你的剪切板不是文章的url,目前支持csdn,bbsmax,博客园,:下面是你的剪切板的内容\n%s' % url)
input('按任意键继续 ')
url = paste() |