[Python] 纯文本查看 复制代码
import pdfkit
import requests
from bs4 import BeautifulSoup
import os
import re
import time
urls = []
def txt_get_url(txt_file):
f = open(txt_file, 'r')
f_content = f.readlines()
f.close()
return f_content
def url_pdf(url):
headers = {
#'content-type':'application/json',
#'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
#'sec-fetch-dest': 'document',
#'sec-fetch-mode': 'navigate',
#'sec-fetch-site': 'none',
#'upgrade-insecure-requests': 1,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
res = requests.get(url, headers=headers)
# data-src替换为src, 有时候返回的正文被隐藏了,将hidden去掉
html = res.text.replace("data-src", "src").replace('style="visibility: hidden;"',"")
soup = BeautifulSoup(html, "lxml")
# 选择正文(去除javascript等)
html = soup.select('div#img-content')[0]
title = soup.find('h2').string.replace('\n','').replace(' ', '').replace('\u200b', '').replace('\ufeff', '').replace('\ue601','')
rstr = r"[\/\\\:\*\?\"\<\>\|]" ## cannot save file
new_title = re.sub(rstr, '_', title)
author_name = soup.find('a', id='js_name').string.replace('\n','').replace(' ', '').replace('\u200b', '').replace('\ufeff', '').replace('\ue601','')
author_name = re.sub(rstr, '_', author_name)
title_name = author_name + '-' + new_title + '.pdf'
#title_file_name = "".join(re.findall(r'[\u4e00-\u9fa5]+', title_name))
# 可以修改字体
font = '''
<style type="text/css">
@font-face{font-family: "微软雅黑";src:url("‪C:\\Windows\\Fonts\\msyh.ttc")
</style>
<style type = "text/css">
p { font-family: '微软雅黑', cursive; }
</style>
'''
html = font + str(html)
#headers = [('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'),]
# 选项
options = {
'page-size': 'A4',
# 'margin-top': '0',
'margin-right': '3mm',
# 'margin-bottom': '0',
'margin-left': '3mm',
# 'disable-smart-shrinking': [''],
'zoom': '0.9', # too long code, to zoom
#'image-dpi': '1200',
#'dpi': '1200',
# 'margin-top': '0.75in',
# 'margin-right': '0.75in',
# 'margin-bottom': '0.75in',
# 'margin-left': '0.75in',
'encoding': "UTF-8",
# 'custom-header': headers,
# 'debug-javascript': [''],
# 'javascript-delay': 10000,
# 'no-stop-slow-scripts': "",
# 'load-media-error-handling': 'abort',
}
config = pdfkit.configuration(wkhtmltopdf=r'xxxxx\wkhtmltopdf\bin\wkhtmltopdf.exe')
try:
pdfkit.from_string(str(html), title_name, configuration=config, options=options)
print('Complete print: \n', url, ' : ', title_name, '\n=================================================')
except IOError as err:
print("IO error: {0}".format(err))
except OSError as err:
print("OS error: {0}".format(err))
except ValueError:
print("Could not convert data to an integer.")
except:
print("Unexpected error:", sys.exc_info()[0])
raise
else:
print('Sleep 2s')
time.sleep(2)
if __name__ == "__main__":
path = r'C:\Users\Administrator\Desktop'
os.chdir(path)
txt_file = 'vx.txt'
urls = txt_get_url(txt_file)
folder = 'pdf'
folder_temp = os.path.join(path, folder)
if not os.path.isdir(folder_temp):
os.mkdir(folder_temp)
os.chdir(folder_temp)
urls_ = [re.findall(r'[a-zA-z]+://[^s].*', x) for x in urls]
for i in urls_:
if i == []: continue
#url = i.decode('utf-8').replace('\r', '').replace('\n', '').replace(' ', '')
print('=================================================\n Printing url: ', i[0])
url_pdf(i[0])
time.sleep(1)
print('Complete print!!')