保存vx文章到pdf文件
本帖最后由 helian147 于 2021-1-24 14:26 编辑目的:
保存一些零散公众号文章到pdf,不是采集某个公众号全部文章【可以用夜泉大神的WeChatDownload,论坛有下载】。
这程序没啥用,会的无须用嘛{:1_911:}
实现:
1、复制公众号链接【试了几次发现拷贝到QQ,登陆PC QQ再复制全部链接比较方便】链接全部复制到指定txt文件,txt文件内容无需排版、会自动提取链接;
2、运行程序,pdf保存在指定目录。
需要修改:
1、txt文件的目录,由path参数指定,默认桌面
2、pdf保存在指定目录path下的pdf目录,不存在会新建pdf目录,也可自己修改
3、options选项:page-size指定页面大小;zoom指定缩放【有时文章带CODE太长而打印PDF不全,可以缩放试试】,pdf打印出来效果可以调整试试。
感谢论坛大神【收藏夹太多没找到{:1_925:}】,代码大部分是参考的【拷贝的:victory:】
import pdfkit
import requests
from bs4 import BeautifulSoup
import os
import re
import time
urls = []
def txt_get_url(txt_file):
f = open(txt_file, 'r')
f_content = f.readlines()
f.close()
return f_content
def url_pdf(url):
headers = {
#'content-type':'application/json',
#'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
#'sec-fetch-dest': 'document',
#'sec-fetch-mode': 'navigate',
#'sec-fetch-site': 'none',
#'upgrade-insecure-requests': 1,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
res = requests.get(url, headers=headers)
# data-src替换为src, 有时候返回的正文被隐藏了,将hidden去掉
html = res.text.replace("data-src", "src").replace('style="visibility: hidden;"',"")
soup = BeautifulSoup(html, "lxml")
# 选择正文(去除javascript等)
html = soup.select('div#img-content')
title = soup.find('h2').string.replace('\n','').replace(' ', '').replace('\u200b', '').replace('\ufeff', '').replace('\ue601','')
rstr = r"[\/\\\:\*\?\"\<\>\|]" ## cannot save file
new_title = re.sub(rstr, '_', title)
author_name = soup.find('a', id='js_name').string.replace('\n','').replace(' ', '').replace('\u200b', '').replace('\ufeff', '').replace('\ue601','')
author_name = re.sub(rstr, '_', author_name)
title_name = author_name + '-' + new_title + '.pdf'
#title_file_name = "".join(re.findall(r'[\u4e00-\u9fa5]+', title_name))
# 可以修改字体
font = '''
<style type="text/css">
@font-face{font-family: "微软雅黑";src:url("C:\\Windows\\Fonts\\msyh.ttc")
</style>
<style type = "text/css">
p { font-family: '微软雅黑', cursive; }
</style>
'''
html = font + str(html)
#headers = [('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'),]
# 选项
options = {
'page-size': 'A4',
# 'margin-top': '0',
'margin-right': '3mm',
# 'margin-bottom': '0',
'margin-left': '3mm',
# 'disable-smart-shrinking': [''],
'zoom': '0.9', # too long code, to zoom
#'image-dpi': '1200',
#'dpi': '1200',
# 'margin-top': '0.75in',
# 'margin-right': '0.75in',
# 'margin-bottom': '0.75in',
# 'margin-left': '0.75in',
'encoding': "UTF-8",
# 'custom-header': headers,
# 'debug-javascript': [''],
# 'javascript-delay': 10000,
# 'no-stop-slow-scripts': "",
# 'load-media-error-handling': 'abort',
}
config = pdfkit.configuration(wkhtmltopdf=r'xxxxx\wkhtmltopdf\bin\wkhtmltopdf.exe')
try:
pdfkit.from_string(str(html), title_name, configuration=config, options=options)
print('Complete print: \n', url, ' : ', title_name, '\n=================================================')
except IOError as err:
print("IO error: {0}".format(err))
except OSError as err:
print("OS error: {0}".format(err))
except ValueError:
print("Could not convert data to an integer.")
except:
print("Unexpected error:", sys.exc_info())
raise
else:
print('Sleep 2s')
time.sleep(2)
if __name__ == "__main__":
path = r'C:\Users\Administrator\Desktop'
os.chdir(path)
txt_file = 'vx.txt'
urls = txt_get_url(txt_file)
folder = 'pdf'
folder_temp = os.path.join(path, folder)
if not os.path.isdir(folder_temp):
os.mkdir(folder_temp)
os.chdir(folder_temp)
urls_ = +://[^s].*', x) for x in urls]
for i in urls_:
if i == []: continue
#url = i.decode('utf-8').replace('\r', '').replace('\n', '').replace(' ', '')
print('=================================================\n Printing url: ', i)
url_pdf(i)
time.sleep(1)
print('Complete print!!') 感谢分享 谢谢楼主分享 学习了多谢楼主分享 多谢楼主 很方便,感谢楼主分享 整挺好啊,莫名想起safari也可以另存为PDF 有什么方法可以批量提取公众号文章的链接?
页:
[1]