python采集网页内容打印pdf输出文档
python采集网页内容打印pdf输出文档
confg = pdfkit.configuration(wkhtmltopdf=r'C:\Users\Administrator\AppData\Local\Programs\Python\Python37\wkhtmltox\bin\wkhtmltopdf.exe')
这里替换成自己的exe路径
具体配置看我上一篇帖子:https://www.52pojie.cn/thread-1002135-1-1.html
[Asm] 纯文本查看 复制代码 #追梦人物博客采集打印pdf
# -*- coding: UTF-8 -*-
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
import re
import pdfkit
confg = pdfkit.configuration(wkhtmltopdf=r'C:\Users\Administrator\AppData\Local\Programs\Python\Python37\wkhtmltox\bin\wkhtmltopdf.exe')
class ZmrwSpider(object):
def __init__(self,url):
self.ua=UserAgent()
self.headers={"User-Agent":self.ua.random}
self.data=""
self.url=url
self.list_urls=[]
def get_listurl(self):
response=requests.get(self.url,headers=self.headers)
if response.status_code==200:
soup=BeautifulSoup(response.text,'lxml')
a=soup.find('div',class_="toc").find_all('a')
for href in a:
list_url=f"https://www.zmrenwu.com{href['href']}"
if "#" not in list_url:
self.list_urls.append(list_url)
return self.list_urls
def get_content(self):
for list_url in self.list_urls:
response=requests.get(list_url,headers=self.headers)
if response.status_code==200:
if list_url=="https://www.zmrenwu.com/courses/hellodjango-blog-tutorial/":
article=re.findall(r'<article class="material">(.+?)</article>',response.text,re.S)[0]
else:
article = re.findall(r'<article class="">(.+?)</article>', response.text, re.S)[0]
self.data='%s%s'%(self.data,article)
return self.data
def dypdf(self):
datas=f'<html><head><meta charset="UTF-8"></head><body>{self.data}</body></html>'
print("开始打印内容!")
pdfkit.from_string(datas, r'out.pdf', configuration=confg)
print("打印保存成功!")
if __name__ == '__main__':
url="https://www.zmrenwu.com/courses/hellodjango-blog-tutorial/"
zmrw=ZmrwSpider(url)
zmrw.get_listurl()
zmrw.get_content()
zmrw.dypdf()
附pdf文档:
HelloDjango - Django博客教程(第二版)-追梦人物的博客
HelloDjango - Django博客教程(第二版)-追梦人物的博客.zip
(767.17 KB, 下载次数: 105)
|