本帖最后由 冰雨的风暴 于 2020-4-22 20:53 编辑
使用cookie模拟登陆,需要vip功能(新用户默认赠送7天)
样式部分有些走样,转换后带书签功能
import pdfkit
import requests
import json
import os
from PyPDF2 import PdfFileMerger
from io import BytesIO
from PIL import Image
confg = pdfkit.configuration(
wkhtmltopdf=r'C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe')
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Cookie': 'acw_tc=2760820815872829446754492e641aa66e53c8b10f66ee4835639aeea617c0; SESSION=17c1f230-fcac-4c02-84d9-6b59f7f631f1'
}
template = '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><link rel=stylesheet type=text/css ><link rel=stylesheet type=text/css ><style>#onlineEbookReader .rds-box,#onlineEbookReader .rds-box .content_box{width:auto !important;}#onlineEbookReader .rds-box{margin-top:0;}#onlineEbookReader{height:100%;}</style></head><body style=" overflow: hidden; padding-right: 17px !important;"><div id="app"style=""><div id="onlineEbookReader"style="overflow-y: auto;"><div id="onlineEbookReaderFalls"class="rds-box"><div class="content_box"style="background: rgb(246, 248, 250); font-size: 16px; line-height: 28px;">{{content}}</div></div></div></div></body></html>'
path = 'E:\\scripts\\python\\'
def getById(projectId):
result = ''
url = 'https://pubcloud.ptpress.cn/pubcloud/content/onlineProject/getById?id=' + projectId
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
obj = json.loads(response.text)
if obj.get('success'):
data = obj.get('data')
result = data.get('ubookName')
return result
def ebookFolderTree(projectId, chapter=True):
global path
bookName = getById(projectId)
print('开始----- '+ bookName +' -----')
savepath = os.path.join(path, bookName)
if chapter and os.path.exists(savepath) == False:
os.mkdir(savepath)
url = 'https://pubcloud.ptpress.cn/pubcloud/content/front/ebookFolderTree?projectId=' + projectId
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
obj = json.loads(response.text)
result = ''
if obj.get('success'):
contents = obj.get('data')
for item in contents:
folderId = item.get('id')
index = item.get('index')
name = item.get('name')
print(name)
pdfname = str(index)+' '+name+'.pdf'
# if folderId != '36562005-4ed8-4903-aa23-08ab8bd9dbf6':
# continue
if chapter:
result = getContentsByFolders(item)
convertToPdf(result, savepath, pdfname)
else:
result += getContentsByFolders(item)
if chapter == False:
convertToPdf(result, path, bookName+'.pdf')
print('结束----- '+ bookName +' -----')
# merger = PdfFileMerger()
# files = [x for x in os.listdir(savepath) if x[-4:] == ".pdf"]
# if files and len(files) > 0:
# for file in sorted(files):
# ph = os.path.join(savepath, file)
# merger.append(open(ph, 'rb'))
# bookPath = os.path.join(path, bookName+'.pdf')
# with open(bookPath, 'wb') as fout:
# merger.write(fout)
def convertToPdf(result, path, pdfname):
global template
if result:
temp = template.replace('{{content}}', result)
pdfkit.from_string(temp, os.path.join(
path, pdfname), configuration=confg)
def getContentsByFolders(folder):
result = ''
if folder:
folderId = folder.get('id')
projectId = folder.get('projectId')
children = folder.get('children')
name = folder.get('name')
print(name)
result = result + getContentsByFolderId(folderId, projectId, folder)
if children and len(children) > 0:
for child in children:
result = result + getContentsByFolders(child)
return result
def getContentsByFolderId(folderId, projectId, folder):
# name = folder.get('name')
str = ''
url = 'https://pubcloud.ptpress.cn/pubcloud/content/front/getContentsByFolderId?folderId='
url = url + folderId + '&projectId=' + projectId
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
obj = json.loads(response.text)
if obj.get('success'):
contents = obj.get('data').get('contents')
for item in contents:
type = item.get('type')
if type == 'editor':
str += '<div class="text"><div>' + \
item.get('content')+'</div></div>'
elif type == 'singleImg':
imgurl = item.get('showUrls')[0].get('showUrl')
size = getRemoteImageSize(imgurl)
style = 'style="width:800px"' if size and len(
size) > 0 and size[0] > 800 else ''
str += '<div class="text"><img ' + style + ' src="' + imgurl + '" /></div>'
else:
str += '<div class="text"><div>' + \
item.get('content')+'</div></div>'
return str
def getRemoteImageSize(path):
response = requests.get(path)
image = Image.open(BytesIO(response.content), 'r')
return image.size
if __name__ == '__main__':
print('***** 开始执行 *****')
# pdfkit.from_file('E:\\scripts\\python\\123.html', save, configuration=confg)
# str = ebookFolderTree('ba0acab9-48db-4ac8-8b25-9833d1008507')
# if str:
# template = template.replace('{{content}}', str)
# pdfkit.from_string(template, save, configuration=confg)
data = ['350f6b8a-c65d-4186-9ea4-53ab6ae1a604', 'ba0acab9-48db-4ac8-8b25-9833d1008507', '51b9fb96-6012-4acc-acfd-8d119fc03988']
for pid in data:
ebookFolderTree(pid, False)
# ebookFolderTree('0d295b48-d2c3-4536-8224-dfbdb49f23a6', False)
print('***** 执行结束 *****')
|