[Python] 纯文本查看 复制代码
import json
import os
import random
import re
import requests
import time
from docx import Document
from docx.shared import Inches
doc = Document()
def sort_by_bottom_left(item):
return -int(item['bottom_value']), int(item['left_value'])
book = []
book_base = []
snippet_base = []
# 医院医疗质量标准化建设与管理(重新转)
# epubID = '1901295227'
# 现代医院信息化建设策略与实践电子书
epubID = '1901218068'
token = 'pc_1876eb17825a8a3ee0de0563c51d00645dd7230f5c4a089663b89a**********'
url = "https://e.dangdang.com/media/api.go?action=getPcMediaInfo&epubID=" + epubID + "&token=" + token + "&wordSize=2&style=2"
response = requests.request("GET", url, timeout=5)
print(response.text)
data = json.loads(response.text)['data']
mediaPageInfo = data['mediaPageInfo']
mediaPageInfo_sort = {}
for k, v in mediaPageInfo.items():
k = k.replace('pagenum', '')
mediaPageInfo_sort[k] = v
mediaPageInfo_sort = {k: v for k, v in sorted(mediaPageInfo_sort.items(), key=lambda x: int(x[0]))}
text = ''
for k, v in mediaPageInfo_sort.items():
book_txt_base = []
book_txt = []
book_img_base = []
book_img = []
url = "https://e.dangdang.com/media/api.go?action=getPcChapterInfo&epubID=" + epubID + "&" "token=" + token + \
"&chapterID=" + str(v['chapterID']) + "&pageIndex=" + str(v['pageIndex']) + \
"&locationIndex=" + k + "&wordSize=2&style=2"
print('chapterID=', v['chapterID'], 'pageIndex=', v['pageIndex'], 'locationIndex=', k)
time.sleep(random.uniform(0, 2))
while True:
try:
response = requests.request("GET", url, timeout=5)
break
except BaseException as e:
print(e)
time.sleep(random.uniform(0, 2))
data = json.loads(response.text)
if data['status']['code'] != 0:
continue
snippet_data = json.loads(data['data']['chapterInfo'])['snippet']
snippet_split = snippet_data.split('\n')
div_style = ''
for s in snippet_split:
if s.startswith('<div style') or s.startswith('</div><div style='):
div_style = s
elif s.startswith('<img src'):
if div_style == '':
print('error')
pattern = r'left:\s*(\d+)px;\s*top:\s*(\d+)px;\s*width:\s*\d+px;\s*height:\s*\d+px;'
match = re.findall(pattern, div_style)
if match:
left_value = match[0][0]
top_value = match[0][1]
pattern = r'src="([^"]+)".*?width:\s*(\d+)px;\s*height:\s*(\d+)px;'
match = re.findall(pattern, s)
if match:
image_url = match[0][0]
width = match[0][1]
height = match[0][2]
div_style = ''
book_img_base.append(
{'image_url': image_url, 'left_value': left_value, 'bottom_value': (880 - int(top_value)),
'top_value': top_value, 'width': width, 'height': height, 'text_content': image_url + '\n'})
elif s.startswith('<span class'):
pattern = r'<span\s+class="([^"]*)"\s+style="left:([^"]*)px;\s+bottom:([^"]*)px;\s*">([^<]*)<\/span>'
match = re.match(pattern, s)
if match:
class_name = match.group(1)
left_value = match.group(2)
bottom_value = match.group(3)
text_content = match.group(4)
book_txt_base.append({'class_name': class_name, 'left_value': left_value, 'bottom_value': bottom_value,
'text_content': text_content})
book_txt_base += book_img_base
book_txt_base = sorted(book_txt_base, key=sort_by_bottom_left)
text = '\n********************'
bottom_value = '0'
for s in book_txt_base:
if bottom_value != s['bottom_value']:
bottom_value = s['bottom_value']
text += '\n' + s['text_content']
else:
text += s['text_content']
print(text)
book_base.append({'locationIndex': k, 'book_txt_base': book_txt_base, 'text': text})
with open(r'C:\Users\Lenovo\Desktop\\' + epubID + 'book_base.txt', "a") as file:
file.write(json.dumps(book_base, ensure_ascii=False))
if os.path.exists(r'C:\Users\Lenovo\Desktop\\' + epubID + 'book_base.txt'):
with open(r'C:\Users\Lenovo\Desktop\\' + epubID + 'book_base.txt', "r") as file:
data = file.read()
data = json.loads(data)
for d in data:
if len(d['book_txt_base']) > 0:
img_size = {}
for b in d['book_txt_base']:
if 'image_url' in b:
img_name = b['image_url'].split('/')[-1]
img_size[img_name] = {'width': (int(b['width']) * 9.16 / 880),
'height': (int(b['height']) * 9.16 / 880)}
text = d['text'].replace('\n********************\n', '')
txt = ''
for t in text.split('\n'):
if t.startswith('http'):
if txt != '':
doc.add_paragraph(txt)
txt = ''
img_name = t.split('/')[-1]
img_path = r'C:\Users\Lenovo\Desktop\img\\' + img_name
if not os.path.exists(img_path):
while True:
try:
response = requests.request("GET", t, timeout=5)
with open(img_path, 'wb') as f:
f.write(response.content)
break
except BaseException as e:
print(e)
doc.add_picture(img_path, width=Inches(img_size[img_name]['width']),
height=Inches(img_size[img_name]['height']))
else:
txt += t + '\n'
if txt != '':
doc.add_paragraph(txt)
txt = ''
doc.add_page_break()
doc.save(r'C:\Users\Lenovo\Desktop\\' + epubID + 'book.docx')
print('ok')