【待打补丁】文泉学堂的高清png下载（修正版，交互、可下整本书）

mikeee 发表于 2020-2-3 15:13

本帖最后由 mikeee 于 2020-2-4 15:55 编辑

'''
python 3.6, 3.7

pip install httpx loguru pyjwt tqdm

# 文泉学堂
# 3208943 ('Python+TensorFlow机器学习实战', '248') 第1页
python fetch_png.py

# 第10页
python fetch_png.py 3208943 10

'''

# from typing import Union, Tuple
from pathlib import Path
from time import time
import json
import httpx
import jwt
from tqdm import trange
from loguru import logger

JWT_SECRET = 'g0NnWdSE8qEjdMD8a1aq12qEYphwErKctvfd3IktWHWiOBpVsgkecur38aBRPn2w'
SESS = httpx.Client()
URL = 'https://lib-nuanxin.wqxuetang.com'
SESS.get(URL)

# def gen_jwt_key(self):
def gen_jwt_key(bookid):
''' jwt key for bookid '''
# url = "https://lib-nuanxin.wqxuetang.com/v1/read/k?bid=%s" % bookid
url = f'{URL}/v1/read/k?bid={bookid}'
# r = self.session.get(url, timeout=5)

# r = SESS.get(url, timeout=5)
# j = json.loads(r.text)

try:
   resp = SESS.get(url)
   resp.raise_for_status()
except Exception as exc:
   logger.warning(exc)
   return str(exc)

try:
   jdata = resp.json()
except Exception as exc:
   logger.warning(exc)
   jdata = {}

res = jdata.get('data')
if res is None:
   raise Exception('returned None, something is not right...')

return res

# def gen_jwt_token(self, page):
def gen_jwt_token(bookid, page=1):
''' gen jwt token '''
cur_time = time()
jwtkey = gen_jwt_key(bookid)
jwttoken = jwt.encode(
   {
         "p": page,
         "t": int(cur_time) * 1000,
         "b": str(bookid),
         "w": 1000,
         # "k": json.dumps(self.jwtkey),
         "k": json.dumps(jwtkey),
         "iat": int(cur_time),
   },
   JWT_SECRET,
   algorithm='HS256',
).decode('ascii')
return jwttoken

# def bookinfo(self):
def bookinfo(bookid):
''' bookinfo '''
# url = f"https://lib-nuanxin.wqxuetang.com/v1/read/initread?bid={self.bookid}"# noqa
url = f'{URL}/v1/read/initread?bid={bookid}'
# r = self.session.get(url)

try:
   bookid = int(bookid)
except Exception as exc:
   logger.warning(f'error: {exc}, setting bookid to 1')
   raise
if bookid < 1:
   # logger.warning(f' bookid {bookid} < 1, setting bookid to 1')
   # bookid = 1
   raise Exception(' bookid must be bigger than zero')

req = httpx.models.Request('GET', URL)
try:
   resp = SESS.get(url)
   resp.raise_for_status()
except Exception as exc:
   logger.warning(exc)
   resp = httpx.Response(
         status_code=499, request=req, content=str(exc).encode()
   )

try:
   jdata = resp.json()
except Exception as exc:
   logger.warning(exc)
   jdata = {}

# info = json.loads(r.text)

# data = info['data']
# return data['name'], data['canreadpages']

data = jdata.get('data')

if data is None:
   raise Exception(
         'returned None, something is not right...可能无此书号，也有可能是网络有问题或IP被限制……'# noqa
   )

bookinfo.jdata = jdata

return data.get('name'), data.get('canreadpages')

# async def download_img(self, page, task_id):
def fetch_png(bookid, page=1):
''' download booid page img '''
token = gen_jwt_token(bookid, page)
url = f'{URL}/page/img/{bookid}/{page}?k={token}'
headers = {
   'referer': f'{URL}/read/pdf/{bookid}',
   'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',# noqa
}

req = httpx.models.Request('GET', URL)

count = 0
while 1:
   try:
         resp = SESS.get(url, headers=headers)
         resp.raise_for_status()
         break
   except Exception as exc:
         logger.warning(exc)
         resp = httpx.Response(
            status_code=499, request=req, content=str(exc).encode()
         )
   count += 1
   if count > 3:
         break
else:
   logger.warning(' We tried hard (4 times), giving up')
   raise Exception('Failed... ')

fetch_png.resp = resp

try:
   res = resp.content
except Exception as exc:
   logger.warning(exc)
   return b''

return res

def fetch_book(bookid, page1=1, page2=None):
''' fetch book with bookid page1 to page2 '''
# '''

try:
   last_page = int(bookinfo(bookid))
except Exception as exc:
   logger.error(exc)
   raise

if page2 is None:
   page2 = last_page

for elm in trange(page1, page2 + 1):
   # if elm > 100: break
   filename = f'{bookid}-{elm:03d}.png'
   if Path(filename).exists():
         logger.info(f'{filename} already exists, sipping....')
   else:
         logger.info(f' Fetching {filename}...')

         count = 0
         while 1:
            try:
               png = fetch_png(bookid, elm)
               Path(filename).write_bytes(png)
               break
            except Exception as exc:
               count += 1
            if count > 2:
               break
         else:
            logger.warning(' Page {page} probbaly missing')

def main():# pylint: disable=too-many-branches
''' main '''
import os# pylint: disable=unused-import
import sys# pylint: disable=unused-import

bookid_str = ''
yes_list = ['是', 'y', 'qui', '对', '好']

logger.info(
   r'''

   访问 https://lib-nuanxin.wqxuetang.com/
   搜索点击感兴趣的书，例如《计算机网络基础》
   https://lib-nuanxin.wqxuetang.com/#/Book/2175744

   网址尾部的数字2175744 即为书号 bookid。将bookid拷至
   系统剪贴板或记住该数字。
'''
)

while 1:
   bookid_str = input('输入书号bookid：（例如 123, 退出输入 q 或 x） ')

   if any(map(lambda x: bookid_str.lower().startswith(x), ['q', 'x'])):
         break

   if not bookid_str.strip():
         continue

   try:
         bookid = int(bookid_str)
   except Exception as exc:
         logger.warning(f'\n\t 无效书号：{bookid_str}, 重新输入')
         continue

   if bookid < 1:
         logger.warning(f'\n\t 无效书号：{bookid_str}, 重新输入')
         continue

   count = 0
   success = False
   while 1:
         try:
            logger.info('\n\t\t diggin...')
            info = bookinfo(bookid)
            success = True
            break
         except Exception as exc:
            logger.error(exc)

         count += 1
         if count > 3:
            logger.info('\n\t 事不过三，还是算了吧')
            break

         cont_or_not = input('再试一次？确认输入 y，输入其他重输书号 ')

         if not cont_or_not.strip():
            continue

         if any(map(lambda x: cont_or_not.lower().startswith(x), yes_list)):
            continue
         else:
            break

   if not success:
         continue

   logger.info(bookinfo.jdata)
   logger.info(f'\n\t\t 下载 {info}? ')

   yes_no = input('确认输入 y，输入其他重输书号 ')

   if any(map(lambda x: yes_no.lower().startswith(x), yes_list)):
         logger.info(
            f'''
         开始下载 {info}...
         一般情况下（例如网络正常、服务器不太忙，IP未被限制）
         平均每100页约需15-20分钟)。

         终断（ctrl-C或ctrl-brea）程序后，已下载的页不会被覆盖。
         因此，如发现下载的页有问题或有些页未成功下载，可以删掉
         再用相同的 bookid 运行一次程序 python fetch_png.py

         '''
         )
         fetch_book(bookid)

'''
bookid = 3208943
if not sys.argv:
   logger.info(' Provide at least a bookid.')
   logger.info(' Using %s to test ' % bookid)
else:
   try:
         bookid = sys.argv
   except Exception as exc:
         logger.warning(exc)

page = 1
if not sys.argv:
   logger.info(' Provide a page number.')
   logger.info(' Using %s to test ' % page)
else:
   try:
         bookid = sys.argv
   except Exception as exc:
         logger.warning(exc)
logger.info(f' Fetchiing {bookid} {bookinfo(bookid)} page: {page}')

res = fetch_png(bookid, page)

filename = f'{bookid}-{page:03d}.png'

count = 0
while Path(filename).exists():
   count += 1
   filename = f'{bookid}-{page:03d}-{count}.png'
   if count > 4:
         break
else:
   logger.warning(f' Possibly overwriting {filename}')

Path(filename).write_bytes(res)
logger.info(f'{filename} saved.')

if sys.platform in ['win32']:
   os.startfile(f'{filename}')# type: ignore
# '''# pylint: disable=pointless-string-statement

if __name__ == '__main__':
main()

Happy downloading, reading and learning！
搜到网上高人分享的资源，做了点简化。希望对网友有点用。（友好提示：可以免费评分。）

修正版更新：交互输入bookid。无引入人为延迟，一般情况下每100页需时15-20分钟。可能有些页显示“下载中”（由于未认为引入延迟，估计服务器有时忙不过来或是服务器有限流机制），可删去这些有问题的页再用系统bookid运行一次，程序会自动跳过已存在的页。

mikeee 发表于 2020-2-3 17:11

本帖最后由 mikeee 于 2020-2-3 17:14 编辑

zhucebuyi 发表于 2020-2-3 16:56
可否把循环页码加上 -0- 代码只能下载一页 -0-
单纯加个循环并不能真正解决问题，因为牵涉到下载不成功（下得多可能会比较经常出现）或下的是"正在载入请稍等"图片时需重试、重重试等等。

我要想想要不要再花时间整得使用友好一些…… 有一百个坛友评分的话我应该就有了足够的理由整一个只需给个bookid就可以拿到整本书的pdf的程序 :)

mikeee 发表于 2020-2-3 16:23

本帖最后由 mikeee 于 2020-2-3 16:28 编辑

wtujcf123 发表于 2020-2-3 16:11
你好，能告诉我怎么用吗
先找到要下的书号 bookid，比如 https://lib-nuanxin.wqxuetang.com/read/pdf/3208943 里的 3208943就是bookid。调用 fetch_png(3208943, 25) 返回第 25 页的 png 的二元数据，可以写入(用open 或 pathlib.Path)例如 25.png. 下完所有的页后可以转换合成pdf。

命令行调用参看第008、011行

python程序里调用参看 168行、181行。

是给会一点python的坛友用的。

miqi1314 发表于 2020-2-3 15:15

感谢楼主的无私分享！

oktyx 发表于 2020-2-3 15:31

这个怎么用呀？菜鸡求问~

ampie1994 发表于 2020-2-3 15:32

一年又一年 发表于 2020-2-3 16:11

python的吗？

wtujcf123 发表于 2020-2-3 16:11

你好，能告诉我怎么用吗

袁煜914 发表于 2020-2-3 16:23

枫叶荻花 发表于 2020-2-3 16:30

论坛小白还是很多的，最好还是打包成软件分享出来，提高帖子阅读权限

lntuer 发表于 2020-2-3 16:36

os.startfile(f'{filename}')# type: ignore
OSError: 找不到应用程序: '3209078-001.png'

页: [1] 2 3 4

吾爱破解 - 52pojie.cn's Archiver

【待打补丁】文泉学堂的高清png下载（修正版，交互、可下整本书）