吾爱破解 - 52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 7108|回复: 38
收起左侧

[Python 转载] 【待打补丁】文泉学堂的高清png下载(修正版,交互、可下整本书)

[复制链接]
mikeee 发表于 2020-2-3 15:13
本帖最后由 mikeee 于 2020-2-4 15:55 编辑

[Asm] 纯文本查看 复制代码
'''
python 3.6, 3.7

pip install httpx loguru pyjwt tqdm

# 文泉学堂
# 3208943 ('Python+TensorFlow机器学习实战', '248') 第1页
python fetch_png.py

# 第10页
python fetch_png.py 3208943 10

'''

# from typing import Union, Tuple
from pathlib import Path
from time import time
import json
import httpx
import jwt
from tqdm import trange
from loguru import logger

JWT_SECRET = 'g0NnWdSE8qEjdMD8a1aq12qEYphwErKctvfd3IktWHWiOBpVsgkecur38aBRPn2w'
SESS = httpx.Client()
URL = 'https://lib-nuanxin.wqxuetang.com'
SESS.get(URL)


# def gen_jwt_key(self):
def gen_jwt_key(bookid):
    ''' jwt key for bookid '''
    # url = "https://lib-nuanxin.wqxuetang.com/v1/read/k?bid=%s" % bookid
    url = f'{URL}/v1/read/k?bid={bookid}'
    # r = self.session.get(url, timeout=5)

    # r = SESS.get(url, timeout=5)
    # j = json.loads(r.text)

    try:
        resp = SESS.get(url)
        resp.raise_for_status()
    except Exception as exc:
        logger.warning(exc)
        return str(exc)

    try:
        jdata = resp.json()
    except Exception as exc:
        logger.warning(exc)
        jdata = {}

    res = jdata.get('data')
    if res is None:
        raise Exception('returned None, something is not right...')

    return res


# def gen_jwt_token(self, page):
def gen_jwt_token(bookid, page=1):
    ''' gen jwt token '''
    cur_time = time()
    jwtkey = gen_jwt_key(bookid)
    jwttoken = jwt.encode(
        {
            "p": page,
            "t": int(cur_time) * 1000,
            "b": str(bookid),
            "w": 1000,
            # "k": json.dumps(self.jwtkey),
            "k": json.dumps(jwtkey),
            "iat": int(cur_time),
        },
        JWT_SECRET,
        algorithm='HS256',
    ).decode('ascii')
    return jwttoken


# def bookinfo(self):
def bookinfo(bookid):
    ''' bookinfo '''
    # url = f"https://lib-nuanxin.wqxuetang.com/v1/read/initread?bid={self.bookid}"  # noqa
    url = f'{URL}/v1/read/initread?bid={bookid}'
    # r = self.session.get(url)

    try:
        bookid = int(bookid)
    except Exception as exc:
        logger.warning(f'error: {exc}, setting bookid to 1')
        raise
    if bookid < 1:
        # logger.warning(f' bookid {bookid} < 1, setting bookid to 1')
        # bookid = 1
        raise Exception(' bookid must be bigger than zero')

    req = httpx.models.Request('GET', URL)
    try:
        resp = SESS.get(url)
        resp.raise_for_status()
    except Exception as exc:
        logger.warning(exc)
        resp = httpx.Response(
            status_code=499, request=req, content=str(exc).encode()
        )

    try:
        jdata = resp.json()
    except Exception as exc:
        logger.warning(exc)
        jdata = {}

    # info = json.loads(r.text)

    # data = info['data']
    # return data['name'], data['canreadpages']

    data = jdata.get('data')

    if data is None:
        raise Exception(
            'returned None, something is not right...可能无此书号,也有可能是网络有问题或IP被限制……'  # noqa
        )

    bookinfo.jdata = jdata

    return data.get('name'), data.get('canreadpages')


# async def download_img(self, page, task_id):
def fetch_png(bookid, page=1):
    ''' download booid page img '''
    token = gen_jwt_token(bookid, page)
    url = f'{URL}/page/img/{bookid}/{page}?k={token}'
    headers = {
        'referer': f'{URL}/read/pdf/{bookid}',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',  # noqa
    }

    req = httpx.models.Request('GET', URL)

    count = 0
    while 1:
        try:
            resp = SESS.get(url, headers=headers)
            resp.raise_for_status()
            break
        except Exception as exc:
            logger.warning(exc)
            resp = httpx.Response(
                status_code=499, request=req, content=str(exc).encode()
            )
        count += 1
        if count > 3:
            break
    else:
        logger.warning(' We tried hard (4 times), giving up')
        raise Exception('Failed... ')

    fetch_png.resp = resp

    try:
        res = resp.content
    except Exception as exc:
        logger.warning(exc)
        return b''

    return res


def fetch_book(bookid, page1=1, page2=None):
    ''' fetch book with bookid page1 to page2 '''
    # '''

    try:
        last_page = int(bookinfo(bookid)[1])
    except Exception as exc:
        logger.error(exc)
        raise

    if page2 is None:
        page2 = last_page

    for elm in trange(page1, page2 + 1):
        # if elm > 100: break
        filename = f'{bookid}-{elm:03d}.png'
        if Path(filename).exists():
            logger.info(f'{filename} already exists, sipping....')
        else:
            logger.info(f' Fetching {filename}...')

            count = 0
            while 1:
                try:
                    png = fetch_png(bookid, elm)
                    Path(filename).write_bytes(png)
                    break
                except Exception as exc:
                    count += 1
                if count > 2:
                    break
            else:
                logger.warning(' Page {page} probbaly missing')


def main():  # pylint: disable=too-many-branches
    ''' main '''
    import os  # pylint: disable=unused-import
    import sys  # pylint: disable=unused-import

    bookid_str = ''
    yes_list = ['是', 'y', 'qui', '对', '好']

    logger.info(
        r'''

        访问 https://lib-nuanxin.wqxuetang.com/
        搜索点击感兴趣的书,例如 《计算机网络基础》
        https://lib-nuanxin.wqxuetang.com/#/Book/2175744

        网址尾部的数字2175744 即为书号 bookid。将bookid拷至
        系统剪贴板或记住该数字。
    '''
    )

    while 1:
        bookid_str = input('输入书号bookid:(例如 123, 退出输入 q 或 x) ')

        if any(map(lambda x: bookid_str.lower().startswith(x), ['q', 'x'])):
            break

        if not bookid_str.strip():
            continue

        try:
            bookid = int(bookid_str)
        except Exception as exc:
            logger.warning(f'\n\t 无效书号:{bookid_str}, 重新输入')
            continue

        if bookid < 1:
            logger.warning(f'\n\t 无效书号:{bookid_str}, 重新输入')
            continue

        count = 0
        success = False
        while 1:
            try:
                logger.info('\n\t\t diggin...')
                info = bookinfo(bookid)
                success = True
                break
            except Exception as exc:
                logger.error(exc)

            count += 1
            if count > 3:
                logger.info('\n\t 事不过三,还是算了吧')
                break

            cont_or_not = input('再试一次?确认输入 y, 输入其他重输书号 ')

            if not cont_or_not.strip():
                continue

            if any(map(lambda x: cont_or_not.lower().startswith(x), yes_list)):
                continue
            else:
                break

        if not success:
            continue

        logger.info(bookinfo.jdata)
        logger.info(f'\n\t\t 下载 {info}? ')

        yes_no = input('确认输入 y, 输入其他重输书号 ')

        if any(map(lambda x: yes_no.lower().startswith(x), yes_list)):
            logger.info(
                f'''
            开始下载 {info}...
            一般情况下(例如网络正常、服务器不太忙,IP未被限制)
            平均每100页约需15-20分钟)。

            终断(ctrl-C或ctrl-brea)程序后,已下载的页不会被覆盖。
            因此,如发现下载的页有问题或有些页未成功下载,可以删掉
            再用相同的 bookid 运行一次程序 python fetch_png.py

            '''
            )
            fetch_book(bookid)

    '''
    bookid = 3208943
    if not sys.argv[1:]:
        logger.info(' Provide at least a bookid.')
        logger.info(' Using %s to test ' % bookid)
    else:
        try:
            bookid = sys.argv[1]
        except Exception as exc:
            logger.warning(exc)

    page = 1
    if not sys.argv[2:]:
        logger.info(' Provide a page number.')
        logger.info(' Using %s to test ' % page)
    else:
        try:
            bookid = sys.argv[2]
        except Exception as exc:
            logger.warning(exc)
    logger.info(f' Fetchiing {bookid} {bookinfo(bookid)} page: {page}')

    res = fetch_png(bookid, page)

    filename = f'{bookid}-{page:03d}.png'

    count = 0
    while Path(filename).exists():
        count += 1
        filename = f'{bookid}-{page:03d}-{count}.png'
        if count > 4:
            break
    else:
        logger.warning(f' Possibly overwriting {filename}')

    Path(filename).write_bytes(res)
    logger.info(f'{filename} saved.')

    if sys.platform in ['win32']:
        os.startfile(f'{filename}')  # type: ignore
    # '''  # pylint: disable=pointless-string-statement


if __name__ == '__main__':
    main()

Happy downloading, reading and learning!
搜到网上高人分享的资源,做了点简化。希望对网友有点用。(友好提示:可以免费评分。)

修正版更新:交互输入bookid。无引入人为延迟,一般情况下每100页需时15-20分钟。可能有些页显示“下载中”(由于未认为引入延迟,估计服务器有时忙不过来或是服务器有限流机制),可删去这些有问题的页再用系统bookid运行一次,程序会自动跳过已存在的页。

免费评分

参与人数 9吾爱币 +12 热心值 +9 收起 理由
san4san + 1 + 1 谢谢@Thanks!
huguo002 + 1 + 1 用心讨论,共获提升!
时空之外 + 1 + 1 我很赞同!
错的是世界 + 1 + 1 感谢楼主,研究下
whyu + 1 谢谢@Thanks!
bmcys2010 + 1 + 1 用心讨论,共获提升!
苏紫方璇 + 5 + 1 感谢发布原创作品,吾爱破解论坛因你更精彩!
first + 1 + 1 谢谢@Thanks!
wtujcf123 + 1 + 1 &amp;lt;font style=&amp;quot;vertical-align: inherit;&amp;quot;&amp;gt;&amp;lt;font style=

查看全部评分

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

 楼主| mikeee 发表于 2020-2-3 17:11
本帖最后由 mikeee 于 2020-2-3 17:14 编辑
zhucebuyi 发表于 2020-2-3 16:56
可否把循环页码加上 -0-   代码只能下载一页 -0-

单纯加个循环并不能真正解决问题,因为牵涉到下载不成功(下得多可能会比较经常出现)或下的是"正在载入请稍等"图片时需重试、重重试等等。

我要想想要不要再花时间整得使用友好一些…… 有一百个坛友评分的话我应该就有了足够的理由整一个只需给个bookid就可以拿到整本书的pdf的程序 :)
 楼主| mikeee 发表于 2020-2-3 16:23
本帖最后由 mikeee 于 2020-2-3 16:28 编辑
wtujcf123 发表于 2020-2-3 16:11
你好,能告诉我怎么用吗

先找到要下的书号 bookid,比如 https://lib-nuanxin.wqxuetang.com/read/pdf/3208943 里的 3208943就是bookid。调用 fetch_png(3208943, 25) 返回第 25 页的 png 的二元数据,可以写入(用open 或 pathlib.Path)例如 25.png. 下完所有的页后可以转换合成pdf。

命令行调用参看第008、011行

python程序里调用参看 168行、181行。

是给会一点python的坛友用的。
miqi1314 发表于 2020-2-3 15:15
oktyx 发表于 2020-2-3 15:31
这个怎么用呀?菜鸡求问~
头像被屏蔽
ampie1994 发表于 2020-2-3 15:32
提示: 作者被禁止或删除 内容自动屏蔽
一年又一年 发表于 2020-2-3 16:11
python的吗?
wtujcf123 发表于 2020-2-3 16:11
你好,能告诉我怎么用吗
头像被屏蔽
袁煜914 发表于 2020-2-3 16:23
提示: 作者被禁止或删除 内容自动屏蔽
枫叶荻花 发表于 2020-2-3 16:30
论坛小白还是很多的,最好还是打包成软件分享出来,提高帖子阅读权限
lntuer 发表于 2020-2-3 16:36
    os.startfile(f'{filename}')  # type: ignore
OSError: [WinError -2147221003] 找不到应用程序: '3209078-001.png'
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则

返回列表

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2024-11-25 11:29

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表