[Python] 纯文本查看 复制代码
# -*- coding: utf-8 -*-
import urllib.parse
import string
import requests
import json
import re
import time
import os
import sys
# doc文档模块
from docx import Document
from docx.shared import Inches
from docx.oxml.ns import qn
# cookie修改
Cookie = 'PHPSESSID=5s6umsm3ic7hmp9djhkia854s9; _gid=398749400617; _gidv=9e54db532452f68408485a0f3f20a1b1'
# 接收用户输入的关键字
flag = True
keyword = ''
while flag:
keyword = input('请输入搜索的关键字:')
if keyword:
flag = False
else:
print('关键字为空,请重新输入!')
# 要发送的请求
wqxuetang_url = r'https://lib-nuanxin.wqxuetang.com/v1/search/initsearch'
# 字符转码
new_wqxuetang_url = urllib.parse.quote(wqxuetang_url, safe=string.printable)
# 发送请求
def get_response(page=1):
# 请求头
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept - Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'keep-alive',
'Host': 'lib-nuanxin.wqxuetang.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
'Cookie': Cookie
}
# 要发送的数据
data = {
'kw': keyword,
'type': 1,
'begindate': '',
'enddate': '',
'tid': '',
'libclass': '',
'cnclass': '',
'tag': '',
'author': '',
'pn': page,
'size': 10,
'isfquery': 'undefined',
'sort': 1
}
# 发送请求,带数据和请求头
response = requests.post(url=new_wqxuetang_url, data=data, headers=headers)
# 转为JSON对象
dict_str = json.loads(response.text)
return dict_str
# 保存数据
def data_download(dict_str):
global count
# 匹配网页标签并替换
rec = re.compile('</?[a-zA-Z]+\s?\d?/?>')
# 写入文件
for item in dict_str['data']['list']:
# 分割线
data1 = '{0}【华丽的分割线】{0}\n'.format('*' * 30)
f1.write(data1)
# 书ID号
data2 = '书的ID号:%s\n' % item['numid']
f1.write(data2)
print('BID:%s\t' % item['numid'], end='')
# 书名
book_name = item['name']
result = set(re.findall(rec, book_name))
for ret in result:
book_name = book_name.replace(ret, '')
data3 = '书名:《%s》\n' % book_name
f1.write(data3)
print('书名:《%s》\t' % book_name, end='')
# 作者
data4 = '作者:%s\n' % item['author']
f1.write(data4)
# 出版时间
data5 = '出版时间:%s\n' % item['pubdate']
f1.write(data5)
print('出版时间:%s' % item['pubdate'])
# 出版社
data6 = '出版社:%s\n' % item['pub']
f1.write(data6)
# 图书简介
des = item['descript']
result = set(re.findall(rec, des))
for ret in result:
des = des.replace(ret, '')
data7 = '图书简介 :%s\n' % des
f1.write(data7)
data8 = '书籍地址:[url=https://lib-nuanxin.wqxuetang.com/#/Book/%s]https://lib-nuanxin.wqxuetang.com/#/Book/%s[/url]\n\n' % item['numid']
f1.write(data8)
# 刷新缓存区
f1.flush()
# 图片请求头
img_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept - Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'keep-alive',
'Host': 'bookask-cover.oss-cn-beijing.aliyuncs.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0',
'Cookie': Cookie
}
# 拿到图片的请求地址
img_url = item['coverurl']
# 发送请求
img_data = requests.get(url=img_url, headers=img_headers)
# 发送间隔
time.sleep(2)
# 把图片写入本地目录
img_count = '%s\\%s.jpg' % (pt, count)
with open(img_count, 'wb') as fp:
fp.write(img_data.content)
# 把书籍信息写入doc
data1 = data1.replace('\n', '')
myDocument.add_paragraph(data1)
# 向文档里添加图片
myDocument.add_picture(img_count, width=Inches(2), height=Inches(3))
# 处理文字
datas = [data2, data3, data4, data5, data6, data7, data8]
datas = map(lambda text: text.replace('\n', ''), datas)
for data_item in datas:
myDocument.add_paragraph(u'%s' % data_item)
#
myDocument.add_paragraph()
# 指数加1
count += 1
# 图片计数
count = 1
# 第一次发送数据
dict_str = get_response()
# 判断cookie是否失效
login = dict_str['errmsg']
if login == '请先登录':
input('cookie已失效,请重新获取登录后的cookie进行替换,本程序自动结束!')
sys.exit()
# 取最大页数
page_count = dict_str['data']['pageinfo']['pagecount']
print('约%s页' % page_count)
# 获取开始页数
flag1 = True
mincount = 1
while flag1:
min_page = input('请输入要开始获取的页数,默认值为第1页:')
if min_page.isdigit():
mincount = int(min_page)
flag1 = False
elif min_page == '':
flag1 = False
else:
print('非法参数!请重新输入')
# 获取最大页数
flag2 = True
while flag2:
max_page = input('请输入要获取的总页数,默认值为最大页数:')
if max_page.isdigit():
page_count = int(max_page)
flag2 = False
elif max_page == '':
flag2 = False
else:
print('非法参数!请重新输入')
# 获取当前工作目录
root = os.getcwd()
# 拼接路径
pt = os.path.join(root, keyword)
# 如果目录不存在则创建文件夹
if not os.path.isdir(pt):
os.mkdir(root + '\\%s' % keyword)
# 创建空白文档,并设置样式
myDocument = Document()
myDocument.styles['Normal'].font.name = u'微软雅黑'
myDocument.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'微软雅黑')
# 创建空白文件,并设置编码
f1 = open('%s.txt' % keyword, 'w', encoding='utf-8')
# 获取数据
for page in range(mincount, page_count + 1):
print('第%s页书籍信息:' % page)
# 发送请求
dict_str = get_response(page)
# 如果为空则结束发送
if not dict_str['data']['list']:
break
# 把数据写入文件
data_download(dict_str)
# 提示信息
print('【第%s页数据写入文件中...】' % page)
# 延迟发送请求
time.sleep(5)
#
# 保存文件并关闭
f1.close()
myDocument.save('%s.doc' % keyword)
print('数据写入完毕!本程序自动结束!')