报纸下载
本帖最后由 FeiyuYip 于 2023-12-16 19:09 编辑勾选报纸种类、选择日期,然后开始下载即可
目前仅添加了三种的报纸种类,理论还可以扩展
一、主界面
主界面是QTDesigner画的,再转成py代码
# -*- coding: utf-8 -*-
# Form implementation generated from reading ui file 'UI.ui'
#
# Created by: PyQt5 UI code generator 5.15.9
#
# WARNING: Any manual changes made to this file will be lost when pyuic5 is
# run again.Do not edit this file unless you know what you are doing.
from PyQt5 import QtCore, QtGui, QtWidgets
class Ui_MainWindow(object):
def setupUi(self, MainWindow):
screen = QtWidgets.QDesktopWidget().screenGeometry()# 自动适应屏幕宽高
width = int(screen.width() / 4)
height = int(screen.height() / 2.5)
MainWindow.setObjectName("MainWindow")
MainWindow.setEnabled(True)
MainWindow.resize(width, height)
MainWindow.setMinimumSize(QtCore.QSize(415, 515))
MainWindow.setAcceptDrops(True)
icon = QtGui.QIcon()
icon.addPixmap(QtGui.QPixmap(resource_path('paper.ico')), QtGui.QIcon.Normal, QtGui.QIcon.Off)
MainWindow.setWindowIcon(icon)
MainWindow.setLayoutDirection(QtCore.Qt.LeftToRight)
MainWindow.setLocale(QtCore.QLocale(QtCore.QLocale.Chinese, QtCore.QLocale.China))
self.centralwidget = QtWidgets.QWidget(MainWindow)
self.centralwidget.setMinimumSize(QtCore.QSize(415, 550))
self.centralwidget.setObjectName("centralwidget")
self.verticalLayout = QtWidgets.QVBoxLayout(self.centralwidget)
self.verticalLayout.setObjectName("verticalLayout")
self.horizontalLayout = QtWidgets.QHBoxLayout()
self.horizontalLayout.setObjectName("horizontalLayout")
self.verticalLayout_2 = QtWidgets.QVBoxLayout()
self.verticalLayout_2.setObjectName("verticalLayout_2")
self.label_2 = QtWidgets.QLabel(self.centralwidget)
self.label_2.setAlignment(QtCore.Qt.AlignCenter)
self.label_2.setObjectName("label_2")
self.verticalLayout_2.addWidget(self.label_2, 0, QtCore.Qt.AlignHCenter | QtCore.Qt.AlignVCenter)
spacerItem = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding)
self.verticalLayout_2.addItem(spacerItem)
self.checkBox = QtWidgets.QCheckBox(self.centralwidget)
self.checkBox.setChecked(True)
self.checkBox.setTristate(False)
self.checkBox.setObjectName("checkBox")
self.verticalLayout_2.addWidget(self.checkBox)
spacerItem1 = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding)
self.verticalLayout_2.addItem(spacerItem1)
self.checkBox_3 = QtWidgets.QCheckBox(self.centralwidget)
self.checkBox_3.setChecked(True)
self.checkBox_3.setObjectName("checkBox_3")
self.verticalLayout_2.addWidget(self.checkBox_3)
spacerItem2 = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding)
self.verticalLayout_2.addItem(spacerItem2)
self.checkBox_2 = QtWidgets.QCheckBox(self.centralwidget)
self.checkBox_2.setChecked(True)
self.checkBox_2.setObjectName("checkBox_2")
self.verticalLayout_2.addWidget(self.checkBox_2)
spacerItem3 = QtWidgets.QSpacerItem(20, 40, QtWidgets.QSizePolicy.Minimum, QtWidgets.QSizePolicy.Expanding)
self.verticalLayout_2.addItem(spacerItem3)
self.horizontalLayout.addLayout(self.verticalLayout_2)
self.verticalLayout_4 = QtWidgets.QVBoxLayout()
self.verticalLayout_4.setObjectName("verticalLayout_4")
self.label = QtWidgets.QLabel(self.centralwidget)
self.label.setObjectName("label")
self.verticalLayout_4.addWidget(self.label, 0, QtCore.Qt.AlignHCenter | QtCore.Qt.AlignVCenter)
self.calendarWidget = QtWidgets.QCalendarWidget(self.centralwidget)
self.calendarWidget.setMinimumSize(QtCore.QSize(314, 244))
self.calendarWidget.setGridVisible(True)
self.calendarWidget.setSelectionMode(QtWidgets.QCalendarWidget.SingleSelection)
self.calendarWidget.setHorizontalHeaderFormat(QtWidgets.QCalendarWidget.ShortDayNames)
self.calendarWidget.setVerticalHeaderFormat(QtWidgets.QCalendarWidget.NoVerticalHeader)
self.calendarWidget.setDateEditEnabled(False)
self.calendarWidget.setObjectName("calendarWidget")
self.verticalLayout_4.addWidget(self.calendarWidget)
self.horizontalLayout.addLayout(self.verticalLayout_4)
self.verticalLayout.addLayout(self.horizontalLayout)
self.horizontalLayout_2 = QtWidgets.QHBoxLayout()
self.horizontalLayout_2.setObjectName("horizontalLayout_2")
self.pushButton = QtWidgets.QPushButton(self.centralwidget)
self.pushButton.setObjectName("pushButton")
self.horizontalLayout_2.addWidget(self.pushButton)
spacerItem4 = QtWidgets.QSpacerItem(40, 20, QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Minimum)
self.horizontalLayout_2.addItem(spacerItem4)
self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)
self.pushButton_2.setObjectName("pushButton_2")
self.horizontalLayout_2.addWidget(self.pushButton_2)
self.verticalLayout.addLayout(self.horizontalLayout_2)
self.horizontalLayout_5 = QtWidgets.QHBoxLayout()
self.horizontalLayout_5.setObjectName("horizontalLayout_5")
self.textEdit = QtWidgets.QTextEdit(self.centralwidget)
self.textEdit.setEnabled(True)
self.textEdit.setMinimumSize(QtCore.QSize(395, 230))
self.textEdit.setObjectName("textEdit")
self.horizontalLayout_5.addWidget(self.textEdit)
self.verticalLayout.addLayout(self.horizontalLayout_5)
MainWindow.setCentralWidget(self.centralwidget)
self.retranslateUi(MainWindow)
QtCore.QMetaObject.connectSlotsByName(MainWindow)
def retranslateUi(self, MainWindow):
_translate = QtCore.QCoreApplication.translate
MainWindow.setWindowTitle(_translate("MainWindow", "报纸下载器"))
self.label_2.setText(_translate("MainWindow", "报纸种类"))
self.checkBox.setText(_translate("MainWindow", "人民日报"))
self.checkBox_3.setText(_translate("MainWindow", "江西日报"))
self.checkBox_2.setText(_translate("MainWindow", "赣南日报"))
self.label.setText(_translate("MainWindow", "选择日期"))
self.pushButton.setText(_translate("MainWindow", "开始下载"))
self.pushButton_2.setText(_translate("MainWindow", "查看下载"))
def resource_path(relative):
import os, sys
if hasattr(sys, "_MEIPASS"):
absolute_path = os.path.join(sys._MEIPASS, relative)
else:
absolute_path = os.path.join(relative)
return absolute_path
二、逻辑部分
1.RMRB下载
from get_page import get_page, make_dir, header, merge_pdf, handle_string
from lxml import etree
import os
import requests
import time
def download(url, date, year, month, day, date_path, paper_name):
response = get_page(url)
seclector = etree.HTML(response)
chapter_urls = seclector.xpath('//div[@class="swiper-slide"]/a/@href')
chapter_names = seclector.xpath('//div[@class="swiper-slide"]/a/text()')
print(f'{paper_name}({date}) 共有 {len(chapter_urls)} 个版面')
for chapter_name, chapter_url in zip(chapter_names, chapter_urls):
chapter_name = handle_string(chapter_name)
# print(chapter_name)
if os.path.exists(f'{date_path}/{chapter_name}.pdf'):
pass
else:
chapter_url = f'http://paper.people.com.cn/rmrb/html/{year}-{month}/{day}/nbs.D110000renmrb_{str(chapter_urls.index(chapter_url) + 1).zfill(2)}.htm'
chapter_response = get_page(chapter_url)
chapter_seclector = etree.HTML(chapter_response)
chapter_pdf_dowload_url = chapter_seclector.xpath('//p[@class="right btn"]/a/@href')
chapter_pdf_dowload_url = ]
# # http://paper.people.com.cn/rmrb/images/2023-11/13/01/rmrb2023111301.pdf
chapter_pdf_dowload_url = 'http://paper.people.com.cn/rmrb/' + '/'.join(chapter_pdf_dowload_url)
#
pdf_response = requests.get(chapter_pdf_dowload_url, headers=header)
if pdf_response.headers['Content-Type'] == 'application/pdf':
with open(f'{date_path}/{chapter_name}.pdf', 'wb') as f:
f.write(pdf_response.content)
print(f'{paper_name}({date}) {chapter_name}保存完毕!!!')
time.sleep(1)
merge_pdf(date, date_path, paper_name)
def main(paper_name, date):
year = date.split('-')
month = date.split('-')
day = date.split('-')
date_path = f'{paper_name}/{date}'
make_dir(date_path)
url = f'http://paper.people.com.cn/rmrb/html/{year}-{month}/{day}/nbs.D110000renmrb_01.htm'
download(url, date, year, month, day, date_path, paper_name)
if __name__ == '__main__':
paper_name = '人民日报'
date = '2023-12-12'
main(paper_name, date)
2.JXRB下载
import requests
import os
import time
from lxml import etree
from get_page import get_page, make_dir, header, merge_pdf,handle_string
def download(url, date, year, month, day, date_path, paper_name):
response = get_page(url)
seclector = etree.HTML(response)
chapter_urls = seclector.xpath('//td[@class="szb_text_color"]/a/@href')
chapter_names = seclector.xpath('//td[@class="szb_text_color"]/a/text()')
print(f'{paper_name}({date}) 共有 {len(chapter_urls)} 个版面')
for chapter_name, chapter_url in zip(chapter_names, chapter_urls):
chapter_name = handle_string(chapter_name)
# print(chapter_name)
if os.path.exists(f'{date_path}/{chapter_name}.pdf'):
pass
else:
chapter_url = f'http://epaper.jxxw.com.cn/html/{year}-{month}/{day}/{chapter_url}'
chapter_response = get_page(chapter_url)
chapter_seclector = etree.HTML(chapter_response)
chapter_pdf_dowload_url = chapter_seclector.xpath('//a[@id="bigbmshowpdf"]/@href')
chapter_pdf_dowload_url = ]
chapter_pdf_dowload_url = 'http://epaper.jxxw.com.cn/resfile/' + '/'.join(chapter_pdf_dowload_url)
pdf_response = requests.get(chapter_pdf_dowload_url, headers=header)
if pdf_response.headers['Content-Type'] == 'application/pdf':
with open(f'{date_path}/{chapter_name}.pdf', 'wb') as f:
f.write(pdf_response.content)
print(f'{paper_name}({date}) {chapter_name}保存完毕!!!')
time.sleep(1)
merge_pdf(date, date_path, paper_name)
def main(paper_name, date):
year = date.split('-')
month = date.split('-')
day = date.split('-')
date_path = f'{paper_name}/{date}'
make_dir(date_path)
url = f'http://epaper.jxxw.com.cn/html/{year}-{month}/{day}/index_{date}.htm'
download(url, date, year, month, day, date_path, paper_name)
if __name__ == '__main__':
paper_name = '江西日报'
date = '2023-12-12'
main(paper_name, date)
3.GNRB下载
import requests
import os
import time
from lxml import etree
from get_page import get_page, make_dir, header, merge_pdf,handle_string
def download(url, date, year, month, day, date_path, paper_name):
response = get_page(url)
# print(response)
seclector = etree.HTML(response)
chapter_urls = seclector.xpath('//td[@class="szb_text_color"]/a/@href')
chapter_names = seclector.xpath('//td[@class="szb_text_color"]/a/text()')
print(f'{paper_name}({date}) 共有 {len(chapter_urls)} 个版面')
for chapter_name, chapter_url in zip(chapter_names, chapter_urls):
chapter_name = handle_string(chapter_name)
if os.path.exists(f'{date_path}/{chapter_name}.pdf'):
print(f'{paper_name}({date}) {chapter_name}保存完毕~~~')
else:
# https://szb.gnrbs.cn/html/2023-12/16/node_95762.htm
chapter_url = f'https://szb.gnrbs.cn/html/{year}-{month}/{day}/{chapter_url}'
chapter_response = get_page(chapter_url)
chapter_seclector = etree.HTML(chapter_response)
chapter_pdf_dowload_url = chapter_seclector.xpath('//a[@id="bigbmshowpdf"]/@href')
chapter_pdf_dowload_url = ]
# https://szb.gnrbs.cn/resfile/2023-12-16/01/gnrb-20231216-001.pdf
chapter_pdf_dowload_url = 'https://szb.gnrbs.cn/resfile/' + '/'.join(chapter_pdf_dowload_url)
# print(chapter_pdf_dowload_url, chapter_name)
pdf_response = requests.get(chapter_pdf_dowload_url, headers=header)
if pdf_response.headers['Content-Type'] == 'application/pdf':
with open(f'{date_path}/{chapter_name}.pdf', 'wb') as f:
f.write(pdf_response.content)
print(f'{paper_name}({date}) {chapter_name}保存完毕!!!')
time.sleep(1)
#
print(f'{paper_name}({date}) 版面下载完毕,开始合并,请稍候……')
merge_pdf(date, date_path, paper_name)
def main(paper_name, date):
year = date.split('-')
month = date.split('-')
day = date.split('-')
date_path = f'{paper_name}/{date}'
make_dir(date_path)
url = f'https://szb.gnrbs.cn/html/{year}-{month}/{day}/index_{year}-{month}-{day}.htm'
download(url, date, year, month, day, date_path, paper_name)
if __name__ == '__main__':
main()
4.调用逻辑
from PyQt5 import QtWidgets, QtCore, QtGui
from UI import Ui_MainWindow
from PyQt5.QtWidgets import *
from PyQt5.QtCore import *
from PyQt5.QtGui import *
import rmrb_downloader
import jxrb_downloader
import gnrb_downloader
import os
# 發射信號
class Stream(QObject):
newText = pyqtSignal(str)
def write(self, text):
self.newText.emit(str(text))
QApplication.processEvents()
class Thread(QThread):
def __init__(self, paper_selector_list=None, select_date=None, parent=None):
super(Thread, self).__init__(parent)
self.paper_selector_list = paper_selector_list
self.select_date = select_date
def __del__(self):
self.wait()
def run(self):
print('--------分--------隔--------线--------')
print(f'选择的报纸种类有:{self.paper_selector_list}')
print(f'选择的下载日期是:{self.select_date}')
try:
for self.paper_name in self.paper_selector_list:
if self.paper_name == '人民日报':
print('--------分--------隔--------线--------')
print(f'开始下载 {self.select_date} {self.paper_name}')
rmrb_downloader.main(self.paper_name, self.select_date)
if self.paper_name == '江西日报':
print('--------分--------隔--------线--------')
print(f'开始下载 {self.select_date} {self.paper_name}')
jxrb_downloader.main(self.paper_name, self.select_date)
if self.paper_name == '赣南日报':
print('--------分--------隔--------线--------')
print(f'开始下载 {self.select_date} {self.paper_name}')
gnrb_downloader.main(self.paper_name, self.select_date)
print('--------分--------隔--------线--------')
except Exception as e:
print(e)
class MainWindow(QtWidgets.QMainWindow, Ui_MainWindow):
def __init__(self, parent=None):
super(MainWindow, self).__init__(parent)
self.setupUi(self)
self.center()
# 发射信息
sys.stdout = Stream(newText=self.onUpdateText)
sys.stderr = Stream(newText=self.onUpdateText)
# 开始下载
self.pushButton.clicked.connect(self.start_download)
# 查看下载
self.pushButton_2.clicked.connect(self.show_download)
def start_download(self):
# 选择的日期
self.select_date = self.get_date()
# 选择的报纸种类
self.paper_selector_list = self.get_paper_name()
self.thread = Thread()
self.thread.paper_selector_list = self.paper_selector_list
self.thread.select_date = self.select_date
self.thread.start()
def show_download(self):
try:
os.startfile(f'{os.getcwd()}')
except Exception as e:
print(e)
def get_paper_name(self):
paper_selector_list = []
if self.checkBox.isChecked():# 人民日报
paper_selector_list.append(self.checkBox.text())
if self.checkBox_3.isChecked():# 赣南日报
paper_selector_list.append(self.checkBox_3.text())
if self.checkBox_2.isChecked():# 江西日报
paper_selector_list.append(self.checkBox_2.text())
return paper_selector_list
def get_date(self):
date = QtCore.QDate(self.calendarWidget.selectedDate())
year = date.year()
month = str(date.month()).zfill(2)# 补齐2位
day = str(date.day()).zfill(2)# 补齐2位
select_date = f'{year}-{month}-{day}'
return select_date
def onUpdateText(self, text):
"""Write console output to text widget."""
cursor = self.textEdit.textCursor()
cursor.movePosition(QTextCursor.End)
cursor.insertText(text)
self.textEdit.setTextCursor(cursor)
self.textEdit.ensureCursorVisible()
def center(self):
qr = self.frameGeometry()
cp = QDesktopWidget().availableGeometry().center()
qr.moveCenter(cp)
self.move(qr.topLeft())
def closeEvent(self, event):
reply = QMessageBox.question(self, '退出提示',
"您确定要退出吗?", QMessageBox.Yes |
QMessageBox.No, QMessageBox.No)
if reply == QMessageBox.Yes:
event.accept()
elif reply == QMessageBox.No:
event.ignore()
def resource_path(relative):
import os, sys
if hasattr(sys, "_MEIPASS"):
absolute_path = os.path.join(sys._MEIPASS, relative)
else:
absolute_path = os.path.join(relative)
return absolute_path
if __name__ == '__main__':
import sys
app = QtWidgets.QApplication(sys.argv)
mainWindow = MainWindow()
mainWindow.show()
sys.exit(app.exec_())
5.其他代码,放到一个py文件里了
import requests
import os
from pypdf import PdfMerger, PdfReader
import shutil
import time
import re
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
def handle_string(string):
string = re.sub(r'\r\n *', '', str(string))
return string
def merge_pdf(date, date_path, paper_name):
pdf_list = os.listdir(date_path)
pdf_merge = PdfMerger()
pdf_list.sort()
# 记录每次合并后的总页数
pdf_page_num = 0
for pdf in pdf_list:
pdf_path = os.path.join(date_path, pdf)
pdf_in = PdfReader(pdf_path, strict=False)
# 每张报纸的标题
pdf_title = pdf.split('.')
pdf_merge.append(pdf_path)
pdf_merge.add_outline_item(pdf_title, pdf_page_num, None)
pdf_page_num += len(pdf_in.pages)
pdf_merge.write(f'{paper_name}/{paper_name}({date}).pdf')
print(f'{paper_name}({date}).pdf下载完成')
pdf_merge.close()
time.sleep(1)
# 删除文件夹
shutil.rmtree(date_path)
def make_dir(path):
if os.path.exists(path):
pass
else:
os.makedirs(path)
def get_page(url):
response = requests.get(url, headers=header)
response.encoding = 'utf-8'
if response.status_code == 200:
return response.text
三、界面展示
四、不足之处
1.三个下载逻辑相似,可以集成到一个函数中调用
2.使用pypdf库合并pdf,会出现“Multiple definitions in dictionary at byte 0x13866c for key /Ascent”的信息,不过不影响使用
有知道怎样解决的大佬还望帮助解决
五、声明
此仅供学习研究使用,请勿用于其他用途 wazct 发表于 2023-12-16 20:45
问一下这个get_page模块怎么下载啊,我找不到get_page库
from get_page import...
就是第5项,我随便起的名:lol wazct 发表于 2023-12-16 20:45
问一下这个get_page模块怎么下载啊,我找不到get_page库
from get_page import...
自己写的,第5部分的代码就是。你自己新建一个get_page.py的文件,把代码拷贝进去就行 能添加一个电脑报吗,万分感谢 非常开心,已经成功运行下载!
注意5个文件名:报纸下载器(调用文件),UI,get_page(其他文件), rmrb_downloader,jxrb_downloader,gnrb_downloader
已成功,已打包https://www.lanzoub.com/i6EHS222kecd 高手的技术就是牛 虽然不用,还是点赞
有没有成品软件呢:lol 感谢分享~ 这年头还有人看报纸??今日头条不香吗。。。。 能发一个打包好的嘛 好牛啊1987年的能下载不 问一下这个get_page模块怎么下载啊,我找不到get_page库
from get_page import get_page, make_dir, header, merge_pdf,handle_string
用pip安装时候出现如下报错
ERROR: Could not find a version that satisfies the requirement get_page (from versions: none)
ERROR: No matching distribution found for get_page
不然代码运行不了
from get_page import get_page, make_dir, header, merge_pdf, handle_string
ModuleNotFoundError: No module named 'get_page'