不少朋友最近在考虑如何将出版社的书进行爬取下载,因此就分享一下自己的思路和代码。
主要是运用python+selenium来实现,通过模拟访问来打开页面,之后进行截图,点击下一页,再截图,进行循环,之后再拼接所有图片为pdf。
代码仅限于研究交流,需要具备一定的python和selenium的知识,修改关键信息才不会报错。
代码如下:[Python] 纯文本查看 复制代码 import math
import operator
from functools import reduce
from selenium import webdriver
import json
from PIL import Image
import time
import glob
import os
import fitz
"""定义图片合成为pdf"""
def pic2pdf(img_dir,name):
doc = fitz.open()
for img in sorted(glob.glob("{}/*".format(img_dir))):
print(img)
imgdoc = fitz.open(img)
pdfbytes = imgdoc.convertToPDF()
imgpdf = fitz.open("pdf", pdfbytes)
doc.insertPDF(imgpdf)
if os.path.exists(name + ".pdf"):
os.remove(name + ".pdf")
doc.save(img_dir + "/" + name + ".pdf")
doc.close()
"""爬取网页并保存为图片"""
def pqts(url,name):
chromedriver = 'chromedriver.exe'
chome_options = webdriver.ChromeOptions()
wd = webdriver.Chrome(chromedriver, chrome_options=chome_options)
wd.delete_all_cookies()
wd.maximize_window()
wd.implicitly_wait(30)
wd.get("[url=http://www.sklib.cn/]http://www.sklib.cn/[/url]")
print("访问页面!")
#自动输入账号密码
print("登陆成功!")
time.sleep(10)
cookies = wd.get_cookies()
print (type(cookies))
f1 = open('cookie.txt', 'w')
f1.write(json.dumps(cookies))
f1.close
f1 = open('cookie.txt')
cookie = f1.read()
cookie =json.loads(cookie)
for c in cookie:
wd.add_cookie(c)
time.sleep(15)
print("开始寻找!")
print(name)
print(url)
wd.get(url)
print("访问成功!")
element = wd.find_element_by_class_name("原版阅读")
element.click()
print("打开成功!")
time.sleep(10)
try:
element = wd.find_element_by_id("下一页")
dir_name = name
if not os.path.exists(dir_name):
os.mkdir(dir_name)
new_name = dir_name + "1"
if not os.path.exists(new_name):
os.mkdir(new_name)
for i in range(1,2000):
time.sleep(5)
print("保存第" + str(i) + "页")
wd.save_screenshot(dir_name + '/' + str(i) + '.png')
element.click()
ran = Image.open(dir_name + '/' + str(i) + '.png')
box = (左,上,右,下)
if i < 10:
ran.crop(box).save(new_name + '/' + "00" + str(i) + '.png')
if 10 <= i < 100:
ran.crop(box).save(new_name + '/' + "0" + str(i) + '.png')
if i > 100:
image1 = Image.open(dir_name + '/' + str(i - 1) + '.png')
image2 = Image.open(dir_name + '/' + str(i) + '.png')
h1 = image1.histogram()
h2 = image2.histogram()
result = math.sqrt(reduce(operator.add, list(map(lambda a, b: (a - b) ** 2, h1, h2))) / len(h1))
if result == 0.0:
break
else:
ran.crop(box).save(new_name + '/' + str(i) + '.png')
wd.close()
wd.quit()
except Exception as e:
print(url)
print(e)
wd.close()
wd.quit()
with open ("失败网址.txt","a") as sb:
sb.write(url + "\n")
wd.close()
wd.quit()
if __name__ == '__main__':
url = ""
name = ""
pqts(url,name)
new_name = name + "1"
pic2pdf( "/" + new_name,name)
|