本帖最后由 wushaominkk 于 2019-3-11 16:33 编辑
成果下载链接:
https://pan.baidu.com/s/1acbueRlh5SFhh7OBlJL2qw 提取码: qedu
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
__author__ = 'jiangwenwen'
import pdfkit
import time
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
ua = UserAgent()
headers = {
"Host": "www.yinwang.org",
"User-Agent": ua.random,
"Referer": "http://www.yinwang.org/",
}
# 代{过}{滤}理服务器
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# 代{过}{滤}理隧道验证信息
proxyUser = "HJEG872M8LONIE4Dee"
proxyPass = "26C89049A9EE5BA9"
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
response = requests.get("http://www.yinwang.org/", headers=headers, proxies=proxies)
soup = BeautifulSoup(response.content, 'html.parser')
tags = soup.find_all("li", class_="list-group-item title")
for child in tags:
start = time.time()
url = "http://www.yinwang.org" + child.a.get('href')
file_name = "D:\Python\资料\王垠的博客\\" + child.a.string + ".pdf"
print("文件打印中...")
headers["User-Agent"] = ua.random
print("User-Agent是:{0}".format(headers["User-Agent"]))
content = requests.get(url, headers=headers, timeout=None, proxies=proxies).text
pdfkit.from_string(content, file_name)
end = time.time()
print("打印成功,本次打印耗时:%0.2f秒" % (end - start))
|