[Python] 纯文本查看 复制代码
#coding=utf-8
import tkinter
from tkinter.filedialog import askdirectory
from tkinter import *
import requests
import re
from typing import List, Tuple
from urllib.parse import quote
from tkinter import messagebox
def select_out_Path():
path_out = askdirectory()
out_path.set(path_out)
print("--------------存取文件夹设置完成----------")
def download(pic_urls: List[str], num: int,out_path:str,na:str) -> None:
"""给出图片链接列表,下载所有图片
Args:
pic_urls: 图片链接列表
num: 最大下载数量
out_path:输出地址
na:图片的标签
"""
print("--------------下载模块启动----------")
pic_urls = pic_urls[:num]
out_dir = out_path.get()
for i, pic_url in enumerate(pic_urls):
try:
pic = requests.get(pic_url, timeout=15)
image_output_path = out_dir +'/'+ na + str(i + 1) + '.png'
with open(image_output_path, 'wb') as f:
f.write(pic.content)
print('成功下载第%s张图片,目标网址为: %s' % (str(i + 1), str(pic_url)))
except IOError as e:
print('下载第%s张图片时失败,对应地址为: %s' % (str(i + 1), str(pic_url)))
print('系统报错信息如下:')
print(e)
continue
def get_page_urls(page_url: str, headers: dict) -> Tuple[List[str], str]:
"""获取当前翻页的所有图片的链接
Args:
page_url: 当前翻页的链接
headers: 请求表头
Returns:
当前页面下的所有图片的链接, 当前页面的下一页面的链接
"""
print("--------------获取链接模块运行中----------")
if not page_url:
return [], ''
try:
html = requests.get(page_url, headers=headers)
html.encoding = 'utf-8'
html = html.text
except IOError as e:
print(e)
return [], ''
pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
next_page_url = re.findall(re.compile(r'<a href="(.*)" class="n">下一页</a>'), html, flags=0)
next_page_url = 'http://image.baidu.com' + next_page_url[0] if next_page_url else ''
return pic_urls, next_page_url
def Spider():
print("--------------爬虫模块启动----------")
key = keyword.get()
nu = number.get()
num = int(nu)
na = name.get()
headers ={
'User-Agent': 'Mozilla/5.0 (Windows 10 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'
}
url = 'https://image.baidu.com/search/flip?tn=baiduimage&word='
url_init = url + quote(key, safe='/')
all_pic_urls = []
print("--------------获取链接模块启动----------")
page_urls, next_page_url = get_page_urls(url_init, headers)
all_pic_urls.extend(page_urls)
page = 0
while 1:
page_urls, next_page_url = get_page_urls(next_page_url, headers)
page += 1
print('正在获取第%s个页面的所有图片链接' % str(page))
if next_page_url == '' and page_urls == []:
print('已到最后一页,共计爬取%s个页面' % page)
break
all_pic_urls.extend(page_urls)
if len(all_pic_urls) >= num:
print('采集的图片链接已达到最大下载数量:%s' % num)
break
download(list(set(all_pic_urls)), num,out_path,na)
out = out_path.get()
print("--------------任务执行完毕,即将弹窗提示用户----------")
messagebox.showinfo(title='爬取完成', message='请打开 %s 目录查看爬虫获取的图片' %out)
print("--------------日志开始记录----------")
ui = Tk()
out_path = StringVar()
keyword = StringVar()
number = StringVar()
name = StringVar()
ui.title("图片爬虫 By Error log 成都理工大学毕业设计")
Label(ui,text = "选择存放文件夹:").grid(row = 0, column = 0)
Entry(ui, textvariable = out_path).grid(row = 0, column = 1)
Button(ui, text = "选择文件夹", command = select_out_Path).grid(row = 0, column = 2)
Label(ui,text = "搜索关键词:").grid(row = 1, column = 0)
Entry(ui, textvariable = keyword).grid(row = 1, column = 1)
Label(ui,text = "下载数量:").grid(row = 2, column = 0)
Entry(ui, textvariable = number).grid(row = 2, column = 1)
Label(ui,text = "图片命名标签:").grid(row = 3, column = 0)
Entry(ui, textvariable = name).grid(row = 3, column = 1)
Button(ui, text = "开始爬取", command = Spider).grid(row = 4, column = 1)
Label(ui,text = "版权所有:Error log V 0.9版本用于成都理工大学毕业设计").grid(row = 5, column = 1)
Label(ui,text = "本程序基于MIT协议开源").grid(row = 6, column = 1)
print("--------------UI构建完成----------")
ui.mainloop()