打算做一个遥感图片识别的机器学习程序,一些问题求助
我打算做一个基于TensorFlow和Yolo_v3网络的机器学习系统,用来识别遥感地图中的桥梁、机场、港口等对象。这项目是作为我的毕业设计来弄的。后期我写的代码(可能大部分会copy开源仓库里的来做修改,莫办法,自己太菜了)通过测试之后,我会逐步把源代码和编写过程中的心得和笔记,贴到这个帖子里,请大佬们指教。等毕业设计通过之后,我会完整的贴出论文和源代码。
目前是遇到了这样的一个问题。按计划,我需要下载大量的遥感图片来标注,然后作为样本给程序训练。我已经开始学爬虫程序的设计了,也跟着大佬们的源码写了个弱弱的小说下载程序,目前打算基于它写个爬取遥感图片的爬虫,需要一些能够免费提供遥感图片的网站。百度已经被我玩残了,还是没找到合适的,所以发这个帖子,想请大家推荐几个能够免费下载遥感图片的网站或者接口。
下面是我弄的爬小说的程序,贴出来请大佬们指教:
#-*- codeing = utf-8 -*-
import requests
from lxml import etree
def get_data():
url='https://read.qidian.com/hankread/1021808365/85814489'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Referer': 'https: // read.qidian.com / chapter / ozgK2uJ6kuk1 / _Ba1my_qLv_M5j8_3RRvhw2',
'Cookie': '_csrfToken=5kGAdBD0xnc0kyIJEEumsbvuIFrLk1R6ENJyo1Xj; newstatisticUUID=1595415838_246417717; qdrs=0%7C3%7C0%7C0%7C1; showSectionCommentGuide=1; qdgd=1; e1=%7B%22pid%22%3A%22qd_P_Searchresult%22%2C%22eid%22%3A%22qd_S05%22%2C%22l1%22%3A3%7D; e2=%7B%22pid%22%3A%22qd_P_xuanhuan%22%2C%22eid%22%3A%22qd_F42%22%2C%22l3%22%3A2%2C%22l2%22%3A1%2C%22l1%22%3A11%7D; rcr=3144877%2C1016383202; bc=1016383202%2C3144877; pageOps=1; lrbc=3144877%7C485656806%7C0%2C1016383202%7C551256291%7C0; floatOp=12'
}
r=requests.get(url,headers=headers).text
html=etree.HTML(r)
chapter=html.xpath('//div[@class="main-text-wrap"]')
for item in chapter:
title=item.xpath('.//span[@class="content-wrap"]/text()')
content=item.xpath('.//div[@class="read-content j_readContent"]/p/text()')
with open('./venv/%s.txt' % title, 'w', encoding='utf-8') as f:
f.write(''.join(content))
if _name_=="_main_":
get_data() 学习学习 http://www.gscloud.cn/ 各种地理数据 川黔 发表于 2021-3-25 18:51
http://www.gscloud.cn/ 各种地理数据
谢谢大佬 更新一个帖子,贴上图片采集脚本的源代码
#coding=utf-8
import tkinter
from tkinter.filedialog import askdirectory
from tkinter import *
import requests
import re
from typing import List, Tuple
from urllib.parse import quote
from tkinter import messagebox
def select_out_Path():
path_out = askdirectory()
out_path.set(path_out)
print("--------------存取文件夹设置完成----------")
def download(pic_urls: List, num: int,out_path:str,na:str) -> None:
"""给出图片链接列表,下载所有图片
Args:
pic_urls: 图片链接列表
num: 最大下载数量
out_path:输出地址
na:图片的标签
"""
print("--------------下载模块启动----------")
pic_urls = pic_urls[:num]
out_dir = out_path.get()
for i, pic_url in enumerate(pic_urls):
try:
pic = requests.get(pic_url, timeout=15)
image_output_path = out_dir +'/'+ na + str(i + 1) + '.png'
with open(image_output_path, 'wb') as f:
f.write(pic.content)
print('成功下载第%s张图片,目标网址为: %s' % (str(i + 1), str(pic_url)))
except IOError as e:
print('下载第%s张图片时失败,对应地址为: %s' % (str(i + 1), str(pic_url)))
print('系统报错信息如下:')
print(e)
continue
def get_page_urls(page_url: str, headers: dict) -> Tuple, str]:
"""获取当前翻页的所有图片的链接
Args:
page_url: 当前翻页的链接
headers: 请求表头
Returns:
当前页面下的所有图片的链接, 当前页面的下一页面的链接
"""
print("--------------获取链接模块运行中----------")
if not page_url:
return [], ''
try:
html = requests.get(page_url, headers=headers)
html.encoding = 'utf-8'
html = html.text
except IOError as e:
print(e)
return [], ''
pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
next_page_url = re.findall(re.compile(r'<a href="(.*)" class="n">下一页</a>'), html, flags=0)
next_page_url = 'http://image.baidu.com' + next_page_url if next_page_url else ''
return pic_urls, next_page_url
def Spider():
print("--------------爬虫模块启动----------")
key = keyword.get()
nu = number.get()
num = int(nu)
na = name.get()
headers ={
'User-Agent': 'Mozilla/5.0 (Windows 10 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'
}
url = 'https://image.baidu.com/search/flip?tn=baiduimage&word='
url_init = url + quote(key, safe='/')
all_pic_urls = []
print("--------------获取链接模块启动----------")
page_urls, next_page_url = get_page_urls(url_init, headers)
all_pic_urls.extend(page_urls)
page = 0
while 1:
page_urls, next_page_url = get_page_urls(next_page_url, headers)
page += 1
print('正在获取第%s个页面的所有图片链接' % str(page))
if next_page_url == '' and page_urls == []:
print('已到最后一页,共计爬取%s个页面' % page)
break
all_pic_urls.extend(page_urls)
if len(all_pic_urls) >= num:
print('采集的图片链接已达到最大下载数量:%s' % num)
break
download(list(set(all_pic_urls)), num,out_path,na)
out = out_path.get()
print("--------------任务执行完毕,即将弹窗提示用户----------")
messagebox.showinfo(title='爬取完成', message='请打开 %s 目录查看爬虫获取的图片' %out)
print("--------------日志开始记录----------")
ui = Tk()
out_path = StringVar()
keyword = StringVar()
number = StringVar()
name = StringVar()
ui.title("图片爬虫 By Error log 成都理工大学毕业设计")
Label(ui,text = "选择存放文件夹:").grid(row = 0, column = 0)
Entry(ui, textvariable = out_path).grid(row = 0, column = 1)
Button(ui, text = "选择文件夹", command = select_out_Path).grid(row = 0, column = 2)
Label(ui,text = "搜索关键词:").grid(row = 1, column = 0)
Entry(ui, textvariable = keyword).grid(row = 1, column = 1)
Label(ui,text = "下载数量:").grid(row = 2, column = 0)
Entry(ui, textvariable = number).grid(row = 2, column = 1)
Label(ui,text = "图片命名标签:").grid(row = 3, column = 0)
Entry(ui, textvariable = name).grid(row = 3, column = 1)
Button(ui, text = "开始爬取", command = Spider).grid(row = 4, column = 1)
Label(ui,text = "版权所有:Error log V 0.9版本用于成都理工大学毕业设计").grid(row = 5, column = 1)
Label(ui,text = "本程序基于MIT协议开源").grid(row = 6, column = 1)
print("--------------UI构建完成----------")
ui.mainloop()
页:
[1]