好友
阅读权限25
听众
最后登录1970-1-1
|
一只码农
发表于 2020-4-22 16:00
爬取目标:www.netbian.com
所需的库:requests
bs4(BeautifulSoup)
random
运行环境:win10 64位
python 3.8.0
使用方法:
然后输入你想爬取的类型,我这里已王者荣耀的壁纸为例:
如果想停止按ctrl+c即可
我这里因为时间问题就没有爬取全部,看看部分
预览图片:
爬取的图片:
代码:[Python] 纯文本查看 复制代码 import requestsfrom bs4 import BeautifulSoup
import os
import time
import random
import UserAgent
index = 'http://www.netbian.com' # 网站根地址
interval = 0.1 # 爬取图片的间隔时间
firstDir = 'E:/各路语言开发项目/测试' # 总路径,因为不会自动创建文件夹,所以这里自己修改一下即可
classificationDict = {} # 存放网站分类子页面的信息
# 获取页面筛选后的内容列表
def screen(url, select):
headers = UserAgent.get_headers() # 随机获取一个headers
html = requests.get(url = url, headers = headers)
html.encoding = 'gbk' # 网站的编码
html = html.text
soup = BeautifulSoup(html, 'lxml')
return soup.select(select)
# 获取页码
def screenPage(url, select):
html = requests.get(url = url, headers = UserAgent.get_headers())
html.encoding = 'gbk'
html = html.text
soup = BeautifulSoup(html, 'lxml')
return soup.select(select)[0].next_sibling.text
# 下载操作
def download(src, name, path):
if(isinstance(src, str)):
response = requests.get(src)
path = path + '/' + name + '.jpg'
while(os.path.exists(path)): # 若文件名重复
path = path.split(".")[0] + str(random.randint(2, 17)) + '.' + path.split(".")[1]
with open(path,'wb') as pic:
for chunk in response.iter_content(128):
pic.write(chunk)
# 定位到 1920 1080 分辨率图片
def handleImgs(links, path):
for link in links:
href = link.get('href')
if(href == 'http://pic.netbian.com/'): # 过滤图片广告
continue
# 第一次跳转
if('http://' in href): # 有极个别图片不提供正确的相对地址
url = href
else:
url = index + href
select = 'div#main div.endpage div.pic div.pic-down a'
link = screen(url, select)
if(link == []):
print(url + ' 无此图片,爬取失败')
continue
href = link[0].get('href')
# 第二次跳转
url = index + href
# 获取到图片了
select = 'div#main table a img'
link = screen(url, select)
if(link == []):
print(url + " 该图片需要登录才能爬取,爬取失败")
continue
name = link[0].get('alt').replace('\t', '').replace('|', '').replace(':', '').replace('\\', '').replace('/', '').replace('*', '').replace('?', '').replace('"', '').replace('<', '').replace('>', '')
print(name) # 输出下载图片的文件名
src = link[0].get('src')
if(requests.get(src).status_code == 404):
print(url + ' 该图片下载链接404,爬取失败')
print()
continue
print()
download(src, name, path)
time.sleep(interval)
# 选择下载分类子页面
def select_classification(choice):
print('---------------------------')
print('--------------' + choice + '-------------')
print('---------------------------')
secondUrl = classificationDict[choice]['url']
secondDir = classificationDict[choice]['path']
if(not os.path.exists(secondDir)):
os.mkdir(secondDir) # 创建分类目录
select = '#main > div.page > span.slh'
pageIndex = screenPage(secondUrl, select)
lastPagenum = int(pageIndex) # 获取最后一页的页码
for i in range(0, lastPagenum):
if i == 0:
url = secondUrl
else:
url = secondUrl + 'index_%d.htm' %(i+1)
print('--------------' + choice + ': ' + str(i+1) + '-------------')
path = secondDir + '/' + str(i+1)
if(not os.path.exists(path)):
os.mkdir(path) # 创建分类目录下页码目录
select = 'div#main div.list ul li a'
links = screen(url, select)
handleImgs(links, path)
# ui界面,用户选择下载分类
def ui():
print('--------------netbian-------------')
print('全部', end=' ')
for c in classificationDict.keys():
print(c, end=' ')
print()
choice = input('请输入分类名:')
if(choice == '全部'):
for c in classificationDict.keys():
select_classification(c)
elif(choice not in classificationDict.keys()):
print("输入错误,请重新输入!")
print('----')
ui()
else:
select_classification(choice)
# 将分类子页面信息存放在字典中
def init_classification():
url = index
select = '#header > div.head > ul > li:nth-child(1) > div > a'
classifications = screen(url, select)
for c in classifications:
href = c.get('href') # 获取的是相对地址
text = c.string # 获取分类名
if(text == '4k壁纸'): # 4k壁纸,因权限问题无法爬取,直接跳过
continue
secondDir = firstDir + '/' + text # 分类目录
url = index + href # 分类子页面url
global classificationDict
classificationDict[text] = {
'path': secondDir,
'url': url
}
def main():
if(not os.path.exists(firstDir)):
os.mkdir(firstDir) # 创建总目录
init_classification()
ui()
if __name__ == '__main__':
main()
这个是运行的,还有一个需要配合这个代码使用的文件,否则报错,文件和源码我已经放在附件里了,大家请自行下载
彼岸桌面爬虫.zip
(3.69 KB, 下载次数: 939)
|
免费评分
-
查看全部评分
本帖被以下淘专辑推荐:
- · 编程语言类|主题: 110, 订阅: 42
- · 源码|主题: 47, 订阅: 1
|