Python爬虫实现下载某图片站内容
本帖最后由 此称江初 于 2020-6-19 08:30 编辑原创的一个小爬虫
偶然间发现这个网站上面的图片特别多,而且也蛮有欣赏价值,索性利用闲暇时间分析了一下网站源码、路由、结构。然后用PyCharm写下了这个网站的爬虫程序。
环境:Python
所需库:lxml requests threading2
全站图片多线程下载!!!
站点链接在Code里面
Centos 7、Debian 9/10、Windows7/10 均通过测试正常运行
python运行记得安装所需库
运行截图:
Code:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#Code by 此称江初
import requests,os,threading
from lxml import etree
Url = "https://www.moestack.com/all"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"}
def Mkdir(path):
path = path.strip()
path = path.rstrip("\\")
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
return True
else:
pass
def Get_Page(x):
Page = requests.get(x,headers=headers)
Page.encoding = "utf-8"
Page = Page.text
Page = etree.HTML(Page)
return Page
def end():
ImgUrl = GetImgUrl
save_img = requests.get(ImgUrl, headers=headers)
with open(r"Moe/" + Title + "/" + ImgUrl[-27:] + ".jpg", "wb") as fh:
fh.write(save_img.content)
fh.flush()
def DownImg():
global i
global t
path = "Moe/" + Title + "/"
Mkdir(path)
threads = []
for i in range(len(GetImgUrl)):
t = threading.Thread(target=end,daemon=True)
t.start()
threads.append(t)
for t in threads:
t.join()
print("下载完成")
def OnePageDown(x):
global Title,GetImgUrl
GetImgUrl = Get_Page(x).xpath('//*/div/div/div/p/img/@src')
Title = Get_Page(x).xpath('//*[@class="entry-title"]/text()')
print("标题:" + Title)
print("一共有%d张图片"%len(GetImgUrl))
DownImg()
def PageDown(x):
ImgPageUrl = Get_Page(x).xpath('//*[@class="entry-media"]/div/a/@href')
for i in ImgPageUrl:
OnePageDown(i)
def AllDown(x):
PageNum = Get_Page(x).xpath('/html/body/div/div/div/div/div/div/main/div/ul/li/a/text()')
print("全站共有%d页"%int(PageNum))
for i in range(int(PageNum)):
i = i + 1
if i == '1':
PageUrl = "https://www.moestack.com/all"
PageDown(PageUrl)
else:
PageUrl = "https://www.moestack.com/all" + "/page/" + str(i)
PageDown(PageUrl)
def main():
print("菜单:\n1.单页下载\n2.页面下载\n3.全站下载(Boom!!!)")
Choice = input("请选择:")
if Choice == '1':
ImgPageUrl = input("请输入链接:")
OnePageDown(ImgPageUrl)
elif Choice == '2':
PageUrl = input("请输入页面链接:")
PageDown(PageUrl)
elif Choice == '3':
AllDown(Url)
if __name__ == "__main__":
main()#Code by 此称江初
下载地址:https://52pojie.shop
Python小白,大佬勿喷。
转载请注明出处 下全站的时候老是卡死或者报错,不知道是不是我电脑的原因。
于是我简单的改造了一下,加了个 超时重试 和 缓存判断
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Code by 此称江初
import random
import os
import requests
import threading
import time
from lxml import etree
Url = "https://www.moestack.com/all"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"}
def Mkdir(path):
path = path.strip()
path = path.rstrip("\\")
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
return True
else:
pass
def Get_Page(x):
Page = gethtml(x, headers=headers)
Page.encoding = "utf-8"
Page = Page.text
Page = etree.HTML(Page)
return Page
def gethtml(url, headers):
i = 0
while i < 3:
try:
html = requests.get(url, headers=headers, timeout=5)
return html
except requests.exceptions.RequestException:
i += 1
print("超时:" + url)
def end():
ImgUrl = GetImgUrl
save_img = gethtml(ImgUrl, headers=headers)
if not os.path.exists(r"Moe/" + Title + "/" + ImgUrl[-27:] + ".jpg"):
with open(r"Moe/" + Title + "/" + ImgUrl[-27:] + ".jpg", "wb") as fh:
fh.write(save_img.content)
fh.flush()
def GetFiles(path):
dirnum = 0
filenum = 0
for lists in os.listdir(path):
sub_path = os.path.join(path, lists)
# print(sub_path)
if os.path.isfile(sub_path):
filenum = filenum+1
elif os.path.isdir(sub_path):
dirnum = dirnum+1
return filenum
def DownImg():
global i
global t
path = "Moe/" + Title + "/"
if os.path.exists(path):
if GetFiles(path) == len(GetImgUrl):
print('以缓存:' + Title)
return
print('开始下载:' + Title)
Mkdir(path)
threads = []
for i in range(len(GetImgUrl)):
t = threading.Thread(target=end, daemon=True)
t.start()
threads.append(t)
for t in threads:
# time.sleep(0.1)
t.join()
print("下载完成")
def OnePageDown(x):
global Title, GetImgUrl
GetImgUrl = Get_Page(x).xpath('//*/div/div/div/p/img/@src')
Title = Get_Page(x).xpath('//*[@class="entry-title"]/text()')
print("标题:" + Title)
print("一共有%d张图片" % len(GetImgUrl))
DownImg()
def PageDown(x):
ImgPageUrl = Get_Page(x).xpath('//*[@class="entry-media"]/div/a/@href')
print("此页共有%d章" % len(ImgPageUrl))
# OnePageDown(ImgPageUrl)
for i in ImgPageUrl:
OnePageDown(i)
def AllDown(x):
PageNum = Get_Page(x).xpath('/html/body/div/div/div/div/div/div/main/div/ul/li/a/text()')
print("全站共有%d页" % int(PageNum))
PageUrl = "https://www.moestack.com/all" + "/page/" + str(random.randint(1, int(PageNum)))
#PageDown(PageUrl)
for i in range(int(PageNum)):
i = i + 1
if i == '1':
PageUrl = "https://www.moestack.com/all"
PageDown(PageUrl)
else:
PageUrl = "https://www.moestack.com/all" + "/page/" + str(i)
PageDown(PageUrl)
def main():
print("菜单:\n1.单页下载\n2.页面下载\n3.全站下载(Boom!!!)")
Choice = input("请选择:")
if Choice == '1':
ImgPageUrl = input("请输入链接:")
OnePageDown(ImgPageUrl)
elif Choice == '2':
PageUrl = input("请输入页面链接:")
PageDown(PageUrl)
elif Choice == '3':
AllDown(Url)
if __name__ == "__main__":
main()# Code by 此称江初
没有出错处理,可靠性是有问题的 我只要你第八行代码!{:301_997:} xccxvb 发表于 2020-6-18 21:56
我只要你第八行代码!
拿去吧 紫薯布丁 谢谢分享。学习学习。 此称江初 发表于 2020-6-18 21:58
拿去吧 紫薯布丁
你试过全站下载没,会不会封ip{:301_997:} xccxvb 发表于 2020-6-18 22:12
你试过全站下载没,会不会封ip
并不会,请准备50GB硬盘,挂服务器上了好几次 15G磁盘全部塞满 此称江初 发表于 2020-6-18 22:15
并不会,请准备50GB硬盘,挂服务器上了好几次 15G磁盘全部塞满
我是感觉硬盘可能会不够用,我这就去换个移动硬盘存着,希望网站不要坏掉哈哈{:301_997:} 我怀疑你在ghs,我上去第一张就,,,, 你这个shop网站源码有吗,看上去是个很简洁的网盘 barnett2016 发表于 2020-6-18 22:59
你这个shop网站源码有吗,看上去是个很简洁的网盘
Powered byh5ai