python图片爬取小脚本
前几天发的关于daili IP的帖子今天被删了,因为才开始逛论坛,若还有违规之处,请麻烦管理员删帖。这次抓取的妹子图,其实也不邪恶,就怕违规,所以打码了{:1_924:}。代码已上传,自行下载分析。 线程还加把锁,用线程池啊!!!
import requests
from bs4 import BeautifulSoup
from threading import Lock,Thread
import os
basePath = r"D:\Users\Quincy_C\PycharmProjects\S6\爬取图片"
threadLimit = 10
threadNum = 0
os.chdir(basePath)
# 遍历打印出图片地址
urlPool = ["http://www.meitulu.com/item/{}.html".format(str(i)) for i in range(3465, 5645)]
mutex = Lock()
def downloadImg(url):
dirname = url.split("/")[-1].split(".")
print(dirname)
if not os.path.isdir(dirname):
os.mkdir(dirname)
ordinal = 1
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36"}
linkPool = []
while True:
try:
resp = requests.get(url, headers=headers).text
soup = BeautifulSoup(resp, "html.parser")
links = soup.select("body > div.content > center > img")
for urlLink in links:
link = urlLink.get("src")
linkPool.append(link)
nextPageUrl = soup.findAll("a", {"class": "a1"}).get("href")
if nextPageUrl == url:
break
else:
url = nextPageUrl
except Exception:
print("Connection Error, or BeautifulSoup going Wrong, forget it:", url)
break
for link in linkPool:
try:
content = requests.get(link, headers=headers)
title = str(ordinal) + ".jpg"
# 文件就保存在工作目录了
file = open(dirname + "/" + title, "wb")
print(dirname + "/" + title)
file.write(content.content)
file.close()
ordinal += 1
except Exception:
print("Couldn't Parse!", link)
break
print('爬取完成')
class MyThread(Thread):
def __init__(self, url):
self.url = url
Thread.__init__(self)
def run(self):
downloadImg(self.url)
mutex.acquire()
global threadNum
threadNum -= 1
mutex.release()
while urlPool != []:
if threadNum < threadLimit:
newUrl = urlPool.pop()
threadNum += 1
newThread = MyThread(newUrl)
newThread.start()
去吧,小白们,把路径换成自己的就可以了 你好!basePath = "./111"这个什么意思
提示错误
Traceback (most recent call last):
File "D:/ProductPython/picture.py", line 11, in <module>
os.chdir(basePath)
FileNotFoundError: 系统找不到指定的文件。: './111'
初学Python不大懂,大神解释一下 {:1_910:}我还能说什么?? 不会用啊 楼主能分享一个详细的教程不? 你说说不打码还没觉得 一打码就让我污了一把 似乎不打码的话,真的很污的样子 抱歉,该附件无法读取 正在学习python爬虫,参考一下, 我还以为是成品图连接呢,白下载了{:301_1007:} 不错不错,学习了,这个requests挺好用的
我这里直连这个pic.yiipic.com下不动,补了几行代码走代{过}{滤}理勉强能下