本帖最后由 牵手丶若相惜 于 2020-11-12 17:07 编辑
仅限学习!仅限学习!!仅限学习!!!
我也只是学生 自学的 没你们厉害 这个我知道 感觉我写的垃圾你就指点指点
指点我的话 我很高兴 但是你能不能不要是那种非要踩着我 显得你厉害
————————————————————————————
没设置线程停止 因为没有好的停止方法 我最开始是判断队列是否为空 但是当页面很少的时候 ParsesThread不启动
因为只是学习用 所以没去完善 欢迎补充
——————————————————————————————
有用的可以自己封装成软件 可以通过修改参数来达到爬取不同的图片 我爬取的是风景 具体看图
[Python] 纯文本查看 复制代码
import requests
import re
import threading
from queue import Queue
class CrawlThread(threading.Thread):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36",
"cookie": "__cfduid=db12901348dc11f6695ac98e3bb7cf9251576048442; lang=zh; anonymous_user_id=7233a3d8-b6f5-4b4d-af09-c348290cae74; _ga=GA1.2.485160559.1576048444; _gid=GA1.2.1835149362.1576048444; is_human=1; dwf_attribution_template_ads=True; client_width=1903; _sp_id.aded=0cf1e1a4-c45f-4755-ae5d-240531a4a702.1576048444.2.1576051689.1576049566.32edf551-700c-4a10-b647-bde70f44234a; _sp_ses.aded=*",
"referer": "https://pixabay.com/zh/photos/search/%E9%A3%8E%E6%99%AF/"
}
def __init__(self, crawl_queue, parses_queue):
super(CrawlThread, self).__init__()
self.crawl_queue = crawl_queue
self.parses_queue = parses_queue
def run(self):
while True:
url = self.crawl_queue.get()
response = requests.get(url, headers=self.headers).text
img_urls = re.findall('1x, (.*?) 2x"', response)
img_names = re.findall('jpg" alt="(.*?)"></a>', response)
for x in range(len(img_urls)):
self.parses_queue.put((img_names[x], img_urls[x]))
class ParsesThread(threading.Thread):
def __init__(self, parses_queue):
super(ParsesThread, self).__init__()
self.parses_queue = parses_queue
def run(self):
while True:
img_name, img_url = self.parses_queue.get()
print(img_name, img_url)
img = requests.get(img_url)
with open("e:/pixabay/%s.jpg" % img_name, "wb") as file:
file.write(img.content)
def main():
url = "https://pixabay.com/zh/photos/search/%E9%A3%8E%E6%99%AF/?pagi={}"
# 小于1代表队列无限大
crawl_queue = Queue(-1)
parses_queue = Queue(-1)
for i in range(1, 5):
urls = url.format(i)
crawl_queue.put(urls)
for i in range(5):
crawl_tread = CrawlThread(crawl_queue, parses_queue)
crawl_tread.start()
for i in range(5):
parses_tread = ParsesThread(parses_queue)
parses_tread.start()
crawl_tread.join()
parses_tread.join()
print("下载完成")
if __name__ == '__main__':
main()
|