本帖最后由 骑狗的猴子 于 2020-11-18 13:18 编辑
[Asm] 纯文本查看 复制代码 import requests
from lxml import etree
import os
import re
from threading import Thread
from queue import Queue
from fake_useragent import UserAgent
base_url = "xxx"
base_host = "xxx"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}
class CrawlInfo(Thread):
def __init__(self, url_queue):
Thread.__init__(self)
self.url_queue = url_queue
def run(self):
base_host = "xxx"
headers = {
"user-agent": UserAgent().random
}
while self.url_queue.empty() == False:
url_map = base_url_queue.get()
for name, value in url_map.items():
response_img = requests.get(base_host + value, headers=headers, timeout=3)
response_img.encoding = 'utf-8'
img_html = etree.HTML(response_img.text)
img_url_list = img_html.xpath(
"//html/body/div/div[@class='contentList']/div[@class='content']/p/img/@src")
name = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", name)
if not (os.path.exists("tupian/" + name)):
os.makedirs("tupian/" + name)
for inx, img in enumerate(img_url_list):
with open("tupian/" + name + '/' + str(inx) + ".jpg", 'wb') as f:
f.write(requests.get(img).content)
print(name +"---"+ str(inx) + "-----> 下载完成"+self.name)
def getURL_Map(base_url_queue):
base_url = "xxx"
headers = {
"user-agent": UserAgent().random
}
response = requests.get(base_url, headers=headers)
response.encoding = 'utf-8'
# print(response.text)
sq_html = etree.HTML(response.text)
li_url_list = sq_html.xpath("//html/body/div[@class='main']/div[@class='classList']/ul/li/a/@href")
li_name_list = sq_html.xpath("//html/body/div[@class='main']/div[@class='classList']/ul/li/a/text()")
for name,url in zip(li_name_list,li_url_list):
ls_map = {}
ls_map[name]=url;
base_url_queue.put(ls_map)
return base_url_queue
if __name__ == '__main__':
base_url_queue=Queue()
url_list = getURL_Map(base_url_queue)
for i in range(0,9):
crawl = CrawlInfo(url_list)
crawl.start()
pass
网址进行了一个加密,有能力的自己解密吧. 希望有收获的给点支持吧,谢谢 |