本帖最后由 龍魂小白 于 2023-7-18 00:19 编辑
初学py爬虫,还在继续努力
这个是单进程爬取,下边有个进程池获取 太猛自己悠着点
import os.path
import random
import time
import requests
from lxml import etree
import webbrowser
webbrowser.open('https://pic.netbian.com')
ua = input("请输入浏览器的User-Agent,不会的请百度\n")
print('''例如:https://pic.netbian.com/4kmeinv
https://pic.netbian.com/4kfengjing
https://pic.netbian.com
''')
uuuu = input("请输入彼岸图的URL不带'/'\n")
headers = {
'User-Agent': ua
}
url = uuuu
def huoqu(urll):
url = urll
# print(url)
respones = requests.get(url, headers=headers)
mg = respones.content.decode("gbk")
tree = etree.HTML(mg)
img_url = tree.xpath('//ul[@class="clearfix"]//li')
# print(img_url)
for a in img_url:
c = a.xpath('./a//img/@alt')[0]
img_mg = a.xpath('./a//img/@src')
img_mgg = requests.get('https://pic.netbian.com' + ''.join(img_mg))
if not os.path.exists('彼岸图网'):
os.mkdir("彼岸图网")
with open(f'彼岸图网/{c}.jpg', 'wb') as f:
f.write(img_mgg.content)
print(f'彼岸图网/{c}.jpg,保存成功')
def zongpage(url):
url = url
print(url)
respones = requests.get(url, headers=headers)
mg = respones.content.decode("gbk")
tree = etree.HTML(mg)
page = tree.xpath('//div[@class="page"]/a/text()')
zongpage = page[-2]
print('总共:' + zongpage + '页')
return zongpage
zongpagee = zongpage(url)
for a in range(1, int(zongpagee)):
if a == 1:
huoqu(url)
else:
uu = f'{url}/index_{a}.html'
print(uu)
huoqu(uu)
time.sleep(random.randint(2, 5))
这个是进程池获取
import os.path
import random
import time
import requests
from lxml import etree
import threading
from multiprocessing import Pool
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
url = 'https://pic.netbian.com'
def huoqu(urll):
print('我已经运行了')
# 发送HTTP请求,获取页面内容
respones = requests.get(urll, headers=headers)
mg = respones.content.decode("gbk")
tree = etree.HTML(mg)
# 使用XPath选择器提取图片URL和相关信息
img_url = tree.xpath('//ul[@class="clearfix"]//li')
for a in img_url:
c = a.xpath('./a//img/@alt')[0]
img_mg = a.xpath('./a//img/@src')
img_mgg = requests.get('https://pic.netbian.com' + ''.join(img_mg))
if not os.path.exists('彼岸图网'):
os.mkdir("彼岸图网")
with open(f'彼岸图网/{c}.jpg', 'wb') as f:
f.write(img_mgg.content)
print(f'彼岸图网/{c}.jpg,保存成功')
def zongpage(url):
# 发送HTTP请求,获取页面内容
respones = requests.get(url, headers=headers)
mg = respones.content.decode("gbk")
tree = etree.HTML(mg)
# 使用XPath选择器提取总页数
page = tree.xpath('//div[@class="page"]/a/text()')
zongpage = page[-2]
print('总共:' + zongpage + '页')
return zongpage
zongpagee = zongpage(url)
if __name__ == '__main__':
threads = []
#Pool(5)可以调整进程数量 不填则按照CPU最大核心数来并发不填会有死机风险,最好不要超过10
pool = Pool(5)
for a in range(1, int(zongpagee)):
if a == 1:
# 第一页直接调用huoqu函数
huoqu(url)
else:
uu = f'{url}/index_{a}.html'
print(uu)
threads.append(uu)
for i in range(1, int(zongpagee)):
# 创建线程并启动
pool.map(huoqu,threads)
pool.close()
# 随机延迟一段时间
time.sleep(random.randint(2, 5))
for t in threads:
# 等待所有线程完成
pool.join()
print("完事")
|