- requests版本,优点就是写法简单,缺点就是程序运行耗时比较多。请各位手下留情,不要把别个网站玩坏了。此网站本身就提供了JSON接口,不过没有关键字搜索,所以利用自带的搜索写了个Demo。如果有什么问题,联系我,删贴。在此只为学习。
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: Aixiu
# @Time : 2023/06/09 17:08:38
import random
from urllib import parse
import requests
import json
from parsel import Selector
from datetime import datetime
import asyncio
import time
class bbj(object):
# 初始化搜索URL
def __init__(self):
self.Burl = r'http://bbj.icu/search?' # parse.urlencode
# 随机选择一个headers
def get_headers(self):
'''
随机获取一个headers
'''
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
headers = {'User-Agent': random.choice(user_agents)}
return headers
# requests抓取网页的通用框架
def get_Html_Text(self, url):
try:
r = requests.get(url=url, headers=self.get_headers(), timeout=10)
# 如果状态码不是200 则应发HTTOError异常
r.raise_for_status()
# 设置正确的编码方式
r.encoding = r.apparent_encoding
return r.text
except:
return 'Something Wrong!'
# 获取详情页URL
def get_search_list(self, url):
selector = Selector(self.get_Html_Text(url=url)) # 初始化Selector()对象
'''使用xpath方法获取id为images下的所有a标签'''
# 实际就是一个列表:['<a href="/haibao/631.html">2018斗罗大陆》高清无水印海报轮播图</a>', '<a href="/haibao/751.html">2022年国产动漫《斗罗大陆2绝世唐门第四季》高清无水印海报轮播图</a>'']
items_list = selector.xpath('//div[1]/div[3]/ul/li/h4/a')
result_text = [item.xpath('./text()').get() for item in items_list] # 获取节点文字内容
result_href = [item.xpath('./@href').get() for item in items_list] # 获取节点属性
url_list = []
for index, data in enumerate(zip(result_text, result_href)):
url_list.append(f'http://bbj.icu{data[1]}')
print(f"{index}、影片:{data[0]} ==> 详情地址:http://bbj.icu{data[1]}")
return url_list
# 获取详情页数据
def get_details_date(self, url_list, keyword):
out_data = []
for Burl in url_list:
selector = Selector(self.get_Html_Text(url=Burl))
data_img_url = selector.xpath(
'//div[1]/div[1]/div/ul/p/img/@data-original'
).getall()
url_title = selector.xpath('//div/div[1]/div[1]/div/h2/text()').get()
out_data_dict = {url_title: data_img_url}
# print(json.dumps(out_data_dict, ensure_ascii=False, indent=4))
out_data.append(out_data_dict)
return self.info_data(
code=200, url=out_data, msg=f'与--{parse.unquote(keyword)}--相关的影片海报,解析成功'
)
# 豆瓣ID API
def api_date(self, doubanId):
if doubanId.isdigit():
Burl = f'http://bbj.icu/BBJ-json?doubanId={doubanId}'
json_data = requests.get(
url=Burl, headers=self.get_headers(), timeout=10
).json()
if json_data.get(doubanId).get('doubanId'):
return self.info_data(
code=200,
url=json_data.get(doubanId).get('bbjPosterUrl'),
msg=f'《{json_data.get(doubanId).get("name")}》影片海报,解析成功',
doubanId=json_data.get(doubanId).get('doubanId'),
)
else:
return self.info_data(
code=400,
url='None',
msg=f'暂无与豆瓣ID为: {doubanId} 相关的影片海报',
doubanId="None",
)
else:
return json.dumps(
self.info_data(code=400, url=None, msg='doubanId不正确'),
ensure_ascii=False,
indent=4,
)
# 统一消息返回函数 开始
def info_data(self, code, url, msg, doubanId=None):
info_list = [
'本是清灯不归客,却因浊酒恋红尘。',
'一切有为法,如梦幻泡影,如露亦如电,当作如是观。',
'念才生,万法齐现,假指心性,而明易道。',
'你可以拥有爱,但不要执着,因为分离是必然的。',
'见利忘义,不知道恩德,这就是造作罪业。',
'满纸荒唐言,一把辛酸泪。都云作者痴,谁解其中味。',
'懵逼树上懵逼果,懵逼树下你和我。',
'缘来则去,缘聚则散,缘起则生,缘落则灭。',
'心中装满着自己的看法与想法的人,永远听不见别人的心声。',
'逆境是必经的过程,能勇于接受逆境的人,生命就会日渐的茁壮。',
'势不可使尽,福不可享尽,便宜不可占尽,聪明不可用尽。',
'请你用慈悲心和温和的态度,把你的不满与委屈说出来,别人就容易接受。',
]
info = {
'code': code,
'status': msg,
'doubanId': doubanId,
'url': url,
'info': random.choice(info_list),
# 'msg': '请输入 /help 获取更多帮助。',
'date': datetime.now().strftime(r"%Y-%m-%d %H:%M:%S"),
}
# 去除字典值为空的键值对
for key in list(info.keys()):
if not info.get(key):
del info[key]
return info
# 统一消息返回函数 结束
def main(self, keyword):
url = f'{self.Burl}{parse.urlencode({"content": keyword})}'
# url = f'{self.Burl}{parse.quote(keyword)}'
url_list = self.get_search_list(url=url)
out_data = self.get_details_date(url_list=url_list, keyword=keyword)
return out_data
if __name__ == '__main__':
new_obj = bbj()
# 关键字
keyword = input('搜索片名 ==> ').encode('UTF-8')
start_time = time.time()
out_info = new_obj.main(keyword)
print(out_info)
print("总耗时: ", time.time()-start_time)
# 豆瓣ID
# doubanId = input('输入豆瓣ID ==> ')
# json_data = new_obj.api_date(doubanId=str(doubanId))
# print(json_data)
效果图:
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: Aixiu
# @Time : 2023/06/16 16:10:10
import asyncio
import aiohttp
from parsel import Selector
from urllib import parse
import random
from datetime import datetime
import time
import json
# 随机选择一个headers
def get_headers():
'''
随机获取一个headers
'''
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
headers = {'User-Agent': random.choice(user_agents)}
return headers
# 定义异步函数来发送GET请求
async def get_Html_Text(session, url):
try:
async with session.get(url, headers=get_headers()) as response:
# 检查响应状态码是否为200,否则抛出异常
if response.status != 200:
raise Exception(f"Bad status code: {response.status}")
# 返回响应内容的文本格式
return await response.text()
except Exception as e:
# 打印异常信息,并返回None
print(e)
return None
# 获取关键字所有的连接
async def get_search_list(session, url):
html = await get_Html_Text(session=session, url=url)
selector = Selector(text=html)
'''使用xpath方法获取id为images下的所有a标签'''
# 实际就是一个列表:['<a href="/haibao/631.html">2018斗罗大陆》高清无水印海报轮播图</a>', '<a href="/haibao/751.html">2022年国产动漫《斗罗大陆2绝世唐门第四季》高清无水印海报轮播图</a>'']
items_list = selector.xpath('//div[1]/div[3]/ul/li/h4/a')
result_text = [item.xpath('./text()').get() for item in items_list] # 获取节点文字内容
result_href = [item.xpath('./@href').get() for item in items_list] # 获取节点属性
url_list = []
for index, data in enumerate(zip(result_text, result_href)):
url_list.append(f'http://bbj.icu{data[1]}')
# print(f"{index}、影片:{data[0]} ==> 详情地址:http://bbj.icu{data[1]} ==> 耗时:{time.time()}")
return url_list
# 获取详情页数据
async def get_details_date(session, url):
html = await get_Html_Text(session=session, url=url)
selector = Selector(text=html)
data_img_url = selector.xpath(
'//div[1]/div[1]/div/ul/p/img/@data-original'
).getall()
url_title = selector.xpath('//div/div[1]/div[1]/div/h2/text()').get()
out_data_dict = {url_title: data_img_url}
# out_data_dict = {url_title: data_img_url, 'run time':time.time()}
# print(f"{time.time()}")
return out_data_dict
# 豆瓣ID API
async def api_date(session, doubanId):
Burl = f'http://bbj.icu/BBJ-json?doubanId={doubanId}'
async with session.get(url=Burl, headers=get_headers()) as response:
return await response.json()
async def json_api_date(doubanId):
if doubanId.isdigit():
start_time = time.time()
async with aiohttp.ClientSession() as session:
json_data = await api_date(session=session, doubanId=doubanId)
run_time = f'{time.time()-start_time:.2f} second'
if json_data.get(doubanId).get('doubanId'):
return info_data(
code=200,
url=json_data.get(doubanId).get('bbjPosterUrl'),
msg=f'《{json_data.get(doubanId).get("name")}》影片海报,解析成功',
doubanId=json_data.get(doubanId).get('doubanId'),
run_time=run_time,
)
else:
return info_data(
code=400,
url='None',
msg=f'暂无与豆瓣ID为: {doubanId} 相关的影片海报',
doubanId="None",
)
else:
return json.dumps(
info_data(code=400, url=None, msg='doubanId不正确'),
ensure_ascii=False,
indent=4,
)
# 统一消息返回函数 开始
def info_data(code, url, msg, run_time=None, doubanId=None):
info_list = [
'本是清灯不归客,却因浊酒恋红尘。',
'一切有为法,如梦幻泡影,如露亦如电,当作如是观。',
'念才生,万法齐现,假指心性,而明易道。',
'你可以拥有爱,但不要执着,因为分离是必然的。',
'见利忘义,不知道恩德,这就是造作罪业。',
'满纸荒唐言,一把辛酸泪。都云作者痴,谁解其中味。',
'懵逼树上懵逼果,懵逼树下你和我。',
'缘来则去,缘聚则散,缘起则生,缘落则灭。',
'心中装满着自己的看法与想法的人,永远听不见别人的心声。',
'逆境是必经的过程,能勇于接受逆境的人,生命就会日渐的茁壮。',
'势不可使尽,福不可享尽,便宜不可占尽,聪明不可用尽。',
'请你用慈悲心和温和的态度,把你的不满与委屈说出来,别人就容易接受。',
]
info = {
'code': code,
'status': msg,
'doubanId': doubanId,
'url': url,
'info': random.choice(info_list),
'run_time': run_time,
'date': datetime.now().strftime(r"%Y-%m-%d %H:%M:%S"),
}
# 去除字典值为空的键值对
for key in list(info.keys()):
if not info.get(key):
del info[key]
return info
# 统一消息返回函数 结束
async def main(keyword):
Burl = r'http://bbj.icu/search?' # parse.urlencode
url = f'{Burl}{parse.urlencode({"content": keyword})}'
sem = asyncio.Semaphore(5) # 控制并发数量
start_time = time.time()
async with aiohttp.ClientSession() as session:
async with sem:
url_list = await get_search_list(session, url)
tasks = []
for url in url_list:
task = asyncio.create_task(get_details_date(session=session, url=url))
tasks.append(task)
# gather方法
out_data_list = await asyncio.gather(*tasks)
run_time = f'{time.time()-start_time:.2f} second'
return info_data(
code=200,
url=out_data_list,
msg=f'与--{parse.unquote(keyword)}--相关的影片海报,解析成功',
run_time=run_time,
)
if __name__ == "__main__":
keyword = input('搜索片名 ==> ').encode('UTF-8')
start_time = time.time()
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
out_info = asyncio.run(main(keyword))
print(out_info)
print("总耗时: ", time.time() - start_time)
# doubanId = input('输入豆瓣ID ==> ')
# start_time = time.time()
# asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
# out_info = asyncio.run(json_api_date(doubanId))
# print(out_info)
# print("总耗时: ", time.time()-start_time)
效果图:
提升的不是一点点。