import httpx
import logging
import asyncio
import json
from motor.motor_asyncio import AsyncIOMotorClient
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
semaphore = asyncio.Semaphore(5)
session = httpx.AsyncClient(http2=True)
async def scrape_api(url):
async with semaphore:
# 忽略ssl证书与开启http2.0一致,需要在client提供参数
try:
async with httpx.AsyncClient(http2=True) as client:
r = await client.get(url)
return r.json()
except Exception as error:
logging.log(level=logging.WARNING, msg=f"{url}获取失败,\n原因如下:{error}")
async def get_index(page):
url = f"https://spa16.scrape.center/api/book/?limit=18&offset={page * 18}"
return await scrape_api(url)
async def get_detail(id):
url = f"https://spa16.scrape.center/api/book/{id}/"
data = await scrape_api(url)
await save_data(data)
MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
MONGO_DB_NAME = 'Books'
MONGO_COLLECTION_NAME = 'Books'
MONGO = AsyncIOMotorClient(MONGO_CONNECTION_STRING)
db = MONGO[MONGO_DB_NAME]
collection = db[MONGO_COLLECTION_NAME]
async def save_data(data):
logging.info('saving data %s', data)
if data:
return await collection.update_one({
'id': data.get('id')
}, {
'$set': data
}, upsert=True)
async def main():
booklist = [asyncio.ensure_future(get_index(i)) for i in range(20)]
result = await asyncio.gather(*booklist)
logging.info('results %s', json.dumps(result, ensure_ascii=False, indent=2))
ids = []
for data in result:
if not data:
continue
for bid in data["results"]:
ids.append(bid.get("id"))
details = [asyncio.ensure_future(get_detail(bid)) for bid in ids]
await asyncio.wait(details)
if __name__ == '__main__':
asyncio.run(main())
程序可以正常运行到sava_data函数,并且logging也正常打印,但是最后会报错:
RuntimeError: Task <Task pending name='Task-401' coro=<get_detail() running at E:\code\Python\ScrapeCenter\SPA16\LikeBook.py:32> cb=[_wait.<locals>._on_completion() at E:\Python\lib\asyncio\tasks.py:479]> got Future <Future pending cb=[_chain_future.<locals>._call_check_cancel() at E:\Python\lib\asyncio\futures.py:384]> attached to a different loop