Python实例记录----aiohttp异步爬取实战
import asyncioimport aiohttp
import logging
import json
from motor.motor_asyncio import AsyncIOMotorClient
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')
INDEX_URL = 'https://spa5.scrape.center/api/book/?limit=18&offset={offset}'
DETAIL_URL = 'https://spa5.scrape.center/api/book/{id}'
PAGE_SIZE = 18# 页面内容数量
PAGE_NUMBER = 100# 爬取页数
CONCURRENCY = 5# 并发量
semaphore = asyncio.Semaphore(CONCURRENCY)
session = None
async def scrape_api(url):
async with semaphore:
try:
logging.info('scraping %s', url)
async with session.get(url) as response:
return await response.json()
except aiohttp.ClientError:
logging.error('error occurred while scraping %s', url, exc_info=True)
async def scrape_index(page):
url = INDEX_URL.format(offset=PAGE_SIZE * (page - 1))
return await scrape_api(url)
MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
MONGO_DB_NAME = 'book'
MONGO_COLLECTION_NAME = 'books'
client = AsyncIOMotorClient(MONGO_CONNECTION_STRING)
db = client
collection = db
async def save_data(data):
logging.info('saving daata %s', data)
if data:
return await collection.update_one({
'id': data.get('id')
}, {
'$set': data
}, upsert=True)
async def scrape_detail(id):
url = DETAIL_URL.format(id=id)
data = await scrape_api(url)
await save_data(data)
async def main():
global session
session = aiohttp.ClientSession()
scrape_index_tasks =
results = await asyncio.gather(*scrape_index_tasks)
logging.info('result %s', json.dumps(results, ensure_ascii=False, indent=2))
ids = []
for index_data in results:
if not index_data:
continue
for item in index_data.get('results'):
ids.append(item.get('id'))
scrape_detail_tasks =
await asyncio.wait(scrape_detail_tasks)
await session.close()
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(main())
感谢老哥分享,拿下来跑了下。第73行报错,不知如何解决
await asyncio.wait(scrape_detail_tasks)
File "C:\Python37\lib\asyncio\tasks.py", line 380, in wait
raise ValueError('Set of coroutines/Futures is empty.')
ValueError: Set of coroutines/Futures is empty.
2022-03-06 19:01:40,039 - ERROR: Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x000002762E071A48> 感谢分享
页:
[1]