酷狗音乐网页歌曲爬取优化

daofengyi · 发表于 2022-11-28 11:28

本帖最后由 daofengyi 于 2022-11-28 14:43 编辑

import re
import time
import json
import aiohttp
import aiofiles
import asyncio
import hashlib
import os.path
import argparse
import logging

read_me = """
# 日期: 20221128
# 功能: 酷狗网页版MP3音乐下载
   1.增加日志模块
   2.对mp3_url为空的异常处理
   3.增加歌手过滤
   4.增加页数选择
# 用法: python kugouDownload.py keyword -s singer -p page
# 示例: 搜索歌曲名:给你给我
      python kugouDownload.py 给你给我
   搜索歌曲名:给你给我歌手: 毛不易
      python kugouDownload.py 给你给我 -s 毛不易
   搜索歌曲名:给你给我歌手: 毛不易两页结果: 2
      python kugouDownload.py 给你给我 -s 毛不易 -p 2
# 来源: https://www.52pojie.cn/thread-1638865-1-1.html
"""

headers = {
"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) "
               "Chrome/89.0.4389.114 Mobile Safari/537.36 "
}

def generate_signature(t, search_keyword, page):
"""生成签名"""
sign_params = ['NVPh5oo715z5DIWAeQlhMDsWXXQV4hwt', 'bitrate=0', 'callback=callback123',
               'clienttime=' + str(t), 'clientver=2000', 'dfid=-', 'inputtype=0', 'iscorrection=1',
               'isfuzzy=0',
               f'keyword={search_keyword}', f'mid={str(t)}', f'page={str(page)}', 'pagesize=30',
               'platform=WebFilter', 'privilege_filter=0', 'srcappid=2919', 'token=', 'userid=0',
               f'uuid={str(t)}', 'NVPh5oo715z5DIWAeQlhMDsWXXQV4hwt']
sign_params = ''.join(sign_params)
signature = hashlib.md5(sign_params.encode(encoding='UTF-8')).hexdigest()
return signature

async def main(search_keyword, singer_name, page):
async with aiohttp.ClientSession() as session:
      url = 'https://complexsearch.kugou.com/v2/search/song'
      t = time.time()
      params = {
         'callback': 'callback123',
         'page': page,
         'keyword': search_keyword,
         'pagesize': '30',
         'bitrate': '0',
         'isfuzzy': '0',
         'inputtype': '0',
         'platform': 'WebFilter',
         'userid': '0',
         'clientver': '2000',
         'iscorrection': '1',
         'privilege_filter': '0',
         'token': '',
         'srcappid': '2919',
         'clienttime': str(t),
         'mid': str(t),
         'uuid': str(t),
         'dfid': '-',
         'signature': generate_signature(t, search_keyword, page)
      }
      async with session.get(url=url, headers=headers, params=params) as resp:
         if resp.status == 200:
            resp_text = await resp.text()
            json_data = json.loads(resp_text[12:-2:])
            status = json_data['status']
            song_list = []
            if status == 1:
                  for item in json_data['data']['lists']:
                     song_info = {'SongName': re.sub(r"[/\\:*?\"<>|]", "_", item['SongName']),
                                 'AlbumID': item['AlbumID'],
                                 'FileHash': item['FileHash'], 'SQFileHash': item['SQFileHash'],
                                 'HQFileHash': item['HQFileHash'], 'MvHash': item['MvHash'],
                                 'Audioid': item['Audioid'],
                                 'SingerName': re.sub(r"[/\\:*?\"<>|]", "_", item['SingerName'])}
                     song_list.append(song_info)
            else:
                  logging.error(f'获取歌曲列表失败: {json_data["error_msg"]}')
            if singer_name is not None:  # 根据歌手名过滤
                  song_list = [song_info for song_info in song_list if song_info["SingerName"] == singer_name]
            n = len(song_list)
            if n > 0:
                  logging.info(f'获取歌曲列表成功，总共{len(song_list)}首，准备下载...')
                  tasks = [asyncio.create_task(get_song_play_addr(song)) for song in song_list]
                  await asyncio.wait(tasks)
            else:
                  logging.error(f'歌曲搜索结果为: {n}, 请更换关键词或者歌手重试')
         else:
            logging.error(f'错误代码: {resp.status}, 稍后重试')

async def get_song_play_addr(song_info):
async with aiohttp.ClientSession() as session:
      url = 'https://wwwapi.kugou.com/yy/index.php'
      params = {
         'r': 'play/getdata',
         'callback': 'jQuery191035601158181920933_1653052693184',
         'hash': song_info['FileHash'],
         'dfid': '2mSZvv2GejpK2VDsgh0K7U0O',
         'appid': '1014',
         'mid': 'c18aeb062e34929c6e90e3af8f7e2512',
         'platid': '4',
         'album_id': song_info['AlbumID'],
         '_': '1653050047389'
      }
      async with session.get(url=url, headers=headers, params=params) as resp:
         if resp.status == 200:
            resp_text = await resp.text()
            json_data = json.loads(resp_text[42:-2:].replace('\\', '').encode('utf8').decode('unicode_escape'))
            data = json_data['data'] if json_data.get('data') is not None else None
            mp3_url = data['play_url'] if data is not None and data.get('play_url') is not None else ""
            if not mp3_url:
                  logging.error("未找到mp3_url下载链接")
            else:
                  await save_mp3(mp3_url, song_info['SongName'], song_info['SingerName'])
         else:
            logging.info(f'错误代码: {resp.status}, 请稍后再试')

async def save_mp3(url, song_name, singer_name, default_sv_dir="music"):
os.makedirs(default_sv_dir, exist_ok=True)
async with aiohttp.ClientSession() as session:
      async with session.get(url=url, headers=headers) as resp:
         async with aiofiles.open(f'{default_sv_dir}/{song_name}-{singer_name}.mp3', mode='wb') as f:
            await f.write(await resp.content.read())
            logging.info(f'{song_name}--{singer_name}--下载完成')

if __name__ == '__main__':
logging.basicConfig(filename="kgDownloader.log", format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO)
logging.info(read_me)
parser = argparse.ArgumentParser(
      prog='kuGouDownloader',
      description='酷狗音乐下载')
parser.add_argument('keyword', help='歌曲关键词')
parser.add_argument('-s', '--singer', help='歌手名')
parser.add_argument('-p', '--page', help='页数', type=int)
args = parser.parse_args()
keyword, singer = args.keyword, args.singer
total_page = args.page if args.page is not None else 1
loop = asyncio.get_event_loop()
for i in range(1, total_page+1):
      logging.info(f"搜索: {keyword}, 歌手: {singer if singer is not None else '' }, 搜索页数: {i}")
      loop.run_until_complete(main(keyword, singer, i))  # 默认下载搜索列表的第一页共30首

基于原贴：https://www.52pojie.cn/thread-1638865-1-1.html，优化修改

chenshime · 发表于 2022-12-5 19:48

很不错，但是下载的有的只有一分钟。还有不会用的，可以参考一下。（以下针对小小白）
1。安装python 3.8或更高版本
2。如果运行py 时python 提示::运行缺少aiofile  和aiohttp
3。添加环境变量  运行：CMD 输入sysdm.cpl ,变量：path 路径就是python 的路径
4.python -m pip install --upgrade pip
5.  pip3 install aiohttp
6.pip3 install aiofiles

封心棒棒糖 · 发表于 2022-11-28 21:32

可以的，很棒

gaoliying · 发表于 2022-11-29 09:33

支持楼主！感谢分享！

xssaxhhyy · 发表于 2022-11-29 12:40

谢谢分享，学习一下

lfordch · 发表于 2022-12-3 02:07

酷狗哭了T﹏T

bearseasay · 发表于 2022-12-14 16:16

酷狗哭了T﹏T

Mr.救赎 · 发表于 2023-1-7 20:41

协程爬虫的时候，下载数据有的不是很完整，怎么解决这个问题？

smith017 · 发表于 2023-1-9 22:00

可以爬取vip整首歌吗，

hao5205420 · 发表于 2023-1-12 21:41

不能下高音质的吗?只能下载普通音质的啊

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] 酷狗音乐网页歌曲爬取优化