本帖最后由 sakura32 于 2024-1-13 13:04 编辑
功能:
搜索歌曲/歌手,返回一个结果列表,然后选择列表中的编号进行下载。自动合并专辑封面和歌词(合并歌词代码有问题,无法正常合并)
使用说明:
1.需要配置好playwright
2.无法在python控制台中直接运行(会闪退,不知道什么原因),在pycharm中能正常运行
其他说明:
爬的网站曲库一般,音质一般,lrc歌词质量较差
额外补充说明:爬的网站gequbao.com,网站本身是能直接正常用的,但是直链下载几次后网站会隐藏链接需要关注公众号,解决方案:网站有试听功能,试听指向的链接就是下载链接,藏得很浅且不加密,所以只要抓到这个链接就行了,用浏览器-检查/审查元素-网络抓包或者网页资源嗅探类插件都能抓到
再次补充说明:新做了一个网站的爬虫,曲库更多,但是下架了一些版权歌(例如周董的)
截图:
源码:https://github.com/PPJUST/Music-Spider
main.py
[Python] 纯文本查看 复制代码 # 主程序
import re
import time
from lxml import html
from tqdm import tqdm
from down_music import *
from music_info import *
etree = html.etree
baseurl_search = r'https://www.gequbao.com/s/'
baseurl_homepage = r'https://www.gequbao.com'
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
def get_search_result(keyword):
"""获取原始搜索结果文本"""
url_search = baseurl_search + keyword
response = requests.get(url_search, headers=headers)
if response.status_code == 200:
return response.text
else:
print('响应状态码错误')
def get_urls(html_str: str):
"""利用正则提取网页链接"""
pattern = r'<a href="(/music/\d+)" target'
short_urls = re.findall(pattern, html_str) # 短链接/music/402856
urls = [baseurl_homepage + i for i in short_urls] # 拼接完整链接
return urls
def get_music_info(urls: list):
"""获取链接对应的链接字典"""
url_info_dict = {} # {url:{获取的info}...}
for url in tqdm(urls, bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}'):
spider = MusicInfo(url)
info_dict = spider.get_info()
url_info_dict[url] = info_dict
time.sleep(0.2)
return url_info_dict
def show_music_list(url_info_dict: dict):
"""显示带编号的歌曲列表"""
for index, info_dict in enumerate(url_info_dict.values(), start=1):
music_name = info_dict['music_name']
print(index, music_name)
def down_music(url, info_dict):
"""下载歌曲"""
spider = DownMusic(info_dict)
# 检查是否正确下载,如果错误则重新获取链接
if spider.is_error():
print('下载链接已失效,尝试重新获取')
re_spider = MusicInfo(url)
re_info_dict = re_spider.get_info()
return down_music(url, re_info_dict)
else:
print('完成下载')
def main():
while True:
keyword = input('输入歌名/歌手,回车后查询:').strip()
html_str = get_search_result(keyword)
urls = get_urls(html_str)
url_info_dict = get_music_info(urls)
show_music_list(url_info_dict)
while True:
number = int(input('输入歌曲编号,回车后下载歌曲(输入0返回搜索栏):').strip())
if number == 0:
break
select_url, select_info_dict = list(url_info_dict.items())[number - 1]
down_music(select_url, select_info_dict)
if __name__ == '__main__':
main()
music_info.py
[Python] 纯文本查看 复制代码 # 该模块用于获取歌曲的封面、文件名、下载链接等信息
from playwright.sync_api import sync_playwright
class MusicInfo:
def __init__(self, music_page: str):
"""
:param music_page: str类型,歌曲页面链接
"""
self._music_download_link = '' # 歌曲下载链接
self._cover_download_link = '' # 封面下载链接
self._lrc_download_link = '' # 歌词下载链接
self._music_name = '' # 歌曲名
self._goto_page(music_page)
def _goto_page(self, music_page: str):
"""
:param music_page: str类型,歌曲页面链接
"""
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.on('response', self._on_response) # 响应请求
page.goto(music_page)
page.wait_for_load_state('networkidle')
html = page.content() # 获取页面源码
browser.close()
self._get_music_name_and_lrc(html)
def _on_response(self, response):
state = response.status # 状态码
url = response.url # 链接
# print(f'Statue {state}: {url}')
# 酷我接口
if 'kuwo' in url and '.mp3' in url: # 提取歌曲下载链接
self._music_download_link = url
elif 'kuwo' in url and '.jpg' in url: # 提取封面
self._cover_download_link = url
# 网易云接口
elif 'music.126' in url and '.mp3' in url: # 提取歌曲下载链接
self._music_download_link = url
elif 'music.126' in url and'.jpg' in url: # 提取封面
self._cover_download_link = url
def _get_music_name_and_lrc(self, html: str):
"""获取歌曲文件名"""
html_lines = html.split('\n')
for line in html_lines:
# print(f'Line: {line}')
if 'description' in line: # 提取歌曲名称
# <meta name="description" content="青花瓷-周杰伦.mp3免费在线下载播放,歌曲宝在线音乐搜索
split1 = line.find('content=')
split2 = line.find('.mp3')
music_name = line[split1 + len('content=') + 1:split2]
self._music_name = music_name
elif 'btn-download-lrc' in line and 'href' in line: # 提取歌词
# <a id="btn-download-lrc" href="/download/lrc/1655094" class="btn btn-primary"
split1 = line.find('href=')
split2 = line.find(' class')
short_lrc_url = line[split1 + len('href=') + 1:split2 - 1]
lrc_download_link = 'https://www.gequbao.com' + short_lrc_url
self._lrc_download_link = lrc_download_link
def get_info(self):
"""返回信息"""
info_dict = {
'music_download_link': self._music_download_link,
'cover_download_link': self._cover_download_link,
'lrc_download_link': self._lrc_download_link,
'music_name': self._music_name
}
return info_dict
down_music.py
[Python] 纯文本查看 复制代码 # 该模块用于获取歌曲的封面、文件名、下载链接等信息
import os
import requests
from mutagen.id3 import ID3, APIC, USLT
class DownMusic:
"""下载歌曲"""
def __init__(self, info_dict: dict):
self._music_download_link = info_dict['music_download_link']
self._cover_download_link = info_dict['cover_download_link']
self._lrc_download_link = info_dict['lrc_download_link']
self._music_name = info_dict['music_name']
if self._music_download_link: # 如果没有获取到歌曲链接,则不进行下一步
result = self._down_music() # 歌曲链接有有效期,过期后无法下载文件
if result:
self._is_error = False
self._down_lrc()
self._down_cover()
self._join_music_metadata()
self._delete_useless_file()
else:
self._is_error = True
else:
self._is_error = True
def is_error(self):
"""测试运行是否出错"""
return self._is_error
def _down_music(self):
"""下载歌曲"""
filename = self._music_name + '.mp3'
result = self._download_file(self._music_download_link, filename)
return result
def _down_lrc(self):
"""下载歌词"""
filename = self._music_name + '.lrc'
self._download_file(self._lrc_download_link, filename)
def _down_cover(self):
"""下载封面"""
filename = self._music_name + '.jpg'
self._download_file(self._cover_download_link, filename)
def _join_music_metadata(self):
"""拼合歌曲文件"""
file_music = self._music_name + '.mp3'
file_lrc = self._music_name + '.lrc'
file_cover = self._music_name + '.jpg'
audio = ID3(file_music)
# 添加封面
with open(file_cover, 'rb') as f:
cover = f.read()
audio['APIC'] = APIC(
encoding=3, # utf-8
mime='image/jpeg', # image/jpeg或image/png
type=3, # cover image
desc=u'Cover',
data=cover
)
# 添加歌词
with open(file_lrc, 'r', encoding='utf-8') as f:
lyrics = f.read()
audio['USLT'] = USLT(
encoding=3, # utf-8
lang='chi', # 歌词语言
desc=u'Lyrics',
text=lyrics
)
audio.save()
def _delete_useless_file(self):
"""合并后删除无用文件"""
file_lrc = self._music_name + '.lrc'
file_cover = self._music_name + '.jpg'
os.remove(file_lrc)
os.remove(file_cover)
@staticmethod
def _download_file(url, filename):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
}
response = requests.get(url, headers=headers)
with open(filename, 'wb') as f:
f.write(response.content)
if os.path.getsize(filename):
return True
else:
return False
|