前言
偶然一次看到语音合成的相关功能,挺感兴趣的。经过一番了解后,看到了某浏览器的大声朗读功能。尝试抓包,并使用js调用,实现了这个小功能。
准备
某浏览器打开一个本地的txt文档,右键大声朗读,使用charles进行抓包,得到以下结果:
抓包结果1
一个是获取语音列表的接口,包含各种语音(晓晓,云希等等)一个是websocket请求获取音频二进制流的。两个接口的TrustedClientToken参数都是固定的,可以直接使用。websocket请求,客户端会发送两条数据,如下:
抓包结果2
接下来就通过h5页面简单实现这个功能。
代码
页面模版
<template id="main">
<div class="item-line">
<p class="item-title">选择语言:</p>
<select v-model="state.lan" @change="state.voice = 0">
<option v-for="item in lanList" :value="item">{{item}}</option>
</select>
</div>
<div class="item-line">
<p class="item-title">选择语音:</p>
<select v-model="state.voice">
<option v-for="(item, index) in localVoiceList[state.lan]" :value="index">{{item.ShortName}}</option>
</select>
</div>
<div class="item-line">
<p class="item-title">设置音调:</p>
<input class="ipt-number" step="10" v-model.number="state.pitch" type="number">
</div>
<div class="item-line">
<p class="item-title">设置语速:</p>
<input class="ipt-number" step="10" v-model.number="state.rate" type="number">
</div>
<div class="item-line">
<textarea class="ipt-box" placeholder="请输入要合成的内容" v-model="state.text" @keydown.enter="speak"></textarea>
</div>
<div class="item-line">
<div class="submit-btn" @click="speak">播放语音</div>
</div>
<div class="item-line">
<div class="submit-btn" @click="download">下载语音</div>
</div>
</template>
js代码
import { reactive } from 'https://unpkg.com/vue@3.2.45/dist/vue.esm-browser.prod.js'
const textEncoder = new TextEncoder()
const binaryHeadEnd = textEncoder.encode('Path:audio\r\n').toString()
// 获取声音列表
function getVoiceList() {
const _voiceList = reactive({})
const _lanList = reactive([])
fetch('https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4').then(data => data.json()).then(data => {
data.forEach(item => {
if (!_voiceList[item.Locale]) {
_lanList.push(item.Locale)
_voiceList[item.Locale] = []
}
_voiceList[item.Locale].push(item)
})
})
return [_voiceList, _lanList]
}
// 生成guid
function guid(){
function gen(count) {
var out = "";
for (var i=0; i<count; i++) {
out += (((1+Math.random())*0x10000)|0).toString(16).substring(1);
}
return out;
}
return gen(8)
}
// 数字转带符号字符串
function numToString(num) {
return num >= 0 ? `+${num}`:`${num}`
}
const speechConfig = (audioOutputFormat = 'webm-24khz-16bit-mono-opus') => `X-Timestamp:${new Date()}\r\nContent-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},"outputFormat":"${audioOutputFormat}"}}}}`
const ssmlText = ({requestId = guid(), lan = 'zh-CN', voiceName, pitch = '+0', rate = '+0', volume = '+0', text}) => `X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nX-Timestamp:${new Date()}\r\nPath:ssml\r\n\r\n<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='${lan}'><voice name='${voiceName}'><prosody pitch='${pitch}Hz' rate ='${rate}%' volume='${volume}%'>${text}</prosody></voice></speak>`
// 获取音频
function getAudio(state, localVoiceList) {
const bufferList = []
return new Promise((resolve, reject) => {
if (!state.text) {
reject("请输入文字")
return
}
const requestId = guid()
const ws = new WebSocket("wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=6A5AA1D4EAFF4E9FB37E23D68491D6F4")
ws.addEventListener('open', () => {
ws.send(speechConfig())
ws.send(ssmlText({
requestId,
text: state.text,
lan: state.lan,
voiceName: localVoiceList[state.lan][state.voice].Name,
pitch: numToString(state.pitch),
rate: numToString(state.rate),
}))
})
ws.addEventListener('message', async ({ data }) => {
if (data instanceof Blob) {
const view = new Uint8Array(await data.arrayBuffer())
// 响应的二进制流开头一部分数据是不需要的
bufferList.push(...view.toString().split(binaryHeadEnd)[1].split(',').slice(1).map(i => +i))
// 006758开头的表示音频结束了
if(view[0] == 0x00 && view[1] == 0x67 && view[2] == 0x58) {
ws.close(1000)
}
}
})
ws.addEventListener("error", (err) => {
console.log('------出错了', err)
reject(err)
})
ws.addEventListener('close', (event) => {
if (event.code != 1000) {
console.error('----关闭了', event)
reject(event.code)
return
}
const blob = new Blob([new Uint8Array(bufferList)], {type: 'audio/webm'})
resolve(URL.createObjectURL(blob))
})
})
}
export default {
template: '#main',
setup() {
const state = reactive({
text: '',
pitch: 0,
rate: 0,
volume: 0,
lan: 'zh-CN',
voice: 0
})
const [localVoiceList, lanList] = getVoiceList()
function speak() {
getAudio(state, localVoiceList).then(url => {
const audio = new Audio(url)
audio.play()
}).catch(err => {
alert(err)
})
}
function download() {
getAudio(state, localVoiceList).then(url => {
const link = document.createElement('a')
link.download = `audio_${Date.now()}.webm`
link.href = url
link.style = 'display: none';
document.body.append(link)
link.click();
link.remove();
}).catch(err => {
alert(err)
})
}
return {
state,
lanList,
localVoiceList,
speak,
download
}
}
}