js实现大声朗读

jeege · 发表于 2022-11-13 01:27

前言

偶然一次看到语音合成的相关功能，挺感兴趣的。经过一番了解后，看到了某浏览器的大声朗读功能。尝试抓包，并使用js调用，实现了这个小功能。

准备

某浏览器打开一个本地的txt文档，右键大声朗读，使用charles进行抓包，得到以下结果：

抓包结果1

一个是获取语音列表的接口，包含各种语音（晓晓，云希等等）一个是websocket请求获取音频二进制流的。两个接口的TrustedClientToken参数都是固定的，可以直接使用。websocket请求，客户端会发送两条数据，如下：

抓包结果2

接下来就通过h5页面简单实现这个功能。

代码

页面模版

<template id="main">
        <div class="item-line">
            <p class="item-title">选择语言：</p>
            <select v-model="state.lan" @change="state.voice = 0">
                <option v-for="item in lanList" :value="item">{{item}}</option>
            </select>
        </div>
        <div class="item-line">
            <p class="item-title">选择语音：</p>
            <select v-model="state.voice">
                <option v-for="(item, index) in localVoiceList[state.lan]" :value="index">{{item.ShortName}}</option>
            </select>
        </div>
        <div class="item-line">
            <p class="item-title">设置音调：</p>
            <input class="ipt-number" step="10" v-model.number="state.pitch" type="number">
        </div>
        <div class="item-line">
            <p class="item-title">设置语速：</p>
            <input class="ipt-number" step="10" v-model.number="state.rate" type="number">
        </div>
        <div class="item-line">
            <textarea class="ipt-box" placeholder="请输入要合成的内容" v-model="state.text" @keydown.enter="speak"></textarea>
        </div>
        <div class="item-line">
            <div class="submit-btn" @click="speak">播放语音</div>
        </div>
        <div class="item-line">
            <div class="submit-btn" @click="download">下载语音</div>
        </div>
    </template>

js代码

import { reactive } from 'https://unpkg.com/vue@3.2.45/dist/vue.esm-browser.prod.js'

const textEncoder = new TextEncoder()
const binaryHeadEnd = textEncoder.encode('Path:audio\r\n').toString()

// 获取声音列表
function getVoiceList() {
    const _voiceList = reactive({})
    const _lanList = reactive([])
    fetch('https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4').then(data => data.json()).then(data => {
        data.forEach(item => {
            if (!_voiceList[item.Locale]) {
                _lanList.push(item.Locale)
                _voiceList[item.Locale] = []
            }
            _voiceList[item.Locale].push(item)
        })
    })
    return [_voiceList, _lanList]
}
// 生成guid
function guid(){
    function gen(count) {
        var out = "";
        for (var i=0; i<count; i++) {
            out += (((1+Math.random())*0x10000)|0).toString(16).substring(1);
        }
        return out;
    }
    return gen(8)
}
// 数字转带符号字符串
function numToString(num) {
    return num >= 0 ? `+${num}`:`${num}`
}

const speechConfig = (audioOutputFormat = 'webm-24khz-16bit-mono-opus') => `X-Timestamp:${new Date()}\r\nContent-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},"outputFormat":"${audioOutputFormat}"}}}}`

const ssmlText = ({requestId = guid(), lan = 'zh-CN', voiceName, pitch = '+0', rate = '+0', volume = '+0', text}) => `X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nX-Timestamp:${new Date()}\r\nPath:ssml\r\n\r\n<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='${lan}'><voice name='${voiceName}'><prosody pitch='${pitch}Hz' rate ='${rate}%' volume='${volume}%'>${text}</prosody></voice></speak>`

// 获取音频
function getAudio(state, localVoiceList) {
    const bufferList = []
    return new Promise((resolve, reject) => {
        if (!state.text) {
            reject("请输入文字")
            return
        }
        const requestId = guid()
        const ws = new WebSocket("wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=6A5AA1D4EAFF4E9FB37E23D68491D6F4")
        ws.addEventListener('open', () => {
            ws.send(speechConfig())
            ws.send(ssmlText({
                requestId, 
                text: state.text,
                lan: state.lan, 
                voiceName: localVoiceList[state.lan][state.voice].Name, 
                pitch: numToString(state.pitch), 
                rate: numToString(state.rate),
            }))
        })
        ws.addEventListener('message', async ({ data }) => {
            if (data instanceof Blob) {
                const view = new Uint8Array(await data.arrayBuffer())
                // 响应的二进制流开头一部分数据是不需要的
                bufferList.push(...view.toString().split(binaryHeadEnd)[1].split(',').slice(1).map(i => +i))
                // 006758开头的表示音频结束了
                if(view[0] == 0x00 && view[1] == 0x67 && view[2] == 0x58) {
                    ws.close(1000)
                }
            }
        })
        ws.addEventListener("error", (err) => {
            console.log('------出错了', err)
            reject(err)
        })
        ws.addEventListener('close', (event) => {
            if (event.code != 1000) {
                console.error('----关闭了', event)
                reject(event.code)
                return 
            }
            const blob = new Blob([new Uint8Array(bufferList)], {type: 'audio/webm'})
            resolve(URL.createObjectURL(blob))
        })
    })
}

export default {
    template: '#main',
    setup() {
        const state = reactive({
            text: '',
            pitch: 0,
            rate: 0,
            volume: 0,
            lan: 'zh-CN',
            voice: 0
        })
        const [localVoiceList, lanList] = getVoiceList()

        function speak() {
            getAudio(state, localVoiceList).then(url => {
                const audio = new Audio(url)
                audio.play()
            }).catch(err => {
                alert(err)
            })
        }
        function download() {
            getAudio(state, localVoiceList).then(url => {
               const link = document.createElement('a')
               link.download = `audio_${Date.now()}.webm`
               link.href = url
               link.style = 'display: none';
               document.body.append(link)
               link.click();
               link.remove();
            }).catch(err => {
                alert(err)
            })
        }
        return {
            state,
            lanList,
            localVoiceList,
            speak,
            download
        }
    }
  }

jeege · 发表于 2022-11-13 10:59

momoki 发表于 2022-11-13 10:39
感谢分享
本地直接运行有跨域问题。
Access to script at 'file:///D:/XXXXXXXX/component/app.js' from ...

把html模版调整下

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>测试页面</title>
    <link rel="stylesheet" href="./css/index.css">
    <script src="https://unpkg.com/vue@3.2.45/dist/vue.global.prod.js"></script>
</head>
<body>
    <div id="app" class="container">
        <div class="item-line">
            <p class="item-title">选择语言：</p>
            <select v-model="state.lan" @change="state.voice = 0">
                <option v-for="item in lanList" :value="item">{{item}}</option>
            </select>
        </div>
        <div class="item-line">
            <p class="item-title">选择语音：</p>
            <select v-model="state.voice">
                <option v-for="(item, index) in localVoiceList[state.lan]" :value="index">{{item.ShortName}}</option>
            </select>
        </div>
        <div class="item-line">
            <p class="item-title">设置音调：</p>
            <input class="ipt-number" step="10" v-model.number="state.pitch" type="number">
        </div>
        <div class="item-line">
            <p class="item-title">设置语速：</p>
            <input class="ipt-number" step="10" v-model.number="state.rate" type="number">
        </div>
        <div class="item-line">
            <textarea class="ipt-box" placeholder="请输入要合成的内容" v-model="state.text" @keydown.enter="speak"></textarea>
        </div>
        <div class="item-line">
            <div class="submit-btn" @click="speak">播放语音</div>
        </div>
        <div class="item-line">
            <div class="submit-btn" @click="download">下载语音</div>
        </div>
    </div>
    <script>
        const { createApp,reactive } = Vue
        const textEncoder = new TextEncoder()
        const binaryHeadEnd = textEncoder.encode('Path:audio\r\n').toString()

        // 获取声音列表
        function getVoiceList() {
            const _voiceList = reactive({})
            const _lanList = reactive([])
            fetch('https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4').then(data => data.json()).then(data => {
                data.forEach(item => {
                    if (!_voiceList[item.Locale]) {
                        _lanList.push(item.Locale)
                        _voiceList[item.Locale] = []
                    }
                    _voiceList[item.Locale].push(item)
                })
            })
            return [_voiceList, _lanList]
        }
        // 生成guid
        function guid(){
            function gen(count) {
                var out = "";
                for (var i=0; i<count; i++) {
                    out += (((1+Math.random())*0x10000)|0).toString(16).substring(1);
                }
                return out;
            }
            return gen(8)
        }
        // 数字转带符号字符串
        function numToString(num) {
            return num >= 0 ? `+${num}`:`${num}`
        }

        const speechConfig = (audioOutputFormat = 'webm-24khz-16bit-mono-opus') => `X-Timestamp:${new Date()}\r\nContent-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},"outputFormat":"${audioOutputFormat}"}}}}`

        const ssmlText = ({requestId = guid(), lan = 'zh-CN', voiceName, pitch = '+0', rate = '+0', volume = '+0', text}) => `X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nX-Timestamp:${new Date()}\r\nPath:ssml\r\n\r\n<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='${lan}'><voice name='${voiceName}'><prosody pitch='${pitch}Hz' rate ='${rate}%' volume='${volume}%'>${text}</prosody></voice></speak>`

        // 获取音频
        function getAudio(state, localVoiceList) {
            const bufferList = []
            return new Promise((resolve, reject) => {
                if (!state.text) {
                    reject("请输入文字")
                    return
                }
                const requestId = guid()
                const ws = new WebSocket("wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=6A5AA1D4EAFF4E9FB37E23D68491D6F4")
                ws.addEventListener('open', () => {
                    ws.send(speechConfig())
                    ws.send(ssmlText({
                        requestId, 
                        text: state.text,
                        lan: state.lan, 
                        voiceName: localVoiceList[state.lan][state.voice].Name, 
                        pitch: numToString(state.pitch), 
                        rate: numToString(state.rate),
                    }))
                })
                ws.addEventListener('message', async ({ data }) => {
                    if (data instanceof Blob) {
                        const view = new Uint8Array(await data.arrayBuffer())
                        bufferList.push(...view.toString().split(binaryHeadEnd)[1].split(',').slice(1).map(i => +i))
                        if(view[0] == 0x00 && view[1] == 0x67 && view[2] == 0x58) {
                            ws.close(1000)
                        }
                    }
                })
                ws.addEventListener("error", (err) => {
                    console.log('------出错了', err)
                    reject(err)
                })
                ws.addEventListener('close', (event) => {
                    if (event.code != 1000) {
                        console.error('----关闭了', event)
                        reject(event.code)
                        return 
                    }
                    const blob = new Blob([new Uint8Array(bufferList)], {type: 'audio/webm'})
                    resolve(URL.createObjectURL(blob))
                })
            })
        }
        createApp({
            setup() {
                const state = reactive({
                    text: '',
                    pitch: 0,
                    rate: 0,
                    volume: 0,
                    lan: 'zh-CN',
                    voice: 0
                })
                const [localVoiceList, lanList] = getVoiceList()

                function speak() {
                    getAudio(state, localVoiceList).then(url => {
                        const audio = new Audio(url)
                        audio.play()
                    }).catch(err => {
                        alert(err)
                    })
                }
                function download() {
                    getAudio(state, localVoiceList).then(url => {
                    const link = document.createElement('a')
                    link.download = `audio_${Date.now()}.webm`
                    link.href = url
                    link.style = 'display: none';
                    document.body.append(link)
                    link.click();
                    link.remove();
                    }).catch(err => {
                        alert(err)
                    })
                }
                return {
                    state,
                    lanList,
                    localVoiceList,
                    speak,
                    download
                }
            }
        }).mount('#app')

    </script>
</body>
</html>

SunerC · 发表于 2022-11-13 08:32

网上那些微软TTS估计都是这么实现的

pmhker · 发表于 2022-11-13 08:42

高大上，虽然看不懂怎么用

dragonhell · 发表于 2022-11-13 08:45

同是新人，惭愧惭愧。

szl5134 · 发表于 2022-11-13 09:14

这让我想到了老人机上来电或者信息大声朗读出来的画面……

mokson · 发表于 2022-11-13 09:38

提示: 作者被禁止或删除内容自动屏蔽

zeh521 · 发表于 2022-11-13 09:39

我很赞同！

非常猥锁 · 发表于 2022-11-13 10:20

感谢分享

661lly · 发表于 2022-11-13 10:35

感谢分享

momoki · 发表于 2022-11-13 10:39

感谢分享
本地直接运行有跨域问题。
Access to script at 'file:///D:/XXXXXXXX/component/app.js' from origin 'null' has been blocked by CORS policy: Cross origin requests are only supported for protocol schemes: http, data, chrome, chrome-extension, chrome-untrusted, https.
我开个http服务端口才能访问，有其他本地运行的方式吗

帐号		自动登录	找回密码
密码			注册[Register]

[其他原创] js实现大声朗读

前言

准备

代码

页面模版

js代码

免费评分

免费评分

mokson mokson 当前离线好友阅读权限 0 听众最后登录 1970-1-1 头像被屏蔽	mokson 发表于 2022-11-13 09:38 提示: 作者被禁止或删除内容自动屏蔽

	回复支持举报