通过paddlespeech和ollama搭建本地语音问答助手

2周前 (04-14)学习114

废话不多说,直接开始。

没有安装paddlespeech的查看这个文章安装。ollama的搭建往上有很多教程,直接搜就行,很简单。我使用的 qwen2.5:latest 模型,你也可以切换成其他的,看机器配置。

把下面的代码复制到一个文件夹下面,直接双击浏览器打开index.html文件。

点击按住说话,第一次会提示允许麦克风,点允许就行。

然后安装环境,启动server.py程序。记得修改你的ollama和paddlespeech的访问连接。还有js文件里的调用地址也记得改下。

在界面上按住按钮问问题就可以了。他会边回答答案边用语音播报。

下面是代码。

server.py
from flask import Flask, request, jsonify, Response, stream_with_context
from flask_cors import CORS
import os
import base64
import requests
import json
from werkzeug.utils import secure_filename
import ssl
import random
import string


def generate_random_string(length):
    characters = string.ascii_letters + string.digits
    random_string = ''.join(random.choice(characters) for _ in range(length))
    return random_string


app = Flask(__name__)
CORS(app)

# 配置上传文件夹
UPLOAD_FOLDER = 'uploads'
if not os.path.exists(UPLOAD_FOLDER):
    os.makedirs(UPLOAD_FOLDER)

# PaddleSpeech服务地址
PADDLE_SPEECH_URL = 'http://localhost:8090/paddlespeech/asr'
PADDLE_SPEECH_TTS_URL = 'http://localhost:8090/paddlespeech/tts'

# Ollama服务地址
OLLAMA_URL = 'http://localhost:11434/api/generate'

@app.route('/recognize', methods=['POST'])
def recognize_speech():
    try:
        if 'audio' not in request.files:
            return jsonify({'error': '没有收到音频文件'}), 400

        audio_file = request.files['audio']
        if audio_file.filename == '':
            return jsonify({'error': '没有选择文件'}), 400

        # 保存上传的音频文件
        ogg_path = os.path.join(UPLOAD_FOLDER, secure_filename(audio_file.filename))
        audio_file.save(ogg_path)

        wavfilename = generate_random_string(10) + '.wav'
        # 将OGG转换为WAV
        wav_path = os.path.join(UPLOAD_FOLDER, wavfilename)
        os.system(f'ffmpeg -i {ogg_path} -acodec pcm_s16le -ar 16000 {wav_path}')

        # 读取WAV文件并转换为base64
        with open(wav_path, 'rb') as wav_file:
            audio_base64 = base64.b64encode(wav_file.read()).decode('utf-8')

        # 调用PaddleSpeech服务
        response = requests.post(PADDLE_SPEECH_URL, json={
            'audio': audio_base64,
            'audio_format': 'wav',
            'lang': "zh_cn",
            'sample_rate': 16000
        })

        if response.status_code != 200:
            raise Exception('PaddleSpeech服务调用失败')

        result = response.json()
        recognized_text = result.get('result').get('transcription')

        # 清理临时文件
        os.remove(ogg_path)
        os.remove(wav_path)

        return jsonify({'text': recognized_text})

    except Exception as e:
        return jsonify({'error': str(e)}), 500


@app.route('/chat', methods=['get', 'post'])
def chat_with_ollama():
    try:
        user_message = request.args.get('message')
        print(user_message)
        if not user_message:
            return jsonify({'error': '没有收到消息'}), 400

        # 准备发送给Ollama的请求
        ollama_data = {
            "model": "qwen2.5:latest",  # 可以根据实际使用的模型进行调整
            # "model": "deepseek-r1:14b",  # 可以根据实际使用的模型进行调整
            "prompt": user_message,
            "stream": True
        }

        # 使用流式响应
        def generate():
            try:
                # 发送请求到Ollama服务
                response = requests.post(OLLAMA_URL, json=ollama_data, stream=True)
                # response = requests.get(questionurl+user_message, stream=True)

                if response.status_code != 200:
                    yield f"data: {json.dumps({'error': 'Ollama服务调用失败'})}\n\n"
                    return

                # 用于累积文本,以便在适当的时候调用TTS
                accumulated_text = ""
                last_tts_time = 0

                endspeekstat = False
                # 处理流式响应
                for line in response.iter_lines():
                    if line:
                        try:
                            data_str = line.decode('utf-8')
                            json_response = json.loads(data_str.replace('data: ', ''))
                            print(json_response)
                            if 'response' in json_response:
                            # if 'message' in json_response:
                                response_text = json_response['response']
                                accumulated_text += response_text

                                if '引用内容' in response_text:
                                    endspeekstat = True
                                accumulated_text = accumulated_text.strip().replace('*', '').replace('-', '').replace('#', '')
                                # 发送文本到前端
                                yield f"data: {json.dumps({'text': response_text})}\n\n"
                                # yield f"data: {line}\n\n"

                                # 当累积的文本达到一定长度或遇到标点符号时,调用TTS
                                if not endspeekstat:
                                    if len(accumulated_text) >= 20 or any(p in response_text for p in ['。', '!', '?', '.', '!', '?', '\n']):
                                        print(accumulated_text)

                                        # 调用PaddleSpeech TTS服务
                                        tts_response = requests.post(PADDLE_SPEECH_TTS_URL, json={
                                            'text': accumulated_text,
                                            "spk_id": 0,
                                            "speed": 1.0,
                                            "volume": 1.0,
                                            "sample_rate": 0,
                                        })
                                        print(tts_response.json())
                                        if tts_response.status_code == 200 and tts_response.json().get('success'):
                                            tts_result = tts_response.json()
                                            audio_base64 = tts_result.get('result').get('audio')

                                            if audio_base64:
                                                # 发送音频数据
                                                yield f"data: {json.dumps({'audio': audio_base64})}\n\n"
                                                # 清空累积的文本
                                                accumulated_text = ""
                                            else:
                                                yield f"data: {json.dumps({'error': 'TTS服务未返回音频数据'})}\n\n"
                                        else:
                                            print('tts服务失败')
                                            accumulated_text = ""
                                            continue

                        except json.JSONDecodeError:
                            continue

                # 发送完成信号
                yield f"data: {json.dumps({'done': True})}\n\n"

                # 如果还有未处理的文本,调用TTS
                if accumulated_text:
                    try:
                        # 调用PaddleSpeech TTS服务
                        tts_response = requests.post(PADDLE_SPEECH_TTS_URL, json={
                            'text': accumulated_text,
                            'lang': "zh_cn",
                            'speaker': "zh_cn_female"
                        })

                        if tts_response.status_code == 200:
                            tts_result = tts_response.json()
                            audio_base64 = tts_result.get('audio')

                            if audio_base64:
                                # 发送音频数据
                                yield f"data: {json.dumps({'audio': audio_base64})}\n\n"
                            else:
                                yield f"data: {json.dumps({'error': 'TTS服务未返回音频数据'})}\n\n"
                        else:
                            yield f"data: {json.dumps({'error': 'TTS服务调用失败'})}\n\n"
                    except Exception as e:
                        yield f"data: {json.dumps({'error': f'TTS服务错误: {str(e)}'})}\n\n"

            except Exception as e:
                yield f"data: {json.dumps({'error': str(e)})}\n\n"

        return Response(stream_with_context(generate()), mimetype='text/event-stream')

    except Exception as e:
        return jsonify({'error': str(e)}), 500


if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)
index.html
<!DOCTYPE html>
<html lang="zh">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
    <title>语音识别与对话服务</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            display: flex;
            flex-direction: column;
            align-items: center;
            padding: 20px;
            max-width: 800px;
            margin: 0 auto;
        }
        .container {
            width: 100%;
            display: flex;
            flex-direction: column;
            align-items: center;
        }
        #recordButton {
            padding: 20px 40px;
            font-size: 18px;
            background-color: #4CAF50;
            color: white;
            border: none;
            border-radius: 5px;
            cursor: pointer;
            margin: 20px;
        }
        #recordButton:active {
            background-color: #45a049;
        }
        .instructions {
            margin: 20px;
            padding: 15px;
            background-color: #f8f9fa;
            border-radius: 5px;
            width: 100%;
        }
        .instructions h3 {
            margin-top: 0;
            color: #333;
        }
        .instructions ul {
            padding-left: 20px;
        }
        .instructions li {
            margin: 10px 0;
        }
        .chat-container {
            width: 100%;
            margin-top: 20px;
            border: 1px solid #ddd;
            border-radius: 5px;
            overflow: hidden;
        }
        .chat-messages {
            height: 300px;
            overflow-y: auto;
            padding: 15px;
            background-color: #f9f9f9;
        }
        .message {
            margin-bottom: 10px;
            padding: 10px;
            border-radius: 5px;
            max-width: 80%;
        }
        .user-message {
            background-color: #e1f5fe;
            margin-left: auto;
        }
        .bot-message {
            background-color: #f1f1f1;
        }
        .typing-indicator {
            display: inline-block;
            margin-left: 10px;
            color: #666;
        }
        .status {
            margin-top: 10px;
            padding: 5px 10px;
            border-radius: 3px;
            background-color: #f0f0f0;
            font-size: 14px;
        }
        .audio-controls {
            margin-top: 10px;
            display: flex;
            gap: 10px;
        }
        .audio-button {
            padding: 5px 10px;
            background-color: #4CAF50;
            color: white;
            border: none;
            border-radius: 3px;
            cursor: pointer;
            font-size: 12px;
        }
        .audio-button:hover {
            background-color: #45a049;
        }
    </style>
</head>
<body>
    <div class="container">
        <div class="instructions">
            <h3>使用说明:</h3>
            <ul>
                <li>首次使用时,浏览器会请求麦克风访问权限,请点击"允许"</li>
                <li>如果已经拒绝过权限,请点击地址栏左侧的图标重新授权</li>
                <li>按住"按住说话"按钮进行录音,松开按钮后自动发送</li>
                <li>请确保系统已正确连接麦克风设备</li>
                <li>如果浏览器阻止自动播放,请点击页面任意位置后再试</li>
            </ul>
        </div>

        <button id="recordButton">按住说话</button>
        <div id="status" class="status">准备就绪</div>
        
        <div class="chat-container">
            <div id="chatMessages" class="chat-messages">
                <div class="message bot-message">您好!我是AI助手,请按住按钮说话,我会回答您的问题。</div>
            </div>
        </div>
    </div>
    
    <script src="script.js"></script>
</body>
</html>
script.js
let mediaRecorder;
let audioChunks = [];
let hasMicrophonePermission = false;
let audioStream = null;
let currentEventSource = null;
let audioQueue = [];
let isPlaying = false;

document.getElementById('recordButton').addEventListener('mousedown', startRecording);
document.getElementById('recordButton').addEventListener('mouseup', stopRecording);
document.getElementById('recordButton').addEventListener('touchstart', startRecording);
document.getElementById('recordButton').addEventListener('touchend', stopRecording);

// 检查麦克风权限状态
async function checkMicrophonePermission() {
    try {
        const result = await navigator.permissions.query({ name: 'microphone' });
        hasMicrophonePermission = result.state === 'granted';
        console.log('麦克风权限状态:', result.state);
        
        // 监听权限变化
        result.onchange = () => {
            hasMicrophonePermission = result.state === 'granted';
            console.log('麦克风权限状态变化:', result.state);
        };
    } catch (err) {
        console.error('检查麦克风权限失败:', err);
    }
}

// 页面加载时检查权限
checkMicrophonePermission();

async function startRecording(e) {
    e.preventDefault();
    audioChunks = [];
    
    try {
        // 检查浏览器是否支持getUserMedia
        if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
            throw new Error('您的浏览器不支持录音功能');
        }

        // 如果已经有权限和音频流,直接使用
        if (hasMicrophonePermission && audioStream) {
            console.log('使用已有的音频流');
            initializeMediaRecorder(audioStream);
            return;
        }

        // 列出可用的音频设备
        const devices = await navigator.mediaDevices.enumerateDevices();
        const audioDevices = devices.filter(device => device.kind === 'audioinput');
        console.log('可用的音频设备:', audioDevices);

        if (audioDevices.length === 0) {
            throw new Error('未检测到麦克风设备');
        }

        // 请求麦克风权限
        audioStream = await navigator.mediaDevices.getUserMedia({
            audio: {
                echoCancellation: true,
                noiseSuppression: true,
                sampleRate: 16000
            }
        });

        // 检查是否成功获取到音频流
        if (!audioStream.getAudioTracks().length) {
            throw new Error('未能获取到音频流');
        }

        console.log('成功获取音频流:', audioStream.getAudioTracks());
        hasMicrophonePermission = true;

        initializeMediaRecorder(audioStream);
    } catch (err) {
        console.error('录音失败:', err);
        let errorMessage = '录音失败: ';
        if (err.name === 'NotAllowedError') {
            errorMessage += '请允许访问麦克风';
            hasMicrophonePermission = false;
        } else if (err.name === 'NotFoundError') {
            errorMessage += '未找到麦克风设备';
        } else if (err.name === 'NotReadableError') {
            errorMessage += '麦克风被其他应用程序占用';
        } else {
            errorMessage += err.message;
        }
        updateStatus(errorMessage);
    }
}

function initializeMediaRecorder(stream) {
    // 获取支持的MIME类型
    const mimeType = MediaRecorder.isTypeSupported('audio/webm')
        ? 'audio/webm'
        : MediaRecorder.isTypeSupported('audio/mp4')
            ? 'audio/mp4'
            : 'audio/wav';

    console.log('使用的音频格式:', mimeType);

    // 创建MediaRecorder实例
    mediaRecorder = new MediaRecorder(stream, {
        mimeType: mimeType
    });

    mediaRecorder.ondataavailable = (event) => {
        if (event.data.size > 0) {
            audioChunks.push(event.data);
            console.log('收到音频数据:', event.data.size, '字节');
        }
    };

    mediaRecorder.onerror = (error) => {
        console.error('MediaRecorder错误:', error);
        updateStatus('录音出错,请重试');
    };

    mediaRecorder.start();
    document.getElementById('recordButton').textContent = '正在录音...';
    updateStatus('正在录音...');
    console.log('开始录音');
}

async function stopRecording() {
    if (!mediaRecorder) return;
    
    try {
        mediaRecorder.stop();
        document.getElementById('recordButton').textContent = '按住说话';
        updateStatus('处理中...');
        console.log('停止录音');
        
        // 注意:不要停止音频流,这样下次可以直接使用
        // mediaRecorder.stream.getTracks().forEach(track => track.stop());
    } catch (err) {
        console.error('停止录音失败:', err);
        updateStatus('停止录音失败,请重试');
    }
    
    mediaRecorder.onstop = async () => {
        try {
            const audioBlob = new Blob(audioChunks, { type: mediaRecorder.mimeType });
            console.log('音频数据大小:', audioBlob.size, '字节');
            await sendAudioToServer(audioBlob);
        } catch (err) {
            console.error('处理音频数据失败:', err);
            updateStatus('处理音频失败,请重试');
        }
    };
}

async function sendAudioToServer(audioBlob) {
    try {
        const formData = new FormData();
        formData.append('audio', audioBlob);

        updateStatus('正在识别语音...');
        console.log('正在发送音频数据到服务器...');

        const response = await fetch('http://localhost:5000/recognize', {
            method: 'POST',
            body: formData
        });

        if (!response.ok) {
            throw new Error(`服务器响应错误: ${response.status}`);
        }

        const result = await response.json();
        console.log('服务器响应:', result);
        
        if (result.error) {
            updateStatus(`识别失败: ${result.error}`);
            return;
        }
        
        const recognizedText = result.text || '未能识别内容';
        updateStatus('语音识别完成,正在生成回答...');
        
        // 添加用户消息到聊天界面
        addMessage(recognizedText, 'user');
        
        // 发送到Ollama模型进行对话
        await chatWithOllama(recognizedText);
        
    } catch (error) {
        console.error('发送音频失败:', error);
        updateStatus('识别失败,请重试');
    }
}

async function chatWithOllama(message) {
    try {
        // 关闭之前的EventSource(如果有)
        if (currentEventSource) {
            currentEventSource.close();
        }
        
        // 清空音频队列
        audioQueue = [];
        isPlaying = false;
        
        // 创建一个新的消息元素用于流式显示
        const botMessageElement = document.createElement('div');
        botMessageElement.className = 'message bot-message';
        document.getElementById('chatMessages').appendChild(botMessageElement);
        
        // 滚动到底部
        const chatMessages = document.getElementById('chatMessages');
        chatMessages.scrollTop = chatMessages.scrollHeight;

        // 创建EventSource连接
        currentEventSource = new EventSource(`http://localhost:5000/chat?message=${encodeURIComponent(message)}`);

        let fullResponse = '';
        
        currentEventSource.onmessage = function(event) {
            console.log('收到服务器消息:', event.data);
            const data = JSON.parse(event.data);

            if (data.error) {
                botMessageElement.textContent = `错误: ${data.error}`;
                updateStatus('对话出错');
                currentEventSource.close();
                return;
            }
            
            if (data.done) {
                updateStatus('对话完成');
                currentEventSource.close();
                return;
            }
            
            if (data.text) {
                fullResponse += data.text;
                var processedHTML = marked.parse(fullResponse);
//                botMessageElement.textContent = fullResponse;
                botMessageElement.innerHTML = processedHTML;

                // 滚动到底部
                chatMessages.scrollTop = chatMessages.scrollHeight;
            }
            
            // 处理TTS音频数据
            if (data.audio) {
                updateStatus('正在播放语音回答...');
                // 将音频添加到队列
                audioQueue.push(data.audio);
                // 如果没有正在播放,开始播放
                if (!isPlaying) {
                    playNextAudio();
                }
            }
        };
        
        currentEventSource.onerror = function(error) {
            console.error('EventSource错误:', error);
            updateStatus('对话连接出错');
            currentEventSource.close();
        };
        
    } catch (error) {
        console.error('对话请求失败:', error);
        updateStatus('对话请求失败');
    }
}

// 播放音频队列中的下一个音频
function playNextAudio() {
    if (audioQueue.length === 0) {
        isPlaying = false;
        return;
    }
    
    isPlaying = true;
    const audioBase64 = audioQueue.shift();
    playAudio(audioBase64);
}

// 播放音频函数
function playAudio(audioBase64) {
    try {
        // 将base64转换为Blob
        const binaryString = atob(audioBase64);
        const bytes = new Uint8Array(binaryString.length);
        for (let i = 0; i < binaryString.length; i++) {
            bytes[i] = binaryString.charCodeAt(i);
        }
        const audioBlob = new Blob([bytes], { type: 'audio/wav' });
        
        // 创建音频URL
        const audioUrl = URL.createObjectURL(audioBlob);
        
        // 创建新的音频播放器
        const audioPlayer = new Audio(audioUrl);
        
        // 播放完成后处理
        audioPlayer.onended = function() {
            URL.revokeObjectURL(audioUrl);
            // 播放队列中的下一个音频
            playNextAudio();
        };
        
        audioPlayer.onerror = function(e) {
            console.error('音频播放错误:', e);
            updateStatus('音频播放失败');
            URL.revokeObjectURL(audioUrl);
            // 即使出错也继续播放队列中的下一个
            playNextAudio();
        };
        
        // 播放音频
        audioPlayer.play().catch(e => {
            console.error('播放音频失败:', e);
            updateStatus('播放音频失败');
            // 即使出错也继续播放队列中的下一个
            playNextAudio();
        });
    } catch (error) {
        console.error('处理音频数据失败:', error);
        updateStatus('处理音频数据失败');
        // 即使出错也继续播放队列中的下一个
        playNextAudio();
    }
}

function addMessage(text, sender) {
    const messageElement = document.createElement('div');
    messageElement.className = `message ${sender}-message`;
    messageElement.textContent = text;
    
    document.getElementById('chatMessages').appendChild(messageElement);
    
    // 滚动到底部
    const chatMessages = document.getElementById('chatMessages');
    chatMessages.scrollTop = chatMessages.scrollHeight;
}

function updateStatus(message) {
    document.getElementById('status').textContent = message;
}

分享到: