通过paddlespeech和ollama搭建本地语音问答助手
废话不多说,直接开始。
没有安装paddlespeech的查看这个文章安装。ollama的搭建往上有很多教程,直接搜就行,很简单。我使用的 qwen2.5:latest 模型,你也可以切换成其他的,看机器配置。
把下面的代码复制到一个文件夹下面,直接双击浏览器打开index.html文件。
点击按住说话,第一次会提示允许麦克风,点允许就行。
然后安装环境,启动server.py程序。记得修改你的ollama和paddlespeech的访问连接。还有js文件里的调用地址也记得改下。
在界面上按住按钮问问题就可以了。他会边回答答案边用语音播报。
下面是代码。
server.py
from flask import Flask, request, jsonify, Response, stream_with_context
from flask_cors import CORS
import os
import base64
import requests
import json
from werkzeug.utils import secure_filename
import ssl
import random
import string
def generate_random_string(length):
characters = string.ascii_letters + string.digits
random_string = ''.join(random.choice(characters) for _ in range(length))
return random_string
app = Flask(__name__)
CORS(app)
# 配置上传文件夹
UPLOAD_FOLDER = 'uploads'
if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER)
# PaddleSpeech服务地址
PADDLE_SPEECH_URL = 'http://localhost:8090/paddlespeech/asr'
PADDLE_SPEECH_TTS_URL = 'http://localhost:8090/paddlespeech/tts'
# Ollama服务地址
OLLAMA_URL = 'http://localhost:11434/api/generate'
@app.route('/recognize', methods=['POST'])
def recognize_speech():
try:
if 'audio' not in request.files:
return jsonify({'error': '没有收到音频文件'}), 400
audio_file = request.files['audio']
if audio_file.filename == '':
return jsonify({'error': '没有选择文件'}), 400
# 保存上传的音频文件
ogg_path = os.path.join(UPLOAD_FOLDER, secure_filename(audio_file.filename))
audio_file.save(ogg_path)
wavfilename = generate_random_string(10) + '.wav'
# 将OGG转换为WAV
wav_path = os.path.join(UPLOAD_FOLDER, wavfilename)
os.system(f'ffmpeg -i {ogg_path} -acodec pcm_s16le -ar 16000 {wav_path}')
# 读取WAV文件并转换为base64
with open(wav_path, 'rb') as wav_file:
audio_base64 = base64.b64encode(wav_file.read()).decode('utf-8')
# 调用PaddleSpeech服务
response = requests.post(PADDLE_SPEECH_URL, json={
'audio': audio_base64,
'audio_format': 'wav',
'lang': "zh_cn",
'sample_rate': 16000
})
if response.status_code != 200:
raise Exception('PaddleSpeech服务调用失败')
result = response.json()
recognized_text = result.get('result').get('transcription')
# 清理临时文件
os.remove(ogg_path)
os.remove(wav_path)
return jsonify({'text': recognized_text})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/chat', methods=['get', 'post'])
def chat_with_ollama():
try:
user_message = request.args.get('message')
print(user_message)
if not user_message:
return jsonify({'error': '没有收到消息'}), 400
# 准备发送给Ollama的请求
ollama_data = {
"model": "qwen2.5:latest", # 可以根据实际使用的模型进行调整
# "model": "deepseek-r1:14b", # 可以根据实际使用的模型进行调整
"prompt": user_message,
"stream": True
}
# 使用流式响应
def generate():
try:
# 发送请求到Ollama服务
response = requests.post(OLLAMA_URL, json=ollama_data, stream=True)
# response = requests.get(questionurl+user_message, stream=True)
if response.status_code != 200:
yield f"data: {json.dumps({'error': 'Ollama服务调用失败'})}\n\n"
return
# 用于累积文本,以便在适当的时候调用TTS
accumulated_text = ""
last_tts_time = 0
endspeekstat = False
# 处理流式响应
for line in response.iter_lines():
if line:
try:
data_str = line.decode('utf-8')
json_response = json.loads(data_str.replace('data: ', ''))
print(json_response)
if 'response' in json_response:
# if 'message' in json_response:
response_text = json_response['response']
accumulated_text += response_text
if '引用内容' in response_text:
endspeekstat = True
accumulated_text = accumulated_text.strip().replace('*', '').replace('-', '').replace('#', '')
# 发送文本到前端
yield f"data: {json.dumps({'text': response_text})}\n\n"
# yield f"data: {line}\n\n"
# 当累积的文本达到一定长度或遇到标点符号时,调用TTS
if not endspeekstat:
if len(accumulated_text) >= 20 or any(p in response_text for p in ['。', '!', '?', '.', '!', '?', '\n']):
print(accumulated_text)
# 调用PaddleSpeech TTS服务
tts_response = requests.post(PADDLE_SPEECH_TTS_URL, json={
'text': accumulated_text,
"spk_id": 0,
"speed": 1.0,
"volume": 1.0,
"sample_rate": 0,
})
print(tts_response.json())
if tts_response.status_code == 200 and tts_response.json().get('success'):
tts_result = tts_response.json()
audio_base64 = tts_result.get('result').get('audio')
if audio_base64:
# 发送音频数据
yield f"data: {json.dumps({'audio': audio_base64})}\n\n"
# 清空累积的文本
accumulated_text = ""
else:
yield f"data: {json.dumps({'error': 'TTS服务未返回音频数据'})}\n\n"
else:
print('tts服务失败')
accumulated_text = ""
continue
except json.JSONDecodeError:
continue
# 发送完成信号
yield f"data: {json.dumps({'done': True})}\n\n"
# 如果还有未处理的文本,调用TTS
if accumulated_text:
try:
# 调用PaddleSpeech TTS服务
tts_response = requests.post(PADDLE_SPEECH_TTS_URL, json={
'text': accumulated_text,
'lang': "zh_cn",
'speaker': "zh_cn_female"
})
if tts_response.status_code == 200:
tts_result = tts_response.json()
audio_base64 = tts_result.get('audio')
if audio_base64:
# 发送音频数据
yield f"data: {json.dumps({'audio': audio_base64})}\n\n"
else:
yield f"data: {json.dumps({'error': 'TTS服务未返回音频数据'})}\n\n"
else:
yield f"data: {json.dumps({'error': 'TTS服务调用失败'})}\n\n"
except Exception as e:
yield f"data: {json.dumps({'error': f'TTS服务错误: {str(e)}'})}\n\n"
except Exception as e:
yield f"data: {json.dumps({'error': str(e)})}\n\n"
return Response(stream_with_context(generate()), mimetype='text/event-stream')
except Exception as e:
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
index.html
<!DOCTYPE html>
<html lang="zh">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
<title>语音识别与对话服务</title>
<style>
body {
font-family: Arial, sans-serif;
display: flex;
flex-direction: column;
align-items: center;
padding: 20px;
max-width: 800px;
margin: 0 auto;
}
.container {
width: 100%;
display: flex;
flex-direction: column;
align-items: center;
}
#recordButton {
padding: 20px 40px;
font-size: 18px;
background-color: #4CAF50;
color: white;
border: none;
border-radius: 5px;
cursor: pointer;
margin: 20px;
}
#recordButton:active {
background-color: #45a049;
}
.instructions {
margin: 20px;
padding: 15px;
background-color: #f8f9fa;
border-radius: 5px;
width: 100%;
}
.instructions h3 {
margin-top: 0;
color: #333;
}
.instructions ul {
padding-left: 20px;
}
.instructions li {
margin: 10px 0;
}
.chat-container {
width: 100%;
margin-top: 20px;
border: 1px solid #ddd;
border-radius: 5px;
overflow: hidden;
}
.chat-messages {
height: 300px;
overflow-y: auto;
padding: 15px;
background-color: #f9f9f9;
}
.message {
margin-bottom: 10px;
padding: 10px;
border-radius: 5px;
max-width: 80%;
}
.user-message {
background-color: #e1f5fe;
margin-left: auto;
}
.bot-message {
background-color: #f1f1f1;
}
.typing-indicator {
display: inline-block;
margin-left: 10px;
color: #666;
}
.status {
margin-top: 10px;
padding: 5px 10px;
border-radius: 3px;
background-color: #f0f0f0;
font-size: 14px;
}
.audio-controls {
margin-top: 10px;
display: flex;
gap: 10px;
}
.audio-button {
padding: 5px 10px;
background-color: #4CAF50;
color: white;
border: none;
border-radius: 3px;
cursor: pointer;
font-size: 12px;
}
.audio-button:hover {
background-color: #45a049;
}
</style>
</head>
<body>
<div class="container">
<div class="instructions">
<h3>使用说明:</h3>
<ul>
<li>首次使用时,浏览器会请求麦克风访问权限,请点击"允许"</li>
<li>如果已经拒绝过权限,请点击地址栏左侧的图标重新授权</li>
<li>按住"按住说话"按钮进行录音,松开按钮后自动发送</li>
<li>请确保系统已正确连接麦克风设备</li>
<li>如果浏览器阻止自动播放,请点击页面任意位置后再试</li>
</ul>
</div>
<button id="recordButton">按住说话</button>
<div id="status" class="status">准备就绪</div>
<div class="chat-container">
<div id="chatMessages" class="chat-messages">
<div class="message bot-message">您好!我是AI助手,请按住按钮说话,我会回答您的问题。</div>
</div>
</div>
</div>
<script src="script.js"></script>
</body>
</html>
script.js
let mediaRecorder;
let audioChunks = [];
let hasMicrophonePermission = false;
let audioStream = null;
let currentEventSource = null;
let audioQueue = [];
let isPlaying = false;
document.getElementById('recordButton').addEventListener('mousedown', startRecording);
document.getElementById('recordButton').addEventListener('mouseup', stopRecording);
document.getElementById('recordButton').addEventListener('touchstart', startRecording);
document.getElementById('recordButton').addEventListener('touchend', stopRecording);
// 检查麦克风权限状态
async function checkMicrophonePermission() {
try {
const result = await navigator.permissions.query({ name: 'microphone' });
hasMicrophonePermission = result.state === 'granted';
console.log('麦克风权限状态:', result.state);
// 监听权限变化
result.onchange = () => {
hasMicrophonePermission = result.state === 'granted';
console.log('麦克风权限状态变化:', result.state);
};
} catch (err) {
console.error('检查麦克风权限失败:', err);
}
}
// 页面加载时检查权限
checkMicrophonePermission();
async function startRecording(e) {
e.preventDefault();
audioChunks = [];
try {
// 检查浏览器是否支持getUserMedia
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
throw new Error('您的浏览器不支持录音功能');
}
// 如果已经有权限和音频流,直接使用
if (hasMicrophonePermission && audioStream) {
console.log('使用已有的音频流');
initializeMediaRecorder(audioStream);
return;
}
// 列出可用的音频设备
const devices = await navigator.mediaDevices.enumerateDevices();
const audioDevices = devices.filter(device => device.kind === 'audioinput');
console.log('可用的音频设备:', audioDevices);
if (audioDevices.length === 0) {
throw new Error('未检测到麦克风设备');
}
// 请求麦克风权限
audioStream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
sampleRate: 16000
}
});
// 检查是否成功获取到音频流
if (!audioStream.getAudioTracks().length) {
throw new Error('未能获取到音频流');
}
console.log('成功获取音频流:', audioStream.getAudioTracks());
hasMicrophonePermission = true;
initializeMediaRecorder(audioStream);
} catch (err) {
console.error('录音失败:', err);
let errorMessage = '录音失败: ';
if (err.name === 'NotAllowedError') {
errorMessage += '请允许访问麦克风';
hasMicrophonePermission = false;
} else if (err.name === 'NotFoundError') {
errorMessage += '未找到麦克风设备';
} else if (err.name === 'NotReadableError') {
errorMessage += '麦克风被其他应用程序占用';
} else {
errorMessage += err.message;
}
updateStatus(errorMessage);
}
}
function initializeMediaRecorder(stream) {
// 获取支持的MIME类型
const mimeType = MediaRecorder.isTypeSupported('audio/webm')
? 'audio/webm'
: MediaRecorder.isTypeSupported('audio/mp4')
? 'audio/mp4'
: 'audio/wav';
console.log('使用的音频格式:', mimeType);
// 创建MediaRecorder实例
mediaRecorder = new MediaRecorder(stream, {
mimeType: mimeType
});
mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0) {
audioChunks.push(event.data);
console.log('收到音频数据:', event.data.size, '字节');
}
};
mediaRecorder.onerror = (error) => {
console.error('MediaRecorder错误:', error);
updateStatus('录音出错,请重试');
};
mediaRecorder.start();
document.getElementById('recordButton').textContent = '正在录音...';
updateStatus('正在录音...');
console.log('开始录音');
}
async function stopRecording() {
if (!mediaRecorder) return;
try {
mediaRecorder.stop();
document.getElementById('recordButton').textContent = '按住说话';
updateStatus('处理中...');
console.log('停止录音');
// 注意:不要停止音频流,这样下次可以直接使用
// mediaRecorder.stream.getTracks().forEach(track => track.stop());
} catch (err) {
console.error('停止录音失败:', err);
updateStatus('停止录音失败,请重试');
}
mediaRecorder.onstop = async () => {
try {
const audioBlob = new Blob(audioChunks, { type: mediaRecorder.mimeType });
console.log('音频数据大小:', audioBlob.size, '字节');
await sendAudioToServer(audioBlob);
} catch (err) {
console.error('处理音频数据失败:', err);
updateStatus('处理音频失败,请重试');
}
};
}
async function sendAudioToServer(audioBlob) {
try {
const formData = new FormData();
formData.append('audio', audioBlob);
updateStatus('正在识别语音...');
console.log('正在发送音频数据到服务器...');
const response = await fetch('http://localhost:5000/recognize', {
method: 'POST',
body: formData
});
if (!response.ok) {
throw new Error(`服务器响应错误: ${response.status}`);
}
const result = await response.json();
console.log('服务器响应:', result);
if (result.error) {
updateStatus(`识别失败: ${result.error}`);
return;
}
const recognizedText = result.text || '未能识别内容';
updateStatus('语音识别完成,正在生成回答...');
// 添加用户消息到聊天界面
addMessage(recognizedText, 'user');
// 发送到Ollama模型进行对话
await chatWithOllama(recognizedText);
} catch (error) {
console.error('发送音频失败:', error);
updateStatus('识别失败,请重试');
}
}
async function chatWithOllama(message) {
try {
// 关闭之前的EventSource(如果有)
if (currentEventSource) {
currentEventSource.close();
}
// 清空音频队列
audioQueue = [];
isPlaying = false;
// 创建一个新的消息元素用于流式显示
const botMessageElement = document.createElement('div');
botMessageElement.className = 'message bot-message';
document.getElementById('chatMessages').appendChild(botMessageElement);
// 滚动到底部
const chatMessages = document.getElementById('chatMessages');
chatMessages.scrollTop = chatMessages.scrollHeight;
// 创建EventSource连接
currentEventSource = new EventSource(`http://localhost:5000/chat?message=${encodeURIComponent(message)}`);
let fullResponse = '';
currentEventSource.onmessage = function(event) {
console.log('收到服务器消息:', event.data);
const data = JSON.parse(event.data);
if (data.error) {
botMessageElement.textContent = `错误: ${data.error}`;
updateStatus('对话出错');
currentEventSource.close();
return;
}
if (data.done) {
updateStatus('对话完成');
currentEventSource.close();
return;
}
if (data.text) {
fullResponse += data.text;
var processedHTML = marked.parse(fullResponse);
// botMessageElement.textContent = fullResponse;
botMessageElement.innerHTML = processedHTML;
// 滚动到底部
chatMessages.scrollTop = chatMessages.scrollHeight;
}
// 处理TTS音频数据
if (data.audio) {
updateStatus('正在播放语音回答...');
// 将音频添加到队列
audioQueue.push(data.audio);
// 如果没有正在播放,开始播放
if (!isPlaying) {
playNextAudio();
}
}
};
currentEventSource.onerror = function(error) {
console.error('EventSource错误:', error);
updateStatus('对话连接出错');
currentEventSource.close();
};
} catch (error) {
console.error('对话请求失败:', error);
updateStatus('对话请求失败');
}
}
// 播放音频队列中的下一个音频
function playNextAudio() {
if (audioQueue.length === 0) {
isPlaying = false;
return;
}
isPlaying = true;
const audioBase64 = audioQueue.shift();
playAudio(audioBase64);
}
// 播放音频函数
function playAudio(audioBase64) {
try {
// 将base64转换为Blob
const binaryString = atob(audioBase64);
const bytes = new Uint8Array(binaryString.length);
for (let i = 0; i < binaryString.length; i++) {
bytes[i] = binaryString.charCodeAt(i);
}
const audioBlob = new Blob([bytes], { type: 'audio/wav' });
// 创建音频URL
const audioUrl = URL.createObjectURL(audioBlob);
// 创建新的音频播放器
const audioPlayer = new Audio(audioUrl);
// 播放完成后处理
audioPlayer.onended = function() {
URL.revokeObjectURL(audioUrl);
// 播放队列中的下一个音频
playNextAudio();
};
audioPlayer.onerror = function(e) {
console.error('音频播放错误:', e);
updateStatus('音频播放失败');
URL.revokeObjectURL(audioUrl);
// 即使出错也继续播放队列中的下一个
playNextAudio();
};
// 播放音频
audioPlayer.play().catch(e => {
console.error('播放音频失败:', e);
updateStatus('播放音频失败');
// 即使出错也继续播放队列中的下一个
playNextAudio();
});
} catch (error) {
console.error('处理音频数据失败:', error);
updateStatus('处理音频数据失败');
// 即使出错也继续播放队列中的下一个
playNextAudio();
}
}
function addMessage(text, sender) {
const messageElement = document.createElement('div');
messageElement.className = `message ${sender}-message`;
messageElement.textContent = text;
document.getElementById('chatMessages').appendChild(messageElement);
// 滚动到底部
const chatMessages = document.getElementById('chatMessages');
chatMessages.scrollTop = chatMessages.scrollHeight;
}
function updateStatus(message) {
document.getElementById('status').textContent = message;
}