Feat/chat support voice input (#532)

This commit is contained in:
zxhlyh
2023-07-07 17:50:42 +08:00
committed by GitHub
parent feebb5dd1f
commit a03a92e9db
70 changed files with 1420 additions and 26 deletions

View File

@@ -4,6 +4,7 @@ import uuid
from core.constant import llm_constant
from models.account import Account
from services.dataset_service import DatasetService
from core.llm.llm_builder import LLMBuilder
class AppModelConfigService:
@@ -109,6 +110,26 @@ class AppModelConfigService:
if not isinstance(config["suggested_questions_after_answer"]["enabled"], bool):
raise ValueError("enabled in suggested_questions_after_answer must be of boolean type")
# speech_to_text
if 'speech_to_text' not in config or not config["speech_to_text"]:
config["speech_to_text"] = {
"enabled": False
}
if not isinstance(config["speech_to_text"], dict):
raise ValueError("speech_to_text must be of dict type")
if "enabled" not in config["speech_to_text"] or not config["speech_to_text"]["enabled"]:
config["speech_to_text"]["enabled"] = False
if not isinstance(config["speech_to_text"]["enabled"], bool):
raise ValueError("enabled in speech_to_text must be of boolean type")
provider_name = LLMBuilder.get_default_provider(account.current_tenant_id)
if config["speech_to_text"]["enabled"] and provider_name != 'openai':
raise ValueError("provider not support speech to text")
# more_like_this
if 'more_like_this' not in config or not config["more_like_this"]:
config["more_like_this"] = {
@@ -277,6 +298,7 @@ class AppModelConfigService:
"opening_statement": config["opening_statement"],
"suggested_questions": config["suggested_questions"],
"suggested_questions_after_answer": config["suggested_questions_after_answer"],
"speech_to_text": config["speech_to_text"],
"more_like_this": config["more_like_this"],
"model": {
"provider": config["model"]["provider"],

View File

@@ -0,0 +1,43 @@
import io
from werkzeug.datastructures import FileStorage
from core.llm.llm_builder import LLMBuilder
from core.llm.provider.llm_provider_service import LLMProviderService
from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServiceError, UnsupportedAudioTypeServiceError, ProviderNotSupportSpeechToTextServiceError
from core.llm.whisper import Whisper
from models.provider import ProviderName
FILE_SIZE_LIMIT = 1 * 1024 * 1024
ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm']
class AudioService:
@classmethod
def transcript(cls, tenant_id: str, file: FileStorage):
if file is None:
raise NoAudioUploadedServiceError()
extension = file.mimetype
if extension not in [f'audio/{ext}' for ext in ALLOWED_EXTENSIONS]:
raise UnsupportedAudioTypeServiceError()
file_content = file.read()
file_size = len(file_content)
if file_size > FILE_SIZE_LIMIT:
message = f"({file_size} > {FILE_SIZE_LIMIT})"
raise AudioTooLargeServiceError(message)
provider_name = LLMBuilder.get_default_provider(tenant_id)
if provider_name != ProviderName.OPENAI.value:
raise ProviderNotSupportSpeechToTextServiceError('haha')
provider_service = LLMProviderService(tenant_id, provider_name)
buffer = io.BytesIO(file_content)
buffer.name = 'temp.wav'
return Whisper(provider_service.provider).transcribe(buffer)

View File

@@ -1,7 +1,7 @@
# -*- coding:utf-8 -*-
__all__ = [
'base', 'conversation', 'message', 'index', 'app_model_config', 'account', 'document', 'dataset',
'app', 'completion'
'app', 'completion', 'audio'
]
from . import *

View File

@@ -0,0 +1,23 @@
from services.errors.base import BaseServiceError
class NoAudioUploadedServiceError(BaseServiceError):
error_code = 'no_audio_uploaded'
description = "Please upload your audio."
code = 400
class AudioTooLargeServiceError(BaseServiceError):
error_code = 'audio_too_large'
description = "Audio size exceeded. {message}"
code = 413
class UnsupportedAudioTypeServiceError(BaseServiceError):
error_code = 'unsupported_audio_type'
description = "Audio type not allowed."
code = 415
class ProviderNotSupportSpeechToTextServiceError(BaseServiceError):
error_code = 'provider_not_support_speech_to_text'
description = "Provider not support speech to text. {message}"
code = 400