From a4fc057a1cd517186e93583bd258185084adc704 Mon Sep 17 00:00:00 2001 From: "SiliconFlow, Inc" Date: Mon, 25 Nov 2024 11:04:13 +0800 Subject: [PATCH] ISSUE=11042: add tts model in siliconflow (#11043) --- .../siliconflow/llm/_position.yaml | 1 - .../siliconflow/siliconflow.yaml | 1 + .../siliconflow/tts/__init__.py | 0 .../siliconflow/tts/fish-speech-1.4.yaml | 37 ++++++ .../model_providers/siliconflow/tts/tts.py | 105 ++++++++++++++++++ 5 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 api/core/model_runtime/model_providers/siliconflow/tts/__init__.py create mode 100644 api/core/model_runtime/model_providers/siliconflow/tts/fish-speech-1.4.yaml create mode 100644 api/core/model_runtime/model_providers/siliconflow/tts/tts.py diff --git a/api/core/model_runtime/model_providers/siliconflow/llm/_position.yaml b/api/core/model_runtime/model_providers/siliconflow/llm/_position.yaml index f010e4c82..b52df3e4e 100644 --- a/api/core/model_runtime/model_providers/siliconflow/llm/_position.yaml +++ b/api/core/model_runtime/model_providers/siliconflow/llm/_position.yaml @@ -24,4 +24,3 @@ - meta-llama/Meta-Llama-3.1-8B-Instruct - google/gemma-2-27b-it - google/gemma-2-9b-it -- deepseek-ai/DeepSeek-V2-Chat diff --git a/api/core/model_runtime/model_providers/siliconflow/siliconflow.yaml b/api/core/model_runtime/model_providers/siliconflow/siliconflow.yaml index 71f9a9238..73a9e8076 100644 --- a/api/core/model_runtime/model_providers/siliconflow/siliconflow.yaml +++ b/api/core/model_runtime/model_providers/siliconflow/siliconflow.yaml @@ -18,6 +18,7 @@ supported_model_types: - text-embedding - rerank - speech2text + - tts configurate_methods: - predefined-model - customizable-model diff --git a/api/core/model_runtime/model_providers/siliconflow/tts/__init__.py b/api/core/model_runtime/model_providers/siliconflow/tts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/api/core/model_runtime/model_providers/siliconflow/tts/fish-speech-1.4.yaml b/api/core/model_runtime/model_providers/siliconflow/tts/fish-speech-1.4.yaml new file mode 100644 index 000000000..4adfd05c6 --- /dev/null +++ b/api/core/model_runtime/model_providers/siliconflow/tts/fish-speech-1.4.yaml @@ -0,0 +1,37 @@ +model: fishaudio/fish-speech-1.4 +model_type: tts +model_properties: + default_voice: 'fishaudio/fish-speech-1.4:alex' + voices: + - mode: "fishaudio/fish-speech-1.4:alex" + name: "Alex(男声)" + language: [ "zh-Hans", "en-US" ] + - mode: "fishaudio/fish-speech-1.4:benjamin" + name: "Benjamin(男声)" + language: [ "zh-Hans", "en-US" ] + - mode: "fishaudio/fish-speech-1.4:charles" + name: "Charles(男声)" + language: [ "zh-Hans", "en-US" ] + - mode: "fishaudio/fish-speech-1.4:david" + name: "David(男声)" + language: [ "zh-Hans", "en-US" ] + - mode: "fishaudio/fish-speech-1.4:anna" + name: "Anna(女声)" + language: [ "zh-Hans", "en-US" ] + - mode: "fishaudio/fish-speech-1.4:bella" + name: "Bella(女声)" + language: [ "zh-Hans", "en-US" ] + - mode: "fishaudio/fish-speech-1.4:claire" + name: "Claire(女声)" + language: [ "zh-Hans", "en-US" ] + - mode: "fishaudio/fish-speech-1.4:diana" + name: "Diana(女声)" + language: [ "zh-Hans", "en-US" ] + audio_type: 'mp3' + max_workers: 5 + # stream: false +pricing: + input: '0.015' + output: '0' + unit: '0.001' + currency: RMB diff --git a/api/core/model_runtime/model_providers/siliconflow/tts/tts.py b/api/core/model_runtime/model_providers/siliconflow/tts/tts.py new file mode 100644 index 000000000..a5554abb7 --- /dev/null +++ b/api/core/model_runtime/model_providers/siliconflow/tts/tts.py @@ -0,0 +1,105 @@ +import concurrent.futures +from typing import Any, Optional + +from openai import OpenAI + +from core.model_runtime.errors.invoke import InvokeBadRequestError +from core.model_runtime.errors.validate import CredentialsValidateFailedError +from core.model_runtime.model_providers.__base.tts_model import TTSModel +from core.model_runtime.model_providers.openai._common import _CommonOpenAI + + +class SiliconFlowText2SpeechModel(_CommonOpenAI, TTSModel): + """ + Model class for SiliconFlow Speech to text model. + """ + + def _invoke( + self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, user: Optional[str] = None + ) -> Any: + """ + _invoke text2speech model + + :param model: model name + :param tenant_id: user tenant id + :param credentials: model credentials + :param content_text: text content to be translated + :param voice: model timbre + :param user: unique user id + :return: text translated to audio file + """ + if not voice or voice not in [ + d["value"] for d in self.get_tts_model_voices(model=model, credentials=credentials) + ]: + voice = self._get_model_default_voice(model, credentials) + # if streaming: + return self._tts_invoke_streaming(model=model, credentials=credentials, content_text=content_text, voice=voice) + + def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None: + """ + validate credentials text2speech model + + :param model: model name + :param credentials: model credentials + :param user: unique user id + :return: text translated to audio file + """ + try: + self._tts_invoke_streaming( + model=model, + credentials=credentials, + content_text="Hello SiliconFlow!", + voice=self._get_model_default_voice(model, credentials), + ) + except Exception as ex: + raise CredentialsValidateFailedError(str(ex)) + + def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, voice: str) -> Any: + """ + _tts_invoke_streaming text2speech model + + :param model: model name + :param credentials: model credentials + :param content_text: text content to be translated + :param voice: model timbre + :return: text translated to audio file + """ + try: + # doc: https://docs.siliconflow.cn/capabilities/text-to-speech + self._add_custom_parameters(credentials) + credentials_kwargs = self._to_credential_kwargs(credentials) + client = OpenAI(**credentials_kwargs) + model_support_voice = [ + x.get("value") for x in self.get_tts_model_voices(model=model, credentials=credentials) + ] + if not voice or voice not in model_support_voice: + voice = self._get_model_default_voice(model, credentials) + if len(content_text) > 4096: + sentences = self._split_text_into_sentences(content_text, max_length=4096) + executor = concurrent.futures.ThreadPoolExecutor(max_workers=min(3, len(sentences))) + futures = [ + executor.submit( + client.audio.speech.with_streaming_response.create, + model=model, + response_format="mp3", + input=sentences[i], + voice=voice, + ) + for i in range(len(sentences)) + ] + for future in futures: + yield from future.result().__enter__().iter_bytes(1024) # noqa:PLC2801 + + else: + response = client.audio.speech.with_streaming_response.create( + model=model, voice=voice, response_format="mp3", input=content_text.strip() + ) + + yield from response.__enter__().iter_bytes(1024) # noqa:PLC2801 + except Exception as ex: + raise InvokeBadRequestError(str(ex)) + + @classmethod + def _add_custom_parameters(cls, credentials: dict) -> None: + credentials["openai_api_base"] = "https://api.siliconflow.cn" + credentials["openai_api_key"] = credentials["api_key"]