Source code for voicebox.tts.voiceai

from io import BytesIO
from typing import Any

import requests

from voicebox.audio import Audio
from voicebox.tts import TTS
from voicebox.tts.utils import add_optional_items, get_audio_from_wav_file
from voicebox.types import StrOrSSML

DEFAULT_VOICE_AI_API_URL: str = "https://dev.voice.ai/api/v1/tts/speech"


[docs] class VoiceAiTTS(TTS): """ TTS using the `Voice.AI API <https://voice.ai/docs/api-reference/text-to-speech/generate-speech>`_. Supports `SSML <https://www.w3.org/TR/speech-synthesis/>`_: ✔ (`docs <https://elevenlabs.io/docs/speech-synthesis/prompting#pronunciation>`_) Args: api_key: Your Voice.AI API key. Create one here: https://voice.ai/app/dashboard/developers voice_id: (Optional) Voice ID. If omitted, the default built-in voice is used. temperature: (Optional) Temperature for generation (0.0-2.0). top_p: (Optional) Top-p sampling parameter (0.0-1.0). model: (Optional) TTS model to use. See here for options: https://voice.ai/docs/api-reference/text-to-speech/generate-speech#body-model-one-of-0 language: (Optional) Language code (ISO 639-1 format, e.g., 'en', 'es', 'fr'). Defaults to 'en' if not provided. api_url: (Optional) Override the default API URL. extra_json: (Optional) Extra request parameters to put in the JSON payload. extra_headers: (Optional) Extra headers to add to the request. request_kwargs: (Optional) Extra kwargs to pass to the ``requests.post()`` call. """ def __init__( self, api_key: str, voice_id: str = None, temperature: float = None, top_p: float = None, model: str = None, language: str = None, api_url: str = DEFAULT_VOICE_AI_API_URL, extra_json: dict[str, Any] = None, extra_headers: dict[str, str] = None, request_kwargs: dict[str, Any] = None, ): self.api_key = api_key self.voice_id = voice_id self.temperature = temperature self.top_p = top_p self.model = model self.language = language self.api_url = api_url self.extra_json = extra_json or {} self.extra_headers = extra_headers or {} self.request_kwargs = request_kwargs or {}
[docs] def get_speech(self, text: StrOrSSML) -> Audio: response = requests.post( self.api_url, headers=self._build_headers(), json=self._build_json(text), **self.request_kwargs, ) response.raise_for_status() with BytesIO(response.content) as wav_file: return get_audio_from_wav_file(wav_file)
def _build_headers(self) -> dict[str, str]: return { "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json", **self.extra_headers, } def _build_json(self, text: StrOrSSML) -> dict[str, Any]: json = { "text": text, "audio_format": "wav", } add_optional_items( json, [ ("voice_id", self.voice_id), ("temperature", self.temperature), ("top_p", self.top_p), ("model", self.model), ("language", self.language), ], ) json.update(self.extra_json) return json