Source code for voicebox.tts.elevenlabs

from typing import Any, Iterator

import numpy as np
from elevenlabs.client import ElevenLabs

from voicebox.audio import Audio
from voicebox.tts import TTS
from voicebox.tts.utils import get_audio_from_samples
from voicebox.types import StrOrSSML



[docs]
class ElevenLabsTTS(TTS):
    """
    TTS using the `ElevenLabs API <https://elevenlabs.io/>`_.

    Supports `SSML <https://www.w3.org/TR/speech-synthesis/>`_: ✔
    (`docs <https://elevenlabs.io/docs/speech-synthesis/prompting#pronunciation>`_)

    Args:
        voice_id:
            Voice to use. See `here <https://elevenlabs.io/docs/api-reference/get-voices>`_
            for a list of valid voice IDs.
        api_key:
            (Optional) Your ElevenLabs API key. If this and client are not given,
            then the client will pull the API key from the ``ELEVENLABS_API_KEY``
            env var. Note: Cannot be used with the ``client`` arg!
        client:
            (Optional) An :class:`elevenlabs.client.ElevenLabs` instance.
            Use this if you want to further customize the client behavior.
            Note: Cannot be used with the ``api_key`` arg!
        sample_rate:
            (Optional) PCM audio sample rate. Defaults to 32kHz.
            This is used to set the ``output_format`` of the request.
            See `here <https://elevenlabs.io/docs/api-reference/text-to-speech/convert#request.query.output_format>`_
            for valid options. Note: You must pick a sample rate from one of the
            ``output_format`` options beginning with ``pcm_``! Other codecs are
            not supported.
        convert_kwargs:
            (Optional) Additional kwargs to pass to the ``client.text_to_speech.convert``
            call. See here for all options:
            https://elevenlabs.io/docs/api-reference/text-to-speech/convert
    """

    client: ElevenLabs
    voice_id: str
    sample_rate: int
    convert_kwargs: dict[str, Any]

    def __init__(
        self,
        *,
        voice_id: str,
        api_key: str = None,
        client: ElevenLabs = None,
        sample_rate: int = 32_000,
        convert_kwargs: dict[str, Any] = None,
    ):
        if api_key and client:
            raise ValueError("Cannot give both api_key and client args.")

        self.voice_id = voice_id
        self.client = client or (
            ElevenLabs(api_key=api_key) if api_key else ElevenLabs()
        )
        self.sample_rate = sample_rate
        self.convert_kwargs = convert_kwargs or {}

    @property
    def api_key(self) -> str:
        # noinspection PyProtectedMember
        return self.client._client_wrapper._api_key

    @property
    def output_format(self):
        return f"pcm_{self.sample_rate}"


[docs]
    def get_speech(self, text: StrOrSSML) -> Audio:
        pcm_data = self.client.text_to_speech.convert(
            voice_id=self.voice_id,
            text=text,
            output_format=self.output_format,
            **self.convert_kwargs,
        )

        if isinstance(pcm_data, Iterator):
            pcm_data = b"".join(pcm_data)

        pcm_data = np.frombuffer(
            pcm_data,
            # Little-endian, signed int, 2 bytes per int
            dtype="<i2",
        )

        return get_audio_from_samples(pcm_data, self.sample_rate)