Source code for voicebox.tts.espeakng

import subprocess
from dataclasses import dataclass, field
from typing import List, Union

from voicebox.audio import Audio
from voicebox.ssml import SSML
from voicebox.tts.tts import TTS
from voicebox.tts.utils import get_audio_from_wav_file
from voicebox.types import StrOrSSML



[docs]
@dataclass
class ESpeakConfig:
    """
    Configuration for the eSpeak NG engine.

    Run "``espeak-ng -h``" for more information on these options.
    """

    amplitude: int = None
    word_gap_seconds: float = None
    capitals: int = None
    line_length: int = None
    pitch: int = None
    speed: int = None
    voice: str = None
    no_final_pause: bool = False
    speak_punctuation: Union[bool, str] = False

    exe_path: str = "espeak-ng"
    timeout: float = None




[docs]
@dataclass
class ESpeakNG(TTS):
    """
    TTS using the `eSpeak NG <https://github.com/espeak-ng/espeak-ng>`_ engine.

    You may need to install it:

    - On Debian/Ubuntu: ``sudo apt install espeak-ng``

    Supports `SSML <https://www.w3.org/TR/speech-synthesis/>`_: ✔
    (`docs <https://github.com/espeak-ng/espeak-ng/blob/master/docs/markup.md>`_)

    Args:
        config:
            Optional configuration for the eSpeak NG engine.
            If not given, a default config will be used.
    """

    config: ESpeakConfig = field(default_factory=ESpeakConfig)


[docs]
    def get_speech(self, text: StrOrSSML) -> Audio:
        proc = self._get_proc(text)

        try:
            return get_audio_from_wav_file(proc.stdout)
        finally:
            proc.wait(timeout=self.config.timeout)


    def _get_proc(self, text: StrOrSSML):
        args = self._get_args(text)

        try:
            proc = subprocess.Popen(
                args,
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
            )
        except FileNotFoundError as e:
            raise FileNotFoundError(
                f"{e}; is espeak-ng installed? Try: sudo apt install espeak-ng"
            )

        proc.stdin.write(text.encode("utf-8"))
        proc.stdin.close()

        return proc

    def _get_args(self, text: StrOrSSML) -> List[str]:
        c = self.config

        args = [
            c.exe_path,
            "--stdin",  # Get input from stdin
            "-b",
            "1",  # Input text encoding UTF-8
            "--stdout",  # Write output to stdout
        ]

        if c.amplitude is not None:
            args.extend(("-a", str(c.amplitude)))

        if c.word_gap_seconds is not None:
            # Units of 10ms
            word_gap = round(c.word_gap_seconds * 100)
            args.extend(("-g", str(word_gap)))

        if c.capitals is not None:
            args.extend(("-k", str(c.capitals)))

        if c.line_length is not None:
            args.extend(("-l", str(c.line_length)))

        if c.pitch is not None:
            args.extend(("-p", str(c.pitch)))

        if c.speed is not None:
            args.extend(("-s", str(c.speed)))

        if c.voice is not None:
            args.extend(("-v", c.voice))

        if c.no_final_pause:
            args.append("-z")

        if isinstance(text, SSML):
            args.append("-m")

        if c.speak_punctuation:
            if isinstance(c.speak_punctuation, str):
                args.append(f'--punct="{c.speak_punctuation}"')
            else:
                args.append("--punct")

        return args