Source code for voicebox.tts.espeakng

import subprocess
from dataclasses import dataclass, field
from typing import List, Union

from voicebox.audio import Audio
from voicebox.ssml import SSML
from voicebox.tts.tts import TTS
from voicebox.tts.utils import get_audio_from_wav_file
from voicebox.types import StrOrSSML


[docs] @dataclass class ESpeakConfig: """ Configuration for the eSpeak NG engine. Run "``espeak-ng -h``" for more information on these options. """ amplitude: int = None word_gap_seconds: float = None capitals: int = None line_length: int = None pitch: int = None speed: int = None voice: str = None no_final_pause: bool = False speak_punctuation: Union[bool, str] = False exe_path: str = "espeak-ng" timeout: float = None
[docs] @dataclass class ESpeakNG(TTS): """ TTS using the `eSpeak NG <https://github.com/espeak-ng/espeak-ng>`_ engine. You may need to install it: - On Debian/Ubuntu: ``sudo apt install espeak-ng`` Supports `SSML <https://www.w3.org/TR/speech-synthesis/>`_: ✔ (`docs <https://github.com/espeak-ng/espeak-ng/blob/master/docs/markup.md>`_) Args: config: Optional configuration for the eSpeak NG engine. If not given, a default config will be used. """ config: ESpeakConfig = field(default_factory=ESpeakConfig)
[docs] def get_speech(self, text: StrOrSSML) -> Audio: proc = self._get_proc(text) try: return get_audio_from_wav_file(proc.stdout) finally: proc.wait(timeout=self.config.timeout)
def _get_proc(self, text: StrOrSSML): args = self._get_args(text) try: proc = subprocess.Popen( args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, ) except FileNotFoundError as e: raise FileNotFoundError( f"{e}; is espeak-ng installed? Try: sudo apt install espeak-ng" ) proc.stdin.write(text.encode("utf-8")) proc.stdin.close() return proc def _get_args(self, text: StrOrSSML) -> List[str]: c = self.config args = [ c.exe_path, "--stdin", # Get input from stdin "-b", "1", # Input text encoding UTF-8 "--stdout", # Write output to stdout ] if c.amplitude is not None: args.extend(("-a", str(c.amplitude))) if c.word_gap_seconds is not None: # Units of 10ms word_gap = round(c.word_gap_seconds * 100) args.extend(("-g", str(word_gap))) if c.capitals is not None: args.extend(("-k", str(c.capitals))) if c.line_length is not None: args.extend(("-l", str(c.line_length))) if c.pitch is not None: args.extend(("-p", str(c.pitch))) if c.speed is not None: args.extend(("-s", str(c.speed))) if c.voice is not None: args.extend(("-v", c.voice)) if c.no_final_pause: args.append("-z") if isinstance(text, SSML): args.append("-m") if c.speak_punctuation: if isinstance(c.speak_punctuation, str): args.append(f'--punct="{c.speak_punctuation}"') else: args.append("--punct") return args