Source code for voicebox.tts.amazonpolly

from contextlib import closing
from dataclasses import dataclass
from typing import Literal, Sequence

import numpy as np
from mypy_boto3_polly.client import PollyClient
from mypy_boto3_polly.literals import LanguageCodeType, EngineType, VoiceIdType

from voicebox.audio import Audio
from voicebox.ssml import SSML
from voicebox.tts.tts import TTS
from voicebox.tts.utils import add_optional_items, get_audio_from_samples
from voicebox.types import StrOrSSML


[docs] @dataclass class AmazonPolly(TTS): """ TTS using `Amazon Polly <https://aws.amazon.com/polly/>`_. See the `Amazon Polly documentation <https://docs.aws.amazon.com/polly/latest/dg/API_SynthesizeSpeech.html>`_ for full descriptions of the parameters. Supports `SSML <https://www.w3.org/TR/speech-synthesis/>`_: ✔ (`docs <https://docs.aws.amazon.com/polly/latest/dg/ssml.html>`_) """ client: PollyClient """ Boto3 Polly client, created by e.g. >>> session = boto3.Session(...) >>> client = session.client('polly') """ voice_id: VoiceIdType """Voice ID to use for the synthesis.""" engine: EngineType = None """ Specifies the engine (``standard`` or ``neural``) for Amazon Polly to use when processing input text for speech synthesis. """ language_code: LanguageCodeType = None """Optional language code for the Synthesize Speech request.""" lexicon_names: Sequence[str] = None """ List of one or more pronunciation lexicon names you want the service to apply during synthesis. """ sample_rate: Literal[8000, 16000] = 16000 """Sample rate of returned audio. Must be ``8000`` or ``16000``."""
[docs] def get_speech(self, text: StrOrSSML) -> Audio: kwargs = dict( OutputFormat="pcm", Text=text, VoiceId=self.voice_id, SampleRate=str(self.sample_rate), TextType="ssml" if isinstance(text, SSML) else "text", ) add_optional_items( kwargs, [ ("Engine", self.engine), ("LanguageCode", self.language_code), ("LexiconNames", self.lexicon_names), ], ) response = self.client.synthesize_speech(**kwargs) with closing(response["AudioStream"]) as audio_stream: signal_bytes = audio_stream.read() samples = np.frombuffer(signal_bytes, dtype=np.int16) return get_audio_from_samples(samples, self.sample_rate)