from contextlib import closing
from dataclasses import dataclass
from typing import Literal, Sequence
import numpy as np
from mypy_boto3_polly.client import PollyClient
from mypy_boto3_polly.literals import LanguageCodeType, EngineType, VoiceIdType
from voicebox.audio import Audio
from voicebox.ssml import SSML
from voicebox.tts.tts import TTS
from voicebox.tts.utils import add_optional_items, get_audio_from_samples
from voicebox.types import StrOrSSML
[docs]
@dataclass
class AmazonPolly(TTS):
"""
TTS using `Amazon Polly <https://aws.amazon.com/polly/>`_.
See the `Amazon Polly documentation
<https://docs.aws.amazon.com/polly/latest/dg/API_SynthesizeSpeech.html>`_
for full descriptions of the parameters.
Supports `SSML <https://www.w3.org/TR/speech-synthesis/>`_: ✔
(`docs <https://docs.aws.amazon.com/polly/latest/dg/ssml.html>`_)
"""
client: PollyClient
"""
Boto3 Polly client, created by e.g.
>>> session = boto3.Session(...)
>>> client = session.client('polly')
"""
voice_id: VoiceIdType
"""Voice ID to use for the synthesis."""
engine: EngineType = None
"""
Specifies the engine (``standard`` or ``neural``) for Amazon Polly to use
when processing input text for speech synthesis.
"""
language_code: LanguageCodeType = None
"""Optional language code for the Synthesize Speech request."""
lexicon_names: Sequence[str] = None
"""
List of one or more pronunciation lexicon names you want the service to
apply during synthesis.
"""
sample_rate: Literal[8000, 16000] = 16000
"""Sample rate of returned audio. Must be ``8000`` or ``16000``."""
[docs]
def get_speech(self, text: StrOrSSML) -> Audio:
kwargs = dict(
OutputFormat="pcm",
Text=text,
VoiceId=self.voice_id,
SampleRate=str(self.sample_rate),
TextType="ssml" if isinstance(text, SSML) else "text",
)
add_optional_items(
kwargs,
[
("Engine", self.engine),
("LanguageCode", self.language_code),
("LexiconNames", self.lexicon_names),
],
)
response = self.client.synthesize_speech(**kwargs)
with closing(response["AudioStream"]) as audio_stream:
signal_bytes = audio_stream.read()
samples = np.frombuffer(signal_bytes, dtype=np.int16)
return get_audio_from_samples(samples, self.sample_rate)