Source code for voicebox.voiceboxes.splitter

import re
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Iterable, Union

import nltk
import nltk.data
from nltk.tokenize import PunktTokenizer
from nltk.tokenize.api import TokenizerI

from voicebox.ssml import SSML
from voicebox.types import StrOrSSML


[docs] class Splitter(ABC): """Splits text into chunks."""
[docs] @abstractmethod def split(self, text: StrOrSSML) -> Iterable[StrOrSSML]: """ Splits the given text into chunks, unless it is a :class:`voicebox.SSML` instance, in which case it is returned as-is, i.e. ``[text]``. """ ... # pragma: no cover
[docs] class NoopSplitter(Splitter): """Does not split text."""
[docs] def split(self, text: StrOrSSML) -> Iterable[StrOrSSML]: yield text
[docs] class RegexSplitter(Splitter): """Splits text on regex pattern.""" pattern: re.Pattern join_split_group: bool def __init__(self, pattern: Union[str, re.Pattern], join_split_group: bool = True): self.pattern = ( pattern if isinstance(pattern, re.Pattern) else re.compile(pattern) ) self.join_split_group = join_split_group
[docs] def split(self, text: StrOrSSML) -> Iterable[StrOrSSML]: # Do not split SSML if isinstance(text, SSML): return [text] result = self.pattern.split(text) result = map(str.strip, result) result = filter(bool, result) if self.join_split_group: result = list(result) + [""] pairs = zip(result[0::2], result[1::2]) result = ("".join(pair) for pair in pairs) return result
[docs] class SimpleSentenceSplitter(RegexSplitter): """Splits text on sentence punctuation '.', '!', and '?'.""" def __init__(self): super().__init__(r"([.!?]+(?:\s+|$))")
[docs] @dataclass class NltkTokenizerSplitter(Splitter): """ Uses an `NLTK tokenizer <https://www.nltk.org/api/nltk.tokenize.html>`_ to split text. """ tokenizer: TokenizerI
[docs] def split(self, text: StrOrSSML) -> Iterable[StrOrSSML]: return [text] if isinstance(text, SSML) else self.tokenizer.tokenize(text)
[docs] class PunktSentenceSplitter(NltkTokenizerSplitter): """ Uses the `Punkt <https://www.nltk.org/api/nltk.tokenize.punkt.html>`_ sentence tokenizer from `NLTK <https://www.nltk.org>`_ to split text into sentences more intelligently than a simple pattern-based splitter. It can handle instances of mid-sentence punctuation very well; e.g. "Mr. Jones went to see Dr. Sherman" would be correctly "split" into only one sentence. This requires that the Punkt NLTK resources be located on disk, e.g. by downloading via one of these methods: >>> PunktSentenceSplitter.download_resources() or >>> import nltk; nltk.download('punkt_tab') or $ python -m nltk.downloader punkt_tab See here for all NLTK Data installation methods: https://www.nltk.org/data.html """ def __init__(self, language: str = "english"): tokenizer = PunktTokenizer(language) super().__init__(tokenizer)
[docs] @staticmethod def download_resources(**kwargs): """Download the Punkt NLTK resources.""" nltk.download("punkt_tab", **kwargs) # pragma: no cover
[docs] def default_splitter() -> Splitter: return NoopSplitter()