Source code for banterbot.models.speech_recognition_input

import datetime
import json
from typing import Iterator, Optional

import azure.cognitiveservices.speech as speechsdk
import numba as nb
from typing_extensions import Self

from banterbot.data.enums import SpaCyLangModel
from banterbot.models.word import Word
from banterbot.types.wordjson import WordJSON
from banterbot.utils.nlp import NLP


[docs] class SpeechRecognitionInput: """ A class that encapsulates the speech-to-text output data. """
[docs] @classmethod def from_recognition_result( cls, recognition_result: speechsdk.SpeechRecognitionResult, language: Optional[str] = None ) -> Self: """ Constructor for the `SpeechRecognitionInput` class. Designed to create lightweight instances with most attributes initially set to None. Computation-intensive operations are performed on-demand when respective properties are accessed, instead of during initialization. Args: recognition_result (speechsdk.SpeechRecognitionResult): The result from a speech recognition event. language (str, optional): The language used during the speech-to-text recognition, if not auto-detected. """ data = json.loads(recognition_result.json) language = language if language is not None else cls._extract_language(recognition_result=recognition_result) return cls(data=data, language=language)
@classmethod def _extract_language(cls, recognition_result: speechsdk.SpeechRecognitionResult) -> Optional[str]: """ If the language is not provided (as it should be if auto-detection is disabled) then extract the auto-detected language as a string. Return None if the process fails. Args: recognition_result (speechsdk.SpeechRecognitionResult): The result from a speech recognition event. Returns: str, optional: The auto-detected language from the speech-to-text output. """ language_key = speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult language = None if language_key in recognition_result.properties: language = recognition_result.properties[language_key] return language def __init__( self, data: dict, language: str, offset: Optional[datetime.timedelta] = None, duration: Optional[datetime.timedelta] = None, offset_end: Optional[datetime.timedelta] = None, sents: Optional[tuple[str, ...]] = None, words: Optional[list[Word]] = None, display: Optional[str] = None, ) -> None: """ Constructor for the `SpeechRecognitionInput` class. Designed to create lightweight instances with most attributes initially set to None unless provided explicitly on initialization. Computation-intensive operations are instead performed lazily: i.e., when respective properties are accessed. Args: data (speechsdk.SpeechRecognitionResult): The JSON data output from a speech recognition event. language (str): The language used during the speech-to-text recognition. offset (optional, datetime.timedelta): The number of milliseconds between recognition start and the output. duration (optional, datetime.timedelta): The duration of the output in milliseconds. sents (optional, tuple[str, ...]): The split sentences of the output as a tuple. words (optional, list[Word]): A list of words from the output. display (optional, str): The display form of the output. """ self._data = data self._language = language self._offset = offset self._duration = duration self._offset_end = offset_end self._sents = sents self._words = words self._display = display @staticmethod @nb.njit(cache=True) def _get_words( input_words: list[str], input_tokens: list[str], indices: list[int], offsets: list[int], cutoff: int ) -> tuple[int, int]: """ Private method that extracts the words from the input text that are within a cutoff interval of the recognized speech. This is used to determine the words that were spoken during a specific interval of the audio stream. Uses Numba to speed up the process. Args: input_words (list[str]): The words in the input text. input_tokens (list[str]): The tokens in the input text. indices (list[int]): The indices of the tokens in the input text. offsets (list[int]): The offsets of the words in the input text. cutoff (int): The cutoff interval in microseconds. Returns: tuple[int, int]: The index of the first word in the cutoff interval and the index of the last word in the cutoff interval. """ word_idx = 0 display_idx = 0 for token_word, index in zip(input_tokens, indices): for display_word, offset in zip(input_words[word_idx:], offsets[word_idx:]): if token_word == display_word and offset <= cutoff: word_idx += 1 display_idx = index + len(token_word) break return word_idx, display_idx
[docs] def from_cutoff(self, cutoff: datetime.timedelta) -> Self: """ Create a new instance of class `SpeechRecognitionInput` that only contains the text spoken within a cutoff interval. Args: cutoff (datetime.timedelta): The upper cutoff time (or duration) of the new instance. Returns: SpeechRecognitionInput: The new instance of `SpeechRecognitionInput`. """ doc = NLP.model(SpaCyLangModel.EN_CORE_WEB_SM)(self.display) input_words = [word.text for word in self.words] input_tokens = [token.text.lower() for token in doc] indices = [token.idx for token in doc] offsets = [word.offset.microseconds for word in self.words] cutoff = cutoff.microseconds word_idx, display_idx = self._get_words( input_words=input_words, input_tokens=input_tokens, indices=indices, offsets=offsets, cutoff=cutoff ) if word_idx: duration = sum((i.duration for i in self.words[:word_idx]), datetime.timedelta()) data = { "Offset": self._data["Offset"], "Duration": duration, "NBest": [ { "Display": self._data["NBest"][0]["Display"][:display_idx], "Words": self._data["NBest"][0]["Words"][:word_idx], } ], } return self.__class__( data=data, language=self._language, offset=self.offset, duration=duration, words=self.words[:word_idx], display=self.display[:display_idx], ) else: return None
def _extract_words(self, words_raw: list[WordJSON]) -> list[Word]: """ Private method that extracts `Word` objects from raw data. Args: words_raw (list[WordJSON]): A list of dictionaries containing raw word data. Returns: list[Word]: A list of `Word` objects. """ words = [] for word in words_raw: words.append( Word( text=word["Word"], offset=datetime.timedelta(microseconds=word["Offset"] / 10), duration=datetime.timedelta(microseconds=word["Duration"] / 10), ) ) return words @property def words(self) -> list[Word]: """ A getter property that returns a list of Word objects. If the list is not already computed, it triggers computation. Returns: list[str]: A list of words. """ if self._words is None: self._words = self._extract_words(words_raw=self._data["NBest"][0]["Words"]) return self._words @property def sents(self) -> tuple[str, ...]: """ A getter property that returns a list of sentences. If the list is not already computed, it triggers computation. Returns: list[str]: A list of sentences. """ if self._sents is None: self._sents = NLP.segment_sentences(string=self.display, whitespace=True) return self._sents @property def offset(self) -> datetime.timedelta: """ A getter property that returns the offset of the recognized speech in the audio stream. Returns: datetime.timedelta: The offset in the form of a datetime.timedelta object. """ if self._offset is None: self._offset = datetime.timedelta(microseconds=self._data["Offset"] / 10) return self._offset @property def duration(self) -> datetime.timedelta: """ A getter property that returns the duration of the recognized speech in the audio stream. Returns: datetime.timedelta: The duration in the form of a datetime.timedelta object. """ if self._duration is None: self._duration = datetime.timedelta(microseconds=self._data["Duration"] / 10) return self._duration @property def offset_end(self) -> datetime.timedelta: """ A getter property that returns the offset + duration of the recognized speech in the audio stream. Returns: datetime.timedelta: The duration + offset in the form of a datetime.timedelta object. """ if self._offset_end is None: self._offset_end = self.offset + self.duration return self._offset_end @property def display(self) -> str: """ A getter property that returns the display form of the recognized speech. The display form is fully processed with Inverse Text Normalization (ITN), Capitalization, Disfluency Removal, and Punctuation. Returns: str: The display form of the speech. """ if self._display is None: self._display = self._data["NBest"][0]["Display"] return self._display def __getitem__(self, idx: int) -> Word: """ Overloads the indexing operator to retrieve words at specific positions. Args: idx (int): The index of the word to retrieve. Returns: Word: The word at the specified index. """ return self.words[idx] def __iter__(self) -> Iterator[Word]: """ Overloads the iterator to allow for iteration over the `Word` objects in the output. Yields: Word: The next Word object in the output. """ for word in self.words: yield word def __len__(self) -> int: """ Overloads the len() operator to return the number of words in the output. Returns: int: The number of words in the output. """ return len(self.words) def __str__(self) -> str: """ Overloads the str() operator to return the fully processed speech-to-text output. The processing includes Inverse Text Normalization (ITN), Capitalization, Disfluency Removal, and Punctuation. Returns: str: The fully processed string representation of the speech-to-text output. """ return self.display