import logging
import re
from typing import Generator
import spacy
from banterbot.data.enums import SpaCyLangModel
[docs]
class NLP:
"""
A comprehensive toolkit that provides a set of Natural Language Processing utilities. It leverages the capabilities
of the spaCy package. The toolkit is designed to automatically download the necessary models if they are not
available.
One of the main features of this toolkit is the intelligent model selection mechanism. It is designed to select the
most appropriate and lightweight model for each specific task, balancing between computational efficiency and task
performance.
"""
_models = {
"senter": None,
"splitter": None,
SpaCyLangModel.EN_CORE_WEB_SM: None,
SpaCyLangModel.EN_CORE_WEB_MD: None,
SpaCyLangModel.EN_CORE_WEB_LG: None,
}
[docs]
@classmethod
def install_upgrade_all_models(cls) -> None:
"""
Lazily checks if models are already installed, and installs any that are missing.
"""
installed = spacy.util.get_installed_models()
for model in SpaCyLangModel:
if model.value not in installed:
cls._download_model(model.value)
[docs]
@classmethod
def load_all_models(cls) -> None:
"""
Preloads all available models.
"""
for model in cls._models:
cls.model(model)
[docs]
@classmethod
def model(cls, name: str) -> spacy.language.Language:
"""
Returns the specified spaCy model lazily by loading models the first time they are called, then storing them in
the `cls._models` dictionary.
Args:
name (str): The name of the spaCy model to return.
Returns:
spacy.language.Language: The requested spaCy model.
"""
if name == "senter" and cls._models["senter"] is None:
# Define a set of pipeline components to disable for the sentence senter.
senter_disable = ["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer", "ner"]
senter = cls._load_model(name=SpaCyLangModel.EN_CORE_WEB_SM.value, disable=senter_disable)
# Enable the sentence segmentation pipeline component for the senter model.
senter.enable_pipe("senter")
logging.debug("NLP initializing model: `senter`")
cls._models["senter"] = senter
elif name == "splitter" and cls._models["splitter"] is None:
rules = {}
splitter = cls._load_model(name=SpaCyLangModel.EN_CORE_WEB_MD.value)
# Customize the tokenization rules for the word splitter in order to prevent splitting of contractions.
ignore = ("'", "’", "‘")
for key, value in splitter.tokenizer.rules.items():
if all(i not in key for i in ignore):
rules[key] = value
splitter.tokenizer.rules = rules
logging.debug("NLP initializing model: `splitter`")
cls._models["splitter"] = splitter
elif name == SpaCyLangModel.EN_CORE_WEB_SM and cls._models[SpaCyLangModel.EN_CORE_WEB_SM] is None:
logging.debug(f"NLP initializing model: `{SpaCyLangModel.EN_CORE_WEB_SM.value}`")
cls._models[SpaCyLangModel.EN_CORE_WEB_SM] = cls._load_model(name=SpaCyLangModel.EN_CORE_WEB_SM.value)
elif name == SpaCyLangModel.EN_CORE_WEB_MD and cls._models[SpaCyLangModel.EN_CORE_WEB_MD] is None:
logging.debug(f"NLP initializing model: `{SpaCyLangModel.EN_CORE_WEB_MD.value}`")
cls._models[SpaCyLangModel.EN_CORE_WEB_MD] = cls._load_model(name=SpaCyLangModel.EN_CORE_WEB_MD.value)
elif name == SpaCyLangModel.EN_CORE_WEB_LG and cls._models[SpaCyLangModel.EN_CORE_WEB_LG] is None:
logging.debug(f"NLP initializing model: `{SpaCyLangModel.EN_CORE_WEB_LG.value}`")
cls._models[SpaCyLangModel.EN_CORE_WEB_LG] = cls._load_model(name=SpaCyLangModel.EN_CORE_WEB_LG.value)
return cls._models[name]
[docs]
@classmethod
def segment_sentences(cls, string: str, whitespace: bool = True) -> tuple[str, ...]:
"""
Splits a text string into individual sentences using a specialized spaCy model. The model is a lightweight
version of `en_core_web_sm` designed specifically for sentence segmentation.
Args:
string (str): The input text string.
whitespace (str): If True, keep whitespace at the beginning/end of sentences; if False, strip it.
Returns:
tuple[str, ...]: A tuple of individual sentences as strings.
"""
return tuple(
sentence.text_with_ws if whitespace else sentence.text for sentence in cls.model("senter")(string).sents
)
[docs]
@classmethod
def segment_words(cls, string: str, whitespace: bool = True) -> tuple[str, ...]:
"""
Splits a text string into individual words using a specialized spaCy model. The model is customized version of
`en_core_web_md` in which words are not split on apostrophes, in order to preserve contractions.
Args:
string (str): The input text string.
whitespace (str): If True, include whitespace characters between words; if False, omit it.
Returns:
tuple[str, ...]: A tuple of individual words as strings.
"""
words = []
for word in cls.model("splitter")(string):
words.append(word.text)
if whitespace and word.whitespace_:
words.append(word.whitespace_)
return tuple(words)
[docs]
@classmethod
def tokenize(cls, strings: list[str]) -> Generator[spacy.tokens.doc.Doc, None, None]:
"""
Given a string or list of strings, returns tokenized versions of the strings as a generator.
Args:
strings (list[str]): A list of strings.
Returns:
Generator[spacy.tokens.doc.Doc, None, None]: A stream of `spacy.tokens.doc.Doc` instances.
"""
return cls.model(SpaCyLangModel.EN_CORE_WEB_LG).pipe(strings)
@classmethod
def _download_model(cls, name: str) -> None:
"""
Downloads a spaCy model by its name. It also provides information about the download process to the user.
Args:
name (str): The name of the spaCy model to download.
"""
logging.info(f'Downloading spaCy language model "{name}". This will only happen once.\n\n\n')
spacy.cli.download(name)
logging.info(f'\n\n\nFinished download of spaCy language model "{name}".')
@classmethod
def _update_model(cls, name: str) -> None:
"""
Downloads a spaCy model by its name. It also provides information about the download process to the user.
Args:
name (str): The name of the spaCy model to download.
"""
logging.info(f'Updating spaCy language model "{name}" for spaCy version {spacy.__version__}.\n\n\n')
spacy.cli.download(name)
logging.info(f'\n\n\nFinished update of spaCy language model "{name}" for spaCy version {spacy.__version__}.')
@classmethod
def _load_model(cls, *, name: str, **kwargs) -> spacy.language.Language:
"""
Loads a spaCy model by its name. If the model is not available, it is downloaded automatically.
Args:
name (str): The name of the spaCy model to load.
**kwargs: Additional keyword arguments for the spacy.load function.
Returns:
spacy.language.Language: The loaded spaCy model.
"""
# If the model is not available, download it.
try:
model = spacy.load(name, **kwargs)
except OSError:
cls._download_model(name)
model = spacy.load(name, **kwargs)
# Version checking to ensure that the model is compatible with the installed version of spaCy.
model_version = re.match(r"(\d+\.\d+).\d+", model.meta["version"])[1]
spacy_version = re.match(r"(\d+\.\d+).\d+", spacy.__version__)[1]
# If the model is not compatible with the installed version of spaCy, update the model.
if model_version != spacy_version:
cls._update_model(name)
model = spacy.load(name, **kwargs)
return model
# Upon import, NLP downloads any missing required models.
NLP.install_upgrade_all_models()