import logging
import shutil
from typing import Optional
from typing_extensions import Self
import banterbot.paths
from banterbot import config
from banterbot.models.memory import Memory
from banterbot.protos import memory_pb2
from banterbot.utils.nlp import NLP
[docs]
class MemoryChain:
"""
MemoryChain is a class responsible for managing and handling arrays of memories using Protocol Buffers. It provides
functionality to save memories to a binary file, load memories from a binary file, and retrieve memories based on
keywords. The MemoryChain class is designed to efficiently store and retrieve memories based on keywords, allowing
for quick access to relevant information.
"""
[docs]
@classmethod
def create(cls) -> Self:
"""
Generate a new empty set of memories and associated UUID.
Returns:
MemoryChain: A new instance of MemoryChain with an empty set of memories and a unique UUID.
"""
uuid = config.generate_uuid().hex
# Create new directory for the new UUID and memory files
directory = banterbot.paths.personae / uuid / banterbot.paths.memories
directory.mkdir(exist_ok=True, parents=True)
# Create new memory index file
memory_index = memory_pb2.MemoryIndex()
with open(banterbot.paths.personae / uuid / banterbot.paths.memory_index, "wb+") as fs:
fs.write(memory_index.SerializeToString())
logging.debug(f"MemoryChain created new UUID: `{uuid}`")
return cls(uuid=uuid, memory_index={})
[docs]
@classmethod
def load(cls, uuid: str) -> Self:
"""
Load the memories from a binary file using Protocol Buffers deserialization and creates a MemoryChain instance.
This method is used to load an existing set of memories from a file, allowing for the continuation of a previous
session or the sharing of memories between different instances.
Args:
uuid (str): The UUID of the memory files to load.
Returns:
MemoryChain: A new instance of MemoryChain with loaded memories.
"""
logging.debug(f"MemoryChain loading UUID: `{uuid}`")
# Read memory index file
memory_index_object = memory_pb2.MemoryIndex()
with open(banterbot.paths.personae / uuid / banterbot.paths.memory_index, "rb") as fs:
memory_index_object.ParseFromString(fs.read())
# Parse the memory index file into a dictionary
memory_index = {entry.keyword: list(entry.memory_uuids) for entry in memory_index_object.entries}
return cls(uuid, memory_index)
[docs]
@classmethod
def delete(cls, uuid: str) -> None:
"""
Delete the directory associated with a MemoryChain instance. This method is used to clean up the file system
by removing the directory and all its contents, including memory files and the memory index file.
Args:
uuid (str): The UUID associated with this set of memories.
"""
shutil.rmtree(banterbot.paths.personae / uuid)
def __init__(self, uuid: str, memory_index: dict[str, list[str]]) -> None:
"""
Initialize a new instance of MemoryChain.
Args:
uuid (str): The UUID associated with this set of memories.
memory_index (dict[str, list[str]]): The dictionary mapping from keyword to list of memory UUIDs. This index
is used to efficiently look up memories based on keywords.
"""
logging.debug(f"MemoryChain initialized with UUID: `{uuid}`")
self.uuid = uuid
self._directory = banterbot.paths.personae / self.uuid / banterbot.paths.memories
self._index_cache = memory_index
self._memories = {}
self._similarity_cache = {}
self._token_cache = {}
self._update_token_cache(self._index_cache.keys())
self._find_memories()
[docs]
def append(self, memory: Memory) -> None:
"""
Append a memory to the current set of memories. This method is used to add a single memory to the MemoryChain,
allowing for the storage of new information. All changes are saved to file as soon as they are made.
Args:
memory (Memory): The memory to append.
"""
self._memories[memory.uuid] = memory
self._save_memory(memory=memory)
self._update_index(memory=memory)
self._save_index()
[docs]
def extend(self, memories: list[Memory]) -> None:
"""
Extend the current set of memories with a list of memories. This method is used to add multiple memories to the
MemoryChain at once, allowing for the storage of new information in bulk. All changes are saved to file as soon
as they are made.
Args:
memories (list[Memory]): The list of memories to append.
"""
for memory in memories:
self._memories[memory.uuid] = memory
self._save_memory(memory=memory)
self._update_index(memory=memory)
self._save_index()
[docs]
def search(self, keywords: list[str], fuzzy_threshold: Optional[float] = None) -> list[Memory]:
"""
Look up memories based on keywords. This method is used to retrieve memories that are relevant to the specified
keywords. It can also perform fuzzy matching, allowing for the retrieval of memories that are similar to the
given keywords based on a similarity threshold.
Args:
keywords (list[str]): The list of keywords to look up.
fuzzy_threshold (Optional[float]): The threshold for fuzzy matching. If None, only returns exact matches. If
a value is provided, memories with keywords that have a similarity score greater than or equal to the
threshold will also be returned.
Returns:
list[Memory]: The list of matching memories.
"""
memory_uuids = set()
memories = []
if fuzzy_threshold is not None:
self._update_similarity_cache(keywords=keywords)
# Find additional keywords that are similar to the specified keywords
keywords_extension = []
cache_filtered = [i for i in self._index_cache.keys() if i not in keywords]
for keyword in keywords:
for keyword_indexed in cache_filtered:
if self._similarity_cache[(keyword, keyword_indexed)] >= fuzzy_threshold:
keywords_extension.append(keyword_indexed)
keywords.extend(keywords_extension)
# Add memory UUIDs to the result set
for keyword in keywords:
if keyword in self._index_cache.keys():
for memory_uuid in self._index_cache[keyword]:
if self._memories[memory_uuid] is None:
self._load_memory(memory_uuid=memory_uuid)
memory_uuids.add(memory_uuid)
# Write all the Memory objects into a list
for memory_uuid in memory_uuids:
memories.append(self._memories[memory_uuid])
return memories
def _save_memory(self, memory: Memory) -> None:
"""
Save an instance of class Memory to file using protocol buffers.
"""
filename = memory.uuid + banterbot.paths.protobuf_extension
with open(self._directory / filename, "wb+") as fs:
fs.write(memory.serialize())
def _save_index(self) -> None:
"""
Save the current state of the memory index to file.
"""
memory_index = memory_pb2.MemoryIndex()
for keyword, memory_uuids in self._index_cache.items():
memory_index_entry = memory_pb2.MemoryIndexEntry()
memory_index_entry.keyword = keyword
memory_index_entry.memory_uuids.extend(memory_uuids)
memory_index.entries.append(memory_index_entry)
with open(banterbot.paths.personae / self.uuid / banterbot.paths.memory_index, "wb+") as fs:
fs.write(memory_index.SerializeToString())
def _find_memories(self) -> None:
"""
Find all memory files associated with this UUID and store them in _memories dictionary. This method is used to
locate all memory files that belong to the current MemoryChain instance, allowing for the efficient loading and
retrieval of memories when needed.
"""
directory = self._directory
self._memories = {path.stem: None for path in directory.glob("*" + banterbot.paths.protobuf_extension)}
def _update_index(self, memory: Memory) -> None:
"""
Update the memory index with a new memory. This method is used to keep the memory index up-to-date when new
memories are added to the MemoryChain. The index allows for efficient look-up of memories based on keywords.
Args:
memory (Memory): The memory to update the index with.
"""
for keyword in memory.keywords:
if keyword not in self._index_cache.keys():
self._index_cache[keyword] = set()
self._index_cache[keyword].add(memory.uuid)
self._update_token_cache(memory.keywords)
def _load_memory(self, memory_uuid: str) -> None:
"""
Load a memory from a memory file. This method is used to load a specific memory from a file when it is needed,
allowing for efficient memory usage by only loading memories when they are required.
Args:
memory_uuid (str): The UUID of the memory to load.
"""
filename = memory_uuid + banterbot.paths.protobuf_extension
with open(self._directory / filename, "rb") as fs:
self._memories[memory_uuid] = Memory.deserialize(fs.read())
def _update_token_cache(self, keywords: list[str]) -> None:
"""
Update the token cache with new keywords. This method is used to keep the token cache up-to-date when new
keywords are added to the MemoryChain. The token cache allows for efficient computation of similarity scores
between keywords.
Args:
keywords (list[str]): The new keywords to update the cache with.
"""
new_keywords = [keyword for keyword in keywords if keyword not in self._token_cache.keys()]
for keyword, token in zip(new_keywords, NLP.tokenize(strings=new_keywords)):
self._token_cache[keyword] = token
def _update_similarity_cache(self, keywords: list[str]) -> None:
"""
Update the similarity cache with new keywords. This method is used to keep the similarity cache up-to-date when
new keywords are added to the MemoryChain. The similarity cache allows for efficient computation of similarity
scores between keywords, enabling fuzzy matching in the search method.
Args:
keywords (list[str]): The new keywords to update the cache with.
"""
self._update_token_cache(keywords=keywords)
for keyword_indexed in self._index_cache.keys():
for keyword in keywords:
pair = (keyword, keyword_indexed)
if pair not in self._similarity_cache.keys():
similarity = self._token_cache[keyword].similarity(self._token_cache[keyword_indexed])
self._similarity_cache[pair] = similarity