Spaces:
Runtime error
Runtime error
import pandas as pd | |
from typing import Union, List | |
from tinytroupe.extraction import logger | |
from tinytroupe import openai_utils | |
import tinytroupe.utils as utils | |
class Normalizer: | |
""" | |
A mechanism to normalize passages, concepts and other textual elements. | |
""" | |
def __init__(self, elements:List[str], n:int, verbose:bool=False): | |
""" | |
Normalizes the specified elements. | |
Args: | |
elements (list): The elements to normalize. | |
n (int): The number of normalized elements to output. | |
verbose (bool, optional): Whether to print debug messages. Defaults to False. | |
""" | |
# ensure elements are unique | |
self.elements = list(set(elements)) | |
self.n = n | |
self.verbose = verbose | |
# a JSON-based structure, where each output element is a key to a list of input elements that were merged into it | |
self.normalized_elements = None | |
# a dict that maps each input element to its normalized output. This will be used as cache later. | |
self.normalizing_map = {} | |
rendering_configs = {"n": n, | |
"elements": self.elements} | |
messages = utils.compose_initial_LLM_messages_with_templates("normalizer.system.mustache", "normalizer.user.mustache", | |
base_module_folder="extraction", | |
rendering_configs=rendering_configs) | |
next_message = openai_utils.client().send_message(messages, temperature=0.1) | |
debug_msg = f"Normalization result message: {next_message}" | |
logger.debug(debug_msg) | |
if self.verbose: | |
print(debug_msg) | |
result = utils.extract_json(next_message["content"]) | |
logger.debug(result) | |
if self.verbose: | |
print(result) | |
self.normalized_elements = result | |
def normalize(self, element_or_elements:Union[str, List[str]]) -> Union[str, List[str]]: | |
""" | |
Normalizes the specified element or elements. | |
This method uses a caching mechanism to improve performance. If an element has been normalized before, | |
its normalized form is stored in a cache (self.normalizing_map). When the same element needs to be | |
normalized again, the method will first check the cache and use the stored normalized form if available, | |
instead of normalizing the element again. | |
The order of elements in the output will be the same as in the input. This is ensured by processing | |
the elements in the order they appear in the input and appending the normalized elements to the output | |
list in the same order. | |
Args: | |
element_or_elements (Union[str, List[str]]): The element or elements to normalize. | |
Returns: | |
str: The normalized element if the input was a string. | |
list: The normalized elements if the input was a list, preserving the order of elements in the input. | |
""" | |
if isinstance(element_or_elements, str): | |
denormalized_elements = [element_or_elements] | |
elif isinstance(element_or_elements, list): | |
denormalized_elements = element_or_elements | |
else: | |
raise ValueError("The element_or_elements must be either a string or a list.") | |
normalized_elements = [] | |
elements_to_normalize = [] | |
for element in denormalized_elements: | |
if element not in self.normalizing_map: | |
elements_to_normalize.append(element) | |
if elements_to_normalize: | |
rendering_configs = {"categories": self.normalized_elements, | |
"elements": elements_to_normalize} | |
messages = utils.compose_initial_LLM_messages_with_templates("normalizer.applier.system.mustache", "normalizer.applier.user.mustache", | |
base_module_folder="extraction", | |
rendering_configs=rendering_configs) | |
next_message = openai_utils.client().send_message(messages, temperature=0.1) | |
debug_msg = f"Normalization result message: {next_message}" | |
logger.debug(debug_msg) | |
if self.verbose: | |
print(debug_msg) | |
normalized_elements_from_llm = utils.extract_json(next_message["content"]) | |
assert isinstance(normalized_elements_from_llm, list), "The normalized element must be a list." | |
assert len(normalized_elements_from_llm) == len(elements_to_normalize), "The number of normalized elements must be equal to the number of elements to normalize." | |
for i, element in enumerate(elements_to_normalize): | |
normalized_element = normalized_elements_from_llm[i] | |
self.normalizing_map[element] = normalized_element | |
for element in denormalized_elements: | |
normalized_elements.append(self.normalizing_map[element]) | |
return normalized_elements | |