Spaces:
Runtime error
Runtime error
import pytest | |
import os | |
import json | |
import random | |
import logging | |
logger = logging.getLogger("tinytroupe") | |
import sys | |
sys.path.append('../../tinytroupe/') | |
sys.path.append('../../') | |
sys.path.append('..') | |
from testing_utils import * | |
from tinytroupe.extraction import ArtifactExporter, Normalizer | |
from tinytroupe import utils | |
def exporter(): | |
return ArtifactExporter(base_output_folder=EXPORT_BASE_FOLDER) | |
def test_export_json(exporter): | |
# Define the artifact data | |
artifact_data = { | |
"name": "John Doe", | |
"age": 30, | |
"occupation": "Engineer", | |
"content": "This is a sample JSON data." | |
} | |
# Export the artifact data as JSON | |
exporter.export("test_artifact", artifact_data, content_type="record", target_format="json") | |
#check if the JSON file was exported correctly | |
assert os.path.exists(f"{EXPORT_BASE_FOLDER}/record/test_artifact.json"), "The JSON file should have been exported." | |
# does it contain the data? | |
with open(f"{EXPORT_BASE_FOLDER}/record/test_artifact.json", "r") as f: | |
exported_data = json.load(f) | |
assert exported_data == artifact_data, "The exported JSON data should match the original data." | |
def test_export_text(exporter): | |
# Define the artifact data | |
artifact_data = "This is a sample text." | |
# Export the artifact data as text | |
exporter.export("test_artifact", artifact_data, content_type="text", target_format="txt") | |
# check if the text file was exported correctly | |
assert os.path.exists(f"{EXPORT_BASE_FOLDER}/text/test_artifact.txt"), "The text file should have been exported." | |
# does it contain the data? | |
with open(f"{EXPORT_BASE_FOLDER}/text/test_artifact.txt", "r") as f: | |
exported_data = f.read() | |
assert exported_data == artifact_data, "The exported text data should match the original data." | |
def test_export_docx(exporter): | |
# Define the artifact data. Include some fancy markdown formatting so we can test if it is preserved. | |
artifact_data =\ | |
""" | |
# This is a sample markdown text | |
This is a **bold** text. | |
This is an *italic* text. | |
This is a [link](https://www.example.com). | |
""" | |
# Export the artifact data as a docx file | |
exporter.export("test_artifact", artifact_data, content_type="Document", content_format="markdown", target_format="docx") | |
# check if the docx file was exported correctly | |
assert os.path.exists(f"{EXPORT_BASE_FOLDER}/Document/test_artifact.docx"), "The docx file should have been exported." | |
# does it contain the data? | |
from docx import Document | |
doc = Document(f"{EXPORT_BASE_FOLDER}/Document/test_artifact.docx") | |
exported_data = "" | |
for para in doc.paragraphs: | |
exported_data += para.text | |
assert "This is a sample markdown text" in exported_data, "The exported docx data should contain some of the original content." | |
assert "#" not in exported_data, "The exported docx data should not contain Markdown." | |
def test_normalizer(): | |
# Define the concepts to be normalized | |
concepts = ['Antique Book Collection', 'Medical Research', 'Electrical safety', 'Reading', 'Technology', 'Entrepreneurship', 'Multimedia Teaching Tools', 'Photography', | |
'Smart home technology', 'Gardening', 'Travel', 'Outdoors', 'Hiking', 'Yoga', 'Finance', 'Health and wellness', 'Sustainable Living', 'Barista Skills', 'Oral health education', | |
'Patient care', 'Professional Development', 'Project safety', 'Coffee', 'Literature', 'Continuous learning', 'Model trains', 'Education', 'Mental and Physical Balance', 'Kayaking', | |
'Social Justice', 'National Park Exploration', 'Outdoor activities', 'Dental technology', 'Teaching electrical skills', 'Volunteering', 'Cooking', 'Industry trends', | |
'Energy-efficient systems', 'Mentoring', 'Empathetic communication', 'Medical Technology', 'Historical Research', 'Public Speaking', 'Museum Volunteering', 'Conflict Resolution'] | |
unique_concepts = list(set(concepts)) | |
normalizer = Normalizer(concepts, n=10, verbose=True) | |
assert len(normalizer.normalized_elements) == 10, "The number of normalized elements should be equal to the specified value." | |
# sample 5 random elements from concepts using standard python methods | |
random_concepts_buckets = [random.sample(concepts, 15), random.sample(concepts, 15), random.sample(concepts, 15), random.sample(concepts, 15), random.sample(concepts, 15)] | |
assert len(normalizer.normalizing_map.keys()) == 0, "The normalizing map should be empty at the beginning." | |
for bucket in random_concepts_buckets: | |
init_cache_size = len(normalizer.normalizing_map.keys()) | |
normalized_concept = normalizer.normalize(bucket) | |
assert normalized_concept is not None, "The normalized concept should not be None." | |
logger.debug(f"Normalized concept: {bucket} -> {normalized_concept}") | |
print(f"Normalized concept: {bucket} -> {normalized_concept}") | |
next_cache_size = len(normalizer.normalizing_map.keys()) | |
# check same length | |
assert len(normalized_concept) == len(bucket), "The normalized concept should have the same length as the input concept." | |
# assert that all elements from normalized concepts are in normalizing map keys | |
for element in bucket: | |
assert element in normalizer.normalizing_map.keys(), f"{element} should be in the normalizing map keys." | |
assert next_cache_size > 0, "The cache size should be greater than 0 after normalizing a new concept." | |
assert next_cache_size >= init_cache_size, "The cache size should not decrease after normalizing a new concept." | |