Spaces:
Runtime error
Runtime error
File size: 5,704 Bytes
82a7a28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import pytest
import os
import json
import random
import logging
logger = logging.getLogger("tinytroupe")
import sys
sys.path.append('../../tinytroupe/')
sys.path.append('../../')
sys.path.append('..')
from testing_utils import *
from tinytroupe.extraction import ArtifactExporter, Normalizer
from tinytroupe import utils
@pytest.fixture
def exporter():
return ArtifactExporter(base_output_folder=EXPORT_BASE_FOLDER)
def test_export_json(exporter):
# Define the artifact data
artifact_data = {
"name": "John Doe",
"age": 30,
"occupation": "Engineer",
"content": "This is a sample JSON data."
}
# Export the artifact data as JSON
exporter.export("test_artifact", artifact_data, content_type="record", target_format="json")
#check if the JSON file was exported correctly
assert os.path.exists(f"{EXPORT_BASE_FOLDER}/record/test_artifact.json"), "The JSON file should have been exported."
# does it contain the data?
with open(f"{EXPORT_BASE_FOLDER}/record/test_artifact.json", "r") as f:
exported_data = json.load(f)
assert exported_data == artifact_data, "The exported JSON data should match the original data."
def test_export_text(exporter):
# Define the artifact data
artifact_data = "This is a sample text."
# Export the artifact data as text
exporter.export("test_artifact", artifact_data, content_type="text", target_format="txt")
# check if the text file was exported correctly
assert os.path.exists(f"{EXPORT_BASE_FOLDER}/text/test_artifact.txt"), "The text file should have been exported."
# does it contain the data?
with open(f"{EXPORT_BASE_FOLDER}/text/test_artifact.txt", "r") as f:
exported_data = f.read()
assert exported_data == artifact_data, "The exported text data should match the original data."
def test_export_docx(exporter):
# Define the artifact data. Include some fancy markdown formatting so we can test if it is preserved.
artifact_data =\
"""
# This is a sample markdown text
This is a **bold** text.
This is an *italic* text.
This is a [link](https://www.example.com).
"""
# Export the artifact data as a docx file
exporter.export("test_artifact", artifact_data, content_type="Document", content_format="markdown", target_format="docx")
# check if the docx file was exported correctly
assert os.path.exists(f"{EXPORT_BASE_FOLDER}/Document/test_artifact.docx"), "The docx file should have been exported."
# does it contain the data?
from docx import Document
doc = Document(f"{EXPORT_BASE_FOLDER}/Document/test_artifact.docx")
exported_data = ""
for para in doc.paragraphs:
exported_data += para.text
assert "This is a sample markdown text" in exported_data, "The exported docx data should contain some of the original content."
assert "#" not in exported_data, "The exported docx data should not contain Markdown."
def test_normalizer():
# Define the concepts to be normalized
concepts = ['Antique Book Collection', 'Medical Research', 'Electrical safety', 'Reading', 'Technology', 'Entrepreneurship', 'Multimedia Teaching Tools', 'Photography',
'Smart home technology', 'Gardening', 'Travel', 'Outdoors', 'Hiking', 'Yoga', 'Finance', 'Health and wellness', 'Sustainable Living', 'Barista Skills', 'Oral health education',
'Patient care', 'Professional Development', 'Project safety', 'Coffee', 'Literature', 'Continuous learning', 'Model trains', 'Education', 'Mental and Physical Balance', 'Kayaking',
'Social Justice', 'National Park Exploration', 'Outdoor activities', 'Dental technology', 'Teaching electrical skills', 'Volunteering', 'Cooking', 'Industry trends',
'Energy-efficient systems', 'Mentoring', 'Empathetic communication', 'Medical Technology', 'Historical Research', 'Public Speaking', 'Museum Volunteering', 'Conflict Resolution']
unique_concepts = list(set(concepts))
normalizer = Normalizer(concepts, n=10, verbose=True)
assert len(normalizer.normalized_elements) == 10, "The number of normalized elements should be equal to the specified value."
# sample 5 random elements from concepts using standard python methods
random_concepts_buckets = [random.sample(concepts, 15), random.sample(concepts, 15), random.sample(concepts, 15), random.sample(concepts, 15), random.sample(concepts, 15)]
assert len(normalizer.normalizing_map.keys()) == 0, "The normalizing map should be empty at the beginning."
for bucket in random_concepts_buckets:
init_cache_size = len(normalizer.normalizing_map.keys())
normalized_concept = normalizer.normalize(bucket)
assert normalized_concept is not None, "The normalized concept should not be None."
logger.debug(f"Normalized concept: {bucket} -> {normalized_concept}")
print(f"Normalized concept: {bucket} -> {normalized_concept}")
next_cache_size = len(normalizer.normalizing_map.keys())
# check same length
assert len(normalized_concept) == len(bucket), "The normalized concept should have the same length as the input concept."
# assert that all elements from normalized concepts are in normalizing map keys
for element in bucket:
assert element in normalizer.normalizing_map.keys(), f"{element} should be in the normalizing map keys."
assert next_cache_size > 0, "The cache size should be greater than 0 after normalizing a new concept."
assert next_cache_size >= init_cache_size, "The cache size should not decrease after normalizing a new concept."
|