turing-space / turing /tests /unit /test_dataset.py
papri-ka's picture
Deploy FastAPI ML service to Hugging Face Spaces
5fc6e5d
from pathlib import Path
import pytest
# Project modules are importable thanks to conftest.py
import turing.config as config
from turing.dataset import DatasetManager
@pytest.mark.data_loader
class TestDatasetManager:
"""
Unit tests for the DatasetManager class.
This test suite validates initialization, data transformation logic,
and data loading mechanisms, including error handling.
"""
def test_initialization_paths_are_correct(self, manager: DatasetManager):
"""
Verifies that the DatasetManager initializes with the correct
Hugging Face ID and constructs its paths as expected.
"""
assert manager.hf_id == "NLBSE/nlbse26-code-comment-classification"
assert "data/raw" in str(manager.raw_data_dir)
# base_interim_path should contain either 'base' or 'features'
path_str = str(manager.base_interim_path)
assert "data/interim" in path_str and ("base" in path_str or "features" in path_str)
@pytest.mark.parametrize(
"input_labels, expected_output",
[
([1, 0, 1], "[1, 0, 1]"), # Case: Standard list
("[1, 0, 1]", "[1, 0, 1]"), # Case: Already a string
([], "[]"), # Case: Empty list
(None, None), # Case: None value
],
)
def test_format_labels_for_csv(self, manager: DatasetManager, input_labels, expected_output):
"""
Tests the internal _format_labels_for_csv method to ensure
it correctly serializes label lists (or handles other inputs) to strings.
"""
# Arrange
example = {"labels": input_labels}
# Act
formatted_example = manager._format_labels_for_csv(example)
# Assert
assert formatted_example["labels"] == expected_output
def test_get_dataset_raises_file_not_found(self, monkeypatch):
"""
Ensures that get_dataset() raises a FileNotFoundError when
the target interim CSV files do not exist.
"""
# Arrange
# Patch the config to point to a non-existent directory
fake_dir = Path("/path/that/is/totally/fake")
monkeypatch.setattr(config, "INTERIM_DATA_DIR", fake_dir)
# Manager must be initialized *after* patching config
manager_with_fake_path = DatasetManager()
# Act & Assert
with pytest.raises(FileNotFoundError, match="Dataset CSV files not found."):
manager_with_fake_path.get_dataset()
def test_get_dataset_success_and_label_parsing(self, fake_csv_data_dir: Path, monkeypatch):
"""
Verifies that get_dataset() successfully loads data from mock CSVs
and correctly parses the string-formatted labels back into lists.
"""
# Arrange
# Point the config at our temporary fixture directory
monkeypatch.setattr(config, "INTERIM_DATA_DIR", fake_csv_data_dir)
manager = DatasetManager()
# Act
dataset = manager.get_dataset()
# Assert
# Check that the correct splits were loaded
assert "java_train" in dataset
assert "java_test" in dataset
assert "python_train" not in dataset # Confirms only found files are loaded
# Check content integrity
assert len(dataset["java_train"]) == 2
assert dataset["java_train"][0]["combo"] == "java code text"
# Ccheck that the string '[1, 0, ...]' was parsed back to a list
expected_labels = [1, 0, 0, 0, 0, 0, 0]
assert dataset["java_train"][0]["labels"] == expected_labels
assert isinstance(dataset["java_train"][0]["labels"], list)