Spaces:

turing-team
/

turing-space

Runtime error

App Files Files Community

turing-space / turing /tests /unit /test_dataset.py

papri-ka

Deploy FastAPI ML service to Hugging Face Spaces

5fc6e5d 6 days ago

raw

history blame contribute delete

3.65 kB

	from pathlib import Path

	import pytest

	# Project modules are importable thanks to conftest.py
	import turing.config as config
	from turing.dataset import DatasetManager


	@pytest.mark.data_loader
	class TestDatasetManager:
	"""
	Unit tests for the DatasetManager class.
	This test suite validates initialization, data transformation logic,
	and data loading mechanisms, including error handling.
	"""

	def test_initialization_paths_are_correct(self, manager: DatasetManager):
	"""
	Verifies that the DatasetManager initializes with the correct
	Hugging Face ID and constructs its paths as expected.
	"""
	assert manager.hf_id == "NLBSE/nlbse26-code-comment-classification"
	assert "data/raw" in str(manager.raw_data_dir)
	# base_interim_path should contain either 'base' or 'features'
	path_str = str(manager.base_interim_path)
	assert "data/interim" in path_str and ("base" in path_str or "features" in path_str)

	@pytest.mark.parametrize(
	"input_labels, expected_output",
	[
	([1, 0, 1], "[1, 0, 1]"), # Case: Standard list
	("[1, 0, 1]", "[1, 0, 1]"), # Case: Already a string
	([], "[]"), # Case: Empty list
	(None, None), # Case: None value
	],
	)
	def test_format_labels_for_csv(self, manager: DatasetManager, input_labels, expected_output):
	"""
	Tests the internal _format_labels_for_csv method to ensure
	it correctly serializes label lists (or handles other inputs) to strings.
	"""
	# Arrange
	example = {"labels": input_labels}

	# Act
	formatted_example = manager._format_labels_for_csv(example)

	# Assert
	assert formatted_example["labels"] == expected_output

	def test_get_dataset_raises_file_not_found(self, monkeypatch):
	"""
	Ensures that get_dataset() raises a FileNotFoundError when
	the target interim CSV files do not exist.
	"""
	# Arrange
	# Patch the config to point to a non-existent directory
	fake_dir = Path("/path/that/is/totally/fake")
	monkeypatch.setattr(config, "INTERIM_DATA_DIR", fake_dir)

	# Manager must be initialized after patching config
	manager_with_fake_path = DatasetManager()

	# Act & Assert
	with pytest.raises(FileNotFoundError, match="Dataset CSV files not found."):
	manager_with_fake_path.get_dataset()

	def test_get_dataset_success_and_label_parsing(self, fake_csv_data_dir: Path, monkeypatch):
	"""
	Verifies that get_dataset() successfully loads data from mock CSVs
	and correctly parses the string-formatted labels back into lists.
	"""
	# Arrange
	# Point the config at our temporary fixture directory
	monkeypatch.setattr(config, "INTERIM_DATA_DIR", fake_csv_data_dir)
	manager = DatasetManager()

	# Act
	dataset = manager.get_dataset()

	# Assert
	# Check that the correct splits were loaded
	assert "java_train" in dataset
	assert "java_test" in dataset
	assert "python_train" not in dataset # Confirms only found files are loaded

	# Check content integrity
	assert len(dataset["java_train"]) == 2
	assert dataset["java_train"][0]["combo"] == "java code text"

	# Ccheck that the string '[1, 0, ...]' was parsed back to a list
	expected_labels = [1, 0, 0, 0, 0, 0, 0]
	assert dataset["java_train"][0]["labels"] == expected_labels
	assert isinstance(dataset["java_train"][0]["labels"], list)