Spaces:

turing-team
/

turing-space

Runtime error

App Files Files Community

turing-space / turing /tests /unit /test_features.py

papri-ka

Deploy FastAPI ML service to Hugging Face Spaces

5fc6e5d 6 days ago

raw

history blame contribute delete

4.1 kB

	import pandas as pd
	import pytest

	from turing.features import (
	FeatureEngineer,
	FeaturePipelineConfig,
	TextProcessor,
	)

	# --- Fixtures ---


	@pytest.fixture(scope="module")
	def full_config():
	"""Returns a config with stopwords and lemmatization enabled."""
	return FeaturePipelineConfig(
	use_stopwords=True,
	use_lemmatization=True,
	use_combo_feature=False,
	max_features=5000,
	min_comment_length=10,
	max_comment_length=500,
	enable_augmentation=False,
	custom_tags="test",
	)


	@pytest.fixture(scope="module")
	def basic_config():
	"""Returns a config with all extra steps disabled."""
	return FeaturePipelineConfig(
	use_stopwords=False,
	use_lemmatization=False,
	use_combo_feature=False,
	max_features=100,
	min_comment_length=5,
	max_comment_length=200,
	enable_augmentation=False,
	)


	@pytest.fixture(scope="module")
	def full_processor(full_config):
	"""A TextProcessor with all steps enabled."""
	return TextProcessor(config=full_config, language="english")


	@pytest.fixture(scope="module")
	def basic_processor(basic_config):
	"""A TextProcessor with only basic cleaning (lowercase, punctuation)."""
	return TextProcessor(config=basic_config, language="english")


	# --- Tests ---


	class TestFeaturePipelineConfig:
	def test_config_id_generation(self, full_config, basic_config):
	"""Tests that the readable ID is generated correctly."""
	assert full_config.hash_id == "clean-k5000-test"
	assert basic_config.hash_id == "clean-k100"

	def test_config_attributes(self, full_config):
	"""Tests that attributes are set correctly."""
	assert full_config.use_stopwords is True
	assert full_config.use_lemmatization is True
	assert full_config.max_features == 5000


	class TestTextProcessor:
	def test_clean_text_basic(self, basic_processor):
	"""Tests lowercase and punctuation removal."""
	text = "This is a TEST... with punctuation!!"
	expected = "this is a test with punctuation"
	assert basic_processor.clean_text(text) == expected

	def test_clean_text_stopwords(self, full_processor, basic_processor):
	"""Tests stopword removal logic."""
	text = "this is a test with a stopword"

	# With stopwords enabled
	expected_full = "test stopword"
	assert full_processor.clean_text(text) == expected_full

	# With stopwords disabled
	expected_basic = "this is a test with a stopword"
	assert basic_processor.clean_text(text) == expected_basic

	def test_clean_text_lemmatization(self, full_processor, basic_processor):
	"""Tests lemmatization logic."""
	text = "running tests while dogs are barking"

	# With lemmatization enabled
	expected_full = "running test dog barking" # 'are' and 'while' are stopwords
	assert full_processor.clean_text(text) == expected_full

	# With lemmatization disabled
	expected_basic = "running tests while dogs are barking"
	assert basic_processor.clean_text(text) == expected_basic

	def test_clean_text_handles_none(self, basic_processor):
	"""Tests that it doesn't crash on None or pd.NA."""
	assert basic_processor.clean_text(None) == ""
	assert basic_processor.clean_text(pd.NA) == ""


	class TestFeatureEngineer:
	def test_extract_numeric_features(self, basic_config):
	"""Tests that extract_features_for_check adds metadata features."""
	fe = FeatureEngineer(config=basic_config)
	data = {"comment_sentence": ["This is short.", "This one is a bit longer.", ""]}
	df = pd.DataFrame(data)
	df_out = fe.extract_features_for_check(df)

	assert "f_length" in df_out.columns
	assert "f_word_count" in df_out.columns
	assert "f_starts_verb" in df_out.columns
	assert "text_hash" in df_out.columns

	assert df_out["f_length"].tolist() == [14, 25, 0]
	assert df_out["f_word_count"].tolist() == [3, 6, 0]