Spaces:

turing-team
/

turing-space

Runtime error

File size: 4,099 Bytes

5fc6e5d

import pandas as pd
import pytest

from turing.features import (
    FeatureEngineer,
    FeaturePipelineConfig,
    TextProcessor,
)

# --- Fixtures ---


@pytest.fixture(scope="module")
def full_config():
    """Returns a config with stopwords and lemmatization enabled."""
    return FeaturePipelineConfig(
        use_stopwords=True,
        use_lemmatization=True,
        use_combo_feature=False,
        max_features=5000,
        min_comment_length=10,
        max_comment_length=500,
        enable_augmentation=False,
        custom_tags="test",
    )


@pytest.fixture(scope="module")
def basic_config():
    """Returns a config with all extra steps disabled."""
    return FeaturePipelineConfig(
        use_stopwords=False,
        use_lemmatization=False,
        use_combo_feature=False,
        max_features=100,
        min_comment_length=5,
        max_comment_length=200,
        enable_augmentation=False,
    )


@pytest.fixture(scope="module")
def full_processor(full_config):
    """A TextProcessor with all steps enabled."""
    return TextProcessor(config=full_config, language="english")


@pytest.fixture(scope="module")
def basic_processor(basic_config):
    """A TextProcessor with only basic cleaning (lowercase, punctuation)."""
    return TextProcessor(config=basic_config, language="english")


# --- Tests ---


class TestFeaturePipelineConfig:
    def test_config_id_generation(self, full_config, basic_config):
        """Tests that the readable ID is generated correctly."""
        assert full_config.hash_id == "clean-k5000-test"
        assert basic_config.hash_id == "clean-k100"

    def test_config_attributes(self, full_config):
        """Tests that attributes are set correctly."""
        assert full_config.use_stopwords is True
        assert full_config.use_lemmatization is True
        assert full_config.max_features == 5000


class TestTextProcessor:
    def test_clean_text_basic(self, basic_processor):
        """Tests lowercase and punctuation removal."""
        text = "This is a TEST... with punctuation!!"
        expected = "this is a test with punctuation"
        assert basic_processor.clean_text(text) == expected

    def test_clean_text_stopwords(self, full_processor, basic_processor):
        """Tests stopword removal logic."""
        text = "this is a test with a stopword"

        # With stopwords enabled
        expected_full = "test stopword"
        assert full_processor.clean_text(text) == expected_full

        # With stopwords disabled
        expected_basic = "this is a test with a stopword"
        assert basic_processor.clean_text(text) == expected_basic

    def test_clean_text_lemmatization(self, full_processor, basic_processor):
        """Tests lemmatization logic."""
        text = "running tests while dogs are barking"

        # With lemmatization enabled
        expected_full = "running test dog barking"  # 'are' and 'while' are stopwords
        assert full_processor.clean_text(text) == expected_full

        # With lemmatization disabled
        expected_basic = "running tests while dogs are barking"
        assert basic_processor.clean_text(text) == expected_basic

    def test_clean_text_handles_none(self, basic_processor):
        """Tests that it doesn't crash on None or pd.NA."""
        assert basic_processor.clean_text(None) == ""
        assert basic_processor.clean_text(pd.NA) == ""


class TestFeatureEngineer:
    def test_extract_numeric_features(self, basic_config):
        """Tests that extract_features_for_check adds metadata features."""
        fe = FeatureEngineer(config=basic_config)
        data = {"comment_sentence": ["This is short.", "This one is a bit longer.", ""]}
        df = pd.DataFrame(data)
        df_out = fe.extract_features_for_check(df)

        assert "f_length" in df_out.columns
        assert "f_word_count" in df_out.columns
        assert "f_starts_verb" in df_out.columns
        assert "text_hash" in df_out.columns

        assert df_out["f_length"].tolist() == [14, 25, 0]
        assert df_out["f_word_count"].tolist() == [3, 6, 0]