File size: 4,099 Bytes
5fc6e5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pandas as pd
import pytest

from turing.features import (
    FeatureEngineer,
    FeaturePipelineConfig,
    TextProcessor,
)

# --- Fixtures ---


@pytest.fixture(scope="module")
def full_config():
    """Returns a config with stopwords and lemmatization enabled."""
    return FeaturePipelineConfig(
        use_stopwords=True,
        use_lemmatization=True,
        use_combo_feature=False,
        max_features=5000,
        min_comment_length=10,
        max_comment_length=500,
        enable_augmentation=False,
        custom_tags="test",
    )


@pytest.fixture(scope="module")
def basic_config():
    """Returns a config with all extra steps disabled."""
    return FeaturePipelineConfig(
        use_stopwords=False,
        use_lemmatization=False,
        use_combo_feature=False,
        max_features=100,
        min_comment_length=5,
        max_comment_length=200,
        enable_augmentation=False,
    )


@pytest.fixture(scope="module")
def full_processor(full_config):
    """A TextProcessor with all steps enabled."""
    return TextProcessor(config=full_config, language="english")


@pytest.fixture(scope="module")
def basic_processor(basic_config):
    """A TextProcessor with only basic cleaning (lowercase, punctuation)."""
    return TextProcessor(config=basic_config, language="english")


# --- Tests ---


class TestFeaturePipelineConfig:
    def test_config_id_generation(self, full_config, basic_config):
        """Tests that the readable ID is generated correctly."""
        assert full_config.hash_id == "clean-k5000-test"
        assert basic_config.hash_id == "clean-k100"

    def test_config_attributes(self, full_config):
        """Tests that attributes are set correctly."""
        assert full_config.use_stopwords is True
        assert full_config.use_lemmatization is True
        assert full_config.max_features == 5000


class TestTextProcessor:
    def test_clean_text_basic(self, basic_processor):
        """Tests lowercase and punctuation removal."""
        text = "This is a TEST... with punctuation!!"
        expected = "this is a test with punctuation"
        assert basic_processor.clean_text(text) == expected

    def test_clean_text_stopwords(self, full_processor, basic_processor):
        """Tests stopword removal logic."""
        text = "this is a test with a stopword"

        # With stopwords enabled
        expected_full = "test stopword"
        assert full_processor.clean_text(text) == expected_full

        # With stopwords disabled
        expected_basic = "this is a test with a stopword"
        assert basic_processor.clean_text(text) == expected_basic

    def test_clean_text_lemmatization(self, full_processor, basic_processor):
        """Tests lemmatization logic."""
        text = "running tests while dogs are barking"

        # With lemmatization enabled
        expected_full = "running test dog barking"  # 'are' and 'while' are stopwords
        assert full_processor.clean_text(text) == expected_full

        # With lemmatization disabled
        expected_basic = "running tests while dogs are barking"
        assert basic_processor.clean_text(text) == expected_basic

    def test_clean_text_handles_none(self, basic_processor):
        """Tests that it doesn't crash on None or pd.NA."""
        assert basic_processor.clean_text(None) == ""
        assert basic_processor.clean_text(pd.NA) == ""


class TestFeatureEngineer:
    def test_extract_numeric_features(self, basic_config):
        """Tests that extract_features_for_check adds metadata features."""
        fe = FeatureEngineer(config=basic_config)
        data = {"comment_sentence": ["This is short.", "This one is a bit longer.", ""]}
        df = pd.DataFrame(data)
        df_out = fe.extract_features_for_check(df)

        assert "f_length" in df_out.columns
        assert "f_word_count" in df_out.columns
        assert "f_starts_verb" in df_out.columns
        assert "text_hash" in df_out.columns

        assert df_out["f_length"].tolist() == [14, 25, 0]
        assert df_out["f_word_count"].tolist() == [3, 6, 0]