Spaces:
Runtime error
Runtime error
File size: 4,099 Bytes
5fc6e5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import pandas as pd
import pytest
from turing.features import (
FeatureEngineer,
FeaturePipelineConfig,
TextProcessor,
)
# --- Fixtures ---
@pytest.fixture(scope="module")
def full_config():
"""Returns a config with stopwords and lemmatization enabled."""
return FeaturePipelineConfig(
use_stopwords=True,
use_lemmatization=True,
use_combo_feature=False,
max_features=5000,
min_comment_length=10,
max_comment_length=500,
enable_augmentation=False,
custom_tags="test",
)
@pytest.fixture(scope="module")
def basic_config():
"""Returns a config with all extra steps disabled."""
return FeaturePipelineConfig(
use_stopwords=False,
use_lemmatization=False,
use_combo_feature=False,
max_features=100,
min_comment_length=5,
max_comment_length=200,
enable_augmentation=False,
)
@pytest.fixture(scope="module")
def full_processor(full_config):
"""A TextProcessor with all steps enabled."""
return TextProcessor(config=full_config, language="english")
@pytest.fixture(scope="module")
def basic_processor(basic_config):
"""A TextProcessor with only basic cleaning (lowercase, punctuation)."""
return TextProcessor(config=basic_config, language="english")
# --- Tests ---
class TestFeaturePipelineConfig:
def test_config_id_generation(self, full_config, basic_config):
"""Tests that the readable ID is generated correctly."""
assert full_config.hash_id == "clean-k5000-test"
assert basic_config.hash_id == "clean-k100"
def test_config_attributes(self, full_config):
"""Tests that attributes are set correctly."""
assert full_config.use_stopwords is True
assert full_config.use_lemmatization is True
assert full_config.max_features == 5000
class TestTextProcessor:
def test_clean_text_basic(self, basic_processor):
"""Tests lowercase and punctuation removal."""
text = "This is a TEST... with punctuation!!"
expected = "this is a test with punctuation"
assert basic_processor.clean_text(text) == expected
def test_clean_text_stopwords(self, full_processor, basic_processor):
"""Tests stopword removal logic."""
text = "this is a test with a stopword"
# With stopwords enabled
expected_full = "test stopword"
assert full_processor.clean_text(text) == expected_full
# With stopwords disabled
expected_basic = "this is a test with a stopword"
assert basic_processor.clean_text(text) == expected_basic
def test_clean_text_lemmatization(self, full_processor, basic_processor):
"""Tests lemmatization logic."""
text = "running tests while dogs are barking"
# With lemmatization enabled
expected_full = "running test dog barking" # 'are' and 'while' are stopwords
assert full_processor.clean_text(text) == expected_full
# With lemmatization disabled
expected_basic = "running tests while dogs are barking"
assert basic_processor.clean_text(text) == expected_basic
def test_clean_text_handles_none(self, basic_processor):
"""Tests that it doesn't crash on None or pd.NA."""
assert basic_processor.clean_text(None) == ""
assert basic_processor.clean_text(pd.NA) == ""
class TestFeatureEngineer:
def test_extract_numeric_features(self, basic_config):
"""Tests that extract_features_for_check adds metadata features."""
fe = FeatureEngineer(config=basic_config)
data = {"comment_sentence": ["This is short.", "This one is a bit longer.", ""]}
df = pd.DataFrame(data)
df_out = fe.extract_features_for_check(df)
assert "f_length" in df_out.columns
assert "f_word_count" in df_out.columns
assert "f_starts_verb" in df_out.columns
assert "text_hash" in df_out.columns
assert df_out["f_length"].tolist() == [14, 25, 0]
assert df_out["f_word_count"].tolist() == [3, 6, 0]
|