Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import pytest | |
| from turing.features import ( | |
| FeatureEngineer, | |
| FeaturePipelineConfig, | |
| TextProcessor, | |
| ) | |
| # --- Fixtures --- | |
| def full_config(): | |
| """Returns a config with stopwords and lemmatization enabled.""" | |
| return FeaturePipelineConfig( | |
| use_stopwords=True, | |
| use_lemmatization=True, | |
| use_combo_feature=False, | |
| max_features=5000, | |
| min_comment_length=10, | |
| max_comment_length=500, | |
| enable_augmentation=False, | |
| custom_tags="test", | |
| ) | |
| def basic_config(): | |
| """Returns a config with all extra steps disabled.""" | |
| return FeaturePipelineConfig( | |
| use_stopwords=False, | |
| use_lemmatization=False, | |
| use_combo_feature=False, | |
| max_features=100, | |
| min_comment_length=5, | |
| max_comment_length=200, | |
| enable_augmentation=False, | |
| ) | |
| def full_processor(full_config): | |
| """A TextProcessor with all steps enabled.""" | |
| return TextProcessor(config=full_config, language="english") | |
| def basic_processor(basic_config): | |
| """A TextProcessor with only basic cleaning (lowercase, punctuation).""" | |
| return TextProcessor(config=basic_config, language="english") | |
| # --- Tests --- | |
| class TestFeaturePipelineConfig: | |
| def test_config_id_generation(self, full_config, basic_config): | |
| """Tests that the readable ID is generated correctly.""" | |
| assert full_config.hash_id == "clean-k5000-test" | |
| assert basic_config.hash_id == "clean-k100" | |
| def test_config_attributes(self, full_config): | |
| """Tests that attributes are set correctly.""" | |
| assert full_config.use_stopwords is True | |
| assert full_config.use_lemmatization is True | |
| assert full_config.max_features == 5000 | |
| class TestTextProcessor: | |
| def test_clean_text_basic(self, basic_processor): | |
| """Tests lowercase and punctuation removal.""" | |
| text = "This is a TEST... with punctuation!!" | |
| expected = "this is a test with punctuation" | |
| assert basic_processor.clean_text(text) == expected | |
| def test_clean_text_stopwords(self, full_processor, basic_processor): | |
| """Tests stopword removal logic.""" | |
| text = "this is a test with a stopword" | |
| # With stopwords enabled | |
| expected_full = "test stopword" | |
| assert full_processor.clean_text(text) == expected_full | |
| # With stopwords disabled | |
| expected_basic = "this is a test with a stopword" | |
| assert basic_processor.clean_text(text) == expected_basic | |
| def test_clean_text_lemmatization(self, full_processor, basic_processor): | |
| """Tests lemmatization logic.""" | |
| text = "running tests while dogs are barking" | |
| # With lemmatization enabled | |
| expected_full = "running test dog barking" # 'are' and 'while' are stopwords | |
| assert full_processor.clean_text(text) == expected_full | |
| # With lemmatization disabled | |
| expected_basic = "running tests while dogs are barking" | |
| assert basic_processor.clean_text(text) == expected_basic | |
| def test_clean_text_handles_none(self, basic_processor): | |
| """Tests that it doesn't crash on None or pd.NA.""" | |
| assert basic_processor.clean_text(None) == "" | |
| assert basic_processor.clean_text(pd.NA) == "" | |
| class TestFeatureEngineer: | |
| def test_extract_numeric_features(self, basic_config): | |
| """Tests that extract_features_for_check adds metadata features.""" | |
| fe = FeatureEngineer(config=basic_config) | |
| data = {"comment_sentence": ["This is short.", "This one is a bit longer.", ""]} | |
| df = pd.DataFrame(data) | |
| df_out = fe.extract_features_for_check(df) | |
| assert "f_length" in df_out.columns | |
| assert "f_word_count" in df_out.columns | |
| assert "f_starts_verb" in df_out.columns | |
| assert "text_hash" in df_out.columns | |
| assert df_out["f_length"].tolist() == [14, 25, 0] | |
| assert df_out["f_word_count"].tolist() == [3, 6, 0] | |