Spaces:
Running
Running
File size: 4,500 Bytes
a6576f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import pytest
import pandas as pd
from pathlib import Path
from datetime import date
from unittest.mock import Mock, patch
from reddit_analysis.summarizer.summarize import (
SummaryManager,
FileManager,
HuggingFaceManager,
)
# --------------------------------------------------------------------------- #
# Fixtures #
# --------------------------------------------------------------------------- #
@pytest.fixture
def mock_config(tmp_path):
"""Minimal config dict compatible with SummaryManager."""
return {
"config": {
"repo_id": "test/repo",
"repo_type": "dataset",
},
"paths": {
"root": tmp_path,
"scored_dir": tmp_path / "scored",
"hf_scored_dir": "scored", # relative path in the Hub
"summary_file": tmp_path / "summary.csv",
},
"secrets": {"HF_TOKEN": "fake"},
}
@pytest.fixture
def mock_file_manager():
"""FileManager double with just the methods we need."""
m = Mock(spec=FileManager)
# read_parquet returns sample data we set in each test
# write_csv just returns a Path so downstream code is happy
m.write_csv.return_value = Path("summary.csv")
return m
@pytest.fixture
def mock_hf_manager():
"""HuggingFaceManager double."""
return Mock(spec=HuggingFaceManager)
# --------------------------------------------------------------------------- #
# Tests #
# --------------------------------------------------------------------------- #
def test_process_date(mock_config, mock_file_manager, mock_hf_manager):
"""End‑to‑end happy path."""
# ---------- sample scored shard --------------------------------------- #
sample = pd.DataFrame(
{
"subreddit": ["a", "a", "b", "b"],
"sentiment": [0.8, 0.6, 0.4, 0.2],
"score": [10, 20, 30, 40],
"post_id": ["p1", "p2", "p3", "p4"],
"text": ["t1", "t2", "t3", "t4"],
"retrieved_at": pd.Timestamp.utcnow(),
}
)
mock_file_manager.read_parquet.return_value = sample
# first call → download scored file, second call (within _save_and_push_summary) unused here
mock_hf_manager.download_file.return_value = Path("dummy.parquet")
with patch.object(
SummaryManager, "_load_remote_summary", return_value=pd.DataFrame()
):
mgr = SummaryManager(
mock_config, file_manager=mock_file_manager, hf_manager=mock_hf_manager
)
mgr.process_date("2025-04-20")
# assertions
mock_file_manager.read_parquet.assert_called_once()
mock_file_manager.write_csv.assert_called_once()
mock_hf_manager.upload_file.assert_called_once()
def test_get_processed_combinations(mock_config, mock_file_manager, mock_hf_manager):
"""The helper should translate the existing CSV into a set of tuples."""
existing = pd.DataFrame(
{
"date": ["2025-04-19", "2025-04-19"],
"subreddit": ["a", "b"],
"mean_sentiment": [0.5, 0.3],
"weighted_sentiment": [0.4, 0.2],
"count": [1, 1],
}
)
with patch.object(
SummaryManager, "_load_remote_summary", return_value=existing
):
mgr = SummaryManager(
mock_config, file_manager=mock_file_manager, hf_manager=mock_hf_manager
)
processed = mgr.get_processed_combinations()
assert processed == {(date(2025, 4, 19), "a"), (date(2025, 4, 19), "b")}
def test_cli_invalid_date():
"""main() should raise on malformed dates."""
from reddit_analysis.summarizer.summarize import main
with pytest.raises(ValueError):
main("bad‑date‑format")
def test_cli_missing_scored_file(mock_config, mock_file_manager, mock_hf_manager):
"""Gracefully handles a missing *_scored.parquet on the Hub."""
# download of scored file raises, but remote summary loads fine →
mock_hf_manager.download_file.side_effect = Exception("not found")
with patch.object(
SummaryManager, "_load_remote_summary", return_value=pd.DataFrame()
):
mgr = SummaryManager(
mock_config, file_manager=mock_file_manager, hf_manager=mock_hf_manager
)
# Should simply return after printing error, not raise.
assert mgr.process_date("2025-04-20") is None
|