Spaces:

hblim
/

reddit_sentiment_tracker

Running

App Files Files Community

reddit_sentiment_tracker / reddit_analysis /tests /summarizer /test_summarize.py

hblim

Clean codebase for HF Space (drop Prometheus binary data)

a6576f0 3 months ago

raw

history blame contribute delete

4.5 kB

	import pytest
	import pandas as pd
	from pathlib import Path
	from datetime import date
	from unittest.mock import Mock, patch

	from reddit_analysis.summarizer.summarize import (
	SummaryManager,
	FileManager,
	HuggingFaceManager,
	)


	# --------------------------------------------------------------------------- #
	# Fixtures #
	# --------------------------------------------------------------------------- #
	@pytest.fixture
	def mock_config(tmp_path):
	"""Minimal config dict compatible with SummaryManager."""
	return {
	"config": {
	"repo_id": "test/repo",
	"repo_type": "dataset",
	},
	"paths": {
	"root": tmp_path,
	"scored_dir": tmp_path / "scored",
	"hf_scored_dir": "scored", # relative path in the Hub
	"summary_file": tmp_path / "summary.csv",
	},
	"secrets": {"HF_TOKEN": "fake"},
	}


	@pytest.fixture
	def mock_file_manager():
	"""FileManager double with just the methods we need."""
	m = Mock(spec=FileManager)
	# read_parquet returns sample data we set in each test
	# write_csv just returns a Path so downstream code is happy
	m.write_csv.return_value = Path("summary.csv")
	return m


	@pytest.fixture
	def mock_hf_manager():
	"""HuggingFaceManager double."""
	return Mock(spec=HuggingFaceManager)


	# --------------------------------------------------------------------------- #
	# Tests #
	# --------------------------------------------------------------------------- #
	def test_process_date(mock_config, mock_file_manager, mock_hf_manager):
	"""End‑to‑end happy path."""
	# ---------- sample scored shard --------------------------------------- #
	sample = pd.DataFrame(
	{
	"subreddit": ["a", "a", "b", "b"],
	"sentiment": [0.8, 0.6, 0.4, 0.2],
	"score": [10, 20, 30, 40],
	"post_id": ["p1", "p2", "p3", "p4"],
	"text": ["t1", "t2", "t3", "t4"],
	"retrieved_at": pd.Timestamp.utcnow(),
	}
	)
	mock_file_manager.read_parquet.return_value = sample
	# first call → download scored file, second call (within _save_and_push_summary) unused here
	mock_hf_manager.download_file.return_value = Path("dummy.parquet")

	with patch.object(
	SummaryManager, "_load_remote_summary", return_value=pd.DataFrame()
	):
	mgr = SummaryManager(
	mock_config, file_manager=mock_file_manager, hf_manager=mock_hf_manager
	)
	mgr.process_date("2025-04-20")

	# assertions
	mock_file_manager.read_parquet.assert_called_once()
	mock_file_manager.write_csv.assert_called_once()
	mock_hf_manager.upload_file.assert_called_once()


	def test_get_processed_combinations(mock_config, mock_file_manager, mock_hf_manager):
	"""The helper should translate the existing CSV into a set of tuples."""
	existing = pd.DataFrame(
	{
	"date": ["2025-04-19", "2025-04-19"],
	"subreddit": ["a", "b"],
	"mean_sentiment": [0.5, 0.3],
	"weighted_sentiment": [0.4, 0.2],
	"count": [1, 1],
	}
	)

	with patch.object(
	SummaryManager, "_load_remote_summary", return_value=existing
	):
	mgr = SummaryManager(
	mock_config, file_manager=mock_file_manager, hf_manager=mock_hf_manager
	)
	processed = mgr.get_processed_combinations()

	assert processed == {(date(2025, 4, 19), "a"), (date(2025, 4, 19), "b")}


	def test_cli_invalid_date():
	"""main() should raise on malformed dates."""
	from reddit_analysis.summarizer.summarize import main

	with pytest.raises(ValueError):
	main("bad‑date‑format")


	def test_cli_missing_scored_file(mock_config, mock_file_manager, mock_hf_manager):
	"""Gracefully handles a missing *_scored.parquet on the Hub."""
	# download of scored file raises, but remote summary loads fine →
	mock_hf_manager.download_file.side_effect = Exception("not found")
	with patch.object(
	SummaryManager, "_load_remote_summary", return_value=pd.DataFrame()
	):
	mgr = SummaryManager(
	mock_config, file_manager=mock_file_manager, hf_manager=mock_hf_manager
	)
	# Should simply return after printing error, not raise.
	assert mgr.process_date("2025-04-20") is None