Spaces:
Sleeping
Sleeping
| import os | |
| import pandas as pd | |
| import requests | |
| import zipfile | |
| from pathlib import Path | |
| import logging | |
| from tqdm import tqdm | |
| import json | |
| import kaggle | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class DatasetDownloader: | |
| def __init__(self): | |
| self.project_root = Path(__file__).parent.parent.parent | |
| self.raw_data_dir = self.project_root / "data" / "raw" | |
| self.processed_data_dir = self.project_root / "data" / "processed" | |
| # Create directories if they don't exist | |
| os.makedirs(self.raw_data_dir, exist_ok=True) | |
| os.makedirs(self.processed_data_dir, exist_ok=True) | |
| def download_kaggle_dataset(self): | |
| """Download dataset from Kaggle.""" | |
| logger.info("Downloading dataset from Kaggle...") | |
| # Kaggle dataset ID | |
| dataset_id = "clmentbisaillon/fake-and-real-news-dataset" | |
| try: | |
| kaggle.api.dataset_download_files( | |
| dataset_id, | |
| path=self.raw_data_dir, | |
| unzip=True | |
| ) | |
| logger.info("Successfully downloaded dataset from Kaggle") | |
| except Exception as e: | |
| logger.error(f"Error downloading from Kaggle: {str(e)}") | |
| logger.info("Please download the dataset manually from: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset") | |
| def download_liar(self): | |
| """Download LIAR dataset.""" | |
| logger.info("Downloading LIAR dataset...") | |
| # URL for LIAR dataset | |
| url = "https://www.cs.ucsb.edu/~william/data/liar_dataset.zip" | |
| output_path = self.raw_data_dir / "liar_dataset.zip" | |
| if not output_path.exists(): | |
| try: | |
| response = requests.get(url, stream=True) | |
| total_size = int(response.headers.get('content-length', 0)) | |
| with open(output_path, 'wb') as f, tqdm( | |
| desc="Downloading LIAR dataset", | |
| total=total_size, | |
| unit='iB', | |
| unit_scale=True | |
| ) as pbar: | |
| for data in response.iter_content(chunk_size=1024): | |
| size = f.write(data) | |
| pbar.update(size) | |
| # Extract the zip file | |
| with zipfile.ZipFile(output_path, 'r') as zip_ref: | |
| zip_ref.extractall(self.raw_data_dir / "liar") | |
| except Exception as e: | |
| logger.error(f"Error downloading LIAR dataset: {str(e)}") | |
| logger.info("Please download the LIAR dataset manually from: https://www.cs.ucsb.edu/~william/data/liar_dataset.zip") | |
| def process_kaggle_dataset(self): | |
| """Process the Kaggle dataset.""" | |
| logger.info("Processing Kaggle dataset...") | |
| # Read fake and real news files | |
| fake_df = pd.read_csv(self.raw_data_dir / "Fake.csv") | |
| true_df = pd.read_csv(self.raw_data_dir / "True.csv") | |
| # Add labels | |
| fake_df['label'] = 1 # 1 for fake | |
| true_df['label'] = 0 # 0 for real | |
| # Combine datasets | |
| combined_df = pd.concat([fake_df, true_df], ignore_index=True) | |
| # Save processed data | |
| combined_df.to_csv(self.processed_data_dir / "kaggle_processed.csv", index=False) | |
| logger.info(f"Saved {len(combined_df)} articles from Kaggle dataset") | |
| def process_liar(self): | |
| """Process LIAR dataset.""" | |
| logger.info("Processing LIAR dataset...") | |
| # Read LIAR dataset | |
| liar_file = self.raw_data_dir / "liar" / "train.tsv" | |
| if not liar_file.exists(): | |
| logger.error("LIAR dataset not found!") | |
| return | |
| # Read TSV file | |
| df = pd.read_csv(liar_file, sep='\t', header=None) | |
| # Rename columns | |
| df.columns = [ | |
| 'id', 'label', 'statement', 'subject', 'speaker', | |
| 'job_title', 'state_info', 'party_affiliation', | |
| 'barely_true', 'false', 'half_true', 'mostly_true', | |
| 'pants_on_fire', 'venue' | |
| ] | |
| # Convert labels to binary (0 for true, 1 for false) | |
| label_map = { | |
| 'true': 0, | |
| 'mostly-true': 0, | |
| 'half-true': 0, | |
| 'barely-true': 1, | |
| 'false': 1, | |
| 'pants-fire': 1 | |
| } | |
| df['label'] = df['label'].map(label_map) | |
| # Select relevant columns | |
| df = df[['statement', 'label', 'subject', 'speaker', 'party_affiliation']] | |
| df.columns = ['text', 'label', 'subject', 'speaker', 'party'] | |
| # Save processed data | |
| df.to_csv(self.processed_data_dir / "liar_processed.csv", index=False) | |
| logger.info(f"Saved {len(df)} articles from LIAR dataset") | |
| def combine_datasets(self): | |
| """Combine processed datasets.""" | |
| logger.info("Combining datasets...") | |
| # Read processed datasets | |
| kaggle_df = pd.read_csv(self.processed_data_dir / "kaggle_processed.csv") | |
| liar_df = pd.read_csv(self.processed_data_dir / "liar_processed.csv") | |
| # Combine datasets | |
| combined_df = pd.concat([ | |
| kaggle_df[['text', 'label']], | |
| liar_df[['text', 'label']] | |
| ], ignore_index=True) | |
| # Save combined dataset | |
| combined_df.to_csv(self.processed_data_dir / "combined_dataset.csv", index=False) | |
| logger.info(f"Combined dataset contains {len(combined_df)} articles") | |
| def main(): | |
| downloader = DatasetDownloader() | |
| # Download datasets | |
| downloader.download_kaggle_dataset() | |
| downloader.download_liar() | |
| # Process datasets | |
| downloader.process_kaggle_dataset() | |
| downloader.process_liar() | |
| # Combine datasets | |
| downloader.combine_datasets() | |
| logger.info("Dataset preparation completed!") | |
| if __name__ == "__main__": | |
| main() |