Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
""" | |
Main pipeline script for the Iain Morris Article Generator | |
Orchestrates the complete workflow from data collection to model deployment | |
""" | |
import argparse | |
import sys | |
import os | |
import logging | |
from pathlib import Path | |
# Add src to path | |
sys.path.append(str(Path(__file__).parent / "src")) | |
from scraper import LightReadingScraper | |
from preprocess import ArticlePreprocessor | |
from finetune import IainMorrisFineTuner | |
from utils import setup_logging, ensure_directories, print_project_status, check_system_requirements | |
def run_data_collection(max_articles: int = 200): | |
""" | |
Step 1: Collect articles from Light Reading | |
Args: | |
max_articles: Maximum number of articles to scrape | |
""" | |
print("\nπ STEP 1: Data Collection") | |
print("="*50) | |
scraper = LightReadingScraper(delay=2.0) | |
print(f"Scraping articles by Iain Morris from Light Reading...") | |
print(f"Target: {max_articles} articles") | |
print("This may take 30-60 minutes depending on network speed...") | |
articles = scraper.scrape_author_articles("Iain Morris", max_articles=max_articles) | |
if articles: | |
scraper.save_articles(articles, "data/raw_articles.json") | |
print(f"β Successfully collected {len(articles)} articles") | |
# Show sample titles | |
print(f"\nSample article titles:") | |
for i, article in enumerate(articles[:3]): | |
print(f" {i+1}. {article['title']}") | |
return True | |
else: | |
print("β Failed to collect articles") | |
return False | |
def run_preprocessing(): | |
""" | |
Step 2: Preprocess articles for training | |
""" | |
print("\nπ STEP 2: Data Preprocessing") | |
print("="*50) | |
if not os.path.exists("data/raw_articles.json"): | |
print("β Raw articles not found. Run data collection first.") | |
return False | |
preprocessor = ArticlePreprocessor() | |
print("Processing articles for training...") | |
print("- Cleaning content") | |
print("- Creating instruction-response pairs") | |
print("- Splitting into train/validation sets") | |
preprocessor.process_articles("data/raw_articles.json") | |
if os.path.exists("data/train_dataset.json"): | |
print("β Preprocessing completed successfully") | |
return True | |
else: | |
print("β Preprocessing failed") | |
return False | |
def run_training(): | |
""" | |
Step 3: Fine-tune the model | |
""" | |
print("\nπ€ STEP 3: Model Fine-tuning") | |
print("="*50) | |
if not os.path.exists("data/train_dataset.json"): | |
print("β Training data not found. Run preprocessing first.") | |
return False | |
# Check system requirements | |
requirements = check_system_requirements() | |
print("System Requirements Check:") | |
print(f" PyTorch: {'β ' if requirements['torch_available'] else 'β'}") | |
print(f" CUDA: {'β ' if requirements['cuda_available'] else 'β'}") | |
if requirements['cuda_available']: | |
print(f" GPU Memory: {requirements['gpu_memory']:.1f} GB") | |
if requirements['gpu_memory'] < 8: | |
print("β οΈ Warning: Low GPU memory. Training may be slow or fail.") | |
else: | |
print("β οΈ Warning: No CUDA available. Training will be very slow on CPU.") | |
print("\nStarting fine-tuning...") | |
print("This will take several hours depending on your hardware.") | |
print("You can monitor progress in the terminal output.") | |
try: | |
fine_tuner = IainMorrisFineTuner() | |
fine_tuner.run_full_pipeline() | |
print("β Fine-tuning completed successfully") | |
return True | |
except Exception as e: | |
print(f"β Fine-tuning failed: {e}") | |
return False | |
def run_app(): | |
""" | |
Step 4: Launch the Gradio app | |
""" | |
print("\nπ STEP 4: Launching Gradio App") | |
print("="*50) | |
print("Starting the Iain Morris Article Generator app...") | |
print("The app will be available at: http://localhost:7860") | |
print("Press Ctrl+C to stop the app") | |
try: | |
# Import and run the app | |
import subprocess | |
subprocess.run([sys.executable, "app.py"]) | |
except KeyboardInterrupt: | |
print("\nπ App stopped by user") | |
except Exception as e: | |
print(f"β Failed to launch app: {e}") | |
def main(): | |
"""Main function with command line interface""" | |
parser = argparse.ArgumentParser( | |
description="Iain Morris Article Generator Pipeline", | |
formatter_class=argparse.RawDescriptionHelpFormatter, | |
epilog=""" | |
Examples: | |
python run_pipeline.py --all # Run complete pipeline | |
python run_pipeline.py --collect --max-articles 100 # Collect 100 articles | |
python run_pipeline.py --preprocess # Only preprocess data | |
python run_pipeline.py --train # Only train model | |
python run_pipeline.py --app # Only launch app | |
python run_pipeline.py --status # Show project status | |
""" | |
) | |
parser.add_argument("--all", action="store_true", | |
help="Run the complete pipeline") | |
parser.add_argument("--collect", action="store_true", | |
help="Collect articles from Light Reading") | |
parser.add_argument("--preprocess", action="store_true", | |
help="Preprocess articles for training") | |
parser.add_argument("--train", action="store_true", | |
help="Fine-tune the model") | |
parser.add_argument("--app", action="store_true", | |
help="Launch the Gradio app") | |
parser.add_argument("--status", action="store_true", | |
help="Show project status") | |
parser.add_argument("--max-articles", type=int, default=200, | |
help="Maximum number of articles to collect (default: 200)") | |
parser.add_argument("--log-level", default="INFO", | |
choices=["DEBUG", "INFO", "WARNING", "ERROR"], | |
help="Logging level (default: INFO)") | |
args = parser.parse_args() | |
# Setup logging and directories | |
setup_logging(args.log_level) | |
ensure_directories() | |
# Show header | |
print("\n" + "="*60) | |
print("ποΈ IAIN MORRIS ARTICLE GENERATOR") | |
print(" AI-Powered Telecom Journalism in Iain's Style") | |
print("="*60) | |
# Show status if requested | |
if args.status: | |
print_project_status() | |
return | |
# Run pipeline steps | |
success = True | |
if args.all or args.collect: | |
success = run_data_collection(args.max_articles) | |
if not success and args.all: | |
print("β Pipeline stopped due to data collection failure") | |
return | |
if (args.all or args.preprocess) and success: | |
success = run_preprocessing() | |
if not success and args.all: | |
print("β Pipeline stopped due to preprocessing failure") | |
return | |
if (args.all or args.train) and success: | |
success = run_training() | |
if not success and args.all: | |
print("β Pipeline stopped due to training failure") | |
return | |
if (args.all or args.app) and success: | |
run_app() | |
# Final status | |
if args.all: | |
if success: | |
print("\nπ Pipeline completed successfully!") | |
print("Your Iain Morris style article generator is ready!") | |
else: | |
print("\nβ Pipeline completed with errors") | |
# Show final project status | |
print_project_status() | |
if __name__ == "__main__": | |
main() | |