file-indexing

Running

File size: 8,430 Bytes

import json
import os
import random
import uuid
import datetime
import re
from typing import List, Tuple, Dict, Optional, Generator, Any
from agent import (
    PREFIX,
    COMPRESS_DATA_PROMPT_SMALL,
    COMPRESS_DATA_PROMPT,
    LOG_PROMPT,
    LOG_RESPONSE
)
import gradio as gr
import requests
from bs4 import BeautifulSoup
from pypdf import PdfReader
import openai
from huggingface_hub import HfApi

# Configuration
OPENAI_API_BASE = "https://openrouter.ai/api/v1"
OPENAI_API_KEY = os.environ.get("OR_KEY", "")
REPO_NAME = "LPX55/ArxivPapers"
SAVE_DATA_URL = f"https://huggingface.co/datasets/{REPO_NAME}/raw/main/"
HF_TOKEN = os.environ.get("HF_TOKEN", "")
api = HfApi(token=HF_TOKEN)

# Initialize OpenAI client
openai.api_base = OPENAI_API_BASE
openai.api_key = OPENAI_API_KEY
VERBOSE = True  # Set to False to disable debug logging


# Indexing Constants
INDEX_PROMPT = """Compile this data into a structured JSON format with these keys:
- "keywords": List of important keywords
- "title": Descriptive title
- "description": Brief summary
- "content": Main content
- "url": Source URL if available
"""

def extract_paper_metadata(content: str) -> Dict:
    """Extract structured metadata from a paper's content."""
    metadata = {
        "keywords": [],
        "title": "Untitled",
        "description": "No description",
        "content": content[:1000],
        "url": ""
    }
    
    # Extract URL
    url_match = re.search(r'https?://[^\s]+', content)
    if url_match:
        metadata['url'] = url_match.group(0)
    
    # Extract title (first line that looks like a title)
    lines = content.split('\n')
    for line in lines:
        if len(line) > 20 and line[0].isupper() and line[-1] in ('.', '?', '!'):
            metadata['title'] = line
            break
    
    # Extract description (first paragraph)
    paragraphs = [p for p in content.split('\n\n') if len(p) > 50]
    if paragraphs:
        metadata['description'] = paragraphs[0]
    
    # Extract keywords (from title and description)
    text_for_keywords = f"{metadata['title']} {metadata['description']}"
    words = [w.lower() for w in re.findall(r'\w+', text_for_keywords) if len(w) > 3]
    metadata['keywords'] = sorted(list(set(words)))[:10]  # Get top 10 unique keywords
    
    return metadata


def save_paper_to_memory(content: str) -> Dict:
    """Save a paper to memory with proper metadata extraction."""
    metadata = extract_paper_metadata(content)
    
    # Additional processing for academic papers
    if 'arxiv' in metadata['url'].lower():
        metadata['keywords'].extend(['arxiv', 'paper', 'research'])
        metadata['description'] = f"Academic paper: {metadata['description']}"
    
    return metadata

def create_index() -> None:
    """Create or update the search index from memory files."""
    uid = uuid.uuid4()
    
    # Load existing index
    index_url = f"{SAVE_DATA_URL}mem-test2/index.json"
    r = requests.get(index_url)
    index_data = json.loads(r.text) if r.status_code == 200 else [{}]
    
    # Load main memory data
    main_url = f"{SAVE_DATA_URL}mem-test2/main.json"
    m = requests.get(main_url)
    main_data = json.loads(m.text) if m.status_code == 200 else []
    
    # Update index
    for entry in main_data:
        try:
            for keyword in entry.get('keywords', []):
                if keyword in index_data[0]:
                    if entry['file_name'] not in index_data[0][keyword]:
                        index_data[0][keyword].append(entry['file_name'])
                else:
                    index_data[0][keyword] = [entry['file_name']]
        except Exception as e:
            print(f"Indexing error: {e}")
    
    # Save updated index
    index_path = f"tmp-index-{uid}.json"
    with open(index_path, "w") as f:
        json.dump(index_data, f)
    
    api.upload_file(
        path_or_fileobj=index_path,
        path_in_repo="/mem-test2/index.json",
        repo_id=REPO_NAME,
        repo_type="dataset",
    )

 
def fetch_url_content(url: str) -> Tuple[bool, str]:
    """Fetch content from a URL and return status and content."""
    try:
        if not url:
            return False, "Enter valid URL"
            
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "lxml")
            return True, str(soup)
        return False, f"Status: {response.status_code}"
    except Exception as e:
        return False, f"Error: {e}"

def read_file_content(file_path: str) -> str:
    """Read content from a file (txt or pdf)."""
    if file_path.endswith(".pdf"):
        reader = PdfReader(file_path)
        return "\n".join(page.extract_text() for page in reader.pages)
    elif file_path.endswith(".txt"):
        with open(file_path, "r") as f:
            return f.read()
    return ""

def generate_response(prompt: str, model: str = "meta-llama/llama-4-maverick:free") -> str:
    """Generate response using OpenRouter API."""
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error: {str(e)}"


def process_pdf_url(pdf_url: str) -> str:
    """Process PDF from URL and extract text."""
    try:
        response = requests.get(pdf_url, stream=True)
        if response.status_code == 200:
            temp_path = f"temp_{uuid.uuid4()}.pdf"
            with open(temp_path, "wb") as f:
                f.write(response.content)
            return read_file_content(temp_path)
        return f"Error: Status {response.status_code}"
    except Exception as e:
        return f"Error: {e}"
def save_memory(purpose: str, content: str) -> List[Dict]:
    """Save processed content to memory with proper metadata extraction."""
    metadata = extract_paper_metadata(content)
    return [metadata]
   
def summarize(
    inp: str,
    history: List[Tuple[str, str]],
    report_check: bool,
    sum_check: bool,
    mem_check: bool,
    data: str = "",
    file: Optional[str] = None,
    url: str = "",
    pdf_url: str = "",
    model: str = "meta-llama/llama-4-maverick:free"
) -> Generator[Tuple[str, List[Tuple[str, str]], str, Dict], None, None]:
    """Main summarization function with memory support."""
    history = [(inp, "Processing...")]
    yield "", history, "", {}

    processed_data = ""
    if pdf_url.startswith("http"):
        processed_data += f"PDF URL: {pdf_url}\n"
    if url.startswith("http"):
        processed_data += f"URL: {url}\n"
    if file:
        processed_data += f"File: {file}\n"
    if data:
        processed_data += f"Data: {data[:1000]}\n"

    summary = f"Summary for: {inp[:100]}\n{processed_data[:500]}"
    memory_entries = []

    if mem_check:
        memory_entries = save_memory(inp, processed_data)
        if memory_entries:
            summary += "\n\nSaved to memory"
        else:
            summary += "\n\nMemory save failed"

    yield summary, history, "", memory_entries[0] if memory_entries else {}
    
def create_app():
    with gr.Blocks() as app:
        gr.Markdown("## Mixtral 8x7B Summarizer")
        
        with gr.Row():
            with gr.Column(scale=3):
                prompt = gr.Textbox(label="Instruction")
            with gr.Column(scale=1):
                report_check = gr.Checkbox(label="Return report", value=True)
                sum_check = gr.Checkbox(label="Summarize", value=True)
                mem_check = gr.Checkbox(label="Memory", value=True)
                submit_btn = gr.Button("Submit")
        
        with gr.Row():
            with gr.Tab("Text"):
                data = gr.Textbox(label="Input text")
            with gr.Tab("File"):
                file = gr.File(label="Upload file")
            with gr.Tab("URL"):
                url = gr.Textbox(label="Website URL")
            with gr.Tab("PDF"):
                pdf_url = gr.Textbox(label="PDF URL")
        
        chatbot = gr.Chatbot()
        error_box = gr.Textbox()
        json_output = gr.JSON()

        submit_btn.click(
            summarize,
            [prompt, chatbot, report_check, sum_check, mem_check, data, file, url, pdf_url],
            [prompt, chatbot, error_box, json_output]
        )

    return app

if __name__ == "__main__":
    app = create_app()
    app.launch()