file-indexing / app.py
LPX55's picture
Update app.py
79e3e7f verified
import json
import os
import random
import uuid
import datetime
import re
from typing import List, Tuple, Dict, Optional, Generator, Any
from agent import (
PREFIX,
COMPRESS_DATA_PROMPT_SMALL,
COMPRESS_DATA_PROMPT,
LOG_PROMPT,
LOG_RESPONSE
)
import gradio as gr
import requests
from bs4 import BeautifulSoup
from pypdf import PdfReader
import openai
from huggingface_hub import HfApi
# Configuration
OPENAI_API_BASE = "https://openrouter.ai/api/v1"
OPENAI_API_KEY = os.environ.get("OR_KEY", "")
REPO_NAME = "LPX55/ArxivPapers"
SAVE_DATA_URL = f"https://huggingface.co/datasets/{REPO_NAME}/raw/main/"
HF_TOKEN = os.environ.get("HF_TOKEN", "")
api = HfApi(token=HF_TOKEN)
# Initialize OpenAI client
openai.api_base = OPENAI_API_BASE
openai.api_key = OPENAI_API_KEY
VERBOSE = True # Set to False to disable debug logging
# Indexing Constants
INDEX_PROMPT = """Compile this data into a structured JSON format with these keys:
- "keywords": List of important keywords
- "title": Descriptive title
- "description": Brief summary
- "content": Main content
- "url": Source URL if available
"""
def extract_paper_metadata(content: str) -> Dict:
"""Extract structured metadata from a paper's content."""
metadata = {
"keywords": [],
"title": "Untitled",
"description": "No description",
"content": content[:1000],
"url": ""
}
# Extract URL
url_match = re.search(r'https?://[^\s]+', content)
if url_match:
metadata['url'] = url_match.group(0)
# Extract title (first line that looks like a title)
lines = content.split('\n')
for line in lines:
if len(line) > 20 and line[0].isupper() and line[-1] in ('.', '?', '!'):
metadata['title'] = line
break
# Extract description (first paragraph)
paragraphs = [p for p in content.split('\n\n') if len(p) > 50]
if paragraphs:
metadata['description'] = paragraphs[0]
# Extract keywords (from title and description)
text_for_keywords = f"{metadata['title']} {metadata['description']}"
words = [w.lower() for w in re.findall(r'\w+', text_for_keywords) if len(w) > 3]
metadata['keywords'] = sorted(list(set(words)))[:10] # Get top 10 unique keywords
return metadata
def save_paper_to_memory(content: str) -> Dict:
"""Save a paper to memory with proper metadata extraction."""
metadata = extract_paper_metadata(content)
# Additional processing for academic papers
if 'arxiv' in metadata['url'].lower():
metadata['keywords'].extend(['arxiv', 'paper', 'research'])
metadata['description'] = f"Academic paper: {metadata['description']}"
return metadata
def create_index() -> None:
"""Create or update the search index from memory files."""
uid = uuid.uuid4()
# Load existing index
index_url = f"{SAVE_DATA_URL}mem-test2/index.json"
r = requests.get(index_url)
index_data = json.loads(r.text) if r.status_code == 200 else [{}]
# Load main memory data
main_url = f"{SAVE_DATA_URL}mem-test2/main.json"
m = requests.get(main_url)
main_data = json.loads(m.text) if m.status_code == 200 else []
# Update index
for entry in main_data:
try:
for keyword in entry.get('keywords', []):
if keyword in index_data[0]:
if entry['file_name'] not in index_data[0][keyword]:
index_data[0][keyword].append(entry['file_name'])
else:
index_data[0][keyword] = [entry['file_name']]
except Exception as e:
print(f"Indexing error: {e}")
# Save updated index
index_path = f"tmp-index-{uid}.json"
with open(index_path, "w") as f:
json.dump(index_data, f)
api.upload_file(
path_or_fileobj=index_path,
path_in_repo="/mem-test2/index.json",
repo_id=REPO_NAME,
repo_type="dataset",
)
def fetch_url_content(url: str) -> Tuple[bool, str]:
"""Fetch content from a URL and return status and content."""
try:
if not url:
return False, "Enter valid URL"
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "lxml")
return True, str(soup)
return False, f"Status: {response.status_code}"
except Exception as e:
return False, f"Error: {e}"
def read_file_content(file_path: str) -> str:
"""Read content from a file (txt or pdf)."""
if file_path.endswith(".pdf"):
reader = PdfReader(file_path)
return "\n".join(page.extract_text() for page in reader.pages)
elif file_path.endswith(".txt"):
with open(file_path, "r") as f:
return f.read()
return ""
def generate_response(prompt: str, model: str = "meta-llama/llama-4-maverick:free") -> str:
"""Generate response using OpenRouter API."""
try:
response = openai.ChatCompletion.create(
model=model,
messages=[{"role": "user", "content": prompt}],
)
return response.choices[0].message.content
except Exception as e:
return f"Error: {str(e)}"
def process_pdf_url(pdf_url: str) -> str:
"""Process PDF from URL and extract text."""
try:
response = requests.get(pdf_url, stream=True)
if response.status_code == 200:
temp_path = f"temp_{uuid.uuid4()}.pdf"
with open(temp_path, "wb") as f:
f.write(response.content)
return read_file_content(temp_path)
return f"Error: Status {response.status_code}"
except Exception as e:
return f"Error: {e}"
def save_memory(purpose: str, content: str) -> List[Dict]:
"""Save processed content to memory with proper metadata extraction."""
metadata = extract_paper_metadata(content)
return [metadata]
def summarize(
inp: str,
history: List[Tuple[str, str]],
report_check: bool,
sum_check: bool,
mem_check: bool,
data: str = "",
file: Optional[str] = None,
url: str = "",
pdf_url: str = "",
model: str = "meta-llama/llama-4-maverick:free"
) -> Generator[Tuple[str, List[Tuple[str, str]], str, Dict], None, None]:
"""Main summarization function with memory support."""
history = [(inp, "Processing...")]
yield "", history, "", {}
processed_data = ""
if pdf_url.startswith("http"):
processed_data += f"PDF URL: {pdf_url}\n"
if url.startswith("http"):
processed_data += f"URL: {url}\n"
if file:
processed_data += f"File: {file}\n"
if data:
processed_data += f"Data: {data[:1000]}\n"
summary = f"Summary for: {inp[:100]}\n{processed_data[:500]}"
memory_entries = []
if mem_check:
memory_entries = save_memory(inp, processed_data)
if memory_entries:
summary += "\n\nSaved to memory"
else:
summary += "\n\nMemory save failed"
yield summary, history, "", memory_entries[0] if memory_entries else {}
def create_app():
with gr.Blocks() as app:
gr.Markdown("## Mixtral 8x7B Summarizer")
with gr.Row():
with gr.Column(scale=3):
prompt = gr.Textbox(label="Instruction")
with gr.Column(scale=1):
report_check = gr.Checkbox(label="Return report", value=True)
sum_check = gr.Checkbox(label="Summarize", value=True)
mem_check = gr.Checkbox(label="Memory", value=True)
submit_btn = gr.Button("Submit")
with gr.Row():
with gr.Tab("Text"):
data = gr.Textbox(label="Input text")
with gr.Tab("File"):
file = gr.File(label="Upload file")
with gr.Tab("URL"):
url = gr.Textbox(label="Website URL")
with gr.Tab("PDF"):
pdf_url = gr.Textbox(label="PDF URL")
chatbot = gr.Chatbot()
error_box = gr.Textbox()
json_output = gr.JSON()
submit_btn.click(
summarize,
[prompt, chatbot, report_check, sum_check, mem_check, data, file, url, pdf_url],
[prompt, chatbot, error_box, json_output]
)
return app
if __name__ == "__main__":
app = create_app()
app.launch()