file-indexing

Running

App Files Files Community

file-indexing / app.py

LPX55

Update app.py

79e3e7f verified 4 months ago

raw

history blame contribute delete

8.43 kB

	import json
	import os
	import random
	import uuid
	import datetime
	import re
	from typing import List, Tuple, Dict, Optional, Generator, Any
	from agent import (
	PREFIX,
	COMPRESS_DATA_PROMPT_SMALL,
	COMPRESS_DATA_PROMPT,
	LOG_PROMPT,
	LOG_RESPONSE
	)
	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from pypdf import PdfReader
	import openai
	from huggingface_hub import HfApi

	# Configuration
	OPENAI_API_BASE = "https://openrouter.ai/api/v1"
	OPENAI_API_KEY = os.environ.get("OR_KEY", "")
	REPO_NAME = "LPX55/ArxivPapers"
	SAVE_DATA_URL = f"https://huggingface.co/datasets/{REPO_NAME}/raw/main/"
	HF_TOKEN = os.environ.get("HF_TOKEN", "")
	api = HfApi(token=HF_TOKEN)

	# Initialize OpenAI client
	openai.api_base = OPENAI_API_BASE
	openai.api_key = OPENAI_API_KEY
	VERBOSE = True # Set to False to disable debug logging


	# Indexing Constants
	INDEX_PROMPT = """Compile this data into a structured JSON format with these keys:
	- "keywords": List of important keywords
	- "title": Descriptive title
	- "description": Brief summary
	- "content": Main content
	- "url": Source URL if available
	"""

	def extract_paper_metadata(content: str) -> Dict:
	"""Extract structured metadata from a paper's content."""
	metadata = {
	"keywords": [],
	"title": "Untitled",
	"description": "No description",
	"content": content[:1000],
	"url": ""
	}

	# Extract URL
	url_match = re.search(r'https?://[^\s]+', content)
	if url_match:
	metadata['url'] = url_match.group(0)

	# Extract title (first line that looks like a title)
	lines = content.split('\n')
	for line in lines:
	if len(line) > 20 and line[0].isupper() and line[-1] in ('.', '?', '!'):
	metadata['title'] = line
	break

	# Extract description (first paragraph)
	paragraphs = [p for p in content.split('\n\n') if len(p) > 50]
	if paragraphs:
	metadata['description'] = paragraphs[0]

	# Extract keywords (from title and description)
	text_for_keywords = f"{metadata['title']} {metadata['description']}"
	words = [w.lower() for w in re.findall(r'\w+', text_for_keywords) if len(w) > 3]
	metadata['keywords'] = sorted(list(set(words)))[:10] # Get top 10 unique keywords

	return metadata


	def save_paper_to_memory(content: str) -> Dict:
	"""Save a paper to memory with proper metadata extraction."""
	metadata = extract_paper_metadata(content)

	# Additional processing for academic papers
	if 'arxiv' in metadata['url'].lower():
	metadata['keywords'].extend(['arxiv', 'paper', 'research'])
	metadata['description'] = f"Academic paper: {metadata['description']}"

	return metadata

	def create_index() -> None:
	"""Create or update the search index from memory files."""
	uid = uuid.uuid4()

	# Load existing index
	index_url = f"{SAVE_DATA_URL}mem-test2/index.json"
	r = requests.get(index_url)
	index_data = json.loads(r.text) if r.status_code == 200 else [{}]

	# Load main memory data
	main_url = f"{SAVE_DATA_URL}mem-test2/main.json"
	m = requests.get(main_url)
	main_data = json.loads(m.text) if m.status_code == 200 else []

	# Update index
	for entry in main_data:
	try:
	for keyword in entry.get('keywords', []):
	if keyword in index_data[0]:
	if entry['file_name'] not in index_data[0][keyword]:
	index_data[0][keyword].append(entry['file_name'])
	else:
	index_data[0][keyword] = [entry['file_name']]
	except Exception as e:
	print(f"Indexing error: {e}")

	# Save updated index
	index_path = f"tmp-index-{uid}.json"
	with open(index_path, "w") as f:
	json.dump(index_data, f)

	api.upload_file(
	path_or_fileobj=index_path,
	path_in_repo="/mem-test2/index.json",
	repo_id=REPO_NAME,
	repo_type="dataset",
	)


	def fetch_url_content(url: str) -> Tuple[bool, str]:
	"""Fetch content from a URL and return status and content."""
	try:
	if not url:
	return False, "Enter valid URL"

	response = requests.get(url)
	if response.status_code == 200:
	soup = BeautifulSoup(response.content, "lxml")
	return True, str(soup)
	return False, f"Status: {response.status_code}"
	except Exception as e:
	return False, f"Error: {e}"

	def read_file_content(file_path: str) -> str:
	"""Read content from a file (txt or pdf)."""
	if file_path.endswith(".pdf"):
	reader = PdfReader(file_path)
	return "\n".join(page.extract_text() for page in reader.pages)
	elif file_path.endswith(".txt"):
	with open(file_path, "r") as f:
	return f.read()
	return ""

	def generate_response(prompt: str, model: str = "meta-llama/llama-4-maverick:free") -> str:
	"""Generate response using OpenRouter API."""
	try:
	response = openai.ChatCompletion.create(
	model=model,
	messages=[{"role": "user", "content": prompt}],
	)
	return response.choices[0].message.content
	except Exception as e:
	return f"Error: {str(e)}"


	def process_pdf_url(pdf_url: str) -> str:
	"""Process PDF from URL and extract text."""
	try:
	response = requests.get(pdf_url, stream=True)
	if response.status_code == 200:
	temp_path = f"temp_{uuid.uuid4()}.pdf"
	with open(temp_path, "wb") as f:
	f.write(response.content)
	return read_file_content(temp_path)
	return f"Error: Status {response.status_code}"
	except Exception as e:
	return f"Error: {e}"
	def save_memory(purpose: str, content: str) -> List[Dict]:
	"""Save processed content to memory with proper metadata extraction."""
	metadata = extract_paper_metadata(content)
	return [metadata]

	def summarize(
	inp: str,
	history: List[Tuple[str, str]],
	report_check: bool,
	sum_check: bool,
	mem_check: bool,
	data: str = "",
	file: Optional[str] = None,
	url: str = "",
	pdf_url: str = "",
	model: str = "meta-llama/llama-4-maverick:free"
	) -> Generator[Tuple[str, List[Tuple[str, str]], str, Dict], None, None]:
	"""Main summarization function with memory support."""
	history = [(inp, "Processing...")]
	yield "", history, "", {}

	processed_data = ""
	if pdf_url.startswith("http"):
	processed_data += f"PDF URL: {pdf_url}\n"
	if url.startswith("http"):
	processed_data += f"URL: {url}\n"
	if file:
	processed_data += f"File: {file}\n"
	if data:
	processed_data += f"Data: {data[:1000]}\n"

	summary = f"Summary for: {inp[:100]}\n{processed_data[:500]}"
	memory_entries = []

	if mem_check:
	memory_entries = save_memory(inp, processed_data)
	if memory_entries:
	summary += "\n\nSaved to memory"
	else:
	summary += "\n\nMemory save failed"

	yield summary, history, "", memory_entries[0] if memory_entries else {}

	def create_app():
	with gr.Blocks() as app:
	gr.Markdown("## Mixtral 8x7B Summarizer")

	with gr.Row():
	with gr.Column(scale=3):
	prompt = gr.Textbox(label="Instruction")
	with gr.Column(scale=1):
	report_check = gr.Checkbox(label="Return report", value=True)
	sum_check = gr.Checkbox(label="Summarize", value=True)
	mem_check = gr.Checkbox(label="Memory", value=True)
	submit_btn = gr.Button("Submit")

	with gr.Row():
	with gr.Tab("Text"):
	data = gr.Textbox(label="Input text")
	with gr.Tab("File"):
	file = gr.File(label="Upload file")
	with gr.Tab("URL"):
	url = gr.Textbox(label="Website URL")
	with gr.Tab("PDF"):
	pdf_url = gr.Textbox(label="PDF URL")

	chatbot = gr.Chatbot()
	error_box = gr.Textbox()
	json_output = gr.JSON()

	submit_btn.click(
	summarize,
	[prompt, chatbot, report_check, sum_check, mem_check, data, file, url, pdf_url],
	[prompt, chatbot, error_box, json_output]
	)

	return app

	if __name__ == "__main__":
	app = create_app()
	app.launch()