Spaces:

jlnh
/

RAG_MVP

Sleeping

RAG_MVP / app.py

Jialun He

1st version

2934401 11 days ago

6.19 kB

	import gradio as gr
	import PyPDF2
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	import re
	import io

	class SimpleRAG:
	def __init__(self):
	self.model = SentenceTransformer('all-MiniLM-L6-v2')
	self.chunks = []
	self.embeddings = None
	self.document_processed = False

	def extract_text_from_pdf(self, pdf_file):
	try:
	pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	return text
	except Exception as e:
	raise Exception(f"Error reading PDF: {str(e)}")

	def chunk_text(self, text, chunk_size=500, overlap=100):
	sentences = re.split(r'[.!?]+', text)
	chunks = []
	current_chunk = ""

	for sentence in sentences:
	sentence = sentence.strip()
	if not sentence:
	continue

	if len(current_chunk) + len(sentence) <= chunk_size:
	current_chunk += sentence + ". "
	else:
	if current_chunk:
	chunks.append(current_chunk.strip())
	current_chunk = sentence + ". "

	if current_chunk:
	chunks.append(current_chunk.strip())

	return [chunk for chunk in chunks if len(chunk) > 50]

	def process_document(self, pdf_file):
	try:
	text = self.extract_text_from_pdf(pdf_file)
	if not text.strip():
	return "Error: Could not extract text from PDF. Please try a different file."

	self.chunks = self.chunk_text(text)
	if not self.chunks:
	return "Error: No meaningful content found in the PDF."

	self.embeddings = self.model.encode(self.chunks)
	self.document_processed = True

	return f"Document processed successfully! Created {len(self.chunks)} text chunks. You can now ask questions."

	except Exception as e:
	return f"Error processing document: {str(e)}"

	def search_similar_chunks(self, query, top_k=3):
	if not self.document_processed:
	return "Please upload and process a document first."

	try:
	query_embedding = self.model.encode([query])
	similarities = cosine_similarity(query_embedding, self.embeddings)[0]
	top_indices = np.argsort(similarities)[::-1][:top_k]

	results = []
	for idx in top_indices:
	results.append({
	'chunk': self.chunks[idx],
	'similarity': similarities[idx]
	})

	return results

	except Exception as e:
	return f"Error searching: {str(e)}"

	def answer_question(self, question):
	if not self.document_processed:
	return "Please upload and process a document first."

	if not question.strip():
	return "Please enter a question."

	results = self.search_similar_chunks(question)

	if isinstance(results, str):
	return results

	if not results:
	return "No relevant information found in the document."

	answer = "Based on the document, here's what I found:\n\n"

	for i, result in enumerate(results):
	answer += f"Relevant section {i+1} (similarity: {result['similarity']:.3f}):\n"
	answer += f"{result['chunk']}\n\n"

	return answer

	rag_system = SimpleRAG()

	def process_pdf(pdf_file):
	if pdf_file is None:
	return "Please upload a PDF file."

	try:
	with open(pdf_file.name, 'rb') as f:
	pdf_content = f.read()

	result = rag_system.process_document(pdf_content)
	return result

	except Exception as e:
	return f"Error: {str(e)}"

	def ask_question(question):
	return rag_system.answer_question(question)

	with gr.Blocks(title="RAG MVP - Document Q&A", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# RAG MVP - Document Q&A System

	Upload a PDF document and ask questions about its content. The system will find relevant information and provide answers based on the document.

	How to use:
	1. Upload a PDF file
	2. Wait for processing to complete
	3. Ask questions about the document
	""")

	with gr.Row():
	with gr.Column(scale=1):
	pdf_input = gr.File(
	label="Upload PDF Document",
	file_types=[".pdf"],
	type="filepath"
	)

	process_btn = gr.Button("Process Document", variant="primary")

	process_output = gr.Textbox(
	label="Processing Status",
	lines=3,
	interactive=False
	)

	with gr.Column(scale=1):
	question_input = gr.Textbox(
	label="Ask a Question",
	placeholder="What is this document about?",
	lines=2
	)

	ask_btn = gr.Button("Get Answer", variant="secondary")

	answer_output = gr.Textbox(
	label="Answer",
	lines=10,
	interactive=False
	)

	gr.Markdown("""
	### Sample Questions to Try:
	- What is the main topic of this document?
	- Can you summarize the key points?
	- What are the important details mentioned?
	""")

	process_btn.click(
	fn=process_pdf,
	inputs=[pdf_input],
	outputs=[process_output]
	)

	ask_btn.click(
	fn=ask_question,
	inputs=[question_input],
	outputs=[answer_output]
	)

	question_input.submit(
	fn=ask_question,
	inputs=[question_input],
	outputs=[answer_output]
	)

	if __name__ == "__main__":
	demo.launch()