RAG_MVP / app.py
Jialun He
1st version
2934401
import gradio as gr
import PyPDF2
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re
import io
class SimpleRAG:
def __init__(self):
self.model = SentenceTransformer('all-MiniLM-L6-v2')
self.chunks = []
self.embeddings = None
self.document_processed = False
def extract_text_from_pdf(self, pdf_file):
try:
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
raise Exception(f"Error reading PDF: {str(e)}")
def chunk_text(self, text, chunk_size=500, overlap=100):
sentences = re.split(r'[.!?]+', text)
chunks = []
current_chunk = ""
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
if len(current_chunk) + len(sentence) <= chunk_size:
current_chunk += sentence + ". "
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence + ". "
if current_chunk:
chunks.append(current_chunk.strip())
return [chunk for chunk in chunks if len(chunk) > 50]
def process_document(self, pdf_file):
try:
text = self.extract_text_from_pdf(pdf_file)
if not text.strip():
return "Error: Could not extract text from PDF. Please try a different file."
self.chunks = self.chunk_text(text)
if not self.chunks:
return "Error: No meaningful content found in the PDF."
self.embeddings = self.model.encode(self.chunks)
self.document_processed = True
return f"Document processed successfully! Created {len(self.chunks)} text chunks. You can now ask questions."
except Exception as e:
return f"Error processing document: {str(e)}"
def search_similar_chunks(self, query, top_k=3):
if not self.document_processed:
return "Please upload and process a document first."
try:
query_embedding = self.model.encode([query])
similarities = cosine_similarity(query_embedding, self.embeddings)[0]
top_indices = np.argsort(similarities)[::-1][:top_k]
results = []
for idx in top_indices:
results.append({
'chunk': self.chunks[idx],
'similarity': similarities[idx]
})
return results
except Exception as e:
return f"Error searching: {str(e)}"
def answer_question(self, question):
if not self.document_processed:
return "Please upload and process a document first."
if not question.strip():
return "Please enter a question."
results = self.search_similar_chunks(question)
if isinstance(results, str):
return results
if not results:
return "No relevant information found in the document."
answer = "Based on the document, here's what I found:\n\n"
for i, result in enumerate(results):
answer += f"**Relevant section {i+1}** (similarity: {result['similarity']:.3f}):\n"
answer += f"{result['chunk']}\n\n"
return answer
rag_system = SimpleRAG()
def process_pdf(pdf_file):
if pdf_file is None:
return "Please upload a PDF file."
try:
with open(pdf_file.name, 'rb') as f:
pdf_content = f.read()
result = rag_system.process_document(pdf_content)
return result
except Exception as e:
return f"Error: {str(e)}"
def ask_question(question):
return rag_system.answer_question(question)
with gr.Blocks(title="RAG MVP - Document Q&A", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# RAG MVP - Document Q&A System
Upload a PDF document and ask questions about its content. The system will find relevant information and provide answers based on the document.
**How to use:**
1. Upload a PDF file
2. Wait for processing to complete
3. Ask questions about the document
""")
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(
label="Upload PDF Document",
file_types=[".pdf"],
type="filepath"
)
process_btn = gr.Button("Process Document", variant="primary")
process_output = gr.Textbox(
label="Processing Status",
lines=3,
interactive=False
)
with gr.Column(scale=1):
question_input = gr.Textbox(
label="Ask a Question",
placeholder="What is this document about?",
lines=2
)
ask_btn = gr.Button("Get Answer", variant="secondary")
answer_output = gr.Textbox(
label="Answer",
lines=10,
interactive=False
)
gr.Markdown("""
### Sample Questions to Try:
- What is the main topic of this document?
- Can you summarize the key points?
- What are the important details mentioned?
""")
process_btn.click(
fn=process_pdf,
inputs=[pdf_input],
outputs=[process_output]
)
ask_btn.click(
fn=ask_question,
inputs=[question_input],
outputs=[answer_output]
)
question_input.submit(
fn=ask_question,
inputs=[question_input],
outputs=[answer_output]
)
if __name__ == "__main__":
demo.launch()