pradeepsengarr commited on
Commit
14b7206
Β·
verified Β·
1 Parent(s): 2b518ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -33
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import gradio as gr
3
- import fitz # PyMuPDF
4
  import faiss
5
  import numpy as np
6
  from sentence_transformers import SentenceTransformer
@@ -8,31 +8,21 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from huggingface_hub import login
10
 
11
- # Load Hugging Face Token from environment
12
  hf_token = os.environ.get("HUGGINGFACE_TOKEN")
13
  if not hf_token:
14
- raise ValueError("⚠️ Please set the HUGGINGFACE_TOKEN environment variable.")
15
  login(token=hf_token)
16
 
17
- # Load embedding model
18
  embed_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
19
 
20
- # Load small, fast LLM (great for CPU)
21
  model_id = "tiiuae/falcon-rw-1b"
22
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
23
- model = AutoModelForCausalLM.from_pretrained(
24
- model_id,
25
- device_map={"": "cpu"},
26
- torch_dtype="auto",
27
- token=hf_token
28
- )
29
  llm = pipeline("text-generation", model=model, tokenizer=tokenizer)
30
 
31
- # Globals
32
  index = None
33
  doc_texts = []
34
 
35
- # Extract text from PDF or TXT (handle Hugging Face Spaces file upload)
36
  def extract_text(file):
37
  text = ""
38
  file_path = file.name if hasattr(file, 'name') else file
@@ -44,14 +34,13 @@ def extract_text(file):
44
  with open(file_path, "r", encoding="utf-8") as f:
45
  text = f.read()
46
  else:
47
- return "❌ Unsupported file type."
48
  return text
49
 
50
- # Process file and build FAISS index
51
  def process_file(file):
52
  global index, doc_texts
53
  text = extract_text(file)
54
- if text.startswith("❌"):
55
  return text
56
 
57
  splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
@@ -62,40 +51,39 @@ def process_file(file):
62
  index = faiss.IndexFlatL2(dim)
63
  index.add(embeddings)
64
 
65
- return "βœ… File processed successfully! Ask your question below."
66
 
67
- # Generate answer
68
  def generate_answer(question):
69
  global index, doc_texts
70
  if index is None or not doc_texts:
71
- return "⚠️ Please upload and process a document first."
72
 
73
  question_emb = embed_model.encode([question], convert_to_numpy=True)
74
  _, I = index.search(question_emb, k=3)
75
  context = "\n".join([doc_texts[i] for i in I[0]])
76
 
77
- prompt = f"""[System: You are a helpful assistant. Answer based on the context.]
78
-
79
- Context:
80
- {context}
81
-
82
- Question: {question}
83
- Answer:"""
84
 
85
  result = llm(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
86
  return result[0]["generated_text"].split("Answer:")[-1].strip()
87
 
88
- # Gradio UI
89
- with gr.Blocks(title="RAG Chatbot (CPU-Optimized)") as demo:
90
- gr.Markdown("## πŸ“š Upload PDF/TXT and Ask Questions (Fast CPU RAG Bot)")
91
 
92
  with gr.Row():
93
- file_input = gr.File(label="πŸ“ Upload PDF or TXT", file_types=[".pdf", ".txt"])
94
- upload_output = gr.Textbox(label="Upload Status", interactive=False)
95
 
96
  with gr.Row():
97
- question_input = gr.Textbox(label="❓ Ask a Question", placeholder="E.g. What is the document about?")
98
- answer_output = gr.Textbox(label="πŸ’¬ Answer", interactive=False)
99
 
100
  file_input.change(fn=process_file, inputs=file_input, outputs=upload_output)
101
  question_input.submit(fn=generate_answer, inputs=question_input, outputs=answer_output)
 
1
  import os
2
  import gradio as gr
3
+ import fitz
4
  import faiss
5
  import numpy as np
6
  from sentence_transformers import SentenceTransformer
 
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from huggingface_hub import login
10
 
 
11
  hf_token = os.environ.get("HUGGINGFACE_TOKEN")
12
  if not hf_token:
13
+ raise ValueError("Hugging Face token not found.")
14
  login(token=hf_token)
15
 
 
16
  embed_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
17
 
 
18
  model_id = "tiiuae/falcon-rw-1b"
19
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
20
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map={"": "cpu"}, torch_dtype="auto", token=hf_token)
 
 
 
 
 
21
  llm = pipeline("text-generation", model=model, tokenizer=tokenizer)
22
 
 
23
  index = None
24
  doc_texts = []
25
 
 
26
  def extract_text(file):
27
  text = ""
28
  file_path = file.name if hasattr(file, 'name') else file
 
34
  with open(file_path, "r", encoding="utf-8") as f:
35
  text = f.read()
36
  else:
37
+ return "Unsupported file type."
38
  return text
39
 
 
40
  def process_file(file):
41
  global index, doc_texts
42
  text = extract_text(file)
43
+ if text.startswith("Unsupported"):
44
  return text
45
 
46
  splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
 
51
  index = faiss.IndexFlatL2(dim)
52
  index.add(embeddings)
53
 
54
+ return "Document processed successfully. You can now ask questions."
55
 
 
56
  def generate_answer(question):
57
  global index, doc_texts
58
  if index is None or not doc_texts:
59
+ return "Please upload and process a document first."
60
 
61
  question_emb = embed_model.encode([question], convert_to_numpy=True)
62
  _, I = index.search(question_emb, k=3)
63
  context = "\n".join([doc_texts[i] for i in I[0]])
64
 
65
+ prompt = (
66
+ f"You are an intelligent assistant. Use the context below to answer the user's question clearly, "
67
+ f"politely, and completely. Do not just extract text β€” give a helpful response.\n\n"
68
+ f"Context:\n{context}\n\n"
69
+ f"User's Question: {question}\n\n"
70
+ f"Answer:"
71
+ )
72
 
73
  result = llm(prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
74
  return result[0]["generated_text"].split("Answer:")[-1].strip()
75
 
76
+ with gr.Blocks(title="Document Q&A Assistant") as demo:
77
+ gr.Markdown("<h1 style='text-align: center;'>πŸ“„ Document AI Assistant</h1>")
78
+ gr.Markdown("Upload a PDF or TXT file, and ask questions about its content. The assistant will provide answers using the document as context.")
79
 
80
  with gr.Row():
81
+ file_input = gr.File(label="Upload PDF or TXT", file_types=[".pdf", ".txt"])
82
+ upload_output = gr.Textbox(label="Upload Status")
83
 
84
  with gr.Row():
85
+ question_input = gr.Textbox(label="Ask a Question", placeholder="What is this document about?")
86
+ answer_output = gr.Textbox(label="Answer")
87
 
88
  file_input.change(fn=process_file, inputs=file_input, outputs=upload_output)
89
  question_input.submit(fn=generate_answer, inputs=question_input, outputs=answer_output)