Spaces:
Sleeping
Sleeping
File size: 5,213 Bytes
b693f8a 2f80f49 b693f8a 2f80f49 b693f8a 2f80f49 f73c93b 2f80f49 f73c93b 2f80f49 b693f8a 2f80f49 b693f8a 2f80f49 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from io import BytesIO
from PyPDF2 import PdfReader, utils
import fitz
from typing import List
import google.generativeai as genai
import gradio as gr
from nltk.tokenize import sent_tokenize
from fastembed import TextEmbedding
import numpy as np
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.models import PointStruct
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
input_path = './repaired-www-foxweather-com.pdf'
with open(input_path, 'rb') as input_file:
input_buffer = BytesIO(input_file.read())
# Try reading the PDF directly
try:
input_pdf = PdfReader(input_buffer)
print("PDF read successfully.")
except utils.PdfReadError:
# If direct reading fails, it might be a compression issue.
print("Could not read PDF directly. Proceeding with original file.")
# Reset buffer position for potential later use
input_buffer.seek(0)
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text+=page.get_text()
return text
def extract_text_from_pdfs_in_directory(directory):
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
pdf_path = os.path.join(directory, filename)
extracted_text = extract_text_from_pdf(pdf_path)
txt_filename = os.path.splitext(filename)[0] + ".txt"
txt_filepath = os.path.join(directory, txt_filename)
with open(txt_filepath, "w", encoding="utf-8") as txt_file:
txt_file.write(extracted_text)
# Specify the directory containing PDF files
directory_path = "./"
# Extract text from PDFs in the directory and save as text files
extract_text_from_pdfs_in_directory(directory_path)
# List all .txt files in the directory
txt_files = [file for file in os.listdir(directory_path) if file.endswith('.txt')]
# List to store sentences from all files
all_sentences = []
# Read each text file, split into sentences, and store
for txt_file in txt_files:
file_path = os.path.join(directory_path, txt_file)
with open(file_path, "r", encoding="utf-8") as file:
text = file.read()
sentences = sent_tokenize(text)
all_sentences.extend(sentences)
# Print the first few sentences as an example
print(all_sentences[:10]) # Print first 10 sentences
# Initialize the TextEmbedding model
embedding_model = TextEmbedding(model_name="BAAI/bge-base-en", cache_dir="./embeddings")
def embed_documents(documents):
embeddings = []
for document in documents:
# Embed document using FastEmbed
embedding = np.array(list((embedding_model.embed([document]))))
# Append the embedding to the list of embeddings
embeddings.append(embedding)
return embeddings
# Define the documents
documents = all_sentences
# Perform embedding generation
embeddings = embed_documents(documents)
embeddings = [sublist[0] for sublist in embeddings]
client = QdrantClient(
url="https://ec069eb8-1679-4f53-971c-8fef6fe7d057.us-west-2-0.aws.cloud.qdrant.io",
api_key=QDRANT_API_KEY,
https=True,
)
collection_name = 'fastembed_collection'
client.recreate_collection(
collection_name=collection_name,
vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)
client.upload_points(
collection_name=collection_name,
points=[
PointStruct(
id=idx,
vector=vector.tolist(),
payload={"text": text}
)
for idx, (vector, text) in enumerate(zip(embeddings, documents))
]
)
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-2.5-pro')
# Function to generate completion from prompt
def generate_completion(prompt):
response = model.generate_content(prompt)
return response.text
# Function to embed Queries
def embed_query(Question):
return np.array(list(embedding_model.embed([Question])))
def generate_response(Question):
query_embeddings = embed_query(Question)
collection_name = 'fastembed_collection'
all_text = ""
# Retrieve all hits and concatenate texts into a single prompt
for query_embedding in query_embeddings:
query_vector: List[np.ndarray] = list(query_embedding)
hits = client.search(
collection_name=collection_name,
query_vector=query_vector,
limit=50
)
for hit in hits:
text = hit.payload["text"]
all_text += text + "\n\n"
# Generate completion using all texts as a single prompt
prompt = f"You are a helpful chatbot. Use only the following pieces of context to answer the question. Don't make up any new information:\n\n{all_text}\n\nQuestion:{Question}\n\nAnswer:"
completion = generate_completion(prompt)
return completion
# Set up the Gradio interface
iface = gr.Interface(
fn=generate_response,
inputs=[gr.Textbox(label="Question")], # Pass input as a list
outputs=[gr.Textbox(label="Generated Response")], # Pass output as a list
title="RAG with Qdrant, FastEmbed and Gemini",
description="Enter a question and get a generated response based on the retrieved text.",
)
iface.launch(share=True,debug=True) |