Spaces:

Sumukhdev
/

weathernow-rag-model

Sleeping

SumukhP-dev

Add functionality to read environment variables

b693f8a 6 days ago

5.21 kB

	import nltk

	nltk.download('punkt')
	nltk.download('punkt_tab')

	from io import BytesIO
	from PyPDF2 import PdfReader, utils

	import fitz

	from typing import List
	import google.generativeai as genai
	import gradio as gr

	from nltk.tokenize import sent_tokenize

	from fastembed import TextEmbedding
	import numpy as np

	from qdrant_client import QdrantClient
	from qdrant_client.http.models import Distance, VectorParams
	from qdrant_client.models import PointStruct

	import os
	from dotenv import load_dotenv, find_dotenv

	load_dotenv(find_dotenv())

	QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
	GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')

	input_path = './repaired-www-foxweather-com.pdf'

	with open(input_path, 'rb') as input_file:
	input_buffer = BytesIO(input_file.read())

	# Try reading the PDF directly
	try:
	input_pdf = PdfReader(input_buffer)
	print("PDF read successfully.")
	except utils.PdfReadError:
	# If direct reading fails, it might be a compression issue.
	print("Could not read PDF directly. Proceeding with original file.")
	# Reset buffer position for potential later use
	input_buffer.seek(0)

	def extract_text_from_pdf(pdf_path):
	doc = fitz.open(pdf_path)
	text = ""
	for page in doc:
	text+=page.get_text()
	return text

	def extract_text_from_pdfs_in_directory(directory):
	for filename in os.listdir(directory):
	if filename.endswith(".pdf"):
	pdf_path = os.path.join(directory, filename)
	extracted_text = extract_text_from_pdf(pdf_path)
	txt_filename = os.path.splitext(filename)[0] + ".txt"
	txt_filepath = os.path.join(directory, txt_filename)
	with open(txt_filepath, "w", encoding="utf-8") as txt_file:
	txt_file.write(extracted_text)

	# Specify the directory containing PDF files
	directory_path = "./"

	# Extract text from PDFs in the directory and save as text files
	extract_text_from_pdfs_in_directory(directory_path)

	# List all .txt files in the directory
	txt_files = [file for file in os.listdir(directory_path) if file.endswith('.txt')]

	# List to store sentences from all files
	all_sentences = []

	# Read each text file, split into sentences, and store
	for txt_file in txt_files:
	file_path = os.path.join(directory_path, txt_file)
	with open(file_path, "r", encoding="utf-8") as file:
	text = file.read()
	sentences = sent_tokenize(text)
	all_sentences.extend(sentences)

	# Print the first few sentences as an example
	print(all_sentences[:10]) # Print first 10 sentences

	# Initialize the TextEmbedding model
	embedding_model = TextEmbedding(model_name="BAAI/bge-base-en", cache_dir="./embeddings")

	def embed_documents(documents):
	embeddings = []
	for document in documents:
	# Embed document using FastEmbed
	embedding = np.array(list((embedding_model.embed([document]))))

	# Append the embedding to the list of embeddings
	embeddings.append(embedding)

	return embeddings

	# Define the documents
	documents = all_sentences

	# Perform embedding generation
	embeddings = embed_documents(documents)
	embeddings = [sublist[0] for sublist in embeddings]

	client = QdrantClient(
	url="https://ec069eb8-1679-4f53-971c-8fef6fe7d057.us-west-2-0.aws.cloud.qdrant.io",
	api_key=QDRANT_API_KEY,
	https=True,
	)
	collection_name = 'fastembed_collection'
	client.recreate_collection(
	collection_name=collection_name,
	vectors_config=VectorParams(size=768, distance=Distance.COSINE),
	)

	client.upload_points(
	collection_name=collection_name,
	points=[
	PointStruct(
	id=idx,
	vector=vector.tolist(),
	payload={"text": text}
	)
	for idx, (vector, text) in enumerate(zip(embeddings, documents))
	]
	)

	genai.configure(api_key=GOOGLE_API_KEY)
	model = genai.GenerativeModel('gemini-2.5-pro')

	# Function to generate completion from prompt
	def generate_completion(prompt):
	response = model.generate_content(prompt)
	return response.text

	# Function to embed Queries
	def embed_query(Question):
	return np.array(list(embedding_model.embed([Question])))

	def generate_response(Question):
	query_embeddings = embed_query(Question)
	collection_name = 'fastembed_collection'
	all_text = ""

	# Retrieve all hits and concatenate texts into a single prompt
	for query_embedding in query_embeddings:
	query_vector: List[np.ndarray] = list(query_embedding)

	hits = client.search(
	collection_name=collection_name,
	query_vector=query_vector,
	limit=50
	)
	for hit in hits:
	text = hit.payload["text"]
	all_text += text + "\n\n"

	# Generate completion using all texts as a single prompt
	prompt = f"You are a helpful chatbot. Use only the following pieces of context to answer the question. Don't make up any new information:\n\n{all_text}\n\nQuestion:{Question}\n\nAnswer:"
	completion = generate_completion(prompt)
	return completion

	# Set up the Gradio interface
	iface = gr.Interface(
	fn=generate_response,
	inputs=[gr.Textbox(label="Question")], # Pass input as a list
	outputs=[gr.Textbox(label="Generated Response")], # Pass output as a list
	title="RAG with Qdrant, FastEmbed and Gemini",
	description="Enter a question and get a generated response based on the retrieved text.",
	)

	iface.launch(share=True,debug=True)