Spaces:

Akmr008
/

ai-blog-assistant

Sleeping

App Files Files Community

ai-blog-assistant / app.py

Akmr008

Update app.py

15176fe verified 13 days ago

raw

history blame contribute delete

5.48 kB

	import os
	import requests
	from bs4 import BeautifulSoup
	from flask import Flask, request, jsonify, send_from_directory
	from flask_cors import CORS

	# --- Import updated RAG-specific libraries ---
	from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import Chroma
	from langchain.schema import Document
	from langchain.chains import RetrievalQA

	# --- Basic Flask App Setup ---
	app = Flask(__name__, static_folder='.', static_url_path='')
	CORS(app)

	# Global variable to hold the RAG chain
	rag_chain = None

	# --- Route for the homepage ---
	@app.route('/')
	def serve_index():
	"""Serves the index.html file as the homepage."""
	return send_from_directory('.', 'index.html')

	# --- API Route for queries ---
	@app.route("/query", methods=["POST"])
	def handle_query():
	"""Handles incoming queries from the frontend."""
	global rag_chain
	if not rag_chain:
	return jsonify({"error": "Failed to process the query. Details: RAG pipeline not initialized. Check server logs."}), 500

	data = request.json
	query_text = data.get("query")
	if not query_text:
	return jsonify({"error": "No query provided."}), 400

	try:
	print(f"Received query: {query_text}")
	result = rag_chain.invoke(query_text)

	answer = result.get('result', 'No answer found.')
	sources = [doc.metadata.get('source', 'Unknown source') for doc in result.get('source_documents', [])]
	unique_sources = list(dict.fromkeys(sources))

	print(f"Generated response: {answer}")
	return jsonify({
	"answer": answer,
	"sources": unique_sources
	})
	except Exception as e:
	print(f"Error during query processing: {e}")
	return jsonify({"error": f"An error occurred: {str(e)}"}), 500

	# --- RAG Pipeline Initialization ---
	def initialize_rag_pipeline():
	"""
	This function loads documents, creates the vector store, and initializes the RAG chain.
	"""
	global rag_chain
	print("--- Starting RAG pipeline initialization ---")

	api_key = os.environ.get("GOOGLE_API_KEY")
	if not api_key:
	print("ERROR: GOOGLE_API_KEY environment variable not set. Halting.")
	return
	print("Step 1: Google API Key found.")

	pdf_files = ["Augusta rule 101 CPE Webinar.pdf", "Augusta rule workshop.pdf"]
	pdf_docs = []
	try:
	for file in pdf_files:
	if os.path.exists(file):
	loader = PyPDFLoader(file)
	pdf_docs.extend(loader.load())
	else:
	print(f"Warning: PDF file not found at {file}")
	except Exception as e:
	print(f"ERROR loading PDFs: {e}. Halting.")
	return
	print(f"Step 2: Loaded {len(pdf_docs)} pages from PDF files.")

	def scrape_url(url):
	try:
	response = requests.get(url, timeout=15)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, "html.parser")
	return Document(page_content=soup.get_text(separator=" ", strip=True), metadata={"source": url})
	except requests.RequestException as e:
	print(f"Warning: Could not scrape URL {url}. Error: {e}")
	return None

	urls = [
	"https://www.instead.com/blog/the-augusta-rule-a-tax-strategy-for-business-owners",
	"https://www.instead.com/blog/s-corp-reasonable-salary-guide",
	"https://www.instead.com/blog/how-to-start-an-s-corp"
	]
	web_docs = [doc for doc in [scrape_url(url) for url in urls] if doc is not None]
	print(f"Step 3: Scraped {len(web_docs)} web pages.")

	all_docs = pdf_docs + web_docs
	if not all_docs:
	print("ERROR: No documents were loaded. Halting.")
	return

	splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
	chunks = splitter.split_documents(all_docs)
	print(f"Step 4: Split documents into {len(chunks)} chunks.")

	try:
	embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
	print("Step 5: Gemini embedding model loaded successfully.")

	# Create vector store in-memory for better stability on free servers
	vectorstore = Chroma.from_documents(chunks, embedding)
	print("Step 6: In-memory vector store created successfully.")

	llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=0.3)
	print("Step 7: Gemini LLM loaded successfully.")

	except Exception as e:
	print(f"ERROR during AI model initialization: {e}. Halting.")
	return

	rag_chain = RetrievalQA.from_chain_type(
	llm=llm,
	retriever=vectorstore.as_retriever(),
	return_source_documents=True
	)

	print("--- RAG pipeline initialized successfully! ---")

	# --- Initialize the RAG pipeline when the app starts ---
	# This is now called in the global scope to ensure it runs on Hugging Face
	initialize_rag_pipeline()

	# --- Main Execution Block (Only used for local testing) ---
	if __name__ == "__main__":
	if not rag_chain:
	print("\nCould not start the server because the RAG pipeline failed to initialize.")
	else:
	# This app.run is for local development and will not be used by Gunicorn on Hugging Face
	app.run(host='0.0.0.0', port=5000)