Spaces:

marzoukiuk
/

repo-rag-assistant

Sleeping

App Files Files Community

repo-rag-assistant / app.py

marzoukiuk

Update app.py

8d69341 verified 3 months ago

raw

history blame contribute delete

5.38 kB

	#!/usr/bin/env python3
	"""
	Enhanced Adaptive Repo RAG Assistant (Hugging Face Spaces Edition)
	"""

	import os, tempfile, ast
	from functools import lru_cache
	from typing import List
	from dotenv import load_dotenv

	import gradio as gr
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_openai import OpenAIEmbeddings, ChatOpenAI
	from langchain.vectorstores import Chroma
	from langchain.chains import LLMChain
	from langchain.prompts import PromptTemplate
	from langchain.schema import Document

	# Load environment variables from .env
	load_dotenv()
	api_key = os.getenv("OPENAI_API_KEY")

	# LangChain setup
	splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)

	# Build or retrieve vector index from repo
	@lru_cache(maxsize=3)
	def build_index(repo: str):
	os.environ['OPENAI_API_KEY'] = api_key
	llm = ChatOpenAI(model='gpt-4', temperature=0)
	embeddings = OpenAIEmbeddings()

	tmp = tempfile.mkdtemp()
	os.system(f'git clone https://github.com/{repo}.git {tmp}/repo')
	base = os.path.join(tmp, 'repo')
	ignore = {'.git','node_modules','__pycache__','venv','.venv'}
	texts, metas = [], []

	# Prioritise README file
	for fn in ('README.md','README.rst','README.txt'):
	p = os.path.join(base, fn)
	if os.path.exists(p):
	raw = open(p, encoding='utf-8').read()
	for c in splitter.split_text(raw):
	texts.append(c); metas.append({'source':fn,'type':'overview','importance':'high'})
	summary = LLMChain(llm=llm, prompt=PromptTemplate(
	input_variables=['text'],
	template='Summarise this project README in two concise sentences.\n\n{text}'
	)).predict(text=raw)
	texts.append(summary); metas.append({'source':fn,'type':'overview_summary','importance':'high'})
	break

	# Walk and process relevant files
	for root, _, files in os.walk(base):
	if any(ig in root for ig in ignore): continue
	for f in files:
	ext = os.path.splitext(f)[1]
	if ext in ('.py','.js','.ts','.md'):
	path = os.path.join(root, f)
	content = open(path, encoding='utf-8').read().replace('<\|endoftext\|>','')
	importance = 'high' if f in ('main.py','app.py','config.py','models.py') else 'normal'
	for c in splitter.split_text(content):
	texts.append(c); metas.append({'source':os.path.relpath(path,base),'type':'code','importance':importance})
	if ext == '.py':
	try:
	tree = ast.parse(content)
	for node in tree.body:
	if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
	sig = f"{node.name}({', '.join(a.arg for a in node.args.args)})"
	doc = ast.get_docstring(node) or ''
	snip = '\n'.join(content.splitlines()[node.lineno-1:node.end_lineno])
	texts.append(f"## {sig}\n{doc}\n```python\n{snip}\n```")
	metas.append({'source':os.path.relpath(path,base)+'::'+sig,'type':'entity','importance':importance})
	except: pass

	vect = Chroma.from_texts(texts, embeddings, metadatas=metas)
	prompt = PromptTemplate(
	input_variables=['summaries','question'],
	template="""
	You are a highly skilled codebase analyst.

	Using the following retrieved context, provide a detailed answer.

	Instructions:
	1. Start with a clear summary of the answer.
	2. Highlight key modules, components, or functions that define this repo's functionality.
	3. Focus on unique or core logic, avoiding generic explanations.
	4. Include a Sources section at the end with specific files/functions mentioned.

	Context:
	{summaries}

	Question:
	{question}
	"""
	)
	return vect, LLMChain(llm=llm, prompt=prompt)

	# Search and prioritize
	def get_relevant_docs(vdb, q: str)->List[Document]:
	out, seen = [], set()
	def add(batch):
	for d in batch:
	s = d.metadata['source']
	if s not in seen:
	seen.add(s); out.append(d)
	for typ,k in [('overview_summary',1), ('overview',2), ('entity',3), ('code',4)]:
	add(vdb.similarity_search(q, k=k, filter={'type':typ}))
	return out

	# Gradio logic
	def qa_fn(repo, question, history):
	try:
	history.append(("User", question))
	vect, chain = build_index(repo)
	docs = get_relevant_docs(vect, question)
	ctx = '\n---\n'.join(d.page_content for d in docs)
	ans = chain.predict(summaries=ctx, question=question)
	history.append(("Assistant", ans))
	return history
	except Exception as e:
	history.append(("Error", str(e)))
	return history

	# UI
	with gr.Blocks() as demo:
	gr.Markdown("# 🧠 Repo QA Assistant")
	repo_in = gr.Textbox(label="GitHub Repo", placeholder="owner/repo (e.g. fastapi/full-stack-fastapi-template)")
	q_in = gr.Textbox(label="Your Question", placeholder="question (e.g How does this project work? )", lines=1)
	chatbox = gr.Chatbot()
	send = gr.Button("Send")

	state = gr.State([])

	send.click(fn=qa_fn, inputs=[repo_in, q_in, state], outputs=[chatbox])

	demo.launch()