marzoukiuk's picture
Update app.py
8d69341 verified
#!/usr/bin/env python3
"""
Enhanced Adaptive Repo RAG Assistant (Hugging Face Spaces Edition)
"""
import os, tempfile, ast
from functools import lru_cache
from typing import List
from dotenv import load_dotenv
import gradio as gr
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema import Document
# Load environment variables from .env
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
# LangChain setup
splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)
# Build or retrieve vector index from repo
@lru_cache(maxsize=3)
def build_index(repo: str):
os.environ['OPENAI_API_KEY'] = api_key
llm = ChatOpenAI(model='gpt-4', temperature=0)
embeddings = OpenAIEmbeddings()
tmp = tempfile.mkdtemp()
os.system(f'git clone https://github.com/{repo}.git {tmp}/repo')
base = os.path.join(tmp, 'repo')
ignore = {'.git','node_modules','__pycache__','venv','.venv'}
texts, metas = [], []
# Prioritise README file
for fn in ('README.md','README.rst','README.txt'):
p = os.path.join(base, fn)
if os.path.exists(p):
raw = open(p, encoding='utf-8').read()
for c in splitter.split_text(raw):
texts.append(c); metas.append({'source':fn,'type':'overview','importance':'high'})
summary = LLMChain(llm=llm, prompt=PromptTemplate(
input_variables=['text'],
template='Summarise this project README in two concise sentences.\n\n{text}'
)).predict(text=raw)
texts.append(summary); metas.append({'source':fn,'type':'overview_summary','importance':'high'})
break
# Walk and process relevant files
for root, _, files in os.walk(base):
if any(ig in root for ig in ignore): continue
for f in files:
ext = os.path.splitext(f)[1]
if ext in ('.py','.js','.ts','.md'):
path = os.path.join(root, f)
content = open(path, encoding='utf-8').read().replace('<|endoftext|>','')
importance = 'high' if f in ('main.py','app.py','config.py','models.py') else 'normal'
for c in splitter.split_text(content):
texts.append(c); metas.append({'source':os.path.relpath(path,base),'type':'code','importance':importance})
if ext == '.py':
try:
tree = ast.parse(content)
for node in tree.body:
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
sig = f"{node.name}({', '.join(a.arg for a in node.args.args)})"
doc = ast.get_docstring(node) or ''
snip = '\n'.join(content.splitlines()[node.lineno-1:node.end_lineno])
texts.append(f"## {sig}\n{doc}\n```python\n{snip}\n```")
metas.append({'source':os.path.relpath(path,base)+'::'+sig,'type':'entity','importance':importance})
except: pass
vect = Chroma.from_texts(texts, embeddings, metadatas=metas)
prompt = PromptTemplate(
input_variables=['summaries','question'],
template="""
You are a highly skilled codebase analyst.
Using the following retrieved context, provide a detailed answer.
Instructions:
1. Start with a **clear summary** of the answer.
2. Highlight **key modules, components, or functions** that define this repo's functionality.
3. Focus on **unique or core logic**, avoiding generic explanations.
4. Include a **Sources** section at the end with specific files/functions mentioned.
Context:
{summaries}
Question:
{question}
"""
)
return vect, LLMChain(llm=llm, prompt=prompt)
# Search and prioritize
def get_relevant_docs(vdb, q: str)->List[Document]:
out, seen = [], set()
def add(batch):
for d in batch:
s = d.metadata['source']
if s not in seen:
seen.add(s); out.append(d)
for typ,k in [('overview_summary',1), ('overview',2), ('entity',3), ('code',4)]:
add(vdb.similarity_search(q, k=k, filter={'type':typ}))
return out
# Gradio logic
def qa_fn(repo, question, history):
try:
history.append(("User", question))
vect, chain = build_index(repo)
docs = get_relevant_docs(vect, question)
ctx = '\n---\n'.join(d.page_content for d in docs)
ans = chain.predict(summaries=ctx, question=question)
history.append(("Assistant", ans))
return history
except Exception as e:
history.append(("Error", str(e)))
return history
# UI
with gr.Blocks() as demo:
gr.Markdown("# 🧠 Repo QA Assistant")
repo_in = gr.Textbox(label="GitHub Repo", placeholder="owner/repo (e.g. fastapi/full-stack-fastapi-template)")
q_in = gr.Textbox(label="Your Question", placeholder="question (e.g How does this project work? )", lines=1)
chatbox = gr.Chatbot()
send = gr.Button("Send")
state = gr.State([])
send.click(fn=qa_fn, inputs=[repo_in, q_in, state], outputs=[chatbox])
demo.launch()