Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
""" | |
Enhanced Adaptive Repo RAG Assistant (Hugging Face Spaces Edition) | |
""" | |
import os, tempfile, ast | |
from functools import lru_cache | |
from typing import List | |
from dotenv import load_dotenv | |
import gradio as gr | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_openai import OpenAIEmbeddings, ChatOpenAI | |
from langchain.vectorstores import Chroma | |
from langchain.chains import LLMChain | |
from langchain.prompts import PromptTemplate | |
from langchain.schema import Document | |
# Load environment variables from .env | |
load_dotenv() | |
api_key = os.getenv("OPENAI_API_KEY") | |
# LangChain setup | |
splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300) | |
# Build or retrieve vector index from repo | |
def build_index(repo: str): | |
os.environ['OPENAI_API_KEY'] = api_key | |
llm = ChatOpenAI(model='gpt-4', temperature=0) | |
embeddings = OpenAIEmbeddings() | |
tmp = tempfile.mkdtemp() | |
os.system(f'git clone https://github.com/{repo}.git {tmp}/repo') | |
base = os.path.join(tmp, 'repo') | |
ignore = {'.git','node_modules','__pycache__','venv','.venv'} | |
texts, metas = [], [] | |
# Prioritise README file | |
for fn in ('README.md','README.rst','README.txt'): | |
p = os.path.join(base, fn) | |
if os.path.exists(p): | |
raw = open(p, encoding='utf-8').read() | |
for c in splitter.split_text(raw): | |
texts.append(c); metas.append({'source':fn,'type':'overview','importance':'high'}) | |
summary = LLMChain(llm=llm, prompt=PromptTemplate( | |
input_variables=['text'], | |
template='Summarise this project README in two concise sentences.\n\n{text}' | |
)).predict(text=raw) | |
texts.append(summary); metas.append({'source':fn,'type':'overview_summary','importance':'high'}) | |
break | |
# Walk and process relevant files | |
for root, _, files in os.walk(base): | |
if any(ig in root for ig in ignore): continue | |
for f in files: | |
ext = os.path.splitext(f)[1] | |
if ext in ('.py','.js','.ts','.md'): | |
path = os.path.join(root, f) | |
content = open(path, encoding='utf-8').read().replace('<|endoftext|>','') | |
importance = 'high' if f in ('main.py','app.py','config.py','models.py') else 'normal' | |
for c in splitter.split_text(content): | |
texts.append(c); metas.append({'source':os.path.relpath(path,base),'type':'code','importance':importance}) | |
if ext == '.py': | |
try: | |
tree = ast.parse(content) | |
for node in tree.body: | |
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): | |
sig = f"{node.name}({', '.join(a.arg for a in node.args.args)})" | |
doc = ast.get_docstring(node) or '' | |
snip = '\n'.join(content.splitlines()[node.lineno-1:node.end_lineno]) | |
texts.append(f"## {sig}\n{doc}\n```python\n{snip}\n```") | |
metas.append({'source':os.path.relpath(path,base)+'::'+sig,'type':'entity','importance':importance}) | |
except: pass | |
vect = Chroma.from_texts(texts, embeddings, metadatas=metas) | |
prompt = PromptTemplate( | |
input_variables=['summaries','question'], | |
template=""" | |
You are a highly skilled codebase analyst. | |
Using the following retrieved context, provide a detailed answer. | |
Instructions: | |
1. Start with a **clear summary** of the answer. | |
2. Highlight **key modules, components, or functions** that define this repo's functionality. | |
3. Focus on **unique or core logic**, avoiding generic explanations. | |
4. Include a **Sources** section at the end with specific files/functions mentioned. | |
Context: | |
{summaries} | |
Question: | |
{question} | |
""" | |
) | |
return vect, LLMChain(llm=llm, prompt=prompt) | |
# Search and prioritize | |
def get_relevant_docs(vdb, q: str)->List[Document]: | |
out, seen = [], set() | |
def add(batch): | |
for d in batch: | |
s = d.metadata['source'] | |
if s not in seen: | |
seen.add(s); out.append(d) | |
for typ,k in [('overview_summary',1), ('overview',2), ('entity',3), ('code',4)]: | |
add(vdb.similarity_search(q, k=k, filter={'type':typ})) | |
return out | |
# Gradio logic | |
def qa_fn(repo, question, history): | |
try: | |
history.append(("User", question)) | |
vect, chain = build_index(repo) | |
docs = get_relevant_docs(vect, question) | |
ctx = '\n---\n'.join(d.page_content for d in docs) | |
ans = chain.predict(summaries=ctx, question=question) | |
history.append(("Assistant", ans)) | |
return history | |
except Exception as e: | |
history.append(("Error", str(e))) | |
return history | |
# UI | |
with gr.Blocks() as demo: | |
gr.Markdown("# 🧠 Repo QA Assistant") | |
repo_in = gr.Textbox(label="GitHub Repo", placeholder="owner/repo (e.g. fastapi/full-stack-fastapi-template)") | |
q_in = gr.Textbox(label="Your Question", placeholder="question (e.g How does this project work? )", lines=1) | |
chatbox = gr.Chatbot() | |
send = gr.Button("Send") | |
state = gr.State([]) | |
send.click(fn=qa_fn, inputs=[repo_in, q_in, state], outputs=[chatbox]) | |
demo.launch() | |