Spaces:

marzoukiuk
/

repo-rag-assistant

Sleeping

File size: 5,377 Bytes

#!/usr/bin/env python3
"""
Enhanced Adaptive Repo RAG Assistant (Hugging Face Spaces Edition)
"""

import os, tempfile, ast
from functools import lru_cache
from typing import List
from dotenv import load_dotenv

import gradio as gr
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema import Document

# Load environment variables from .env
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

# LangChain setup
splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)

# Build or retrieve vector index from repo
@lru_cache(maxsize=3)
def build_index(repo: str):
    os.environ['OPENAI_API_KEY'] = api_key
    llm = ChatOpenAI(model='gpt-4', temperature=0)
    embeddings = OpenAIEmbeddings()

    tmp = tempfile.mkdtemp()
    os.system(f'git clone https://github.com/{repo}.git {tmp}/repo')
    base = os.path.join(tmp, 'repo')
    ignore = {'.git','node_modules','__pycache__','venv','.venv'}
    texts, metas = [], []

    # Prioritise README file
    for fn in ('README.md','README.rst','README.txt'):
        p = os.path.join(base, fn)
        if os.path.exists(p):
            raw = open(p, encoding='utf-8').read()
            for c in splitter.split_text(raw):
                texts.append(c); metas.append({'source':fn,'type':'overview','importance':'high'})
            summary = LLMChain(llm=llm, prompt=PromptTemplate(
                input_variables=['text'],
                template='Summarise this project README in two concise sentences.\n\n{text}'
            )).predict(text=raw)
            texts.append(summary); metas.append({'source':fn,'type':'overview_summary','importance':'high'})
            break

    # Walk and process relevant files
    for root, _, files in os.walk(base):
        if any(ig in root for ig in ignore): continue
        for f in files:
            ext = os.path.splitext(f)[1]
            if ext in ('.py','.js','.ts','.md'):
                path = os.path.join(root, f)
                content = open(path, encoding='utf-8').read().replace('<|endoftext|>','')
                importance = 'high' if f in ('main.py','app.py','config.py','models.py') else 'normal'
                for c in splitter.split_text(content):
                    texts.append(c); metas.append({'source':os.path.relpath(path,base),'type':'code','importance':importance})
                if ext == '.py':
                    try:
                        tree = ast.parse(content)
                        for node in tree.body:
                            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
                                sig = f"{node.name}({', '.join(a.arg for a in node.args.args)})"
                                doc = ast.get_docstring(node) or ''
                                snip = '\n'.join(content.splitlines()[node.lineno-1:node.end_lineno])
                                texts.append(f"## {sig}\n{doc}\n```python\n{snip}\n```")
                                metas.append({'source':os.path.relpath(path,base)+'::'+sig,'type':'entity','importance':importance})
                    except: pass

    vect = Chroma.from_texts(texts, embeddings, metadatas=metas)
    prompt = PromptTemplate(
        input_variables=['summaries','question'],
        template="""
You are a highly skilled codebase analyst.

Using the following retrieved context, provide a detailed answer.

Instructions:
1. Start with a **clear summary** of the answer.
2. Highlight **key modules, components, or functions** that define this repo's functionality.
3. Focus on **unique or core logic**, avoiding generic explanations.
4. Include a **Sources** section at the end with specific files/functions mentioned.

Context:
{summaries}

Question:
{question}
"""
    )
    return vect, LLMChain(llm=llm, prompt=prompt)

# Search and prioritize
def get_relevant_docs(vdb, q: str)->List[Document]:
    out, seen = [], set()
    def add(batch):
        for d in batch:
            s = d.metadata['source']
            if s not in seen:
                seen.add(s); out.append(d)
    for typ,k in [('overview_summary',1), ('overview',2), ('entity',3), ('code',4)]:
        add(vdb.similarity_search(q, k=k, filter={'type':typ}))
    return out

# Gradio logic
def qa_fn(repo, question, history):
    try:
        history.append(("User", question))
        vect, chain = build_index(repo)
        docs = get_relevant_docs(vect, question)
        ctx = '\n---\n'.join(d.page_content for d in docs)
        ans = chain.predict(summaries=ctx, question=question)
        history.append(("Assistant", ans))
        return history
    except Exception as e:
        history.append(("Error", str(e)))
        return history

# UI
with gr.Blocks() as demo:
    gr.Markdown("# 🧠 Repo QA Assistant")
    repo_in = gr.Textbox(label="GitHub Repo", placeholder="owner/repo (e.g. fastapi/full-stack-fastapi-template)")
    q_in = gr.Textbox(label="Your Question", placeholder="question (e.g How does this project work? )", lines=1)
    chatbox = gr.Chatbot()
    send = gr.Button("Send")

    state = gr.State([])

    send.click(fn=qa_fn, inputs=[repo_in, q_in, state], outputs=[chatbox])

demo.launch()