#!/usr/bin/env python3 """ Enhanced Adaptive Repo RAG Assistant (Hugging Face Spaces Edition) """ import os, tempfile, ast from functools import lru_cache from typing import List from dotenv import load_dotenv import gradio as gr from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings, ChatOpenAI from langchain.vectorstores import Chroma from langchain.chains import LLMChain from langchain.prompts import PromptTemplate from langchain.schema import Document # Load environment variables from .env load_dotenv() api_key = os.getenv("OPENAI_API_KEY") # LangChain setup splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300) # Build or retrieve vector index from repo @lru_cache(maxsize=3) def build_index(repo: str): os.environ['OPENAI_API_KEY'] = api_key llm = ChatOpenAI(model='gpt-4', temperature=0) embeddings = OpenAIEmbeddings() tmp = tempfile.mkdtemp() os.system(f'git clone https://github.com/{repo}.git {tmp}/repo') base = os.path.join(tmp, 'repo') ignore = {'.git','node_modules','__pycache__','venv','.venv'} texts, metas = [], [] # Prioritise README file for fn in ('README.md','README.rst','README.txt'): p = os.path.join(base, fn) if os.path.exists(p): raw = open(p, encoding='utf-8').read() for c in splitter.split_text(raw): texts.append(c); metas.append({'source':fn,'type':'overview','importance':'high'}) summary = LLMChain(llm=llm, prompt=PromptTemplate( input_variables=['text'], template='Summarise this project README in two concise sentences.\n\n{text}' )).predict(text=raw) texts.append(summary); metas.append({'source':fn,'type':'overview_summary','importance':'high'}) break # Walk and process relevant files for root, _, files in os.walk(base): if any(ig in root for ig in ignore): continue for f in files: ext = os.path.splitext(f)[1] if ext in ('.py','.js','.ts','.md'): path = os.path.join(root, f) content = open(path, encoding='utf-8').read().replace('<|endoftext|>','') importance = 'high' if f in ('main.py','app.py','config.py','models.py') else 'normal' for c in splitter.split_text(content): texts.append(c); metas.append({'source':os.path.relpath(path,base),'type':'code','importance':importance}) if ext == '.py': try: tree = ast.parse(content) for node in tree.body: if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): sig = f"{node.name}({', '.join(a.arg for a in node.args.args)})" doc = ast.get_docstring(node) or '' snip = '\n'.join(content.splitlines()[node.lineno-1:node.end_lineno]) texts.append(f"## {sig}\n{doc}\n```python\n{snip}\n```") metas.append({'source':os.path.relpath(path,base)+'::'+sig,'type':'entity','importance':importance}) except: pass vect = Chroma.from_texts(texts, embeddings, metadatas=metas) prompt = PromptTemplate( input_variables=['summaries','question'], template=""" You are a highly skilled codebase analyst. Using the following retrieved context, provide a detailed answer. Instructions: 1. Start with a **clear summary** of the answer. 2. Highlight **key modules, components, or functions** that define this repo's functionality. 3. Focus on **unique or core logic**, avoiding generic explanations. 4. Include a **Sources** section at the end with specific files/functions mentioned. Context: {summaries} Question: {question} """ ) return vect, LLMChain(llm=llm, prompt=prompt) # Search and prioritize def get_relevant_docs(vdb, q: str)->List[Document]: out, seen = [], set() def add(batch): for d in batch: s = d.metadata['source'] if s not in seen: seen.add(s); out.append(d) for typ,k in [('overview_summary',1), ('overview',2), ('entity',3), ('code',4)]: add(vdb.similarity_search(q, k=k, filter={'type':typ})) return out # Gradio logic def qa_fn(repo, question, history): try: history.append(("User", question)) vect, chain = build_index(repo) docs = get_relevant_docs(vect, question) ctx = '\n---\n'.join(d.page_content for d in docs) ans = chain.predict(summaries=ctx, question=question) history.append(("Assistant", ans)) return history except Exception as e: history.append(("Error", str(e))) return history # UI with gr.Blocks() as demo: gr.Markdown("# 🧠 Repo QA Assistant") repo_in = gr.Textbox(label="GitHub Repo", placeholder="owner/repo (e.g. fastapi/full-stack-fastapi-template)") q_in = gr.Textbox(label="Your Question", placeholder="question (e.g How does this project work? )", lines=1) chatbox = gr.Chatbot() send = gr.Button("Send") state = gr.State([]) send.click(fn=qa_fn, inputs=[repo_in, q_in, state], outputs=[chatbox]) demo.launch()