Spaces:
Sleeping
Sleeping
File size: 5,377 Bytes
0980f19 8d69341 0980f19 07e6326 8d69341 0980f19 8d69341 0980f19 8d69341 677bdb4 8d69341 0980f19 8d69341 5d612d9 8d69341 7dc0414 8d69341 0980f19 8d69341 0980f19 8d69341 5d612d9 8d69341 0980f19 8d69341 0980f19 8d69341 7dc0414 0980f19 8d69341 0980f19 8d69341 7dc0414 8d69341 0980f19 8d69341 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
#!/usr/bin/env python3
"""
Enhanced Adaptive Repo RAG Assistant (Hugging Face Spaces Edition)
"""
import os, tempfile, ast
from functools import lru_cache
from typing import List
from dotenv import load_dotenv
import gradio as gr
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema import Document
# Load environment variables from .env
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
# LangChain setup
splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)
# Build or retrieve vector index from repo
@lru_cache(maxsize=3)
def build_index(repo: str):
os.environ['OPENAI_API_KEY'] = api_key
llm = ChatOpenAI(model='gpt-4', temperature=0)
embeddings = OpenAIEmbeddings()
tmp = tempfile.mkdtemp()
os.system(f'git clone https://github.com/{repo}.git {tmp}/repo')
base = os.path.join(tmp, 'repo')
ignore = {'.git','node_modules','__pycache__','venv','.venv'}
texts, metas = [], []
# Prioritise README file
for fn in ('README.md','README.rst','README.txt'):
p = os.path.join(base, fn)
if os.path.exists(p):
raw = open(p, encoding='utf-8').read()
for c in splitter.split_text(raw):
texts.append(c); metas.append({'source':fn,'type':'overview','importance':'high'})
summary = LLMChain(llm=llm, prompt=PromptTemplate(
input_variables=['text'],
template='Summarise this project README in two concise sentences.\n\n{text}'
)).predict(text=raw)
texts.append(summary); metas.append({'source':fn,'type':'overview_summary','importance':'high'})
break
# Walk and process relevant files
for root, _, files in os.walk(base):
if any(ig in root for ig in ignore): continue
for f in files:
ext = os.path.splitext(f)[1]
if ext in ('.py','.js','.ts','.md'):
path = os.path.join(root, f)
content = open(path, encoding='utf-8').read().replace('<|endoftext|>','')
importance = 'high' if f in ('main.py','app.py','config.py','models.py') else 'normal'
for c in splitter.split_text(content):
texts.append(c); metas.append({'source':os.path.relpath(path,base),'type':'code','importance':importance})
if ext == '.py':
try:
tree = ast.parse(content)
for node in tree.body:
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
sig = f"{node.name}({', '.join(a.arg for a in node.args.args)})"
doc = ast.get_docstring(node) or ''
snip = '\n'.join(content.splitlines()[node.lineno-1:node.end_lineno])
texts.append(f"## {sig}\n{doc}\n```python\n{snip}\n```")
metas.append({'source':os.path.relpath(path,base)+'::'+sig,'type':'entity','importance':importance})
except: pass
vect = Chroma.from_texts(texts, embeddings, metadatas=metas)
prompt = PromptTemplate(
input_variables=['summaries','question'],
template="""
You are a highly skilled codebase analyst.
Using the following retrieved context, provide a detailed answer.
Instructions:
1. Start with a **clear summary** of the answer.
2. Highlight **key modules, components, or functions** that define this repo's functionality.
3. Focus on **unique or core logic**, avoiding generic explanations.
4. Include a **Sources** section at the end with specific files/functions mentioned.
Context:
{summaries}
Question:
{question}
"""
)
return vect, LLMChain(llm=llm, prompt=prompt)
# Search and prioritize
def get_relevant_docs(vdb, q: str)->List[Document]:
out, seen = [], set()
def add(batch):
for d in batch:
s = d.metadata['source']
if s not in seen:
seen.add(s); out.append(d)
for typ,k in [('overview_summary',1), ('overview',2), ('entity',3), ('code',4)]:
add(vdb.similarity_search(q, k=k, filter={'type':typ}))
return out
# Gradio logic
def qa_fn(repo, question, history):
try:
history.append(("User", question))
vect, chain = build_index(repo)
docs = get_relevant_docs(vect, question)
ctx = '\n---\n'.join(d.page_content for d in docs)
ans = chain.predict(summaries=ctx, question=question)
history.append(("Assistant", ans))
return history
except Exception as e:
history.append(("Error", str(e)))
return history
# UI
with gr.Blocks() as demo:
gr.Markdown("# 🧠 Repo QA Assistant")
repo_in = gr.Textbox(label="GitHub Repo", placeholder="owner/repo (e.g. fastapi/full-stack-fastapi-template)")
q_in = gr.Textbox(label="Your Question", placeholder="question (e.g How does this project work? )", lines=1)
chatbox = gr.Chatbot()
send = gr.Button("Send")
state = gr.State([])
send.click(fn=qa_fn, inputs=[repo_in, q_in, state], outputs=[chatbox])
demo.launch()
|