File size: 5,377 Bytes
0980f19
 
 
 
 
8d69341
0980f19
 
07e6326
8d69341
0980f19
 
8d69341
 
0980f19
 
8d69341
677bdb4
8d69341
0980f19
8d69341
 
 
 
 
 
 
5d612d9
8d69341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7dc0414
8d69341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0980f19
 
8d69341
 
0980f19
8d69341
 
5d612d9
8d69341
 
0980f19
8d69341
 
 
 
 
 
 
0980f19
 
8d69341
7dc0414
0980f19
8d69341
0980f19
 
8d69341
 
7dc0414
8d69341
0980f19
 
8d69341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python3
"""
Enhanced Adaptive Repo RAG Assistant (Hugging Face Spaces Edition)
"""

import os, tempfile, ast
from functools import lru_cache
from typing import List
from dotenv import load_dotenv

import gradio as gr
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema import Document

# Load environment variables from .env
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

# LangChain setup
splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)

# Build or retrieve vector index from repo
@lru_cache(maxsize=3)
def build_index(repo: str):
    os.environ['OPENAI_API_KEY'] = api_key
    llm = ChatOpenAI(model='gpt-4', temperature=0)
    embeddings = OpenAIEmbeddings()

    tmp = tempfile.mkdtemp()
    os.system(f'git clone https://github.com/{repo}.git {tmp}/repo')
    base = os.path.join(tmp, 'repo')
    ignore = {'.git','node_modules','__pycache__','venv','.venv'}
    texts, metas = [], []

    # Prioritise README file
    for fn in ('README.md','README.rst','README.txt'):
        p = os.path.join(base, fn)
        if os.path.exists(p):
            raw = open(p, encoding='utf-8').read()
            for c in splitter.split_text(raw):
                texts.append(c); metas.append({'source':fn,'type':'overview','importance':'high'})
            summary = LLMChain(llm=llm, prompt=PromptTemplate(
                input_variables=['text'],
                template='Summarise this project README in two concise sentences.\n\n{text}'
            )).predict(text=raw)
            texts.append(summary); metas.append({'source':fn,'type':'overview_summary','importance':'high'})
            break

    # Walk and process relevant files
    for root, _, files in os.walk(base):
        if any(ig in root for ig in ignore): continue
        for f in files:
            ext = os.path.splitext(f)[1]
            if ext in ('.py','.js','.ts','.md'):
                path = os.path.join(root, f)
                content = open(path, encoding='utf-8').read().replace('<|endoftext|>','')
                importance = 'high' if f in ('main.py','app.py','config.py','models.py') else 'normal'
                for c in splitter.split_text(content):
                    texts.append(c); metas.append({'source':os.path.relpath(path,base),'type':'code','importance':importance})
                if ext == '.py':
                    try:
                        tree = ast.parse(content)
                        for node in tree.body:
                            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
                                sig = f"{node.name}({', '.join(a.arg for a in node.args.args)})"
                                doc = ast.get_docstring(node) or ''
                                snip = '\n'.join(content.splitlines()[node.lineno-1:node.end_lineno])
                                texts.append(f"## {sig}\n{doc}\n```python\n{snip}\n```")
                                metas.append({'source':os.path.relpath(path,base)+'::'+sig,'type':'entity','importance':importance})
                    except: pass

    vect = Chroma.from_texts(texts, embeddings, metadatas=metas)
    prompt = PromptTemplate(
        input_variables=['summaries','question'],
        template="""
You are a highly skilled codebase analyst.

Using the following retrieved context, provide a detailed answer.

Instructions:
1. Start with a **clear summary** of the answer.
2. Highlight **key modules, components, or functions** that define this repo's functionality.
3. Focus on **unique or core logic**, avoiding generic explanations.
4. Include a **Sources** section at the end with specific files/functions mentioned.

Context:
{summaries}

Question:
{question}
"""
    )
    return vect, LLMChain(llm=llm, prompt=prompt)

# Search and prioritize
def get_relevant_docs(vdb, q: str)->List[Document]:
    out, seen = [], set()
    def add(batch):
        for d in batch:
            s = d.metadata['source']
            if s not in seen:
                seen.add(s); out.append(d)
    for typ,k in [('overview_summary',1), ('overview',2), ('entity',3), ('code',4)]:
        add(vdb.similarity_search(q, k=k, filter={'type':typ}))
    return out

# Gradio logic
def qa_fn(repo, question, history):
    try:
        history.append(("User", question))
        vect, chain = build_index(repo)
        docs = get_relevant_docs(vect, question)
        ctx = '\n---\n'.join(d.page_content for d in docs)
        ans = chain.predict(summaries=ctx, question=question)
        history.append(("Assistant", ans))
        return history
    except Exception as e:
        history.append(("Error", str(e)))
        return history

# UI
with gr.Blocks() as demo:
    gr.Markdown("# 🧠 Repo QA Assistant")
    repo_in = gr.Textbox(label="GitHub Repo", placeholder="owner/repo (e.g. fastapi/full-stack-fastapi-template)")
    q_in = gr.Textbox(label="Your Question", placeholder="question (e.g How does this project work? )", lines=1)
    chatbox = gr.Chatbot()
    send = gr.Button("Send")

    state = gr.State([])

    send.click(fn=qa_fn, inputs=[repo_in, q_in, state], outputs=[chatbox])

demo.launch()