import json import faiss import numpy as np from sentence_transformers import SentenceTransformer from typing import List, Dict, Any def create_code_vector_db(json_file_path: str, model_name: str, output_index_path: str, output_metadata_path: str): """ Loads code chunks, filters them, generates embeddings, and saves a FAISS index along with corresponding metadata. Args: json_file_path (str): Path to the code_chunks.json file. model_name (str): The name of the SentenceTransformer model to use. output_index_path (str): Path to save the FAISS index file. output_metadata_path (str): Path to save the chunk metadata JSON file. """ # 1. Load and Filter Chunks print(f"Loading chunks from '{json_file_path}'...") try: with open(json_file_path, 'r', encoding='utf-8') as f: all_chunks = json.load(f) except FileNotFoundError: print(f"Error: The file '{json_file_path}' was not found.") return # Filter for chunks that contain meaningful semantic information for a RAG system target_types = {'function', 'class', 'method', 'async_function', 'async_method'} filtered_chunks = [chunk for chunk in all_chunks if chunk.get('chunk_type') in target_types] if not filtered_chunks: print("No chunks of target types found. Exiting.") return print(f"Filtered chunks: Kept {len(filtered_chunks)} out of {len(all_chunks)} total chunks.") # 2. Prepare Text for Embedding # Combine code with metadata for richer semantic representation. texts_to_embed = [] for chunk in filtered_chunks: # A good practice is to create a descriptive text for each chunk docstring = chunk.get('docstring', '') or "No docstring." name = chunk.get('name', '') chunk_type = chunk.get('chunk_type', '') # Create a descriptive header for the code content header = f"Type: {chunk_type}, Name: {name}\nDocstring: {docstring}\n---\n" prepared_text = header + chunk['content'] texts_to_embed.append(prepared_text) # 3. Generate Embeddings print(f"Loading SentenceTransformer model: '{model_name}'...") # Using a model well-suited for code is beneficial, but a general one works too. # Consider models like 'microsoft/codebert-base' or 'all-MiniLM-L6-v2' for a start. model = SentenceTransformer(model_name).half() # Convert the model to half precision for faster inference # model to fp16 for faster inference # model = SentenceTransformer(model_name, device='cpu').half() print("Generating embeddings for filtered chunks... (This may take a while)") # embeddings = model.encode(texts_to_embed, show_progress_bar=True) # Define a batch size batch_size = 2 # You can adjust this number based on your VRAM print("Generating embeddings for filtered chunks... (This may take a while)") embeddings = model.encode( texts_to_embed, batch_size=batch_size, show_progress_bar=True ) # Convert to float32 for FAISS embeddings = np.array(embeddings).astype('float32') dimension = embeddings.shape[1] print(f"Embeddings generated with dimension: {dimension}") # 4. Build and Save FAISS Index print("Building FAISS index...") index = faiss.IndexFlatL2(dimension) index.add(embeddings) print(f"Saving FAISS index to '{output_index_path}'...") faiss.write_index(index, output_index_path) # 5. Save Metadata for Mapping # We need to save the original chunk info to map FAISS results back to the source code metadata_to_save = [ { "chunk_id": chunk.get("chunk_id"), "file_path": chunk.get("file_path"), "start_line": chunk.get("start_line"), "end_line": chunk.get("end_line"), "name": chunk.get("name"), "chunk_type": chunk.get("chunk_type") } for chunk in filtered_chunks ] print(f"Saving metadata mapping to '{output_metadata_path}'...") with open(output_metadata_path, 'w', encoding='utf-8') as f: json.dump(metadata_to_save, f, indent=2) print("\nProcess complete!") print(f"FAISS index and metadata have been successfully saved.") if __name__ == "__main__": # --- CONFIGURATION --- CHUNKS_JSON_PATH = "code_chunks.json" # Recommended model for general purpose, good balance of speed and quality. # For more code-specific tasks, you might explore models like 'microsoft/codebert-base'. MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B" OUTPUT_INDEX_PATH = "code_faiss.index" OUTPUT_METADATA_PATH = "code_metadata.json" # --- EXECUTION --- create_code_vector_db( json_file_path=CHUNKS_JSON_PATH, model_name=MODEL_NAME, output_index_path=OUTPUT_INDEX_PATH, output_metadata_path=OUTPUT_METADATA_PATH )