RUBY / ingest.py
itzbhav's picture
Upload 8 files
748b6ee verified
# ingest.py
from sentence_transformers import SentenceTransformer
import whisper
import os
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
# Initialize models
whisper_model = whisper.load_model("base") # or "medium", "large" depending on system resources
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Set up Chroma DB
client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory="chroma_db"))
collection = client.get_or_create_collection(name="meeting_notes",
embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(embedding_model))
def transcribe_and_ingest(video_path, meeting_id):
print(f"Transcribing {video_path}...")
result = whisper_model.transcribe(video_path)
transcription = result["text"]
print("Splitting transcription...")
chunks = [transcription[i:i+500] for i in range(0, len(transcription), 500)]
ids = [f"{meeting_id}_{i}" for i in range(len(chunks))]
print("Storing embeddings in Chroma DB...")
collection.add(documents=chunks, ids=ids, metadatas=[{"meeting_id": meeting_id}] * len(chunks))
return transcription # return full transcription for PDF or future use