# summarization.py
from llama_cpp import Llama
from utils import available_gguf_llms, s2tw_converter
import time
from functools import lru_cache

@lru_cache(maxsize=1)
def get_model(gguf_repo_id, gguf_filename):
    return Llama.from_pretrained(
        repo_id=gguf_repo_id,
        filename=gguf_filename,
        verbose=False,
        n_ctx=32768,
        n_threads=4,
        repeat_penalty=1.2,
    )

def summarize_transcript(transcript, selected_gguf_model, prompt_input):
    repo_id, filename = available_gguf_llms[selected_gguf_model]
    llm = get_model(repo_id, filename)
    full_summary = []
    is_1st_token = True
    t1 = time.time()

    stream = llm.create_chat_completion(
        messages=[
            {"role": "system", "content": "You are an expert in transcript summarization."},
            {"role": "user", "content": f'{prompt_input} \n{transcript}'}
        ],
        stream=True,
    )

    for chunk in stream:
        delta = chunk['choices'][0]['delta']
        if 'content' in delta:
            if is_1st_token:
                print(f"Time to 1st Token: {time.time()-t1:.1f} sec")
                is_1st_token = False
            token = delta['content']
            full_summary.append(str(token))
            yield s2tw_converter.convert("".join(full_summary)) #, "Summarizing"
    yield s2tw_converter.convert("".join(full_summary)) #, "Summary complete"