# summarization.py from llama_cpp import Llama from utils import available_gguf_llms, s2tw_converter import time from functools import lru_cache @lru_cache(maxsize=1) def get_model(gguf_repo_id, gguf_filename): return Llama.from_pretrained( repo_id=gguf_repo_id, filename=gguf_filename, verbose=False, n_ctx=32768, n_threads=4, repeat_penalty=1.2, ) def summarize_transcript(transcript, selected_gguf_model, prompt_input): repo_id, filename = available_gguf_llms[selected_gguf_model] llm = get_model(repo_id, filename) full_summary = [] is_1st_token = True t1 = time.time() stream = llm.create_chat_completion( messages=[ {"role": "system", "content": "You are an expert in transcript summarization."}, {"role": "user", "content": f'{prompt_input} \n{transcript}'} ], stream=True, ) for chunk in stream: delta = chunk['choices'][0]['delta'] if 'content' in delta: if is_1st_token: print(f"Time to 1st Token: {time.time()-t1:.1f} sec") is_1st_token = False token = delta['content'] full_summary.append(str(token)) yield s2tw_converter.convert("".join(full_summary)) #, "Summarizing" yield s2tw_converter.convert("".join(full_summary)) #, "Summary complete"