Spaces:

Luigi
/

VoxSum

Sleeping

Luigi commited on Sep 26, 2025

Commit

299bf2b

1 Parent(s): 228a065

Convert summarization.py prompts and comments from French to English

- Translate all prompt templates to English
- Update system messages in LLM calls to English
- Convert function docstrings and comments to English
- Update error messages and print statements to English
- Maintain all functionality while ensuring English output

Files changed (1) hide show

src/summarization.py +50 -50

src/summarization.py CHANGED Viewed

@@ -1,11 +1,11 @@
 # summarization.py
 """
-Module de résumé de transcript avec LLM.
-Fournit une fonction robuste pour résumer des textes longs en utilisant
-un chunking intelligent et des modèles de langage locaux.
-Version hybride : utilise LangChain pour le text splitting et les prompts,
-mais llama_cpp directement pour les appels LLM (plus performant).
 """
 import time
@@ -18,13 +18,13 @@ from langchain.prompts import PromptTemplate
 from .utils import available_gguf_llms, num_vcpus, s2tw_converter
-# Détection des cœurs logiques disponibles
 print(f"Detected vCPUs: {num_vcpus}")
 @lru_cache(maxsize=1)
 def get_llm(selected_gguf_model: str) -> Llama:
-    """Cache et retourne le modèle LLM"""
     repo_id, filename = available_gguf_llms[selected_gguf_model]
     return Llama.from_pretrained(
         repo_id=repo_id,
@@ -37,7 +37,7 @@ def get_llm(selected_gguf_model: str) -> Llama:
 def create_text_splitter(chunk_size: int = 4000, chunk_overlap: int = 200) -> RecursiveCharacterTextSplitter:
-    """Crée un splitter de texte avec des séparateurs intelligents"""
     return RecursiveCharacterTextSplitter(
         chunk_size=chunk_size,
         chunk_overlap=chunk_overlap,
@@ -47,27 +47,27 @@ def create_text_splitter(chunk_size: int = 4000, chunk_overlap: int = 200) -> Re
 def create_chunk_summary_prompt() -> PromptTemplate:
-    """Prompt pour résumer un chunk individuel"""
-    template = """Résumez cette partie du transcript en gardant les points clés et les informations importantes.
 Transcript:
 {text}
-Résumé concis:"""
     return PromptTemplate(template=template, input_variables=["text"])
 def create_final_summary_prompt() -> PromptTemplate:
-    """Prompt pour créer le résumé final à partir des résumés partiels"""
-    template = """Voici les résumés des différentes parties d'un transcript.
-Créez un résumé cohérent et synthétique de l'ensemble.
 {user_prompt}
-Résumés partiels:
 {partial_summaries}
-Résumé final:"""
     return PromptTemplate(
         template=template,
         input_variables=["user_prompt", "partial_summaries"]
@@ -75,14 +75,14 @@ Résumé final:"""
 def summarize_chunk(llm: Llama, text: str, prompt_template: PromptTemplate) -> str:
-    """Résume un chunk individuel en utilisant LangChain pour le prompt"""
     try:
-        # Utilise LangChain pour formater le prompt
         formatted_prompt = prompt_template.format(text=text)
         response = llm.create_chat_completion(
             messages=[
-                {"role": "system", "content": "Vous êtes un expert en résumé de transcript. Produisez des résumés clairs, concis et pertinents."},
                 {"role": "user", "content": formatted_prompt}
             ],
             stream=False,
@@ -90,43 +90,43 @@ def summarize_chunk(llm: Llama, text: str, prompt_template: PromptTemplate) -> s
         summary = response['choices'][0]['message']['content']
         return s2tw_converter.convert(summary)
     except Exception as e:
-        print(f"Erreur lors du résumé du chunk: {e}")
-        return f"[Erreur de résumé: {str(e)}]"
 def summarize_transcript_langchain(transcript: str, selected_gguf_model: str, prompt_input: str) -> Iterator[str]:
     """
-    Version hybride LangChain + llama_cpp du résumé de transcript.
-    Avantages de LangChain utilisés :
-    - RecursiveCharacterTextSplitter : chunking intelligent avec séparateurs naturels
-    - PromptTemplate : gestion propre des templates de prompts
-    - Code plus lisible et maintenable
-    Garde llama_cpp pour les appels LLM (meilleures performances).
     """
     if not transcript or not transcript.strip():
-        yield "Le transcript est vide."
         return
     try:
-        # Initialisation des composants
         llm = get_llm(selected_gguf_model)
         text_splitter = create_text_splitter()
         chunk_prompt = create_chunk_summary_prompt()
         final_prompt = create_final_summary_prompt()
-        # Estimation des tokens
         transcript_tokens = len(llm.tokenize(transcript.encode('utf-8')))
-        # Résumé direct si le texte est court
         if transcript_tokens <= 2000:
-            print(f"[summarize_transcript] Résumé direct: {transcript_tokens} tokens")
-            # Streaming direct avec llama_cpp
             stream = llm.create_chat_completion(
                 messages=[
-                    {"role": "system", "content": "Vous êtes un expert en résumé de transcript. Produisez des résumés clairs, concis et pertinents."},
                     {"role": "user", "content": f"{prompt_input}\n\n{transcript}"}
                 ],
                 stream=True,
@@ -140,36 +140,36 @@ def summarize_transcript_langchain(transcript: str, selected_gguf_model: str, pr
                     yield s2tw_converter.convert(full_response)
             return
-        # Chunking avec LangChain pour les textes longs
         chunks = text_splitter.split_text(transcript)
-        print(f"[summarize_transcript] Texte divisé en {len(chunks)} chunks")
-        # Résumé de chaque chunk
         partial_summaries = []
         for i, chunk in enumerate(chunks, 1):
-            print(f"Résumé du chunk {i}/{len(chunks)}")
             summary = summarize_chunk(llm, chunk, chunk_prompt)
             partial_summaries.append(summary)
-        # Combinaison et résumé final
         combined_summaries = "\n\n".join(partial_summaries)
-        # Vérification de la taille de la combinaison
         combined_tokens = len(llm.tokenize(combined_summaries.encode('utf-8')))
-        if combined_tokens <= 3500:  # Laisser de la marge
-            print(f"[summarize_transcript] Résumé final des {len(partial_summaries)} résumés partiels")
-            # Utilise LangChain pour formater le prompt final
             final_prompt_formatted = final_prompt.format(
                 user_prompt=prompt_input,
                 partial_summaries=combined_summaries
             )
-            # Streaming avec llama_cpp
             stream = llm.create_chat_completion(
                 messages=[
-                    {"role": "system", "content": "Vous êtes un expert en résumé de transcript. Produisez des résumés clairs, concis et pertinents."},
                     {"role": "user", "content": final_prompt_formatted}
                 ],
                 stream=True,
@@ -182,11 +182,11 @@ def summarize_transcript_langchain(transcript: str, selected_gguf_model: str, pr
                     full_response += delta['content']
                     yield s2tw_converter.convert(full_response)
         else:
-            print(f"[summarize_transcript] Combinaison trop longue ({combined_tokens} tokens), résumé simplifié")
-            # Fallback : résumé direct de la combinaison
             stream = llm.create_chat_completion(
                 messages=[
-                    {"role": "system", "content": "Vous êtes un expert en résumé de transcript. Produisez des résumés clairs, concis et pertinents."},
                     {"role": "user", "content": f"{prompt_input}\n\n{combined_summaries}"}
                 ],
                 stream=True,
@@ -200,8 +200,8 @@ def summarize_transcript_langchain(transcript: str, selected_gguf_model: str, pr
                     yield s2tw_converter.convert(full_response)
     except Exception as e:
-        print(f"Erreur générale lors du résumé: {e}")
-        yield f"[Erreur lors du résumé: {str(e)}]"
 # Alias pour maintenir la compatibilité

 # summarization.py
 """
+Transcript summarization module with LLM.
+Provides a robust function for summarizing long texts using
+intelligent chunking and local language models.
+Hybrid version: uses LangChain for text splitting and prompts,
+but llama_cpp directly for LLM calls (better performance).
 """
 import time
 from .utils import available_gguf_llms, num_vcpus, s2tw_converter
+# Detection of available logical cores
 print(f"Detected vCPUs: {num_vcpus}")
 @lru_cache(maxsize=1)
 def get_llm(selected_gguf_model: str) -> Llama:
+    """Cache and return the LLM model"""
     repo_id, filename = available_gguf_llms[selected_gguf_model]
     return Llama.from_pretrained(
         repo_id=repo_id,
 def create_text_splitter(chunk_size: int = 4000, chunk_overlap: int = 200) -> RecursiveCharacterTextSplitter:
+    """Create a text splitter with intelligent separators"""
     return RecursiveCharacterTextSplitter(
         chunk_size=chunk_size,
         chunk_overlap=chunk_overlap,
 def create_chunk_summary_prompt() -> PromptTemplate:
+    """Prompt for summarizing an individual chunk"""
+    template = """Summarize this part of the transcript while keeping the key points and important information.
 Transcript:
 {text}
+Concise summary:"""
     return PromptTemplate(template=template, input_variables=["text"])
 def create_final_summary_prompt() -> PromptTemplate:
+    """Prompt for creating the final summary from partial summaries"""
+    template = """Here are the summaries of different parts of a transcript.
+Create a coherent and synthetic summary of the whole.
 {user_prompt}
+Partial summaries:
 {partial_summaries}
+Final summary:"""
     return PromptTemplate(
         template=template,
         input_variables=["user_prompt", "partial_summaries"]
 def summarize_chunk(llm: Llama, text: str, prompt_template: PromptTemplate) -> str:
+    """Summarize an individual chunk using LangChain for the prompt"""
     try:
+        # Use LangChain to format the prompt
         formatted_prompt = prompt_template.format(text=text)
         response = llm.create_chat_completion(
             messages=[
+                {"role": "system", "content": "You are an expert in transcript summarization. Produce clear, concise, and relevant summaries."},
                 {"role": "user", "content": formatted_prompt}
             ],
             stream=False,
         summary = response['choices'][0]['message']['content']
         return s2tw_converter.convert(summary)
     except Exception as e:
+        print(f"Error during chunk summarization: {e}")
+        return f"[Summarization error: {str(e)}]"
 def summarize_transcript_langchain(transcript: str, selected_gguf_model: str, prompt_input: str) -> Iterator[str]:
     """
+    Hybrid LangChain + llama_cpp version of transcript summarization.
+    LangChain advantages used:
+    - RecursiveCharacterTextSplitter: intelligent chunking with natural separators
+    - PromptTemplate: clean template management
+    - More readable and maintainable code
+    Keeps llama_cpp for LLM calls (better performance).
     """
     if not transcript or not transcript.strip():
+        yield "The transcript is empty."
         return
     try:
+        # Component initialization
         llm = get_llm(selected_gguf_model)
         text_splitter = create_text_splitter()
         chunk_prompt = create_chunk_summary_prompt()
         final_prompt = create_final_summary_prompt()
+        # Token estimation
         transcript_tokens = len(llm.tokenize(transcript.encode('utf-8')))
+        # Direct summary if text is short
         if transcript_tokens <= 2000:
+            print(f"[summarize_transcript] Direct summary: {transcript_tokens} tokens")
+            # Direct streaming with llama_cpp
             stream = llm.create_chat_completion(
                 messages=[
+                    {"role": "system", "content": "You are an expert in transcript summarization. Produce clear, concise, and relevant summaries."},
                     {"role": "user", "content": f"{prompt_input}\n\n{transcript}"}
                 ],
                 stream=True,
                     yield s2tw_converter.convert(full_response)
             return
+        # Chunking with LangChain for long texts
         chunks = text_splitter.split_text(transcript)
+        print(f"[summarize_transcript] Text divided into {len(chunks)} chunks")
+        # Summary of each chunk
         partial_summaries = []
         for i, chunk in enumerate(chunks, 1):
+            print(f"Summarizing chunk {i}/{len(chunks)}")
             summary = summarize_chunk(llm, chunk, chunk_prompt)
             partial_summaries.append(summary)
+        # Combination and final summary
         combined_summaries = "\n\n".join(partial_summaries)
+        # Check combination size
         combined_tokens = len(llm.tokenize(combined_summaries.encode('utf-8')))
+        if combined_tokens <= 3500:  # Leave some margin
+            print(f"[summarize_transcript] Final summary of {len(partial_summaries)} partial summaries")
+            # Use LangChain to format the final prompt
             final_prompt_formatted = final_prompt.format(
                 user_prompt=prompt_input,
                 partial_summaries=combined_summaries
             )
+            # Streaming with llama_cpp
             stream = llm.create_chat_completion(
                 messages=[
+                    {"role": "system", "content": "You are an expert in transcript summarization. Produce clear, concise, and relevant summaries."},
                     {"role": "user", "content": final_prompt_formatted}
                 ],
                 stream=True,
                     full_response += delta['content']
                     yield s2tw_converter.convert(full_response)
         else:
+            print(f"[summarize_transcript] Combination too long ({combined_tokens} tokens), simplified summary")
+            # Fallback: direct summary of the combination
             stream = llm.create_chat_completion(
                 messages=[
+                    {"role": "system", "content": "You are an expert in transcript summarization. Produce clear, concise, and relevant summaries."},
                     {"role": "user", "content": f"{prompt_input}\n\n{combined_summaries}"}
                 ],
                 stream=True,
                     yield s2tw_converter.convert(full_response)
     except Exception as e:
+        print(f"General error during summarization: {e}")
+        yield f"[Error during summarization: {str(e)}]"
 # Alias pour maintenir la compatibilité