import pandas as pd from typing import Dict, List, Tuple from .metrics import compute_all_metrics from .hf_embedding import get_model as get_hf_model from .tokenize import tokenize_texts from .progressive_loader import MetricType import logging from itertools import combinations import re # FastText removed: always use Sentence Transformers def get_botok_tokens_for_single_text(text: str, mode: str = "syllable") -> list[str]: """ A wrapper around tokenize_texts to make it suitable for tokenize_fn in generate_embeddings, which expects a function that tokenizes a single string. Accepts a 'mode' argument ('syllable' or 'word') to pass to tokenize_texts. """ if not text.strip(): return [] # Pass the mode to tokenize_texts tokenized_list_of_lists = tokenize_texts([text], mode=mode) if tokenized_list_of_lists and tokenized_list_of_lists[0]: return tokenized_list_of_lists[0] return [] def clean_tibetan_text(text: str) -> str: """ Applies light cleaning steps to Tibetan text: - Removes lnX/pX page/line markers. - Normalizes double tsheg to single tsheg. - Normalizes whitespace. """ # Remove lnX/pX markers cleaned_text = re.sub(r"\s*(?:[lL][nN]|[pP])\d{1,3}[abAB]?\s*", " ", text) # Normalize double tsheg cleaned_text = re.sub(r"།\s*།", "།", cleaned_text) # Normalize spaces (multiple spaces to single, strip leading/trailing) cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() return cleaned_text logger = logging.getLogger(__name__) def process_texts( text_data: Dict[str, str], filenames: List[str], enable_semantic: bool = True, enable_fuzzy: bool = True, fuzzy_method: str = 'token_set', model_name: str = "sentence-transformers/LaBSE", use_stopwords: bool = True, use_lite_stopwords: bool = False, progress_callback = None, progressive_callback = None, batch_size: int = 32, show_progress_bar: bool = False ) -> Tuple[pd.DataFrame, pd.DataFrame, str]: """ Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files. Args: text_data (Dict[str, str]): A dictionary mapping filenames to their content. filenames (List[str]): A list of filenames that were uploaded. enable_semantic (bool, optional): Whether to compute semantic similarity metrics. Requires loading a sentence-transformer model, which can be time-consuming. Defaults to True. enable_fuzzy (bool, optional): Whether to compute fuzzy string similarity metrics. Uses TheFuzz library for approximate string matching. Defaults to True. fuzzy_method (str, optional): The fuzzy matching method to use. Options are: 'token_set' - Order-independent token matching (default) 'token_sort' - Order-normalized token matching 'partial' - Best partial token matching 'ratio' - Simple ratio matching model_name (str, optional): The Hugging Face sentence-transformer model to use for semantic similarity. Must be a valid model identifier on Hugging Face. Defaults to "sentence-transformers/LaBSE". use_stopwords (bool, optional): Whether to use stopwords in the metrics calculation. Defaults to True. use_lite_stopwords (bool, optional): Whether to use the lite stopwords list (common particles only) instead of the comprehensive list. Only applies if use_stopwords is True. Defaults to False. progress_callback (callable, optional): A callback function for reporting progress updates. Should accept a float between 0 and 1 and a description string. Defaults to None. progressive_callback (callable, optional): A callback function for sending incremental results. Used for progressive loading of metrics as they become available. Defaults to None. Returns: Tuple[pd.DataFrame, pd.DataFrame, str]: - metrics_df: DataFrame with similarity metrics between corresponding chapters of file pairs. Contains columns: 'Text Pair', 'Chapter', 'Jaccard Similarity (%)', 'Normalized LCS', 'Fuzzy Similarity' (if enable_fuzzy=True), 'Semantic Similarity' (if enable_semantic=True). - word_counts_df: DataFrame with word counts for each segment (chapter) in each file. Contains columns: 'Filename', 'ChapterNumber', 'SegmentID', 'WordCount'. - warning: A string containing any warnings generated during processing (e.g., missing chapter markers). Raises: RuntimeError: If the botok tokenizer fails to initialize. ValueError: If the input files cannot be processed or if metrics computation fails. """ # Initialize model and model_type variables model, model_type = None, None # st_device removed warning = "" model_warning = "" # Update progress if callback provided if progress_callback is not None: try: progress_callback(0.25, desc="Preparing for text analysis...") except Exception as e: logger.warning(f"Progress callback error (non-critical): {e}") # Continue processing even if progress reporting fails # Load semantic model if enabled if enable_semantic: logger.info("Semantic similarity enabled. Loading embedding model...") try: logger.info(f"Using model: {model_name}") # Always use Hugging Face sentence-transformers model, model_type = get_hf_model(model_id=model_name) if model: logger.info(f"Model '{model_name}' (type: {model_type}) loaded successfully.") if progress_callback is not None: progress_callback(0.3, desc=f"Model '{model_name}' loaded.") else: model_warning = f"Model ('{model_name}') failed to load. Semantic similarity will be disabled." logger.warning(model_warning) warning = warning + f" {model_warning}" if 'warning' in locals() else model_warning enable_semantic = False if progress_callback is not None: try: progress_callback(0.3, desc="Unsupported model, continuing without semantic similarity.") except Exception as e: logger.warning(f"Progress callback error (non-critical): {e}") except Exception as e: # General catch-all for unexpected errors during model loading attempts model_warning = f"An unexpected error occurred while attempting to load model '{model_name}': {e}. Semantic similarity will be disabled." logger.error(model_warning, exc_info=True) enable_semantic = False if progress_callback is not None: try: progress_callback(0.3, desc="Error loading model, continuing without semantic similarity.") except Exception as e_cb: logger.warning(f"Progress callback error (non-critical): {e_cb}") else: logger.info("Semantic similarity disabled. Skipping model loading.") if progress_callback is not None: try: progress_callback(0.3, desc="Processing text segments") except Exception as e: logger.warning(f"Progress callback error (non-critical): {e}") # Detect chapter marker and segment texts if progress_callback is not None: try: progress_callback(0.35, desc="Segmenting texts by chapters...") except Exception as e: logger.warning(f"Progress callback error (non-critical): {e}") chapter_marker = "༈" fallback = False segment_texts = {} # Process each file for i, fname in enumerate(filenames): if progress_callback is not None and len(filenames) > 1: try: progress_callback(0.35 + (0.05 * (i / len(filenames))), desc=f"Segmenting file {i+1}/{len(filenames)}: {fname}") except Exception as e: logger.warning(f"Progress callback error (non-critical): {e}") content = text_data[fname] # Check if content is empty if not content.strip(): logger.warning(f"File '{fname}' is empty or contains only whitespace.") continue # Split by chapter marker if present if chapter_marker in content: segments = [ seg.strip() for seg in content.split(chapter_marker) if seg.strip() ] # Check if we have valid segments after splitting if not segments: logger.warning(f"File '{fname}' contains chapter markers but no valid text segments.") continue for idx, seg in enumerate(segments): seg_id = f"{fname}|chapter {idx+1}" cleaned_seg = clean_tibetan_text(seg) segment_texts[seg_id] = cleaned_seg else: # No chapter markers found, treat entire file as one segment seg_id = f"{fname}|chapter 1" cleaned_content = clean_tibetan_text(content.strip()) segment_texts[seg_id] = cleaned_content fallback = True # Generate warning if no chapter markers found warning = model_warning # Include any model warnings if fallback: chapter_warning = ( "No chapter marker found in one or more files. " "Each file will be treated as a single segment. " "For best results, add a unique marker (e.g., ༈) to separate chapters or sections." ) warning = warning + " " + chapter_warning if warning else chapter_warning # Check if we have any valid segments if not segment_texts: logger.error("No valid text segments found in any of the uploaded files.") return pd.DataFrame(), pd.DataFrame(), "No valid text segments found in the uploaded files. Please check your files and try again." # Tokenize all segments at once for efficiency if progress_callback is not None: try: progress_callback(0.42, desc="Tokenizing all text segments...") except Exception as e: logger.warning(f"Progress callback error (non-critical): {e}") all_segment_ids = list(segment_texts.keys()) all_segment_contents = list(segment_texts.values()) tokenized_segments_list = tokenize_texts(all_segment_contents) segment_tokens = dict(zip(all_segment_ids, tokenized_segments_list)) # Group chapters by filename (preserving order) if progress_callback is not None: try: progress_callback(0.4, desc="Organizing text segments...") except Exception as e: logger.warning(f"Progress callback error (non-critical): {e}") file_to_chapters = {} for seg_id in segment_texts: fname = seg_id.split("|")[0] file_to_chapters.setdefault(fname, []).append(seg_id) # For each pair of files, compare corresponding chapters (by index) if progress_callback is not None: try: progress_callback(0.45, desc="Computing similarity metrics...") except Exception as e: logger.warning(f"Progress callback error (non-critical): {e}") results = [] files = list(file_to_chapters.keys()) # Check if we have at least two files to compare if len(files) < 2: logger.warning("Need at least two files to compute similarity metrics.") return pd.DataFrame(), pd.DataFrame(), "Need at least two files to compute similarity metrics." # Track total number of comparisons for progress reporting total_comparisons = 0 for file1, file2 in combinations(files, 2): chaps1 = file_to_chapters[file1] chaps2 = file_to_chapters[file2] total_comparisons += min(len(chaps1), len(chaps2)) # Initialize results DataFrame for progressive updates results_columns = ['Text Pair', 'Chapter', 'Jaccard Similarity (%)', 'Normalized LCS'] if enable_fuzzy: results_columns.append('Fuzzy Similarity') if enable_semantic: results_columns.append('Semantic Similarity') # Create empty DataFrame with the correct columns progressive_df = pd.DataFrame(columns=results_columns) # Track which metrics have been completed for progressive updates completed_metrics = [] # Process each file pair comparison_count = 0 for file1, file2 in combinations(files, 2): chaps1 = file_to_chapters[file1] chaps2 = file_to_chapters[file2] min_chaps = min(len(chaps1), len(chaps2)) if progress_callback is not None: try: progress_callback(0.45, desc=f"Comparing {file1} with {file2}...") except Exception as e: logger.warning(f"Progress callback error (non-critical): {e}") for idx in range(min_chaps): seg1 = chaps1[idx] seg2 = chaps2[idx] # Update progress comparison_count += 1 if progress_callback is not None and total_comparisons > 0: try: progress_percentage = 0.45 + (0.25 * (comparison_count / total_comparisons)) progress_callback(progress_percentage, desc=f"Computing metrics for chapter {idx+1} ({comparison_count}/{total_comparisons})") except Exception as e: logger.warning(f"Progress callback error (non-critical): {e}") try: # Compute metrics for this chapter pair metrics_df = compute_all_metrics( texts={seg1: segment_texts[seg1], seg2: segment_texts[seg2]}, token_lists={seg1: segment_tokens[seg1], seg2: segment_tokens[seg2]}, model=model, enable_semantic=enable_semantic, enable_fuzzy=enable_fuzzy, fuzzy_method=fuzzy_method, use_stopwords=use_stopwords, use_lite_stopwords=use_lite_stopwords, ) # Extract metrics from the DataFrame (should have only one row) if not metrics_df.empty: pair_metrics = metrics_df.iloc[0].to_dict() else: # Handle empty DataFrame case logger.error(f"No metrics computed for {seg1} vs {seg2}") pair_metrics = { "Jaccard Similarity (%)": 0.0, "Normalized LCS": 0.0, "Fuzzy Similarity": 0.0 if enable_fuzzy else np.nan, "Semantic Similarity": 0.0 if enable_semantic else np.nan } # Format the results text_pair = f"{file1} vs {file2}" chapter_num = idx + 1 result_row = { "Text Pair": text_pair, "Chapter": chapter_num, "Jaccard Similarity (%)": pair_metrics["Jaccard Similarity (%)"], # Already in percentage "Normalized LCS": pair_metrics["Normalized LCS"], } # Add fuzzy similarity if enabled if enable_fuzzy: result_row["Fuzzy Similarity"] = pair_metrics["Fuzzy Similarity"] # Add semantic similarity if enabled and available if enable_semantic and "Semantic Similarity" in pair_metrics: result_row["Semantic Similarity"] = pair_metrics["Semantic Similarity"] # Convert the dictionary to a DataFrame before appending result_df = pd.DataFrame([result_row]) results.append(result_df) # Update progressive DataFrame and send update if callback is provided progressive_df = pd.concat(results, ignore_index=True) # Send progressive update if callback is provided if progressive_callback is not None: # Determine which metrics are complete in this update current_metrics = [] # Always include these basic metrics if "Jaccard Similarity (%)" in progressive_df.columns and MetricType.JACCARD not in completed_metrics: current_metrics.append(MetricType.JACCARD) completed_metrics.append(MetricType.JACCARD) if "Normalized LCS" in progressive_df.columns and MetricType.LCS not in completed_metrics: current_metrics.append(MetricType.LCS) completed_metrics.append(MetricType.LCS) # Add fuzzy if enabled and available if enable_fuzzy and "Fuzzy Similarity" in progressive_df.columns and MetricType.FUZZY not in completed_metrics: current_metrics.append(MetricType.FUZZY) completed_metrics.append(MetricType.FUZZY) # Add semantic if enabled and available if enable_semantic and "Semantic Similarity" in progressive_df.columns and MetricType.SEMANTIC not in completed_metrics: current_metrics.append(MetricType.SEMANTIC) completed_metrics.append(MetricType.SEMANTIC) # Create word counts DataFrame for progressive update word_counts_data = [] for seg_id, tokens in segment_tokens.items(): filename, chapter_info = seg_id.split('|') chapter_num = int(chapter_info.split()[1]) word_counts_data.append({ "Filename": filename, "ChapterNumber": chapter_num, "SegmentID": seg_id, "WordCount": len(tokens) }) word_counts_df_progressive = pd.DataFrame(word_counts_data) # Send the update try: progressive_callback( progressive_df, word_counts_df_progressive, current_metrics, warning, False # Not complete yet ) except Exception as e: logger.warning(f"Progressive callback error (non-critical): {e}") except Exception as e: logger.error(f"Error computing metrics for {seg1} vs {seg2}: {e}", exc_info=True) # Continue with other segmentsparisons instead of failing completely continue # Create the metrics DataFrame if results: # Results are already DataFrames, so we can concatenate them directly metrics_df = pd.concat(results, ignore_index=True) else: metrics_df = pd.DataFrame() warning += " No valid metrics could be computed. Please check your files and try again." # Calculate word counts if progress_callback is not None: try: progress_callback(0.75, desc="Calculating word counts...") except Exception as e: logger.warning(f"Progress callback error (non-critical): {e}") word_counts_data = [] # Process each segment for i, (seg_id, text_content) in enumerate(segment_texts.items()): # Update progress if progress_callback is not None and len(segment_texts) > 0: try: progress_percentage = 0.75 + (0.15 * (i / len(segment_texts))) progress_callback(progress_percentage, desc=f"Counting words in segment {i+1}/{len(segment_texts)}") except Exception as e: logger.warning(f"Progress callback error (non-critical): {e}") fname, chapter_info = seg_id.split("|", 1) chapter_num = int(chapter_info.replace("chapter ", "")) try: # Use botok for accurate word count for raw Tibetan text tokenized_segments = tokenize_texts([text_content]) # Returns a list of lists if tokenized_segments and tokenized_segments[0]: word_count = len(tokenized_segments[0]) else: word_count = 0 word_counts_data.append( { "Filename": fname.replace(".txt", ""), "ChapterNumber": chapter_num, "SegmentID": seg_id, "WordCount": word_count, } ) except Exception as e: logger.error(f"Error calculating word count for segment {seg_id}: {e}") # Add entry with 0 word count to maintain consistency word_counts_data.append( { "Filename": fname.replace(".txt", ""), "ChapterNumber": chapter_num, "SegmentID": seg_id, "WordCount": 0, } ) # Create and sort the word counts DataFrame word_counts_df = pd.DataFrame(word_counts_data) if not word_counts_df.empty: word_counts_df = word_counts_df.sort_values( by=["Filename", "ChapterNumber"] ).reset_index(drop=True) if progress_callback is not None: try: progress_callback(0.95, desc="Analysis complete!") except Exception as e: logger.warning(f"Progress callback error (non-critical): {e}") # Send final progressive update if callback is provided if progressive_callback is not None: try: # Send the complete results progressive_callback( metrics_df, word_counts_df, completed_metrics, warning, True # Computation is complete ) except Exception as e: logger.warning(f"Final progressive callback error (non-critical): {e}") # Return the results return metrics_df, word_counts_df, warning