Spaces:

loringuyen
/

QA-system-in-Vietnamese-law

Sleeping

File size: 5,956 Bytes

32238e9

import re
import pandas as pd
from typing import List, Set
from underthesea import word_tokenize
from config import Config


class VietnameseTextProcessor:
    """Vietnamese text processing utilities for legal documents"""

    def __init__(self):
        self.stopwords = self._load_stopwords()

    def _load_stopwords(self) -> Set[str]:
        """Load Vietnamese stopwords from file"""
        try:
            # Try UTF-8 first
            with open(Config.STOPWORDS_PATH, "r", encoding="utf-8") as f:
                stopwords = set(line.strip() for line in f if line.strip())
                stopwords = set(['_'.join(word.split()) for word in list(stopwords)])
            return stopwords
        except UnicodeDecodeError:
            try:
                # Try UTF-16 if UTF-8 fails
                with open(Config.STOPWORDS_PATH, "r", encoding="utf-16") as f:
                    stopwords = set(line.strip() for line in f if line.strip())
                return stopwords
            except UnicodeDecodeError:
                try:
                    # Try with BOM detection
                    with open(Config.STOPWORDS_PATH, "r", encoding="utf-8-sig") as f:
                        stopwords = set(line.strip() for line in f if line.strip())
                    return stopwords
                except UnicodeDecodeError:
                    print(
                        f"Warning: Unable to decode stopwords file at {Config.STOPWORDS_PATH}"
                    )
                    return set()
        except FileNotFoundError:
            print(f"Warning: Stopwords file not found at {Config.STOPWORDS_PATH}")
            return set()
        except Exception as e:
            print(f"Warning: Error loading stopwords file: {e}")
            return set()

    def clean_text(self, text: str) -> str:
        """Clean Vietnamese text for processing"""
        if not text:
            return ""

        # Remove extra whitespace and normalize
        text = re.sub(r"\s+", " ", text.strip())

        # Remove special characters but keep Vietnamese characters
        text = re.sub(
            r"[^\w\s\-\.\,\;\:\!\?\(\)\[\]\"\'àáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđĐ]",
            " ",
            text,
        )

        # Remove multiple spaces
        text = re.sub(r"\s+", " ", text.strip())

        return text

    def tokenize(self, text: str) -> List[str]:
        """Tokenize Vietnamese text using underthesea"""
        try:
            cleaned_text = self.clean_text(text)
            tokens = word_tokenize(cleaned_text, format="text").split()
            return tokens
        except Exception as e:
            print(f"Error tokenizing text: {e}")
            return text.split()

    def remove_stopwords(self, tokens: List[str]) -> List[str]:
        """Remove stopwords from token list"""
        return [token for token in tokens if token.lower() not in self.stopwords]

    def preprocess_for_search(self, text: str) -> str:
        """Preprocess text for search - tokenize and remove stopwords with legal term preservation"""
        # First, preserve important legal patterns and identifiers
        preserved_patterns = []
        
        # Preserve legal document IDs (e.g., "47/2011/tt-bca", "159/2020/nđ-cp")
        legal_id_pattern = r'\d+/\d+/[a-z\-]+'
        legal_ids = re.findall(legal_id_pattern, text, re.IGNORECASE)
        for legal_id in legal_ids:
            placeholder = f"LEGALID_{len(preserved_patterns)}"
            preserved_patterns.append((placeholder, legal_id))
            text = text.replace(legal_id, placeholder)
        
        # Preserve important legal terms and phrases
        legal_terms = [
            r'điều\s+\d+',  # "điều 15", "điều 20"
            r'khoản\s+\d+',  # "khoản 1", "khoản 2"
            r'điểm\s+[a-z]',  # "điểm a", "điểm b"
            r'nghị\s+định',
            r'thông\s+tư',
            r'quyết\s+định',
            r'luật\s+\w+',
            r'vi\s+phạm',
            r'xử\s+phạt',
            r'mức\s+phạt',
        ]
        
        for pattern in legal_terms:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                placeholder = f"LEGALTERM_{len(preserved_patterns)}"
                preserved_patterns.append((placeholder, match))
                text = text.replace(match, placeholder)
        
        # Normal tokenization and stopword removal
        tokens = self.tokenize(text)
        filtered_tokens = self.remove_stopwords(tokens)
        
        # Reconstruct text
        processed_text = " ".join(filtered_tokens)
        
        # Restore preserved patterns
        for placeholder, original in preserved_patterns:
            processed_text = processed_text.replace(placeholder, original)
        
        return processed_text

    def extract_keywords(self, text: str, min_length: int = 2) -> List[str]:
        """Extract keywords from text"""
        tokens = self.tokenize(text)
        filtered_tokens = self.remove_stopwords(tokens)
        keywords = [token for token in filtered_tokens if len(token) >= min_length]
        return list(set(keywords))  # Remove duplicates

    def chunk_text(
        self, text: str, chunk_size: int = None, overlap: int = None
    ) -> List[str]:
        """Split text into chunks with overlap"""
        if chunk_size is None:
            chunk_size = Config.CHUNK_SIZE
        if overlap is None:
            overlap = Config.CHUNK_OVERLAP

        tokens = self.tokenize(text)
        chunks = []

        for i in range(0, len(tokens), chunk_size - overlap):
            chunk_tokens = tokens[i : i + chunk_size]
            if chunk_tokens:
                chunks.append(" ".join(chunk_tokens))

        return chunks