File size: 5,956 Bytes
32238e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import re
import pandas as pd
from typing import List, Set
from underthesea import word_tokenize
from config import Config


class VietnameseTextProcessor:
    """Vietnamese text processing utilities for legal documents"""

    def __init__(self):
        self.stopwords = self._load_stopwords()

    def _load_stopwords(self) -> Set[str]:
        """Load Vietnamese stopwords from file"""
        try:
            # Try UTF-8 first
            with open(Config.STOPWORDS_PATH, "r", encoding="utf-8") as f:
                stopwords = set(line.strip() for line in f if line.strip())
                stopwords = set(['_'.join(word.split()) for word in list(stopwords)])
            return stopwords
        except UnicodeDecodeError:
            try:
                # Try UTF-16 if UTF-8 fails
                with open(Config.STOPWORDS_PATH, "r", encoding="utf-16") as f:
                    stopwords = set(line.strip() for line in f if line.strip())
                return stopwords
            except UnicodeDecodeError:
                try:
                    # Try with BOM detection
                    with open(Config.STOPWORDS_PATH, "r", encoding="utf-8-sig") as f:
                        stopwords = set(line.strip() for line in f if line.strip())
                    return stopwords
                except UnicodeDecodeError:
                    print(
                        f"Warning: Unable to decode stopwords file at {Config.STOPWORDS_PATH}"
                    )
                    return set()
        except FileNotFoundError:
            print(f"Warning: Stopwords file not found at {Config.STOPWORDS_PATH}")
            return set()
        except Exception as e:
            print(f"Warning: Error loading stopwords file: {e}")
            return set()

    def clean_text(self, text: str) -> str:
        """Clean Vietnamese text for processing"""
        if not text:
            return ""

        # Remove extra whitespace and normalize
        text = re.sub(r"\s+", " ", text.strip())

        # Remove special characters but keep Vietnamese characters
        text = re.sub(
            r"[^\w\s\-\.\,\;\:\!\?\(\)\[\]\"\'àáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđĐ]",
            " ",
            text,
        )

        # Remove multiple spaces
        text = re.sub(r"\s+", " ", text.strip())

        return text

    def tokenize(self, text: str) -> List[str]:
        """Tokenize Vietnamese text using underthesea"""
        try:
            cleaned_text = self.clean_text(text)
            tokens = word_tokenize(cleaned_text, format="text").split()
            return tokens
        except Exception as e:
            print(f"Error tokenizing text: {e}")
            return text.split()

    def remove_stopwords(self, tokens: List[str]) -> List[str]:
        """Remove stopwords from token list"""
        return [token for token in tokens if token.lower() not in self.stopwords]

    def preprocess_for_search(self, text: str) -> str:
        """Preprocess text for search - tokenize and remove stopwords with legal term preservation"""
        # First, preserve important legal patterns and identifiers
        preserved_patterns = []
        
        # Preserve legal document IDs (e.g., "47/2011/tt-bca", "159/2020/nđ-cp")
        legal_id_pattern = r'\d+/\d+/[a-z\-]+'
        legal_ids = re.findall(legal_id_pattern, text, re.IGNORECASE)
        for legal_id in legal_ids:
            placeholder = f"LEGALID_{len(preserved_patterns)}"
            preserved_patterns.append((placeholder, legal_id))
            text = text.replace(legal_id, placeholder)
        
        # Preserve important legal terms and phrases
        legal_terms = [
            r'điều\s+\d+',  # "điều 15", "điều 20"
            r'khoản\s+\d+',  # "khoản 1", "khoản 2"
            r'điểm\s+[a-z]',  # "điểm a", "điểm b"
            r'nghị\s+định',
            r'thông\s+tư',
            r'quyết\s+định',
            r'luật\s+\w+',
            r'vi\s+phạm',
            r'xử\s+phạt',
            r'mức\s+phạt',
        ]
        
        for pattern in legal_terms:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                placeholder = f"LEGALTERM_{len(preserved_patterns)}"
                preserved_patterns.append((placeholder, match))
                text = text.replace(match, placeholder)
        
        # Normal tokenization and stopword removal
        tokens = self.tokenize(text)
        filtered_tokens = self.remove_stopwords(tokens)
        
        # Reconstruct text
        processed_text = " ".join(filtered_tokens)
        
        # Restore preserved patterns
        for placeholder, original in preserved_patterns:
            processed_text = processed_text.replace(placeholder, original)
        
        return processed_text

    def extract_keywords(self, text: str, min_length: int = 2) -> List[str]:
        """Extract keywords from text"""
        tokens = self.tokenize(text)
        filtered_tokens = self.remove_stopwords(tokens)
        keywords = [token for token in filtered_tokens if len(token) >= min_length]
        return list(set(keywords))  # Remove duplicates

    def chunk_text(
        self, text: str, chunk_size: int = None, overlap: int = None
    ) -> List[str]:
        """Split text into chunks with overlap"""
        if chunk_size is None:
            chunk_size = Config.CHUNK_SIZE
        if overlap is None:
            overlap = Config.CHUNK_OVERLAP

        tokens = self.tokenize(text)
        chunks = []

        for i in range(0, len(tokens), chunk_size - overlap):
            chunk_tokens = tokens[i : i + chunk_size]
            if chunk_tokens:
                chunks.append(" ".join(chunk_tokens))

        return chunks