Vu Anh Claude commited on Sep 29

Commit

08bbb4c

1 Parent(s): 76a11b5

Add VLSP2016 dataset support and comprehensive evaluation updates

Major Features:
- Add dataset selection parameter to train.py (--dataset vlsp2016|uts2017)
- Train VLSP2016 models: Logistic Regression (70.19%), SVC Linear (71.14%)
- Create clean.py script to remove runs without exported models
- Update technical report with dual-dataset evaluation

Technical Improvements:
- Generic load_data() function routing to specific dataset loaders
- Support for both UTS2017_Bank (35 aspect-sentiment classes) and VLSP2016 (3 sentiment classes)
- N-gram comparison: bigrams (1-2) vs trigrams (1-3) analysis
- Export filename includes dataset name for better organization

Performance Results:
- VLSP2016: 71.14% (SVC), 70.19% (LR) - balanced 3-class sentiment
- UTS2017_Bank: 71.72% (SVC), 68.18% (LR) - imbalanced 35-class aspect-sentiment
- Consistent ~71% accuracy across different complexity levels
- Training efficiency: LR faster, SVC more accurate

Documentation Updates:
- Enhanced technical report with cross-dataset performance analysis
- Added comparative insights between balanced vs imbalanced datasets
- Updated abstract, methodology, and results sections
- Added VLSP2016 dataset description and characteristics

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (23) hide show

clean.py +212 -0
paper/pulse_core_1_technical_report.tex +82 -15
runs/20250928_131527/metadata.json +0 -1531
runs/20250928_131527/models/labels.txt +0 -35
runs/20250928_131527/training.log +0 -62
runs/20250929_075333/metadata.json +77 -0
runs/{20250928_131527/models/UTS2017_Bank_AspectSentiment_SVC_feat20k_ngram1-2.joblib → 20250929_075333/models/VLSP2016_Sentiment_LogisticRegression_feat20k_ngram1-2.joblib} +2 -2
runs/20250929_075333/models/labels.txt +3 -0
runs/{20250928_131527 → 20250929_075333}/models/model.joblib +2 -2
runs/20250929_075333/training.log +40 -0
runs/20250929_075529/metadata.json +77 -0
runs/20250929_075529/models/VLSP2016_Sentiment_SVC_feat20k_ngram1-2.joblib +3 -0
runs/20250929_075529/models/labels.txt +3 -0
runs/20250929_075529/models/model.joblib +3 -0
runs/20250929_075529/training.log +40 -0
runs/20250929_075901/metadata.json +77 -0
runs/20250929_075901/models/VLSP2016_Sentiment_SVC_feat20k_ngram1-3.joblib +3 -0
runs/20250929_075901/models/labels.txt +3 -0
runs/20250929_075901/models/model.joblib +3 -0
runs/20250929_075901/training.log +40 -0
train.py +165 -24
vlsp2016_sentiment_20250929_075333.joblib +3 -0
vlsp2016_sentiment_20250929_075529.joblib +3 -0

clean.py ADDED Viewed

	@@ -0,0 +1,212 @@

+#!/usr/bin/env python3
+"""
+Clean up script for removing training runs without exported models.
+Removes all directories in runs/ folder that don't have a corresponding exported model file.
+"""
+import argparse
+import os
+import shutil
+from pathlib import Path
+import glob
+def find_exported_models():
+    """Find all exported model files in the current directory"""
+    exported_models = []
+    seen_files = set()  # Track files we've already processed
+    # Look for pattern: *_YYYYMMDD_HHMMSS.joblib
+    # This matches any exported model with timestamp format
+    patterns = [
+        "*_20[0-9][0-9][0-9][0-9][0-9][0-9]_[0-9][0-9][0-9][0-9][0-9][0-9].joblib"
+    ]
+    for pattern in patterns:
+        for filepath in glob.glob(pattern):
+            # Skip if we've already seen this file
+            if filepath in seen_files:
+                continue
+            seen_files.add(filepath)
+            # Extract timestamp from filename
+            # Format: dataset_sentiment_YYYYMMDD_HHMMSS.joblib
+            filename = os.path.basename(filepath)
+            parts = filename.replace(".joblib", "").split("_")
+            if len(parts) >= 4:
+                # Get the last two parts which should be date and time
+                timestamp = "_".join(parts[-2:])
+                exported_models.append({
+                    "file": filepath,
+                    "timestamp": timestamp
+                })
+    return exported_models
+def find_all_runs():
+    """Find all run directories in the runs folder"""
+    runs_dir = Path("runs")
+    if not runs_dir.exists():
+        return []
+    runs = []
+    for run_path in runs_dir.iterdir():
+        if run_path.is_dir():
+            # Run directories are named with timestamps: YYYYMMDD_HHMMSS
+            run_name = run_path.name
+            runs.append({
+                "path": run_path,
+                "timestamp": run_name
+            })
+    return runs
+def clean_runs(dry_run=False, verbose=False):
+    """
+    Remove all run directories that don't have exported models.
+    Args:
+        dry_run: If True, only show what would be deleted without actually deleting
+        verbose: If True, show detailed information
+    Returns:
+        Tuple of (runs_to_keep, runs_to_delete)
+    """
+    # Find all exported models
+    exported_models = find_exported_models()
+    exported_timestamps = {model["timestamp"] for model in exported_models}
+    # Find all runs
+    all_runs = find_all_runs()
+    # Categorize runs
+    runs_to_keep = []
+    runs_to_delete = []
+    for run in all_runs:
+        if run["timestamp"] in exported_timestamps:
+            runs_to_keep.append(run)
+        else:
+            runs_to_delete.append(run)
+    # Show summary
+    print(f"Found {len(all_runs)} total runs")
+    print(f"Found {len(exported_models)} exported models")
+    print(f"Runs to keep: {len(runs_to_keep)}")
+    print(f"Runs to delete: {len(runs_to_delete)}")
+    if verbose and exported_models:
+        print("\nExported models found:")
+        for model in exported_models:
+            print(f"  - {model['file']} (timestamp: {model['timestamp']})")
+    if verbose and runs_to_keep:
+        print("\nRuns with exported models (will be kept):")
+        for run in runs_to_keep:
+            print(f"  - {run['path']}")
+    if runs_to_delete:
+        print("\nRuns without exported models (will be deleted):")
+        for run in runs_to_delete:
+            print(f"  - {run['path']}")
+            if verbose:
+                # Check if metadata exists and show some info
+                metadata_path = run["path"] / "metadata.json"
+                if metadata_path.exists():
+                    import json
+                    try:
+                        with open(metadata_path) as f:
+                            metadata = json.load(f)
+                            print(f"    Model: {metadata.get('model_name', 'unknown')}, "
+                                  f"Dataset: {metadata.get('dataset', 'unknown')}, "
+                                  f"Accuracy: {metadata.get('test_accuracy', 'N/A'):.4f}")
+                    except (json.JSONDecodeError, KeyError):
+                        pass
+    # Calculate space to be freed
+    total_size = 0
+    for run in runs_to_delete:
+        total_size += sum(f.stat().st_size for f in run["path"].rglob("*") if f.is_file())
+    if total_size > 0:
+        size_mb = total_size / (1024 * 1024)
+        print(f"\nTotal space to be freed: {size_mb:.2f} MB")
+    # Perform deletion if not dry run
+    if not dry_run and runs_to_delete:
+        deleted_count = 0
+        for run in runs_to_delete:
+            try:
+                shutil.rmtree(run["path"])
+                deleted_count += 1
+                if verbose:
+                    print(f"Deleted: {run['path']}")
+            except Exception as e:
+                print(f"Error deleting {run['path']}: {e}")
+        print(f"\nSuccessfully deleted {deleted_count} run(s)")
+    elif dry_run and runs_to_delete:
+        print("\nDry run mode - no files were deleted")
+        print("Run without --dry-run to actually delete these directories")
+    elif not runs_to_delete:
+        print("\nNo runs to delete - all runs have exported models or no runs found")
+    return runs_to_keep, runs_to_delete
+def main():
+    parser = argparse.ArgumentParser(
+        description="Clean up training runs without exported models"
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would be deleted without actually deleting",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Show detailed information about runs",
+    )
+    parser.add_argument(
+        "--yes",
+        "-y",
+        action="store_true",
+        help="Skip confirmation prompt",
+    )
+    args = parser.parse_args()
+    # Check if runs directory exists
+    if not Path("runs").exists():
+        print("No 'runs' directory found. Nothing to clean.")
+        return
+    # Find runs to delete
+    print("Analyzing runs directory...\n")
+    # Do a dry run first to show what will be deleted
+    _, runs_to_delete = clean_runs(dry_run=True, verbose=args.verbose)
+    if not runs_to_delete:
+        return
+    # Ask for confirmation if not in dry-run mode and not auto-yes
+    if not args.dry_run and not args.yes and runs_to_delete:
+        print("\n" + "=" * 60)
+        response = input(f"Are you sure you want to delete {len(runs_to_delete)} run(s)? [y/N]: ")
+        if response.lower() != 'y':
+            print("Cleanup cancelled")
+            return
+    # Perform actual cleanup if not dry run
+    if not args.dry_run:
+        print("\nPerforming cleanup...")
+        clean_runs(dry_run=False, verbose=args.verbose)
+if __name__ == "__main__":
+    main()

paper/pulse_core_1_technical_report.tex CHANGED Viewed

@@ -23,7 +23,7 @@
 \maketitle
 \begin{abstract}
-This paper presents Pulse Core 1, a Vietnamese banking aspect sentiment analysis system employing Term Frequency-Inverse Document Frequency (TF-IDF) feature extraction combined with machine learning classification algorithms. The system is evaluated on the UTS2017\_Bank aspect sentiment dataset containing 35 combined aspect-sentiment categories, achieving 68.18\% accuracy with Logistic Regression and 71.72\% accuracy with Support Vector Classification (SVC). The implementation utilizes a 20,000-dimensional TF-IDF feature space with n-gram analysis and incorporates hash-based caching for computational optimization. The model predicts combined aspect-sentiment labels in the format \texttt{<aspect>\#<sentiment>}, enabling fine-grained analysis of Vietnamese banking customer feedback across 14 banking aspects (ACCOUNT, CARD, CUSTOMER\_SUPPORT, etc.) and 3 sentiment polarities (positive, negative, neutral). These results establish baseline performance metrics for Vietnamese banking aspect sentiment analysis and demonstrate the efficacy of traditional machine learning approaches for Vietnamese financial domain natural language processing tasks.
 \end{abstract}
 \section{Introduction}
@@ -34,7 +34,7 @@ Vietnamese, spoken by approximately 95 million speakers globally, exhibits disti
 Traditional machine learning approaches utilizing Term Frequency-Inverse Document Frequency (TF-IDF) vectorization with logistic regression maintain practical relevance for text classification tasks, particularly in resource-constrained computational environments \citep{pedregosa2011scikit}. These methodologies provide advantages in training efficiency, memory utilization, and model interpretability.
-This paper presents Pulse Core 1, a Vietnamese banking aspect sentiment analysis system implementing TF-IDF feature extraction with machine learning classification algorithms. The system is evaluated on the UTS2017\_Bank aspect sentiment dataset containing 1,977 Vietnamese banking documents across 35 combined aspect-sentiment categories, achieving competitive performance with traditional machine learning approaches. The system addresses the challenge of simultaneous aspect detection and sentiment classification for Vietnamese banking customer feedback, providing a computationally efficient solution for production deployment scenarios.
 \section{Related Work}
@@ -46,7 +46,9 @@ Initial research in Vietnamese text classification employed rule-based methodolo
 \subsection{Vietnamese Text Classification Datasets}
-Contemporary Vietnamese aspect sentiment analysis research employs domain-specific datasets for banking applications:
 \textbf{UTS2017\_Bank Dataset}: A specialized corpus developed by the Underthesea NLP Team for Vietnamese banking aspect sentiment analysis. This dataset encompasses 14 banking aspects (ACCOUNT, CARD, CUSTOMER\_SUPPORT, DISCOUNT, INTEREST\_RATE, INTERNET\_BANKING, LOAN, MONEY\_TRANSFER, OTHER, PAYMENT, PROMOTION, SAVING, SECURITY, TRADEMARK) combined with 3 sentiment polarities (positive, negative, neutral), creating 35 unique aspect-sentiment combinations. The dataset represents specialized Vietnamese text classification challenges in the financial domain, focusing on customer feedback analysis and banking service categorization.
@@ -121,9 +123,11 @@ The system incorporates several optimization mechanisms to enhance computational
 \subsection{Datasets}
-This study evaluates performance on the UTS2017\_Bank dataset, a specialized Vietnamese banking aspect sentiment analysis corpus representing the financial domain.
-The UTS2017\_Bank dataset contains 1,977 Vietnamese banking documents spanning 14 banking aspects combined with sentiment analysis, creating 35 unique aspect-sentiment categories. The dataset includes: account services, card services, customer support, discount offers, interest rates, internet banking, loans, money transfers, payments, promotions, savings, security features, trademark information, and miscellaneous services, each labeled with positive, negative, or neutral sentiment. The dataset exhibits significant class imbalance, with CUSTOMER\_SUPPORT\#negative (39\%) and TRADEMARK\#positive (35\%) categories dominating the distribution, while minority aspect-sentiment combinations have limited training examples.
 \begin{table}[h]
 \centering
@@ -131,10 +135,11 @@ The UTS2017\_Bank dataset contains 1,977 Vietnamese banking documents spanning 1
 \toprule
 \textbf{Dataset} & \textbf{Classes} & \textbf{Training} & \textbf{Test} & \textbf{Domain} \\
 \midrule
 UTS2017\_Bank & 35 & 1,581 & 396 & Banking Aspect Sentiment \\
 \bottomrule
 \end{tabular}
-\caption{Dataset characteristics for Vietnamese banking aspect sentiment analysis evaluation.}
 \label{tab:dataset_summary}
 \end{table}
@@ -142,7 +147,7 @@ UTS2017\_Bank & 35 & 1,581 & 396 & Banking Aspect Sentiment \\
 \subsection{Experimental Design}
-The experimental evaluation employs the UTS2017\_Bank dataset for Vietnamese banking aspect sentiment analysis. Performance assessment utilizes standard multi-class classification evaluation metrics focusing on combined aspect-sentiment prediction tasks.
 \subsubsection{Evaluation Metrics}
@@ -167,7 +172,7 @@ The experimental design incorporates comparative analysis against established ba
 \end{itemize}
 \textbf{Baseline Performance Establishment:}
-This work establishes the first comprehensive baseline for Vietnamese banking aspect sentiment analysis using traditional machine learning approaches. Table \ref{tab:comprehensive_comparison} presents performance results for the UTS2017\_Bank aspect sentiment analysis task:
 \begin{table}[h]
 \centering
@@ -176,11 +181,15 @@ This work establishes the first comprehensive baseline for Vietnamese banking as
 \hline
 \textbf{Dataset} & \textbf{Method} & \textbf{Accuracy} \\
 \hline
 UTS2017\_Bank (35 aspect-sentiment) & \textbf{Pulse Core 1 - SVC with TF-IDF} & \textbf{71.72\%} \\
 UTS2017\_Bank (35 aspect-sentiment) & \textbf{Pulse Core 1 - Logistic Regression with TF-IDF} & \textbf{68.18\%} \\
 \hline
 \end{tabular}
-\caption{Performance results for Vietnamese banking aspect sentiment analysis using TF-IDF-based traditional machine learning approaches.}
 \label{tab:comprehensive_comparison}
 \end{table}
@@ -192,6 +201,22 @@ This section presents comprehensive experimental results across both Vietnamese
 \subsubsection{Overall Performance Summary}
 \textbf{UTS2017\_Bank Dataset (Banking Aspect Sentiment Analysis):}
 The system exhibits competitive performance on the banking aspect sentiment analysis task:
 \begin{itemize}
@@ -208,6 +233,24 @@ The system exhibits competitive performance on the banking aspect sentiment anal
 \subsubsection{Detailed Per-Class Performance}
 \textbf{UTS2017\_Bank Dataset: Selected Per-Class Results (Logistic Regression - 68.18\% accuracy):}
 The following table shows performance for the most represented aspect-sentiment categories:
@@ -346,6 +389,22 @@ Analysis of the banking aspect sentiment task reveals important insights about V
     \item \textbf{Class Balance Sensitivity}: Performance correlates strongly with training data availability, with dominant categories (CUSTOMER\_SUPPORT, TRADEMARK) achieving strong results while minority aspect-sentiment combinations suffer from data sparsity.
 \end{itemize}
 \section{Discussion}
 \subsection{Research Implications}
@@ -421,16 +480,17 @@ The current investigation establishes several promising research trajectories fo
 \section{Conclusion}
-This paper presents Pulse Core 1, a Vietnamese banking aspect sentiment analysis system that establishes the viability of systematically optimized traditional machine learning methodologies for financial domain applications. The investigation yields several significant findings:
 \begin{enumerate}
-    \item Traditional machine learning approaches achieve competitive performance on Vietnamese banking aspect sentiment analysis tasks (71.72\% accuracy with SVC) while maintaining substantial computational efficiency advantages (5.3s training time).
-    \item Feature engineering methodologies retain critical importance for Vietnamese banking applications, with the implemented 20,000-dimensional TF-IDF representation effectively capturing aspect-sentiment relationships across 35 combined categories.
-    \item Class distribution imbalance constitutes the primary performance limitation for aspect sentiment analysis, with minority aspect-sentiment combinations achieving zero performance due to insufficient training data.
-    \item The fundamental trade-off between algorithmic complexity and model interpretability substantially favors TF-IDF approaches for banking applications requiring transparency and regulatory compliance.
 \end{enumerate}
-This research contributes to the Vietnamese financial NLP ecosystem by establishing the first comprehensive baseline for banking aspect sentiment analysis that optimally balances classification performance, computational efficiency, and model interpretability. The demonstrated effectiveness on combined aspect-sentiment prediction indicates substantial potential for Vietnamese banking customer feedback analysis applications.
 Future research initiatives should prioritize class imbalance mitigation strategies for minority aspect-sentiment combinations, integration of banking domain-specific feature representations, and exploration of joint aspect-sentiment modeling approaches that capture the interdependence between banking aspects and customer sentiment.
@@ -478,6 +538,13 @@ Toan Pham Van and Ta Minh Thanh.
 \section{Changelog}
 \textbf{2025-09-28}
 \begin{itemize}
     \item Initial release of Pulse Core 1

 \maketitle
 \begin{abstract}
+This paper presents Pulse Core 1, a Vietnamese sentiment analysis system employing Term Frequency-Inverse Document Frequency (TF-IDF) feature extraction combined with machine learning classification algorithms. The system is evaluated on two Vietnamese datasets: (1) VLSP2016 general sentiment dataset achieving 70.19\% accuracy with Logistic Regression and 71.14\% accuracy with Support Vector Classification (SVC), and (2) UTS2017\_Bank aspect sentiment dataset containing 35 combined aspect-sentiment categories, achieving 68.18\% accuracy with Logistic Regression and 71.72\% accuracy with SVC. The implementation utilizes a 20,000-dimensional TF-IDF feature space with configurable n-gram analysis and incorporates hash-based caching for computational optimization. For banking applications, the model predicts combined aspect-sentiment labels in the format \texttt{<aspect>\#<sentiment>}, enabling fine-grained analysis of Vietnamese banking customer feedback across 14 banking aspects and 3 sentiment polarities. These results establish baseline performance metrics for Vietnamese sentiment analysis across both general and domain-specific applications, demonstrating the efficacy of traditional machine learning approaches for Vietnamese natural language processing tasks.
 \end{abstract}
 \section{Introduction}
 Traditional machine learning approaches utilizing Term Frequency-Inverse Document Frequency (TF-IDF) vectorization with logistic regression maintain practical relevance for text classification tasks, particularly in resource-constrained computational environments \citep{pedregosa2011scikit}. These methodologies provide advantages in training efficiency, memory utilization, and model interpretability.
+This paper presents Pulse Core 1, a Vietnamese sentiment analysis system implementing TF-IDF feature extraction with machine learning classification algorithms. The system is evaluated on two complementary datasets: (1) the VLSP2016 general sentiment dataset containing 6,150 Vietnamese reviews across 3 sentiment polarities, and (2) the UTS2017\_Bank aspect sentiment dataset containing 1,977 Vietnamese banking documents across 35 combined aspect-sentiment categories. The system achieves competitive performance with traditional machine learning approaches on both datasets, demonstrating its versatility for general sentiment analysis and specialized aspect-sentiment classification tasks. The dual-dataset evaluation provides comprehensive insights into Vietnamese sentiment analysis challenges across different domains and complexity levels.
 \section{Related Work}
 \subsection{Vietnamese Text Classification Datasets}
+Contemporary Vietnamese sentiment analysis research employs both general-purpose and domain-specific datasets:
+\textbf{VLSP2016 Dataset}: A standard Vietnamese sentiment analysis corpus from the Vietnamese Language and Speech Processing workshop, containing 6,150 product and service reviews categorized into three sentiment polarities (positive, negative, neutral). The dataset provides a balanced distribution with 1,700 samples per class in the training set and 350 per class in the test set, establishing a benchmark for general Vietnamese sentiment classification tasks.
 \textbf{UTS2017\_Bank Dataset}: A specialized corpus developed by the Underthesea NLP Team for Vietnamese banking aspect sentiment analysis. This dataset encompasses 14 banking aspects (ACCOUNT, CARD, CUSTOMER\_SUPPORT, DISCOUNT, INTEREST\_RATE, INTERNET\_BANKING, LOAN, MONEY\_TRANSFER, OTHER, PAYMENT, PROMOTION, SAVING, SECURITY, TRADEMARK) combined with 3 sentiment polarities (positive, negative, neutral), creating 35 unique aspect-sentiment combinations. The dataset represents specialized Vietnamese text classification challenges in the financial domain, focusing on customer feedback analysis and banking service categorization.
 \subsection{Datasets}
+This study evaluates performance on two Vietnamese sentiment analysis datasets representing both general and domain-specific applications:
+\textbf{VLSP2016 Dataset}: The VLSP2016 sentiment dataset contains 6,150 Vietnamese product and service reviews with balanced class distribution across three sentiment polarities. The dataset provides 5,100 training samples and 1,050 test samples, with each sentiment class (positive, negative, neutral) equally represented. This balanced distribution enables robust evaluation of general sentiment classification capabilities without class imbalance complications.
+\textbf{UTS2017\_Bank Dataset}: The UTS2017\_Bank dataset contains 1,977 Vietnamese banking documents spanning 14 banking aspects combined with sentiment analysis, creating 35 unique aspect-sentiment categories. The dataset includes: account services, card services, customer support, discount offers, interest rates, internet banking, loans, money transfers, payments, promotions, savings, security features, trademark information, and miscellaneous services, each labeled with positive, negative, or neutral sentiment. The dataset exhibits significant class imbalance, with CUSTOMER\_SUPPORT\#negative (39\%) and TRADEMARK\#positive (35\%) categories dominating the distribution, while minority aspect-sentiment combinations have limited training examples.
 \begin{table}[h]
 \centering
 \toprule
 \textbf{Dataset} & \textbf{Classes} & \textbf{Training} & \textbf{Test} & \textbf{Domain} \\
 \midrule
+VLSP2016 & 3 & 5,100 & 1,050 & General Sentiment \\
 UTS2017\_Bank & 35 & 1,581 & 396 & Banking Aspect Sentiment \\
 \bottomrule
 \end{tabular}
+\caption{Dataset characteristics for Vietnamese sentiment analysis evaluation.}
 \label{tab:dataset_summary}
 \end{table}
 \subsection{Experimental Design}
+The experimental evaluation employs both the VLSP2016 general sentiment dataset and the UTS2017\_Bank banking aspect sentiment dataset. This dual-dataset approach enables comprehensive assessment of model performance across different complexity levels and domain requirements. Performance assessment utilizes standard multi-class classification evaluation metrics for both simple sentiment classification and complex aspect-sentiment prediction tasks.
 \subsubsection{Evaluation Metrics}
 \end{itemize}
 \textbf{Baseline Performance Establishment:}
+This work establishes comprehensive baselines for Vietnamese sentiment analysis across both general and banking-specific domains using traditional machine learning approaches. Table \ref{tab:comprehensive_comparison} presents performance results for both datasets:
 \begin{table}[h]
 \centering
 \hline
 \textbf{Dataset} & \textbf{Method} & \textbf{Accuracy} \\
 \hline
+VLSP2016 (3 sentiments) & \textbf{Pulse Core 1 - SVC Linear (1-2 gram)} & \textbf{71.14\%} \\
+VLSP2016 (3 sentiments) & \textbf{Pulse Core 1 - SVC Linear (1-3 gram)} & \textbf{70.67\%} \\
+VLSP2016 (3 sentiments) & \textbf{Pulse Core 1 - Logistic Regression} & \textbf{70.19\%} \\
+\hline
 UTS2017\_Bank (35 aspect-sentiment) & \textbf{Pulse Core 1 - SVC with TF-IDF} & \textbf{71.72\%} \\
 UTS2017\_Bank (35 aspect-sentiment) & \textbf{Pulse Core 1 - Logistic Regression with TF-IDF} & \textbf{68.18\%} \\
 \hline
 \end{tabular}
+\caption{Performance results for Vietnamese sentiment analysis using TF-IDF-based traditional machine learning approaches across general and banking-specific datasets.}
 \label{tab:comprehensive_comparison}
 \end{table}
 \subsubsection{Overall Performance Summary}
+\textbf{VLSP2016 Dataset (General Sentiment Analysis):}
+The system achieves strong performance on general Vietnamese sentiment classification:
+\begin{itemize}
+    \item \textbf{Test Classification Accuracy (SVC Linear, 1-2 gram)}: 71.14\%
+    \item \textbf{Test Classification Accuracy (SVC Linear, 1-3 gram)}: 70.67\%
+    \item \textbf{Test Classification Accuracy (Logistic Regression)}: 70.19\%
+    \item \textbf{Training Latency (SVC)}: 24.95 seconds
+    \item \textbf{Training Latency (Logistic Regression)}: 0.75 seconds
+    \item \textbf{Inference Latency}: 0.76 seconds for 1,050 test samples (0.72 ms per sample)
+    \item \textbf{Training Samples}: 5,100 reviews
+    \item \textbf{Test Samples}: 1,050 reviews
+    \item \textbf{Number of Classes}: 3 sentiment polarities
+    \item \textbf{Weighted Average F1-Score (SVC)}: 0.713
+    \item \textbf{Weighted Average F1-Score (Logistic Regression)}: 0.703
+\end{itemize}
 \textbf{UTS2017\_Bank Dataset (Banking Aspect Sentiment Analysis):}
 The system exhibits competitive performance on the banking aspect sentiment analysis task:
 \begin{itemize}
 \subsubsection{Detailed Per-Class Performance}
+\textbf{VLSP2016 Dataset: Per-Class Results (SVC Linear - 71.14\% accuracy):}
+\begin{table}[h]
+\centering
+\begin{tabular}{lcccc}
+\toprule
+Sentiment & Precision & Recall & F1-Score & Support \\
+\midrule
+Negative & 0.70 & 0.72 & 0.71 & 350 \\
+Neutral & 0.65 & 0.69 & 0.67 & 350 \\
+Positive & 0.80 & 0.72 & 0.76 & 350 \\
+\midrule
+Weighted Avg & 0.72 & 0.71 & 0.71 & 1,050 \\
+\bottomrule
+\end{tabular}
+\caption{VLSP2016 per-class performance metrics for SVC Linear model}
+\end{table}
 \textbf{UTS2017\_Bank Dataset: Selected Per-Class Results (Logistic Regression - 68.18\% accuracy):}
 The following table shows performance for the most represented aspect-sentiment categories:
     \item \textbf{Class Balance Sensitivity}: Performance correlates strongly with training data availability, with dominant categories (CUSTOMER\_SUPPORT, TRADEMARK) achieving strong results while minority aspect-sentiment combinations suffer from data sparsity.
 \end{itemize}
+\subsubsection{Cross-Dataset Performance Analysis}
+\textbf{Comparative Insights:}
+The evaluation across both VLSP2016 and UTS2017\_Bank datasets reveals important patterns:
+\begin{itemize}
+    \item \textbf{Consistent SVC Performance}: SVC achieves approximately 71\% accuracy on both datasets despite vastly different complexity levels (3 classes vs. 35 classes), demonstrating robust generalization capabilities.
+    \item \textbf{N-gram Range Impact}: For VLSP2016, bigrams (1-2) outperform trigrams (1-3) by 0.47 percentage points, suggesting that local context is sufficient for general sentiment classification.
+    \item \textbf{Training Efficiency Trade-offs}:
+    \begin{itemize}
+        \item VLSP2016: Larger dataset requires more training time (24.95s for SVC vs. 0.75s for LR)
+        \item UTS2017\_Bank: Despite 35 classes, training is faster (5.3s for SVC) due to smaller dataset size
+    \end{itemize}
+    \item \textbf{Balanced vs. Imbalanced Performance}: VLSP2016's balanced distribution yields consistent per-class performance (0.67-0.76 F1), while UTS2017\_Bank's imbalanced distribution creates extreme performance variation (0.00-0.88 F1).
+\end{itemize}
 \section{Discussion}
 \subsection{Research Implications}
 \section{Conclusion}
+This paper presents Pulse Core 1, a Vietnamese sentiment analysis system that establishes the viability of systematically optimized traditional machine learning methodologies for both general and domain-specific applications. The investigation yields several significant findings:
 \begin{enumerate}
+    \item Traditional machine learning approaches achieve consistent performance across both general sentiment analysis (71.14\% on VLSP2016) and complex aspect-sentiment tasks (71.72\% on UTS2017\_Bank) while maintaining computational efficiency.
+    \item Feature engineering methodologies retain critical importance for Vietnamese applications, with the 20,000-dimensional TF-IDF representation effectively capturing sentiment patterns across both balanced 3-class and imbalanced 35-class scenarios.
+    \item N-gram analysis reveals that bigrams (1-2) provide optimal performance for Vietnamese sentiment classification, with trigrams offering minimal improvement while increasing computational overhead.
+    \item Class distribution significantly impacts performance, with balanced datasets (VLSP2016) yielding consistent per-class results while imbalanced datasets (UTS2017\_Bank) create extreme performance variations.
+    \item The fundamental trade-off between algorithmic complexity and model interpretability favors TF-IDF approaches for applications requiring transparency, rapid deployment, and regulatory compliance.
 \end{enumerate}
+This research contributes to the Vietnamese NLP ecosystem by establishing comprehensive baselines for both general sentiment analysis and banking-specific aspect-sentiment classification. The consistent performance across datasets of varying complexity demonstrates the robustness of traditional machine learning approaches for Vietnamese text classification. The demonstrated effectiveness on both simple sentiment and complex aspect-sentiment prediction indicates substantial potential for practical Vietnamese NLP applications across multiple domains.
 Future research initiatives should prioritize class imbalance mitigation strategies for minority aspect-sentiment combinations, integration of banking domain-specific feature representations, and exploration of joint aspect-sentiment modeling approaches that capture the interdependence between banking aspects and customer sentiment.
 \section{Changelog}
+\textbf{2025-09-29}
+\begin{itemize}
+    \item Added VLSP2016 dataset evaluation and cross-dataset analysis
+    \item Updated performance metrics with multiple n-gram configurations
+    \item Enhanced comparative analysis between balanced and imbalanced datasets
+\end{itemize}
 \textbf{2025-09-28}
 \begin{itemize}
     \item Initial release of Pulse Core 1

runs/20250928_131527/metadata.json DELETED Viewed

@@ -1,1531 +0,0 @@
-{
-  "timestamp": "20250928_131527",
-  "config_name": "UTS2017_Bank_AspectSentiment_SVC_feat20k_ngram1-2",
-  "model_name": "svc_linear",
-  "classifier": "SVC",
-  "max_features": 20000,
-  "ngram_range": [
-    1,
-    2
-  ],
-  "split_ratio": 0.2,
-  "n_samples": null,
-  "train_samples": 1581,
-  "test_samples": 396,
-  "unique_labels": 35,
-  "labels": [
-    "ACCOUNT#negative",
-    "CARD#negative",
-    "CARD#neutral",
-    "CARD#positive",
-    "CUSTOMER_SUPPORT#negative",
-    "CUSTOMER_SUPPORT#neutral",
-    "CUSTOMER_SUPPORT#positive",
-    "DISCOUNT#negative",
-    "DISCOUNT#neutral",
-    "DISCOUNT#positive",
-    "INTEREST_RATE#negative",
-    "INTEREST_RATE#neutral",
-    "INTEREST_RATE#positive",
-    "INTERNET_BANKING#negative",
-    "INTERNET_BANKING#neutral",
-    "INTERNET_BANKING#positive",
-    "LOAN#negative",
-    "LOAN#positive",
-    "MONEY_TRANSFER#negative",
-    "MONEY_TRANSFER#positive",
-    "OTHER#negative",
-    "OTHER#neutral",
-    "OTHER#positive",
-    "PAYMENT#negative",
-    "PAYMENT#positive",
-    "PROMOTION#negative",
-    "PROMOTION#neutral",
-    "PROMOTION#positive",
-    "SAVING#negative",
-    "SAVING#neutral",
-    "SAVING#positive",
-    "SECURITY#neutral",
-    "SECURITY#positive",
-    "TRADEMARK#negative",
-    "TRADEMARK#positive"
-  ],
-  "train_accuracy": 0.9430740037950665,
-  "test_accuracy": 0.7171717171717171,
-  "train_time": 7.737863779067993,
-  "prediction_time": 0.1107940673828125,
-  "classification_report": {
-    "ACCOUNT#negative": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 2.0
-    },
-    "CARD#negative": {
-      "precision": 1.0,
-      "recall": 0.14285714285714285,
-      "f1-score": 0.25,
-      "support": 7.0
-    },
-    "CARD#positive": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 1.0
-    },
-    "CUSTOMER_SUPPORT#negative": {
-      "precision": 0.4931506849315068,
-      "recall": 0.96,
-      "f1-score": 0.6515837104072398,
-      "support": 75.0
-    },
-    "CUSTOMER_SUPPORT#neutral": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 1.0
-    },
-    "CUSTOMER_SUPPORT#positive": {
-      "precision": 0.8641975308641975,
-      "recall": 0.8974358974358975,
-      "f1-score": 0.8805031446540881,
-      "support": 78.0
-    },
-    "DISCOUNT#negative": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 5.0
-    },
-    "DISCOUNT#neutral": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 1.0
-    },
-    "DISCOUNT#positive": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 0.0
-    },
-    "INTEREST_RATE#negative": {
-      "precision": 0.5555555555555556,
-      "recall": 0.5,
-      "f1-score": 0.5263157894736842,
-      "support": 10.0
-    },
-    "INTERNET_BANKING#negative": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 12.0
-    },
-    "INTERNET_BANKING#positive": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 1.0
-    },
-    "LOAN#negative": {
-      "precision": 0.875,
-      "recall": 0.5384615384615384,
-      "f1-score": 0.6666666666666666,
-      "support": 13.0
-    },
-    "MONEY_TRANSFER#negative": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 5.0
-    },
-    "OTHER#negative": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 10.0
-    },
-    "OTHER#neutral": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 1.0
-    },
-    "OTHER#positive": {
-      "precision": 0.6666666666666666,
-      "recall": 0.5,
-      "f1-score": 0.5714285714285714,
-      "support": 4.0
-    },
-    "PAYMENT#positive": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 3.0
-    },
-    "PROMOTION#negative": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 4.0
-    },
-    "PROMOTION#neutral": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 1.0
-    },
-    "PROMOTION#positive": {
-      "precision": 0.3333333333333333,
-      "recall": 0.2,
-      "f1-score": 0.25,
-      "support": 5.0
-    },
-    "SAVING#negative": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 1.0
-    },
-    "SAVING#positive": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 2.0
-    },
-    "SECURITY#negative": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 1.0
-    },
-    "SECURITY#neutral": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 1.0
-    },
-    "TRADEMARK#negative": {
-      "precision": 0.0,
-      "recall": 0.0,
-      "f1-score": 0.0,
-      "support": 14.0
-    },
-    "TRADEMARK#positive": {
-      "precision": 0.8936170212765957,
-      "recall": 0.9130434782608695,
-      "f1-score": 0.9032258064516129,
-      "support": 138.0
-    },
-    "accuracy": 0.7171717171717171,
-    "macro avg": {
-      "precision": 0.21042669602325392,
-      "recall": 0.17228881692649806,
-      "f1-score": 0.1740638403363653,
-      "support": 396.0
-    },
-    "weighted avg": {
-      "precision": 0.6464059257634583,
-      "recall": 0.7171717171717171,
-      "f1-score": 0.6601230396489957,
-      "support": 396.0
-    }
-  },
-  "confusion_matrix": [
-    [
-      0,
-      0,
-      0,
-      0,
-      1,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      1,
-      0,
-      0,
-      5,
-      0,
-      0,
-      0,
-      0,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      1
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      72,
-      0,
-      1,
-      0,
-      0,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      6,
-      0,
-      70,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      2
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      5,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      4,
-      0,
-      0,
-      0,
-      0,
-      0,
-      5,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      1
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      12,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      4,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      7,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      2,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      3,
-      0,
-      0,
-      0,
-      0,
-      0,
-      2,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      8,
-      0,
-      0,
-      0,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      1,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      2,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      1
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      2,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      1
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      3,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      3
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      2
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      10,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      4
-    ],
-    [
-      0,
-      0,
-      0,
-      0,
-      4,
-      0,
-      7,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      126
-    ]
-  ]
-}

runs/20250928_131527/models/labels.txt DELETED Viewed

@@ -1,35 +0,0 @@
-ACCOUNT#negative
-CARD#negative
-CARD#neutral
-CARD#positive
-CUSTOMER_SUPPORT#negative
-CUSTOMER_SUPPORT#neutral
-CUSTOMER_SUPPORT#positive
-DISCOUNT#negative
-DISCOUNT#neutral
-DISCOUNT#positive
-INTEREST_RATE#negative
-INTEREST_RATE#neutral
-INTEREST_RATE#positive
-INTERNET_BANKING#negative
-INTERNET_BANKING#neutral
-INTERNET_BANKING#positive
-LOAN#negative
-LOAN#positive
-MONEY_TRANSFER#negative
-MONEY_TRANSFER#positive
-OTHER#negative
-OTHER#neutral
-OTHER#positive
-PAYMENT#negative
-PAYMENT#positive
-PROMOTION#negative
-PROMOTION#neutral
-PROMOTION#positive
-SAVING#negative
-SAVING#neutral
-SAVING#positive
-SECURITY#neutral
-SECURITY#positive
-TRADEMARK#negative
-TRADEMARK#positive

runs/20250928_131527/training.log DELETED Viewed

@@ -1,62 +0,0 @@
-2025-09-28 13:15:27,107 - INFO - Starting training run: 20250928_131527
-2025-09-28 13:15:27,107 - INFO - Model: svc_linear
-2025-09-28 13:15:27,107 - INFO - Max features: 20000
-2025-09-28 13:15:27,107 - INFO - N-gram range: (1, 2)
-2025-09-28 13:15:27,107 - INFO - Loading UTS2017_Bank aspect sentiment dataset...
-2025-09-28 13:15:31,618 - INFO - Train samples: 1581
-2025-09-28 13:15:31,618 - INFO - Test samples: 396
-2025-09-28 13:15:31,618 - INFO - Unique labels: 35
-2025-09-28 13:15:31,618 - INFO - Label distribution (train): {np.str_('ACCOUNT#negative'): np.int64(3), np.str_('CARD#negative'): np.int64(47), np.str_('CARD#neutral'): np.int64(1), np.str_('CARD#positive'): np.int64(10), np.str_('CUSTOMER_SUPPORT#negative'): np.int64(288), np.str_('CUSTOMER_SUPPORT#neutral'): np.int64(4), np.str_('CUSTOMER_SUPPORT#positive'): np.int64(328), np.str_('DISCOUNT#negative'): np.int64(13), np.str_('DISCOUNT#neutral'): np.int64(3), np.str_('DISCOUNT#positive'): np.int64(19), np.str_('INTEREST_RATE#negative'): np.int64(45), np.str_('INTEREST_RATE#neutral'): np.int64(1), np.str_('INTEREST_RATE#positive'): np.int64(4), np.str_('INTERNET_BANKING#negative'): np.int64(36), np.str_('INTERNET_BANKING#neutral'): np.int64(2), np.str_('INTERNET_BANKING#positive'): np.int64(19), np.str_('LOAN#negative'): np.int64(48), np.str_('LOAN#positive'): np.int64(13), np.str_('MONEY_TRANSFER#negative'): np.int64(24), np.str_('MONEY_TRANSFER#positive'): np.int64(5), np.str_('OTHER#negative'): np.int64(25), np.str_('OTHER#neutral'): np.int64(3), np.str_('OTHER#positive'): np.int64(26), np.str_('PAYMENT#negative'): np.int64(4), np.str_('PAYMENT#positive'): np.int64(8), np.str_('PROMOTION#negative'): np.int64(13), np.str_('PROMOTION#neutral'): np.int64(2), np.str_('PROMOTION#positive'): np.int64(28), np.str_('SAVING#negative'): np.int64(5), np.str_('SAVING#neutral'): np.int64(1), np.str_('SAVING#positive'): np.int64(4), np.str_('SECURITY#neutral'): np.int64(1), np.str_('SECURITY#positive'): np.int64(1), np.str_('TRADEMARK#negative'): np.int64(33), np.str_('TRADEMARK#positive'): np.int64(514)}
-2025-09-28 13:15:31,618 - INFO - Label distribution (test): {np.str_('ACCOUNT#negative'): np.int64(2), np.str_('CARD#negative'): np.int64(7), np.str_('CARD#neutral'): np.int64(0), np.str_('CARD#positive'): np.int64(1), np.str_('CUSTOMER_SUPPORT#negative'): np.int64(75), np.str_('CUSTOMER_SUPPORT#neutral'): np.int64(1), np.str_('CUSTOMER_SUPPORT#positive'): np.int64(78), np.str_('DISCOUNT#negative'): np.int64(5), np.str_('DISCOUNT#neutral'): np.int64(1), np.str_('DISCOUNT#positive'): np.int64(0), np.str_('INTEREST_RATE#negative'): np.int64(10), np.str_('INTEREST_RATE#neutral'): np.int64(0), np.str_('INTEREST_RATE#positive'): np.int64(0), np.str_('INTERNET_BANKING#negative'): np.int64(12), np.str_('INTERNET_BANKING#neutral'): np.int64(0), np.str_('INTERNET_BANKING#positive'): np.int64(1), np.str_('LOAN#negative'): np.int64(13), np.str_('LOAN#positive'): np.int64(0), np.str_('MONEY_TRANSFER#negative'): np.int64(5), np.str_('MONEY_TRANSFER#positive'): np.int64(0), np.str_('OTHER#negative'): np.int64(10), np.str_('OTHER#neutral'): np.int64(1), np.str_('OTHER#positive'): np.int64(4), np.str_('PAYMENT#negative'): np.int64(0), np.str_('PAYMENT#positive'): np.int64(3), np.str_('PROMOTION#negative'): np.int64(4), np.str_('PROMOTION#neutral'): np.int64(1), np.str_('PROMOTION#positive'): np.int64(5), np.str_('SAVING#negative'): np.int64(1), np.str_('SAVING#neutral'): np.int64(0), np.str_('SAVING#positive'): np.int64(2), np.str_('SECURITY#neutral'): np.int64(1), np.str_('SECURITY#positive'): np.int64(0), np.str_('TRADEMARK#negative'): np.int64(14), np.str_('TRADEMARK#positive'): np.int64(138)}
-2025-09-28 13:15:31,619 - INFO - Selected classifier: SVC
-2025-09-28 13:15:31,619 - INFO - ============================================================
-2025-09-28 13:15:31,619 - INFO - Training: UTS2017_Bank_AspectSentiment_SVC_feat20k_ngram1-2
-2025-09-28 13:15:31,619 - INFO - ============================================================
-2025-09-28 13:15:31,619 - INFO - Creating pipeline with max_features=20000, ngram_range=(1, 2)
-2025-09-28 13:15:31,619 - INFO - Training model...
-2025-09-28 13:15:39,357 - INFO - Training completed in 7.74 seconds
-2025-09-28 13:15:39,357 - INFO - Evaluating on training set...
-2025-09-28 13:15:39,803 - INFO - Training accuracy: 0.9431
-2025-09-28 13:15:39,803 - INFO - Evaluating on test set...
-2025-09-28 13:15:39,914 - INFO - Test accuracy: 0.7172
-2025-09-28 13:15:39,914 - INFO - Prediction time: 0.11 seconds
-2025-09-28 13:15:39,914 - INFO - Classification Report:
-2025-09-28 13:15:39,918 - INFO -                            precision    recall  f1-score   support
-         ACCOUNT#negative       0.00      0.00      0.00         2
-            CARD#negative       1.00      0.14      0.25         7
-            CARD#positive       0.00      0.00      0.00         1
-CUSTOMER_SUPPORT#negative       0.49      0.96      0.65        75
- CUSTOMER_SUPPORT#neutral       0.00      0.00      0.00         1
-CUSTOMER_SUPPORT#positive       0.86      0.90      0.88        78
-        DISCOUNT#negative       0.00      0.00      0.00         5
-         DISCOUNT#neutral       0.00      0.00      0.00         1
-        DISCOUNT#positive       0.00      0.00      0.00         0
-   INTEREST_RATE#negative       0.56      0.50      0.53        10
-INTERNET_BANKING#negative       0.00      0.00      0.00        12
-INTERNET_BANKING#positive       0.00      0.00      0.00         1
-            LOAN#negative       0.88      0.54      0.67        13
-  MONEY_TRANSFER#negative       0.00      0.00      0.00         5
-           OTHER#negative       0.00      0.00      0.00        10
-            OTHER#neutral       0.00      0.00      0.00         1
-           OTHER#positive       0.67      0.50      0.57         4
-         PAYMENT#positive       0.00      0.00      0.00         3
-       PROMOTION#negative       0.00      0.00      0.00         4
-        PROMOTION#neutral       0.00      0.00      0.00         1
-       PROMOTION#positive       0.33      0.20      0.25         5
-          SAVING#negative       0.00      0.00      0.00         1
-          SAVING#positive       0.00      0.00      0.00         2
-        SECURITY#negative       0.00      0.00      0.00         1
-         SECURITY#neutral       0.00      0.00      0.00         1
-       TRADEMARK#negative       0.00      0.00      0.00        14
-       TRADEMARK#positive       0.89      0.91      0.90       138
-                 accuracy                           0.72       396
-                macro avg       0.21      0.17      0.17       396
-             weighted avg       0.65      0.72      0.66       396
-2025-09-28 13:15:39,922 - INFO - Confusion Matrix shape: (35, 35)
-2025-09-28 13:15:40,052 - INFO - Model saved to runs/20250928_131527/models/model.joblib
-2025-09-28 13:15:40,181 - INFO - Model also saved as runs/20250928_131527/models/UTS2017_Bank_AspectSentiment_SVC_feat20k_ngram1-2.joblib
-2025-09-28 13:15:40,181 - INFO - Label mapping saved to runs/20250928_131527/models/labels.txt
-2025-09-28 13:15:40,182 - INFO - Metadata saved to runs/20250928_131527/metadata.json

runs/20250929_075333/metadata.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+  "timestamp": "20250929_075333",
+  "dataset": "vlsp2016",
+  "dataset_name": "VLSP2016_Sentiment",
+  "config_name": "VLSP2016_Sentiment_LogisticRegression_feat20k_ngram1-2",
+  "model_name": "logistic",
+  "classifier": "LogisticRegression",
+  "max_features": 20000,
+  "ngram_range": [
+    1,
+    2
+  ],
+  "split_ratio": 0.2,
+  "n_samples": null,
+  "train_samples": 5100,
+  "test_samples": 1050,
+  "unique_labels": 3,
+  "labels": [
+    "negative",
+    "neutral",
+    "positive"
+  ],
+  "train_accuracy": 0.9205882352941176,
+  "test_accuracy": 0.7019047619047619,
+  "train_time": 0.7513909339904785,
+  "prediction_time": 0.03164172172546387,
+  "classification_report": {
+    "negative": {
+      "precision": 0.6843575418994413,
+      "recall": 0.7,
+      "f1-score": 0.692090395480226,
+      "support": 350.0
+    },
+    "neutral": {
+      "precision": 0.6522911051212938,
+      "recall": 0.6914285714285714,
+      "f1-score": 0.6712898751733704,
+      "support": 350.0
+    },
+    "positive": {
+      "precision": 0.778816199376947,
+      "recall": 0.7142857142857143,
+      "f1-score": 0.7451564828614009,
+      "support": 350.0
+    },
+    "accuracy": 0.7019047619047619,
+    "macro avg": {
+      "precision": 0.7051549487992274,
+      "recall": 0.7019047619047619,
+      "f1-score": 0.702845584504999,
+      "support": 1050.0
+    },
+    "weighted avg": {
+      "precision": 0.7051549487992275,
+      "recall": 0.7019047619047619,
+      "f1-score": 0.7028455845049991,
+      "support": 1050.0
+    }
+  },
+  "confusion_matrix": [
+    [
+      245,
+      70,
+      35
+    ],
+    [
+      72,
+      242,
+      36
+    ],
+    [
+      41,
+      59,
+      250
+    ]
+  ]
+}

runs/{20250928_131527/models/UTS2017_Bank_AspectSentiment_SVC_feat20k_ngram1-2.joblib → 20250929_075333/models/VLSP2016_Sentiment_LogisticRegression_feat20k_ngram1-2.joblib} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fe6abcdbb83ea5ae3d75b585cb12a7ce3a054f5e269d8d2c204cb01e732e94b1
-size 2154772

 version https://git-lfs.github.com/spec/v1
+oid sha256:5a1f7db895c7cce8e9dac4632d99f594bbaff87295008b061949ea03b7929c0d
+size 1262400

runs/20250929_075333/models/labels.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+negative
+neutral
+positive

runs/{20250928_131527 → 20250929_075333}/models/model.joblib RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fe6abcdbb83ea5ae3d75b585cb12a7ce3a054f5e269d8d2c204cb01e732e94b1
-size 2154772

 version https://git-lfs.github.com/spec/v1
+oid sha256:5a1f7db895c7cce8e9dac4632d99f594bbaff87295008b061949ea03b7929c0d
+size 1262400

runs/20250929_075333/training.log ADDED Viewed

	@@ -0,0 +1,40 @@

+2025-09-29 07:53:33,898 - INFO - Starting training run: 20250929_075333
+2025-09-29 07:53:33,898 - INFO - Dataset: vlsp2016
+2025-09-29 07:53:33,898 - INFO - Model: logistic
+2025-09-29 07:53:33,898 - INFO - Max features: 20000
+2025-09-29 07:53:33,898 - INFO - N-gram range: (1, 2)
+2025-09-29 07:53:33,898 - INFO - Loading vlsp2016 dataset...
+2025-09-29 07:53:38,896 - INFO - Train samples: 5100
+2025-09-29 07:53:38,896 - INFO - Test samples: 1050
+2025-09-29 07:53:38,896 - INFO - Unique labels: 3
+2025-09-29 07:53:38,896 - INFO - Label distribution (train): {np.str_('negative'): np.int64(1700), np.str_('neutral'): np.int64(1700), np.str_('positive'): np.int64(1700)}
+2025-09-29 07:53:38,896 - INFO - Label distribution (test): {np.str_('negative'): np.int64(350), np.str_('neutral'): np.int64(350), np.str_('positive'): np.int64(350)}
+2025-09-29 07:53:38,896 - INFO - Selected classifier: LogisticRegression
+2025-09-29 07:53:38,897 - INFO - ============================================================
+2025-09-29 07:53:38,897 - INFO - Training: VLSP2016_Sentiment_LogisticRegression_feat20k_ngram1-2
+2025-09-29 07:53:38,897 - INFO - ============================================================
+2025-09-29 07:53:38,897 - INFO - Creating pipeline with max_features=20000, ngram_range=(1, 2)
+2025-09-29 07:53:38,897 - INFO - Training model...
+2025-09-29 07:53:39,648 - INFO - Training completed in 0.75 seconds
+2025-09-29 07:53:39,648 - INFO - Evaluating on training set...
+2025-09-29 07:53:39,963 - INFO - Training accuracy: 0.9206
+2025-09-29 07:53:39,963 - INFO - Evaluating on test set...
+2025-09-29 07:53:39,994 - INFO - Test accuracy: 0.7019
+2025-09-29 07:53:39,994 - INFO - Prediction time: 0.03 seconds
+2025-09-29 07:53:39,995 - INFO - Classification Report:
+2025-09-29 07:53:39,999 - INFO -               precision    recall  f1-score   support
+    negative       0.68      0.70      0.69       350
+     neutral       0.65      0.69      0.67       350
+    positive       0.78      0.71      0.75       350
+    accuracy                           0.70      1050
+   macro avg       0.71      0.70      0.70      1050
+weighted avg       0.71      0.70      0.70      1050
+2025-09-29 07:53:40,004 - INFO - Confusion Matrix shape: (3, 3)
+2025-09-29 07:53:40,165 - INFO - Model saved to runs/20250929_075333/models/model.joblib
+2025-09-29 07:53:40,402 - INFO - Model also saved as runs/20250929_075333/models/VLSP2016_Sentiment_LogisticRegression_feat20k_ngram1-2.joblib
+2025-09-29 07:53:40,751 - INFO - Model exported as ./vlsp2016_sentiment_20250929_075333.joblib
+2025-09-29 07:53:40,752 - INFO - Label mapping saved to runs/20250929_075333/models/labels.txt
+2025-09-29 07:53:40,753 - INFO - Metadata saved to runs/20250929_075333/metadata.json

runs/20250929_075529/metadata.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+  "timestamp": "20250929_075529",
+  "dataset": "vlsp2016",
+  "dataset_name": "VLSP2016_Sentiment",
+  "config_name": "VLSP2016_Sentiment_SVC_feat20k_ngram1-2",
+  "model_name": "svc_linear",
+  "classifier": "SVC",
+  "max_features": 20000,
+  "ngram_range": [
+    1,
+    2
+  ],
+  "split_ratio": 0.2,
+  "n_samples": null,
+  "train_samples": 5100,
+  "test_samples": 1050,
+  "unique_labels": 3,
+  "labels": [
+    "negative",
+    "neutral",
+    "positive"
+  ],
+  "train_accuracy": 0.9456862745098039,
+  "test_accuracy": 0.7114285714285714,
+  "train_time": 24.953126907348633,
+  "prediction_time": 0.762152910232544,
+  "classification_report": {
+    "negative": {
+      "precision": 0.7030812324929971,
+      "recall": 0.7171428571428572,
+      "f1-score": 0.71004243281471,
+      "support": 350.0
+    },
+    "neutral": {
+      "precision": 0.648,
+      "recall": 0.6942857142857143,
+      "f1-score": 0.670344827586207,
+      "support": 350.0
+    },
+    "positive": {
+      "precision": 0.7955974842767296,
+      "recall": 0.7228571428571429,
+      "f1-score": 0.7574850299401198,
+      "support": 350.0
+    },
+    "accuracy": 0.7114285714285714,
+    "macro avg": {
+      "precision": 0.7155595722565756,
+      "recall": 0.7114285714285714,
+      "f1-score": 0.7126240967803456,
+      "support": 1050.0
+    },
+    "weighted avg": {
+      "precision": 0.7155595722565756,
+      "recall": 0.7114285714285714,
+      "f1-score": 0.7126240967803456,
+      "support": 1050.0
+    }
+  },
+  "confusion_matrix": [
+    [
+      251,
+      72,
+      27
+    ],
+    [
+      69,
+      243,
+      38
+    ],
+    [
+      37,
+      60,
+      253
+    ]
+  ]
+}

runs/20250929_075529/models/VLSP2016_Sentiment_SVC_feat20k_ngram1-2.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2ab63a5ec8c8f53581bc13d84ecbbe9912e84111c8cc163b9b8c85351a7d0b9
+size 2947220

runs/20250929_075529/models/labels.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+negative
+neutral
+positive

runs/20250929_075529/models/model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2ab63a5ec8c8f53581bc13d84ecbbe9912e84111c8cc163b9b8c85351a7d0b9
+size 2947220

runs/20250929_075529/training.log ADDED Viewed

	@@ -0,0 +1,40 @@

+2025-09-29 07:55:29,277 - INFO - Starting training run: 20250929_075529
+2025-09-29 07:55:29,277 - INFO - Dataset: vlsp2016
+2025-09-29 07:55:29,277 - INFO - Model: svc_linear
+2025-09-29 07:55:29,277 - INFO - Max features: 20000
+2025-09-29 07:55:29,277 - INFO - N-gram range: (1, 2)
+2025-09-29 07:55:29,277 - INFO - Loading vlsp2016 dataset...
+2025-09-29 07:55:34,138 - INFO - Train samples: 5100
+2025-09-29 07:55:34,138 - INFO - Test samples: 1050
+2025-09-29 07:55:34,138 - INFO - Unique labels: 3
+2025-09-29 07:55:34,138 - INFO - Label distribution (train): {np.str_('negative'): np.int64(1700), np.str_('neutral'): np.int64(1700), np.str_('positive'): np.int64(1700)}
+2025-09-29 07:55:34,138 - INFO - Label distribution (test): {np.str_('negative'): np.int64(350), np.str_('neutral'): np.int64(350), np.str_('positive'): np.int64(350)}
+2025-09-29 07:55:34,138 - INFO - Selected classifier: SVC
+2025-09-29 07:55:34,138 - INFO - ============================================================
+2025-09-29 07:55:34,138 - INFO - Training: VLSP2016_Sentiment_SVC_feat20k_ngram1-2
+2025-09-29 07:55:34,138 - INFO - ============================================================
+2025-09-29 07:55:34,138 - INFO - Creating pipeline with max_features=20000, ngram_range=(1, 2)
+2025-09-29 07:55:34,138 - INFO - Training model...
+2025-09-29 07:55:59,092 - INFO - Training completed in 24.95 seconds
+2025-09-29 07:55:59,092 - INFO - Evaluating on training set...
+2025-09-29 07:56:03,151 - INFO - Training accuracy: 0.9457
+2025-09-29 07:56:03,151 - INFO - Evaluating on test set...
+2025-09-29 07:56:03,913 - INFO - Test accuracy: 0.7114
+2025-09-29 07:56:03,913 - INFO - Prediction time: 0.76 seconds
+2025-09-29 07:56:03,913 - INFO - Classification Report:
+2025-09-29 07:56:03,918 - INFO -               precision    recall  f1-score   support
+    negative       0.70      0.72      0.71       350
+     neutral       0.65      0.69      0.67       350
+    positive       0.80      0.72      0.76       350
+    accuracy                           0.71      1050
+   macro avg       0.72      0.71      0.71      1050
+weighted avg       0.72      0.71      0.71      1050
+2025-09-29 07:56:03,923 - INFO - Confusion Matrix shape: (3, 3)
+2025-09-29 07:56:04,043 - INFO - Model saved to runs/20250929_075529/models/model.joblib
+2025-09-29 07:56:04,162 - INFO - Model also saved as runs/20250929_075529/models/VLSP2016_Sentiment_SVC_feat20k_ngram1-2.joblib
+2025-09-29 07:56:04,294 - INFO - Model exported as ./vlsp2016_sentiment_20250929_075529.joblib
+2025-09-29 07:56:04,295 - INFO - Label mapping saved to runs/20250929_075529/models/labels.txt
+2025-09-29 07:56:04,295 - INFO - Metadata saved to runs/20250929_075529/metadata.json

runs/20250929_075901/metadata.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+  "timestamp": "20250929_075901",
+  "dataset": "vlsp2016",
+  "dataset_name": "VLSP2016_Sentiment",
+  "config_name": "VLSP2016_Sentiment_SVC_feat20k_ngram1-3",
+  "model_name": "svc_linear",
+  "classifier": "SVC",
+  "max_features": 20000,
+  "ngram_range": [
+    1,
+    3
+  ],
+  "split_ratio": 0.2,
+  "n_samples": null,
+  "train_samples": 5100,
+  "test_samples": 1050,
+  "unique_labels": 3,
+  "labels": [
+    "negative",
+    "neutral",
+    "positive"
+  ],
+  "train_accuracy": 0.9425490196078431,
+  "test_accuracy": 0.7066666666666667,
+  "train_time": 25.364684104919434,
+  "prediction_time": 0.7415199279785156,
+  "classification_report": {
+    "negative": {
+      "precision": 0.6988950276243094,
+      "recall": 0.7228571428571429,
+      "f1-score": 0.7106741573033708,
+      "support": 350.0
+    },
+    "neutral": {
+      "precision": 0.6485013623978202,
+      "recall": 0.68,
+      "f1-score": 0.6638772663877266,
+      "support": 350.0
+    },
+    "positive": {
+      "precision": 0.7819314641744548,
+      "recall": 0.7171428571428572,
+      "f1-score": 0.7481371087928465,
+      "support": 350.0
+    },
+    "accuracy": 0.7066666666666667,
+    "macro avg": {
+      "precision": 0.7097759513988615,
+      "recall": 0.7066666666666667,
+      "f1-score": 0.7075628441613145,
+      "support": 1050.0
+    },
+    "weighted avg": {
+      "precision": 0.7097759513988614,
+      "recall": 0.7066666666666667,
+      "f1-score": 0.7075628441613147,
+      "support": 1050.0
+    }
+  },
+  "confusion_matrix": [
+    [
+      253,
+      68,
+      29
+    ],
+    [
+      71,
+      238,
+      41
+    ],
+    [
+      38,
+      61,
+      251
+    ]
+  ]
+}

runs/20250929_075901/models/VLSP2016_Sentiment_SVC_feat20k_ngram1-3.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1743c4adf26bd8d1e118fe4de9409cea4219c5357edd7b57910cfe01afc43c20
+size 3019140

runs/20250929_075901/models/labels.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+negative
+neutral
+positive

runs/20250929_075901/models/model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1743c4adf26bd8d1e118fe4de9409cea4219c5357edd7b57910cfe01afc43c20
+size 3019140

runs/20250929_075901/training.log ADDED Viewed

	@@ -0,0 +1,40 @@

+2025-09-29 07:59:01,513 - INFO - Starting training run: 20250929_075901
+2025-09-29 07:59:01,513 - INFO - Dataset: vlsp2016
+2025-09-29 07:59:01,513 - INFO - Model: svc_linear
+2025-09-29 07:59:01,513 - INFO - Max features: 20000
+2025-09-29 07:59:01,513 - INFO - N-gram range: (1, 3)
+2025-09-29 07:59:01,513 - INFO - Loading vlsp2016 dataset...
+2025-09-29 07:59:05,847 - INFO - Train samples: 5100
+2025-09-29 07:59:05,847 - INFO - Test samples: 1050
+2025-09-29 07:59:05,848 - INFO - Unique labels: 3
+2025-09-29 07:59:05,848 - INFO - Label distribution (train): {np.str_('negative'): np.int64(1700), np.str_('neutral'): np.int64(1700), np.str_('positive'): np.int64(1700)}
+2025-09-29 07:59:05,848 - INFO - Label distribution (test): {np.str_('negative'): np.int64(350), np.str_('neutral'): np.int64(350), np.str_('positive'): np.int64(350)}
+2025-09-29 07:59:05,848 - INFO - Selected classifier: SVC
+2025-09-29 07:59:05,848 - INFO - ============================================================
+2025-09-29 07:59:05,848 - INFO - Training: VLSP2016_Sentiment_SVC_feat20k_ngram1-3
+2025-09-29 07:59:05,848 - INFO - ============================================================
+2025-09-29 07:59:05,848 - INFO - Creating pipeline with max_features=20000, ngram_range=(1, 3)
+2025-09-29 07:59:05,849 - INFO - Training model...
+2025-09-29 07:59:31,214 - INFO - Training completed in 25.36 seconds
+2025-09-29 07:59:31,214 - INFO - Evaluating on training set...
+2025-09-29 07:59:35,246 - INFO - Training accuracy: 0.9425
+2025-09-29 07:59:35,246 - INFO - Evaluating on test set...
+2025-09-29 07:59:35,987 - INFO - Test accuracy: 0.7067
+2025-09-29 07:59:35,987 - INFO - Prediction time: 0.74 seconds
+2025-09-29 07:59:35,987 - INFO - Classification Report:
+2025-09-29 07:59:35,992 - INFO -               precision    recall  f1-score   support
+    negative       0.70      0.72      0.71       350
+     neutral       0.65      0.68      0.66       350
+    positive       0.78      0.72      0.75       350
+    accuracy                           0.71      1050
+   macro avg       0.71      0.71      0.71      1050
+weighted avg       0.71      0.71      0.71      1050
+2025-09-29 07:59:35,997 - INFO - Confusion Matrix shape: (3, 3)
+2025-09-29 07:59:36,117 - INFO - Model saved to runs/20250929_075901/models/model.joblib
+2025-09-29 07:59:36,233 - INFO - Model also saved as runs/20250929_075901/models/VLSP2016_Sentiment_SVC_feat20k_ngram1-3.joblib
+2025-09-29 07:59:36,351 - INFO - Model exported as ./vlsp2016_sentiment_20250929_075901.joblib
+2025-09-29 07:59:36,352 - INFO - Label mapping saved to runs/20250929_075901/models/labels.txt
+2025-09-29 07:59:36,352 - INFO - Metadata saved to runs/20250929_075901/metadata.json

train.py CHANGED Viewed

@@ -1,8 +1,8 @@
 #!/usr/bin/env python3
 """
-Training script for Vietnamese aspect sentiment classification.
-Trains TF-IDF + ML models on UTS2017_Bank aspect sentiment dataset.
-This script trains various machine learning models for Vietnamese banking aspect sentiment analysis.
 """
 import argparse
@@ -86,8 +86,13 @@ def load_uts2017_data(split_ratio=0.2, random_state=42, n_samples=None):
     # Apply sample limit if specified
     if n_samples and n_samples < len(texts):
-        texts = texts[:n_samples]
-        labels = labels[:n_samples]
     # Convert to numpy arrays for consistency
     X = np.array(texts)
@@ -117,6 +122,99 @@ def load_uts2017_data(split_ratio=0.2, random_state=42, n_samples=None):
     return (X_train, y_train), (X_test, y_test)
 def get_available_models():
     """Get available classifier options"""
     return {
@@ -139,7 +237,35 @@ def get_available_models():
     }
 def train_model(
     model_name="logistic",
     max_features=20000,
     ngram_range=(1, 2),
@@ -149,6 +275,7 @@ def train_model(
 ):
     """Train a single model with specified parameters
     Args:
         model_name: Name of the model to train ('logistic' or 'svc')
         max_features: Maximum number of features for TF-IDF vectorizer
         ngram_range: N-gram range for feature extraction
@@ -162,6 +289,7 @@ def train_model(
     run_dir = setup_logging(timestamp)
     logging.info(f"Starting training run: {timestamp}")
     logging.info(f"Model: {model_name}")
     logging.info(f"Max features: {max_features}")
     logging.info(f"N-gram range: {ngram_range}")
@@ -173,11 +301,10 @@ def train_model(
     os.makedirs(output_folder, exist_ok=True)
     # Load data
-    logging.info("Loading UTS2017_Bank aspect sentiment dataset...")
-    (X_train, y_train), (X_test, y_test) = load_uts2017_data(
-        split_ratio=split_ratio, n_samples=n_samples
     )
-    dataset_name = "UTS2017_Bank_AspectSentiment"
     # Get unique labels for reporting
     unique_labels = sorted(set(y_train))
@@ -275,9 +402,10 @@ def train_model(
     # Export model if requested
     if export_model:
-        # Use format: uts2017_sentiment_<timestamp>.joblib
         run_id = os.path.basename(run_dir)
-        export_filename = f"uts2017_sentiment_{run_id}.joblib"
         export_path = os.path.join(".", export_filename)
         joblib.dump(text_clf, export_path)
         logging.info(f"Model exported as {export_path}")
@@ -293,6 +421,8 @@ def train_model(
     # Save metadata
     metadata = {
         "timestamp": timestamp,
         "config_name": config_name,
         "model_name": model_name,
         "classifier": clf_name,
@@ -334,13 +464,13 @@ def train_model(
     return metadata
-def train_all_configurations(models=None, num_rows=None):
     """Train multiple model configurations and compare results"""
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     run_dir = setup_logging(timestamp)
     logging.info(f"Starting comparison run: {timestamp}")
-    logging.info("Dataset: UTS2017_Bank_AspectSentiment")
     if num_rows:
         logging.info(f"Sample limit: {num_rows}")
@@ -357,6 +487,7 @@ def train_all_configurations(models=None, num_rows=None):
         if model_name in ["svc_rbf", "gradient_boost", "ada_boost", "mlp"]:
             # Use fewer features for computationally expensive models
             configurations.append({
                 "model_name": model_name,
                 "max_features": 10000,
                 "ngram_range": (1, 2),
@@ -365,6 +496,7 @@ def train_all_configurations(models=None, num_rows=None):
         else:
             # Use more features for faster models
             configurations.append({
                 "model_name": model_name,
                 "max_features": 20000,
                 "ngram_range": (1, 2),
@@ -416,22 +548,23 @@ def train_all_configurations(models=None, num_rows=None):
     return results
-def train_notebook(model_name="logistic", max_features=20000, ngram_min=1, ngram_max=2,
                    split_ratio=0.2, n_samples=None, compare=False, export_model=False):
     """
     Convenience function for training in Jupyter/Colab notebooks without argparse.
     Example usage:
         from train import train_notebook
-        train_notebook(model_name="logistic", max_features=20000, export_model=True)
     """
     if compare:
-        print("Training and comparing multiple configurations...")
-        return train_all_configurations()
     else:
-        print(f"Training {model_name} model on UTS2017_Bank_AspectSentiment dataset...")
         print(f"Configuration: max_features={max_features}, ngram=({ngram_min}, {ngram_max})")
         return train_model(
             model_name=model_name,
             max_features=max_features,
             ngram_range=(ngram_min, ngram_max),
@@ -448,7 +581,14 @@ def main():
     in_notebook = hasattr(sys, 'ps1') or 'ipykernel' in sys.modules or 'google.colab' in sys.modules
     parser = argparse.ArgumentParser(
-        description="Train Vietnamese aspect sentiment classification model on UTS2017_Bank dataset"
     )
     parser.add_argument(
         "--model",
@@ -505,23 +645,24 @@ def main():
     if args.compare or args.compare_models:
         if args.compare_models:
             print(f"Training and comparing selected models: {args.compare_models}")
-            print("Dataset: UTS2017_Bank_AspectSentiment")
             if args.num_rows:
                 print(f"Using {args.num_rows} rows")
-            train_all_configurations(models=args.compare_models, num_rows=args.num_rows)
         else:
             print("Training and comparing all available models...")
-            print("Dataset: UTS2017_Bank_AspectSentiment")
             if args.num_rows:
                 print(f"Using {args.num_rows} rows")
-            train_all_configurations(num_rows=args.num_rows)
     else:
-        print(f"Training {args.model} model on UTS2017_Bank_AspectSentiment dataset...")
         print(
             f"Configuration: max_features={args.max_features}, ngram=({args.ngram_min}, {args.ngram_max})"
         )
         train_model(
             model_name=args.model,
             max_features=args.max_features,
             ngram_range=(args.ngram_min, args.ngram_max),

 #!/usr/bin/env python3
 """
+Training script for Vietnamese sentiment classification.
+Trains TF-IDF + ML models on VLSP2016 sentiment dataset.
+This script trains various machine learning models for Vietnamese sentiment analysis.
 """
 import argparse
     # Apply sample limit if specified
     if n_samples and n_samples < len(texts):
+        # Shuffle before sampling to get balanced classes
+        indices = np.arange(len(texts))
+        np.random.seed(random_state)
+        np.random.shuffle(indices)
+        indices = indices[:n_samples]
+        texts = [texts[i] for i in indices]
+        labels = [labels[i] for i in indices]
     # Convert to numpy arrays for consistency
     X = np.array(texts)
     return (X_train, y_train), (X_test, y_test)
+def load_vlsp2016_data(use_predefined_split=True, split_ratio=0.2, random_state=42, n_samples=None):
+    """Load and prepare VLSP2016 sentiment dataset
+    Args:
+        use_predefined_split: If True, use the predefined train/test split from the dataset
+        split_ratio: Ratio for train/test split (only used if use_predefined_split is False)
+        random_state: Random seed for reproducibility
+        n_samples: Optional limit on number of samples
+    Returns:
+        Tuple of (X_train, y_train), (X_test, y_test)
+    """
+    print("Loading VLSP2016 sentiment dataset from Hugging Face...")
+    # Load the dataset
+    dataset = load_dataset("ura-hcmut/vlsp2016")
+    if use_predefined_split:
+        # Use the predefined train/test split
+        train_data = dataset["train"]
+        test_data = dataset["test"]
+        # Extract texts and labels
+        X_train = [item["Data"] for item in train_data]
+        y_train = [item["Class"] for item in train_data]
+        X_test = [item["Data"] for item in test_data]
+        y_test = [item["Class"] for item in test_data]
+        # Apply sample limit if specified
+        if n_samples:
+            if n_samples < len(X_train):
+                # Shuffle before sampling to get balanced classes
+                indices = np.arange(len(X_train))
+                np.random.seed(random_state)
+                np.random.shuffle(indices)
+                indices = indices[:n_samples]
+                X_train = [X_train[i] for i in indices]
+                y_train = [y_train[i] for i in indices]
+            if n_samples < len(X_test):
+                # Proportionally reduce test set with shuffling
+                test_samples = int(n_samples * 0.2)  # Keep similar ratio
+                indices = np.arange(len(X_test))
+                np.random.seed(random_state)
+                np.random.shuffle(indices)
+                indices = indices[:test_samples]
+                X_test = [X_test[i] for i in indices]
+                y_test = [y_test[i] for i in indices]
+        # Convert to numpy arrays
+        X_train = np.array(X_train)
+        y_train = np.array(y_train)
+        X_test = np.array(X_test)
+        y_test = np.array(y_test)
+    else:
+        # Combine train and test, then create custom split
+        all_data = list(dataset["train"]) + list(dataset["test"])
+        # Extract texts and labels
+        texts = [item["Data"] for item in all_data]
+        labels = [item["Class"] for item in all_data]
+        # Apply sample limit if specified
+        if n_samples and n_samples < len(texts):
+            texts = texts[:n_samples]
+            labels = labels[:n_samples]
+        # Convert to numpy arrays
+        X = np.array(texts)
+        y = np.array(labels)
+        # Split into train and test sets
+        # Use stratify only if we have enough samples per class (at least 2)
+        min_samples_per_class = 2
+        unique_classes, class_counts = np.unique(y, return_counts=True)
+        can_stratify = all(count >= min_samples_per_class for count in class_counts)
+        if can_stratify:
+            X_train, X_test, y_train, y_test = train_test_split(
+                X, y, test_size=split_ratio, random_state=random_state, stratify=y
+            )
+        else:
+            print(
+                f"Warning: Some classes have fewer than {min_samples_per_class} samples. Disabling stratification."
+            )
+            X_train, X_test, y_train, y_test = train_test_split(
+                X, y, test_size=split_ratio, random_state=random_state
+            )
+    print(f"Dataset loaded: {len(X_train)} train samples, {len(X_test)} test samples")
+    print(f"Number of unique labels: {len(set(y_train))}")
+    print(f"Labels: {sorted(set(y_train))}")
+    return (X_train, y_train), (X_test, y_test)
 def get_available_models():
     """Get available classifier options"""
     return {
     }
+def load_data(dataset_name="vlsp2016", split_ratio=0.2, random_state=42, n_samples=None):
+    """Load data from the specified dataset
+    Args:
+        dataset_name: Name of the dataset to load ('vlsp2016' or 'uts2017')
+        split_ratio: Ratio for train/test split
+        random_state: Random seed for reproducibility
+        n_samples: Optional limit on number of samples
+    Returns:
+        Tuple of (X_train, y_train), (X_test, y_test), dataset_display_name
+    """
+    if dataset_name.lower() == "vlsp2016":
+        (X_train, y_train), (X_test, y_test) = load_vlsp2016_data(
+            use_predefined_split=True, split_ratio=split_ratio,
+            random_state=random_state, n_samples=n_samples
+        )
+        display_name = "VLSP2016_Sentiment"
+    elif dataset_name.lower() == "uts2017":
+        (X_train, y_train), (X_test, y_test) = load_uts2017_data(
+            split_ratio=split_ratio, random_state=random_state, n_samples=n_samples
+        )
+        display_name = "UTS2017_Bank_AspectSentiment"
+    else:
+        raise ValueError(f"Unknown dataset: {dataset_name}. Choose 'vlsp2016' or 'uts2017'")
+    return (X_train, y_train), (X_test, y_test), display_name
 def train_model(
+    dataset="vlsp2016",
     model_name="logistic",
     max_features=20000,
     ngram_range=(1, 2),
 ):
     """Train a single model with specified parameters
     Args:
+        dataset: Name of the dataset to use ('vlsp2016' or 'uts2017')
         model_name: Name of the model to train ('logistic' or 'svc')
         max_features: Maximum number of features for TF-IDF vectorizer
         ngram_range: N-gram range for feature extraction
     run_dir = setup_logging(timestamp)
     logging.info(f"Starting training run: {timestamp}")
+    logging.info(f"Dataset: {dataset}")
     logging.info(f"Model: {model_name}")
     logging.info(f"Max features: {max_features}")
     logging.info(f"N-gram range: {ngram_range}")
     os.makedirs(output_folder, exist_ok=True)
     # Load data
+    logging.info(f"Loading {dataset} dataset...")
+    (X_train, y_train), (X_test, y_test), dataset_name = load_data(
+        dataset_name=dataset, split_ratio=split_ratio, random_state=42, n_samples=n_samples
     )
     # Get unique labels for reporting
     unique_labels = sorted(set(y_train))
     # Export model if requested
     if export_model:
+        # Use format: <dataset>_sentiment_<timestamp>.joblib
         run_id = os.path.basename(run_dir)
+        dataset_prefix = dataset.lower()
+        export_filename = f"{dataset_prefix}_sentiment_{run_id}.joblib"
         export_path = os.path.join(".", export_filename)
         joblib.dump(text_clf, export_path)
         logging.info(f"Model exported as {export_path}")
     # Save metadata
     metadata = {
         "timestamp": timestamp,
+        "dataset": dataset,
+        "dataset_name": dataset_name,
         "config_name": config_name,
         "model_name": model_name,
         "classifier": clf_name,
     return metadata
+def train_all_configurations(dataset="vlsp2016", models=None, num_rows=None):
     """Train multiple model configurations and compare results"""
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     run_dir = setup_logging(timestamp)
     logging.info(f"Starting comparison run: {timestamp}")
+    logging.info(f"Dataset: {dataset}")
     if num_rows:
         logging.info(f"Sample limit: {num_rows}")
         if model_name in ["svc_rbf", "gradient_boost", "ada_boost", "mlp"]:
             # Use fewer features for computationally expensive models
             configurations.append({
+                "dataset": dataset,
                 "model_name": model_name,
                 "max_features": 10000,
                 "ngram_range": (1, 2),
         else:
             # Use more features for faster models
             configurations.append({
+                "dataset": dataset,
                 "model_name": model_name,
                 "max_features": 20000,
                 "ngram_range": (1, 2),
     return results
+def train_notebook(dataset="vlsp2016", model_name="logistic", max_features=20000, ngram_min=1, ngram_max=2,
                    split_ratio=0.2, n_samples=None, compare=False, export_model=False):
     """
     Convenience function for training in Jupyter/Colab notebooks without argparse.
     Example usage:
         from train import train_notebook
+        train_notebook(dataset="vlsp2016", model_name="logistic", max_features=20000, export_model=True)
     """
     if compare:
+        print(f"Training and comparing multiple configurations on {dataset}...")
+        return train_all_configurations(dataset=dataset)
     else:
+        print(f"Training {model_name} model on {dataset} dataset...")
         print(f"Configuration: max_features={max_features}, ngram=({ngram_min}, {ngram_max})")
         return train_model(
+            dataset=dataset,
             model_name=model_name,
             max_features=max_features,
             ngram_range=(ngram_min, ngram_max),
     in_notebook = hasattr(sys, 'ps1') or 'ipykernel' in sys.modules or 'google.colab' in sys.modules
     parser = argparse.ArgumentParser(
+        description="Train Vietnamese sentiment classification model on various datasets"
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        choices=["vlsp2016", "uts2017"],
+        default="vlsp2016",
+        help="Dataset to use for training (default: vlsp2016)",
     )
     parser.add_argument(
         "--model",
     if args.compare or args.compare_models:
         if args.compare_models:
             print(f"Training and comparing selected models: {args.compare_models}")
+            print(f"Dataset: {args.dataset}")
             if args.num_rows:
                 print(f"Using {args.num_rows} rows")
+            train_all_configurations(dataset=args.dataset, models=args.compare_models, num_rows=args.num_rows)
         else:
             print("Training and comparing all available models...")
+            print(f"Dataset: {args.dataset}")
             if args.num_rows:
                 print(f"Using {args.num_rows} rows")
+            train_all_configurations(dataset=args.dataset, num_rows=args.num_rows)
     else:
+        print(f"Training {args.model} model on {args.dataset} dataset...")
         print(
             f"Configuration: max_features={args.max_features}, ngram=({args.ngram_min}, {args.ngram_max})"
         )
         train_model(
+            dataset=args.dataset,
             model_name=args.model,
             max_features=args.max_features,
             ngram_range=(args.ngram_min, args.ngram_max),

vlsp2016_sentiment_20250929_075333.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a1f7db895c7cce8e9dac4632d99f594bbaff87295008b061949ea03b7929c0d
+size 1262400

vlsp2016_sentiment_20250929_075529.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2ab63a5ec8c8f53581bc13d84ecbbe9912e84111c8cc163b9b8c85351a7d0b9
+size 2947220