financial-document-classifier / text_classification.py
AllaaSaboukh's picture
Upload text_classification.py
f5a01c5 verified
# -*- coding: utf-8 -*-
"""text_classification.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1D25W7EYF5v1a0FoSHKAcyVhwMMIU6yg4
"""
!pip install transformers datasets
!pip install torch
# Ultra-Simple Arabic Product Classifier with Enhanced Training
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib
import numpy as np
from collections import Counter
# Load and preprocess your data
print("Loading and preprocessing data...")
df = pd.read_excel('/content/Copy ofمنتجات مقاهي (1).xlsx', sheet_name='products')
df = df[['اسم المنتج', 'التصنيف المحاسبي']].dropna()
# Prepare text and labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['التصنيف المحاسبي'])
texts = df['اسم المنتج'].tolist()
print(f"Loaded {len(texts)} products with {len(set(labels))} unique categories.")
print(f"Categories: {list(label_encoder.classes_)}")
# Check class distribution and handle single-sample classes
from collections import Counter
label_counts = Counter(labels)
print(f"Class distribution:")
for label_id, count in sorted(label_counts.items()):
label_name = label_encoder.inverse_transform([label_id])[0]
print(f" {label_name}: {count} samples")
# Separate single-sample classes from multi-sample classes
single_sample_mask = np.array([label_counts[label] == 1 for label in labels])
multi_sample_mask = ~single_sample_mask
# Get indices for single and multi sample data
single_indices = np.where(single_sample_mask)[0]
multi_indices = np.where(multi_sample_mask)[0]
print(f"\nSingle-sample classes: {np.sum(single_sample_mask)} samples")
print(f"Multi-sample classes: {np.sum(multi_sample_mask)} samples")
if np.sum(multi_sample_mask) > 0:
# Split multi-sample data with stratification
multi_texts = [texts[i] for i in multi_indices]
multi_labels = [labels[i] for i in multi_indices]
train_texts, val_texts, train_labels, val_labels = train_test_split(
multi_texts, multi_labels, test_size=0.2, random_state=42, stratify=multi_labels
)
# Add single-sample data to training set (can't split them)
if np.sum(single_sample_mask) > 0:
single_texts = [texts[i] for i in single_indices]
single_labels = [labels[i] for i in single_indices]
train_texts.extend(single_texts)
train_labels.extend(single_labels)
print(f"Added {len(single_texts)} single-sample items to training set")
else:
# If all classes have single samples, use simple split without stratification
print("Warning: All or most classes have single samples. Using simple split.")
train_texts, val_texts, train_labels, val_labels = train_test_split(
texts, labels, test_size=0.2, random_state=42
)
print(f"Training set: {len(train_texts)} samples")
print(f"Validation set: {len(val_texts)} samples")
# Load Arabic BERT
model_name = "asafaya/bert-base-arabic"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(labels)))
# Define Enhanced Dataset class
class SimpleDataset(torch.utils.data.Dataset):
def __init__(self, texts, labels, tokenizer):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
encoding = self.tokenizer(
str(self.texts[idx]),
truncation=True,
padding='max_length',
max_length=128,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0),
'labels': torch.tensor(self.labels[idx], dtype=torch.long)
}
# Create datasets
train_dataset = SimpleDataset(train_texts, train_labels, tokenizer)
val_dataset = SimpleDataset(val_texts, val_labels, tokenizer)
# Define compute metrics function for evaluation
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
accuracy = accuracy_score(labels, predictions)
return {'accuracy': accuracy}
# Enhanced Training setup with evaluation
training_args = TrainingArguments(
output_dir='./model',
num_train_epochs=50,
per_device_train_batch_size=16, # زودت الـ batch size من 8 لـ 16
per_device_eval_batch_size=16, # batch size للتقييم
eval_strategy="epoch", # تقييم بعد كل epoch
save_strategy="epoch", # حفظ بعد كل epoch
logging_steps=10, # تسجيل أكثر تكراراً
save_total_limit=2, # الاحتفاظ بأفضل 2 نماذج فقط
load_best_model_at_end=True, # تحميل أفضل نموذج في النهاية
metric_for_best_model="eval_accuracy", # المقياس لاختيار أفضل نموذج
greater_is_better=True, # كلما زادت الدقة كان أفضل
report_to=None,
warmup_steps=100, # خطوات إحماء للتدريب
weight_decay=0.01, # تنظيم لمنع الـ overfitting
learning_rate=2e-5, # معدل تعلم محسن
)
# Enhanced Trainer instance with evaluation
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset, # إضافة بيانات التقييم
tokenizer=tokenizer,
compute_metrics=compute_metrics # إضافة وظيفة حساب المقاييس
)
# Start training with evaluation
print("Training started with evaluation...")
trainer.train()
# Save model, tokenizer, and label encoder
trainer.save_model('./model')
tokenizer.save_pretrained('./model')
joblib.dump(label_encoder, './model/labels.pkl')
print("Training complete! Model saved to './model'")
# Enhanced prediction function with batch processing capability
def predict(text):
"""Predict single product classification"""
tokenizer = AutoTokenizer.from_pretrained('./model')
model = AutoModelForSequenceClassification.from_pretrained('./model')
label_encoder = joblib.load('./model/labels.pkl')
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
with torch.no_grad():
outputs = model(**inputs)
predicted_id = outputs.logits.argmax().item()
confidence = torch.nn.functional.softmax(outputs.logits, dim=-1).max().item()
classification = label_encoder.inverse_transform([predicted_id])[0]
return classification, confidence
def predict_batch(texts):
"""Predict multiple products at once for faster processing"""
tokenizer = AutoTokenizer.from_pretrained('./model')
model = AutoModelForSequenceClassification.from_pretrained('./model')
label_encoder = joblib.load('./model/labels.pkl')
inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=128)
with torch.no_grad():
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1).cpu().numpy()
confidences = torch.nn.functional.softmax(outputs.logits, dim=-1).max(dim=-1)[0].cpu().numpy()
classifications = label_encoder.inverse_transform(predictions)
return list(zip(classifications, confidences))
# Evaluate on validation set
print("\nEvaluating on validation set...")
val_predictions = []
val_confidences = []
for text in val_texts:
pred, conf = predict(text)
val_predictions.append(pred)
val_confidences.append(conf)
# Convert back to numeric for comparison
val_pred_numeric = label_encoder.transform(val_predictions)
accuracy = accuracy_score(val_labels, val_pred_numeric)
print(f"Validation Accuracy: {accuracy:.4f}")
# Detailed classification report
val_true_labels = label_encoder.inverse_transform(val_labels)
print("\nDetailed Classification Report:")
print(classification_report(val_true_labels, val_predictions, target_names=label_encoder.classes_))
# Test examples
test_products = [
"نادك حليب طويل الأجل 1 لتر",
"قهوة عربية محمصة",
"شاي أحمر ليبتون",
"عصير برتقال طبيعي"
]
print("\n" + "="*50)
print("Testing on sample products:")
print("="*50)
for product in test_products:
result, confidence = predict(product)
print(f"Product: {product}")
print(f"Classification: {result}")
print(f"Confidence: {confidence:.3f}")
print("-" * 30)
# Batch prediction example
print("\nBatch prediction example:")
batch_results = predict_batch(test_products)
for product, (classification, confidence) in zip(test_products, batch_results):
print(f"{product} -> {classification} ({confidence:.3f})")
print(f"\nModel training complete!")
print(f"- Single prediction: predict('product name')")
print(f"- Batch prediction: predict_batch(['product1', 'product2', ...])")
print(f"- Validation accuracy: {accuracy:.4f}")
print(f"- Model saved to: './model'")
# Using the trained model (without retraining)
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import joblib
print("Loading trained model...")
# Load model and tools (only once)
try:
tokenizer = AutoTokenizer.from_pretrained('./model')
model = AutoModelForSequenceClassification.from_pretrained('./model')
label_encoder = joblib.load('./model/labels.pkl')
print("Model loaded successfully!")
print(f"Number of available categories: {len(label_encoder.classes_)}")
# Display available categories
print("\nAvailable categories:")
for i, category in enumerate(label_encoder.classes_, 1):
print(f"{i:2d}. {category}")
except Exception as e:
print(f"Error loading model: {e}")
print("Make sure './model' folder exists and contains required files")
exit()
# Basic classification function
def classify_product(product_name):
"""Classify a single product"""
try:
# Prepare text
inputs = tokenizer(
product_name,
return_tensors="pt",
truncation=True,
padding=True,
max_length=128
)
# Prediction
with torch.no_grad():
outputs = model(**inputs)
# Extract result
predicted_id = outputs.logits.argmax().item()
confidence = torch.nn.functional.softmax(outputs.logits, dim=-1).max().item()
classification = label_encoder.inverse_transform([predicted_id])[0]
return {
'product': product_name,
'classification': classification,
'confidence': confidence,
'success': True
}
except Exception as e:
return {
'product': product_name,
'classification': None,
'confidence': 0,
'success': False,
'error': str(e)
}
# Function to classify multiple products
def classify_multiple_products(product_list):
"""Classify a list of products"""
results = []
print(f"Classifying {len(product_list)} products...")
for i, product in enumerate(product_list, 1):
result = classify_product(product)
results.append(result)
if result['success']:
print(f"{i:3d}. {product}")
print(f" → {result['classification']}")
print(f" → Confidence: {result['confidence']:.3f}")
else:
print(f"{i:3d}. {product} - Error: {result['error']}")
print()
return results
# Test examples
test_products = [
"نادك حليب طويل الأجل 1 لتر",
"قهوة عربية محمصة",
"شاي أحمر ليبتون",
"منظف أرضيات فلاش",
"سكر أبيض ناعم",
"عصير برتقال طبيعي"
]
print("\n" + "="*60)
print("Testing model on sample products")
print("="*60)
# Classify test products
test_results = classify_multiple_products(test_products)
# Quick statistics
successful_predictions = [r for r in test_results if r['success']]
avg_confidence = sum(r['confidence'] for r in successful_predictions) / len(successful_predictions)
print("="*60)
print("Results summary:")
print(f"Successfully classified {len(successful_predictions)} products")
print(f"Average confidence level: {avg_confidence:.3f}")
# Display unique classifications
unique_classifications = set(r['classification'] for r in successful_predictions)
print(f"Number of categories used: {len(unique_classifications)}")
print("Categories:")
for classification in sorted(unique_classifications):
count = sum(1 for r in successful_predictions if r['classification'] == classification)
print(f" • {classification} ({count} products)")
print("\n" + "="*60)
print("Model ready for use!")
print("="*60)
print("Usage:")
print("result = classify_product('product name')")
print("print(f\"Classification: {result['classification']}\")")
print("print(f\"Confidence: {result['confidence']:.3f}\")")
print("\nFor multiple products:")
print("products = ['product 1', 'product 2', 'product 3']")
print("results = classify_multiple_products(products)")
test_product = 'عطر كروم ليجند للرجال او دي تواليت من ازارو 125 مل'
result, confidence = predict(test_product)
print(f"\nTest: {test_product}")
print(f"Result: {result}")
print(f"Confidence: {confidence:.3f}")
"""# Saving The model"""
# احفظ النموذج
model.save_pretrained('/content/my_model/')
# لاحقاً، لتحميله مرة أخرى:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('/content/my_model/')
!zip -r my_model.zip /content/my_model/
tokenizer.save_pretrained('/content/my_model')
model.save_pretrained('/content/my_model')
import joblib
joblib.dump(label_encoder, '/content/my_model/labels.pkl')
from google.colab import files
files.download('my_model.zip')
"""# Testing"""
!ls /content/my_model
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import joblib
# Define the path where files are saved
save_path = '/content/my_model'
# Load the tokenizer, model, and label encoder
tokenizer = AutoTokenizer.from_pretrained(save_path)
model = AutoModelForSequenceClassification.from_pretrained(save_path)
label_encoder = joblib.load(f'{save_path}/labels.pkl')
def predict(text):
# Preprocess the input text
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
# Perform inference
with torch.no_grad():
outputs = model(**inputs)
# Get predicted class ID and confidence
predicted_id = outputs.logits.argmax().item()
confidence = torch.nn.functional.softmax(outputs.logits, dim=-1).max().item()
# Map the ID back to the label name
classification = label_encoder.inverse_transform([predicted_id])[0]
return classification, confidence
# Test a product
test_product = "نادك حليب طويل الأجل 1 لتر"
result, confidence = predict(test_product)
print(f"Test Product: {test_product}")
print(f"Predicted Category: {result}")
print(f"Confidence: {confidence:.3f}")
# Test a product
test_product = "زبادى"
result, confidence = predict(test_product)
print(f"Test Product: {test_product}")
print(f"Predicted Category: {result}")
print(f"Confidence: {confidence:.3f}")
# Test a product
test_product = "بترول"
result, confidence = predict(test_product)
print(f"Test Product: {test_product}")
print(f"Predicted Category: {result}")
print(f"Confidence: {confidence:.3f}")
from google.colab import files
uploaded = files.upload()