|
|
|
"""text_classification.ipynb |
|
|
|
Automatically generated by Colab. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1D25W7EYF5v1a0FoSHKAcyVhwMMIU6yg4 |
|
""" |
|
|
|
!pip install transformers datasets |
|
!pip install torch |
|
|
|
|
|
import pandas as pd |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments |
|
from sklearn.preprocessing import LabelEncoder |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.metrics import accuracy_score, classification_report |
|
import joblib |
|
import numpy as np |
|
from collections import Counter |
|
|
|
|
|
print("Loading and preprocessing data...") |
|
df = pd.read_excel('/content/Copy ofمنتجات مقاهي (1).xlsx', sheet_name='products') |
|
df = df[['اسم المنتج', 'التصنيف المحاسبي']].dropna() |
|
|
|
|
|
label_encoder = LabelEncoder() |
|
labels = label_encoder.fit_transform(df['التصنيف المحاسبي']) |
|
texts = df['اسم المنتج'].tolist() |
|
|
|
print(f"Loaded {len(texts)} products with {len(set(labels))} unique categories.") |
|
print(f"Categories: {list(label_encoder.classes_)}") |
|
|
|
|
|
from collections import Counter |
|
label_counts = Counter(labels) |
|
print(f"Class distribution:") |
|
for label_id, count in sorted(label_counts.items()): |
|
label_name = label_encoder.inverse_transform([label_id])[0] |
|
print(f" {label_name}: {count} samples") |
|
|
|
|
|
single_sample_mask = np.array([label_counts[label] == 1 for label in labels]) |
|
multi_sample_mask = ~single_sample_mask |
|
|
|
|
|
single_indices = np.where(single_sample_mask)[0] |
|
multi_indices = np.where(multi_sample_mask)[0] |
|
|
|
print(f"\nSingle-sample classes: {np.sum(single_sample_mask)} samples") |
|
print(f"Multi-sample classes: {np.sum(multi_sample_mask)} samples") |
|
|
|
if np.sum(multi_sample_mask) > 0: |
|
|
|
multi_texts = [texts[i] for i in multi_indices] |
|
multi_labels = [labels[i] for i in multi_indices] |
|
|
|
train_texts, val_texts, train_labels, val_labels = train_test_split( |
|
multi_texts, multi_labels, test_size=0.2, random_state=42, stratify=multi_labels |
|
) |
|
|
|
|
|
if np.sum(single_sample_mask) > 0: |
|
single_texts = [texts[i] for i in single_indices] |
|
single_labels = [labels[i] for i in single_indices] |
|
|
|
train_texts.extend(single_texts) |
|
train_labels.extend(single_labels) |
|
|
|
print(f"Added {len(single_texts)} single-sample items to training set") |
|
else: |
|
|
|
print("Warning: All or most classes have single samples. Using simple split.") |
|
train_texts, val_texts, train_labels, val_labels = train_test_split( |
|
texts, labels, test_size=0.2, random_state=42 |
|
) |
|
|
|
print(f"Training set: {len(train_texts)} samples") |
|
print(f"Validation set: {len(val_texts)} samples") |
|
|
|
|
|
model_name = "asafaya/bert-base-arabic" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(labels))) |
|
|
|
|
|
class SimpleDataset(torch.utils.data.Dataset): |
|
def __init__(self, texts, labels, tokenizer): |
|
self.texts = texts |
|
self.labels = labels |
|
self.tokenizer = tokenizer |
|
|
|
def __len__(self): |
|
return len(self.texts) |
|
|
|
def __getitem__(self, idx): |
|
encoding = self.tokenizer( |
|
str(self.texts[idx]), |
|
truncation=True, |
|
padding='max_length', |
|
max_length=128, |
|
return_tensors='pt' |
|
) |
|
return { |
|
'input_ids': encoding['input_ids'].squeeze(0), |
|
'attention_mask': encoding['attention_mask'].squeeze(0), |
|
'labels': torch.tensor(self.labels[idx], dtype=torch.long) |
|
} |
|
|
|
|
|
train_dataset = SimpleDataset(train_texts, train_labels, tokenizer) |
|
val_dataset = SimpleDataset(val_texts, val_labels, tokenizer) |
|
|
|
|
|
def compute_metrics(eval_pred): |
|
predictions, labels = eval_pred |
|
predictions = np.argmax(predictions, axis=1) |
|
accuracy = accuracy_score(labels, predictions) |
|
return {'accuracy': accuracy} |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir='./model', |
|
num_train_epochs=50, |
|
per_device_train_batch_size=16, |
|
per_device_eval_batch_size=16, |
|
eval_strategy="epoch", |
|
save_strategy="epoch", |
|
logging_steps=10, |
|
save_total_limit=2, |
|
load_best_model_at_end=True, |
|
metric_for_best_model="eval_accuracy", |
|
greater_is_better=True, |
|
report_to=None, |
|
warmup_steps=100, |
|
weight_decay=0.01, |
|
learning_rate=2e-5, |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=val_dataset, |
|
tokenizer=tokenizer, |
|
compute_metrics=compute_metrics |
|
) |
|
|
|
|
|
print("Training started with evaluation...") |
|
trainer.train() |
|
|
|
|
|
trainer.save_model('./model') |
|
tokenizer.save_pretrained('./model') |
|
joblib.dump(label_encoder, './model/labels.pkl') |
|
|
|
print("Training complete! Model saved to './model'") |
|
|
|
|
|
def predict(text): |
|
"""Predict single product classification""" |
|
tokenizer = AutoTokenizer.from_pretrained('./model') |
|
model = AutoModelForSequenceClassification.from_pretrained('./model') |
|
label_encoder = joblib.load('./model/labels.pkl') |
|
|
|
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
predicted_id = outputs.logits.argmax().item() |
|
confidence = torch.nn.functional.softmax(outputs.logits, dim=-1).max().item() |
|
classification = label_encoder.inverse_transform([predicted_id])[0] |
|
|
|
return classification, confidence |
|
|
|
def predict_batch(texts): |
|
"""Predict multiple products at once for faster processing""" |
|
tokenizer = AutoTokenizer.from_pretrained('./model') |
|
model = AutoModelForSequenceClassification.from_pretrained('./model') |
|
label_encoder = joblib.load('./model/labels.pkl') |
|
|
|
inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=128) |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
predictions = outputs.logits.argmax(dim=-1).cpu().numpy() |
|
confidences = torch.nn.functional.softmax(outputs.logits, dim=-1).max(dim=-1)[0].cpu().numpy() |
|
classifications = label_encoder.inverse_transform(predictions) |
|
|
|
return list(zip(classifications, confidences)) |
|
|
|
|
|
print("\nEvaluating on validation set...") |
|
val_predictions = [] |
|
val_confidences = [] |
|
|
|
for text in val_texts: |
|
pred, conf = predict(text) |
|
val_predictions.append(pred) |
|
val_confidences.append(conf) |
|
|
|
|
|
val_pred_numeric = label_encoder.transform(val_predictions) |
|
accuracy = accuracy_score(val_labels, val_pred_numeric) |
|
print(f"Validation Accuracy: {accuracy:.4f}") |
|
|
|
|
|
val_true_labels = label_encoder.inverse_transform(val_labels) |
|
print("\nDetailed Classification Report:") |
|
print(classification_report(val_true_labels, val_predictions, target_names=label_encoder.classes_)) |
|
|
|
|
|
test_products = [ |
|
"نادك حليب طويل الأجل 1 لتر", |
|
"قهوة عربية محمصة", |
|
"شاي أحمر ليبتون", |
|
"عصير برتقال طبيعي" |
|
] |
|
|
|
print("\n" + "="*50) |
|
print("Testing on sample products:") |
|
print("="*50) |
|
|
|
for product in test_products: |
|
result, confidence = predict(product) |
|
print(f"Product: {product}") |
|
print(f"Classification: {result}") |
|
print(f"Confidence: {confidence:.3f}") |
|
print("-" * 30) |
|
|
|
|
|
print("\nBatch prediction example:") |
|
batch_results = predict_batch(test_products) |
|
for product, (classification, confidence) in zip(test_products, batch_results): |
|
print(f"{product} -> {classification} ({confidence:.3f})") |
|
|
|
print(f"\nModel training complete!") |
|
print(f"- Single prediction: predict('product name')") |
|
print(f"- Batch prediction: predict_batch(['product1', 'product2', ...])") |
|
print(f"- Validation accuracy: {accuracy:.4f}") |
|
print(f"- Model saved to: './model'") |
|
|
|
|
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
import joblib |
|
|
|
print("Loading trained model...") |
|
|
|
|
|
try: |
|
tokenizer = AutoTokenizer.from_pretrained('./model') |
|
model = AutoModelForSequenceClassification.from_pretrained('./model') |
|
label_encoder = joblib.load('./model/labels.pkl') |
|
print("Model loaded successfully!") |
|
print(f"Number of available categories: {len(label_encoder.classes_)}") |
|
|
|
|
|
print("\nAvailable categories:") |
|
for i, category in enumerate(label_encoder.classes_, 1): |
|
print(f"{i:2d}. {category}") |
|
|
|
except Exception as e: |
|
print(f"Error loading model: {e}") |
|
print("Make sure './model' folder exists and contains required files") |
|
exit() |
|
|
|
|
|
def classify_product(product_name): |
|
"""Classify a single product""" |
|
try: |
|
|
|
inputs = tokenizer( |
|
product_name, |
|
return_tensors="pt", |
|
truncation=True, |
|
padding=True, |
|
max_length=128 |
|
) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
|
|
predicted_id = outputs.logits.argmax().item() |
|
confidence = torch.nn.functional.softmax(outputs.logits, dim=-1).max().item() |
|
classification = label_encoder.inverse_transform([predicted_id])[0] |
|
|
|
return { |
|
'product': product_name, |
|
'classification': classification, |
|
'confidence': confidence, |
|
'success': True |
|
} |
|
|
|
except Exception as e: |
|
return { |
|
'product': product_name, |
|
'classification': None, |
|
'confidence': 0, |
|
'success': False, |
|
'error': str(e) |
|
} |
|
|
|
|
|
def classify_multiple_products(product_list): |
|
"""Classify a list of products""" |
|
results = [] |
|
|
|
print(f"Classifying {len(product_list)} products...") |
|
|
|
for i, product in enumerate(product_list, 1): |
|
result = classify_product(product) |
|
results.append(result) |
|
|
|
if result['success']: |
|
print(f"{i:3d}. {product}") |
|
print(f" → {result['classification']}") |
|
print(f" → Confidence: {result['confidence']:.3f}") |
|
else: |
|
print(f"{i:3d}. {product} - Error: {result['error']}") |
|
print() |
|
|
|
return results |
|
|
|
|
|
test_products = [ |
|
"نادك حليب طويل الأجل 1 لتر", |
|
"قهوة عربية محمصة", |
|
"شاي أحمر ليبتون", |
|
"منظف أرضيات فلاش", |
|
"سكر أبيض ناعم", |
|
"عصير برتقال طبيعي" |
|
] |
|
|
|
print("\n" + "="*60) |
|
print("Testing model on sample products") |
|
print("="*60) |
|
|
|
|
|
test_results = classify_multiple_products(test_products) |
|
|
|
|
|
successful_predictions = [r for r in test_results if r['success']] |
|
avg_confidence = sum(r['confidence'] for r in successful_predictions) / len(successful_predictions) |
|
|
|
print("="*60) |
|
print("Results summary:") |
|
print(f"Successfully classified {len(successful_predictions)} products") |
|
print(f"Average confidence level: {avg_confidence:.3f}") |
|
|
|
|
|
unique_classifications = set(r['classification'] for r in successful_predictions) |
|
print(f"Number of categories used: {len(unique_classifications)}") |
|
print("Categories:") |
|
for classification in sorted(unique_classifications): |
|
count = sum(1 for r in successful_predictions if r['classification'] == classification) |
|
print(f" • {classification} ({count} products)") |
|
|
|
print("\n" + "="*60) |
|
print("Model ready for use!") |
|
print("="*60) |
|
print("Usage:") |
|
print("result = classify_product('product name')") |
|
print("print(f\"Classification: {result['classification']}\")") |
|
print("print(f\"Confidence: {result['confidence']:.3f}\")") |
|
|
|
print("\nFor multiple products:") |
|
print("products = ['product 1', 'product 2', 'product 3']") |
|
print("results = classify_multiple_products(products)") |
|
|
|
test_product = 'عطر كروم ليجند للرجال او دي تواليت من ازارو 125 مل' |
|
result, confidence = predict(test_product) |
|
|
|
print(f"\nTest: {test_product}") |
|
print(f"Result: {result}") |
|
print(f"Confidence: {confidence:.3f}") |
|
|
|
"""# Saving The model""" |
|
|
|
|
|
model.save_pretrained('/content/my_model/') |
|
|
|
|
|
from transformers import BertForSequenceClassification |
|
model = BertForSequenceClassification.from_pretrained('/content/my_model/') |
|
|
|
!zip -r my_model.zip /content/my_model/ |
|
|
|
tokenizer.save_pretrained('/content/my_model') |
|
model.save_pretrained('/content/my_model') |
|
import joblib |
|
joblib.dump(label_encoder, '/content/my_model/labels.pkl') |
|
|
|
from google.colab import files |
|
files.download('my_model.zip') |
|
|
|
"""# Testing""" |
|
|
|
!ls /content/my_model |
|
|
|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
import torch |
|
import joblib |
|
|
|
|
|
save_path = '/content/my_model' |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(save_path) |
|
model = AutoModelForSequenceClassification.from_pretrained(save_path) |
|
label_encoder = joblib.load(f'{save_path}/labels.pkl') |
|
|
|
def predict(text): |
|
|
|
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
|
|
predicted_id = outputs.logits.argmax().item() |
|
confidence = torch.nn.functional.softmax(outputs.logits, dim=-1).max().item() |
|
|
|
|
|
classification = label_encoder.inverse_transform([predicted_id])[0] |
|
|
|
return classification, confidence |
|
|
|
|
|
test_product = "نادك حليب طويل الأجل 1 لتر" |
|
result, confidence = predict(test_product) |
|
|
|
print(f"Test Product: {test_product}") |
|
print(f"Predicted Category: {result}") |
|
print(f"Confidence: {confidence:.3f}") |
|
|
|
|
|
test_product = "زبادى" |
|
result, confidence = predict(test_product) |
|
|
|
print(f"Test Product: {test_product}") |
|
print(f"Predicted Category: {result}") |
|
print(f"Confidence: {confidence:.3f}") |
|
|
|
|
|
test_product = "بترول" |
|
result, confidence = predict(test_product) |
|
|
|
print(f"Test Product: {test_product}") |
|
print(f"Predicted Category: {result}") |
|
print(f"Confidence: {confidence:.3f}") |
|
|
|
from google.colab import files |
|
uploaded = files.upload() |
|
|
|
|