import torch import torch.nn as nn import pickle from transformers import AutoTokenizer, AutoModel from tqdm import tqdm import numpy as np OFFLINE_MODEL_PATH = "all-MiniLM-L6-v2" # ============================================================================== # STEP 1: DEFINE THE MODEL ARCHITECTURE # This MUST be the exact same class definition you used for training. # ============================================================================== class ImprovedMultiTaskClassifier(nn.Module): def __init__(self, model_name, num_keywords, num_groups, dropout_rate=0.1): super(ImprovedMultiTaskClassifier, self).__init__() self.transformer = AutoModel.from_pretrained(model_name) hidden_size = self.transformer.config.hidden_size self.keyword_classifier = nn.Sequential( nn.Linear(hidden_size, hidden_size), nn.LayerNorm(hidden_size), nn.ReLU(), nn.Dropout(dropout_rate), nn.Linear(hidden_size, hidden_size // 2), nn.LayerNorm(hidden_size // 2), nn.ReLU(), nn.Dropout(dropout_rate), nn.Linear(hidden_size // 2, num_keywords) ) self.group_classifier = nn.Sequential( nn.Linear(hidden_size, hidden_size), nn.LayerNorm(hidden_size), nn.ReLU(), nn.Dropout(dropout_rate), nn.Linear(hidden_size, hidden_size // 2), nn.LayerNorm(hidden_size // 2), nn.ReLU(), nn.Dropout(dropout_rate), nn.Linear(hidden_size // 2, num_groups) ) def forward(self, input_ids, attention_mask): outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask) token_embeddings = outputs.last_hidden_state attention_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() sum_embeddings = torch.sum(token_embeddings * attention_mask_expanded, 1) sum_mask = torch.clamp(attention_mask_expanded.sum(1), min=1e-9) pooled_output = sum_embeddings / sum_mask keyword_logits = self.keyword_classifier(pooled_output) group_logits = self.group_classifier(pooled_output) return keyword_logits, group_logits # ============================================================================== # STEP 2: LOAD ALL SAVED COMPONENTS # ============================================================================== print("Loading all components for inference...") # Set device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"Using device: {device}") # Load config with open('minilm_keyword_classifier_gemini/inference_config.pkl', 'rb') as f: config = pickle.load(f) # *** IMPORTANT: Override the model_name to use the local path *** config['model_name'] = OFFLINE_MODEL_PATH # Load tokenizer from the same offline path tokenizer = AutoTokenizer.from_pretrained(OFFLINE_MODEL_PATH) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained('minilm_keyword_classifier_gemini/inference_tokenizer') # Load label encoders with open('minilm_keyword_classifier_gemini/inference_mlb_keywords.pkl', 'rb') as f: mlb_keywords = pickle.load(f) with open('minilm_keyword_classifier_gemini/inference_mlb_groups.pkl', 'rb') as f: mlb_groups = pickle.load(f) # Instantiate the model architecture num_keywords = len(mlb_keywords.classes_) num_groups = len(mlb_groups.classes_) model = ImprovedMultiTaskClassifier(config['model_name'], num_keywords, num_groups).to(device) # Load the trained weights model.load_state_dict(torch.load('minilm_keyword_classifier_gemini/inference_model.pth', map_location=device)) # Set model to evaluation mode (very important!) model.eval() print("✅ All components loaded and model is ready for inference.") # ============================================================================== # STEP 3: CREATE THE PREDICTION FUNCTION (MODIFIED TO INCLUDE SCORES) # ============================================================================== def predict_on_text(text: str): """ Takes a string of text and returns the predicted keywords and groups along with their confidence scores. """ with torch.no_grad(): encoding = tokenizer( text, truncation=True, padding='max_length', max_length=512, return_tensors='pt' ) input_ids = encoding['input_ids'].to(device) attention_mask = encoding['attention_mask'].to(device) keyword_logits, group_logits = model(input_ids, attention_mask) keyword_probs = torch.sigmoid(keyword_logits).cpu().numpy()[0] group_probs = torch.sigmoid(group_logits).cpu().numpy()[0] kw_threshold = config['optimal_keyword_threshold'] gr_threshold = config['optimal_group_threshold'] # --- MODIFICATION START --- # Get keywords that are above the threshold kw_indices = np.where(keyword_probs > kw_threshold)[0] predicted_keywords_with_scores = [ (mlb_keywords.classes_[i], keyword_probs[i]) for i in kw_indices ] # Get groups that are above the threshold gr_indices = np.where(group_probs > gr_threshold)[0] predicted_groups_with_scores = [ (mlb_groups.classes_[i], group_probs[i]) for i in gr_indices ] # Sort predictions by score in descending order predicted_keywords_with_scores.sort(key=lambda x: x[1], reverse=True) predicted_groups_with_scores.sort(key=lambda x: x[1], reverse=True) # --- MODIFICATION END --- return { 'predicted_keywords_with_scores': predicted_keywords_with_scores, 'predicted_groups_with_scores': predicted_groups_with_scores, } # list through all csv files in automarked\todo folder. Read the content column and loop through all the content there as text # for file in glob.glob('automarked\\todo\\*.csv'): # with open(file, 'r', newline='', encoding='utf-8', errors='ignore') as f: # reader = csv.DictReader(f) # for row in reader: # text = row['content'] text = """I want you to understand, people think there are many problems in the world. There are no many problems in the world. There's only one problem in the world – human being. What other problem, I'm asking""" dpred = predict_on_text(text) for d in dpred['predicted_groups_with_scores']: print(d[0], d[1], d[1] > 0.5)