import numpy as np import pickle import time from tqdm import tqdm def predictions_to_iob(sentences_tokens, predictions): pred_sequences = [] for tokens, preds in zip(sentences_tokens, predictions): pred_tags = ['O'] * len(tokens) char_to_token = {} current_char = 0 for idx, token in enumerate(tokens): char_to_token[current_char] = idx current_char += len(token) + 1 # +1 for space for ent in preds: start_char = ent['start'] end_char = ent['end'] label = ent['entity'] # Find which tokens overlap with this span start_token_idx = char_to_token.get(start_char, None) if start_token_idx is None: # Find closest token keys = sorted(char_to_token.keys()) start_token_idx = char_to_token[keys[np.searchsorted(keys, start_char) - 1]] # Mark all overlapping tokens for i in range(start_token_idx, min(len(tokens), start_token_idx + 5)): token_start = sum(len(t) + 1 for t in tokens[:i]) token_end = token_start + len(tokens[i]) if token_start >= end_char: break if token_end > start_char and token_start < end_char: pred_tags[i] = label pred_sequences.append(pred_tags) return pred_sequences # Map DNRTI labels to SecureBERT label space # One can add mapping for multiple labels to the same DNRTI label dnrti_to_securebert = { "HackOrg": "APT", "SecTeam": "SECTEAM", "Idus": "IDTY", "Org": "IDTY", "OffAct": "ACT", "OffAct": "OS", "OffAct": "TOOL", "Way": "ACT", "Way": "OS", "Way": "TOOL", "Exp": "VULID", "Exp": "VULNAME", "Tool": "MAL", "SamFile": "File", "O": "DOM", "O": "ENCR", "O": "IP", "O": "URL", "O": "MD5", "O": "PROT", "O": "EMAIL", "O": "SHA1", "O": "SHA2", "Time": "TIME", "Area": "LOC", "Purp": "O", "Features": "O" } dnrti_to_cyner = { "HackOrg": "Organization", "SecTeam": "Organization", "Idus": "Indicator", "Org": "Indicator", "OffAct": "System", "Way": "System", "Exp": "Vulnerability", "Tool": "Malware", "SamFile": "System", "Time": "Date", "Area": "O", "Purp": "O", "Features": "O" } def remove_prefix(label): return label.split('-')[1] if '-' in label else label def map_predicted_to_true(predicted_labels, true_labels, mapping): mapped_predicted_labels = [] for pred_sent, true_sent in zip(predicted_labels, true_labels): mapped_pred_sent = [] for pred_label, true_label in zip(pred_sent, true_sent): pred_no_prefix = remove_prefix(pred_label) true_no_prefix = remove_prefix(true_label) if (true_no_prefix, pred_no_prefix) in mapping.items(): # If there are multiple mapping - choose 1 of them mapped_pred_sent.append(pred_label.replace(pred_no_prefix, true_no_prefix)) else: mapped_pred_sent.append(pred_label) mapped_predicted_labels.append(mapped_pred_sent) return mapped_predicted_labels # Use a pipeline as a high-level helper def apply_model(sentences_tokens, ner_pipeline): print(f"Running inference on {len(sentences_tokens)} sentences...") start_time = time.time() all_predictions = [] for tokens in tqdm(sentences_tokens): sentence = " ".join(tokens) try: result = ner_pipeline(sentence) all_predictions.append(result) except Exception as e: print(f"Error processing sentence: {e}") all_predictions.append([]) inference_time = time.time() - start_time latency_per_sentence = inference_time / len(sentences_tokens) print(f"Total inference time: {inference_time:.2f}s") print(f"Latency per sentence: {latency_per_sentence:.3f}s") return all_predictions, latency_per_sentence # if __name__ == "__main__": # data = {"sentences_tokens": sentences_tokens, "predictions": predictions} # with open('predictions.pkl', 'wb') as f: # pickle.dump(data, f) if __name__ == "__main__": with open('predictions_cyner.pkl', 'rb') as f: data = pickle.load(f) sentences_tokens = data['sentences_tokens'] predictions = data['predictions'] predicted_iob_tags = predictions_to_iob(sentences_tokens, predictions) with open('dataset.pkl', 'rb') as f: loaded_data = pickle.load(f) sentences_tokens = loaded_data['sentences_tokens'] true_labels = loaded_data['true_labels'] result = map_predicted_to_true(predicted_iob_tags, true_labels, dnrti_to_syner) print(result) all_dnrti_labels = sorted(set(label for x in true_labels for label in x)) all_dnrti_labels = sorted(set(label["entity"] for x in predictions for label in x))