Spaces:

yairgalili
/

cyber-ner

Sleeping

App Files Files Community

cyber-ner / create_dataset.py

yairgalili

py file

83874aa 27 days ago

raw

history blame contribute delete

2.67 kB

	import os
	import requests

	# -------------------------------
	# 1. DOWNLOAD AND EXTRACT DNRTI.RAR
	# -------------------------------

	DNRTI_URL = "https://github.com/SCreaMxp/DNRTI-A-Large-scale-Dataset-for-Named-Entity-Recognition-in-Threat-Intelligence/raw/master/DNRTI.rar"
	OUTPUT_RAR = "DNRTI.rar"
	EXTRACTED_DIR = "DNRTI_dataset"

	if not os.path.exists(EXTRACTED_DIR):
	os.makedirs(EXTRACTED_DIR)

	if not os.path.exists(OUTPUT_RAR):
	print("Downloading DNRTI.rar...")
	response = requests.get(DNRTI_URL, stream=True)
	with open(OUTPUT_RAR, 'wb') as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)
	print("Download complete.")

	# Extract RAR file (requires `rarfile` and system `unrar`)
	try:
	import rarfile
	rarfile.UNRAR_TOOL = "unrar" # Ensure unrar is installed: `sudo apt install unrar`
	with rarfile.RarFile(OUTPUT_RAR) as rf:
	rf.extractall(EXTRACTED_DIR)
	print(f"Extracted {OUTPUT_RAR} to {EXTRACTED_DIR}")
	except Exception as e:
	print("Error: Failed to extract RAR. Install 'unrar' via: sudo apt install unrar")
	raise e

	TEST_FILE = os.path.join(EXTRACTED_DIR, "test.txt")

	if not os.path.exists(TEST_FILE):
	raise FileNotFoundError(f"{TEST_FILE} not found after extraction.")

	# -------------------------------
	# 2. LOAD TEST DATA (token \t label format)
	# -------------------------------

	def load_conll_format(file_path):
	sentences = []
	sentence = []
	with open(file_path, 'r', encoding='utf-8') as f:
	for line in f:
	line = line.strip()
	if not line:
	if sentence:
	sentences.append(sentence)
	sentence = []
	else:
	parts = line.split()
	if len(parts) >= 2:
	token, label = parts[0], parts[1]
	sentence.append((token, label))
	if sentence:
	sentences.append(sentence)
	return sentences

	print("Loading test.txt...")
	test_sentences = load_conll_format(TEST_FILE)
	print(f"Loaded {len(test_sentences)} sentences from test.txt")

	# Extract tokens and true labels
	sentences_tokens = [[token for token, _ in sent] for sent in test_sentences]
	true_labels = [[label for _, label in sent] for sent in test_sentences]

	# DNRTI label list (extract all unique labels)
	all_dnrti_labels = sorted(set(label for sent in test_sentences for _, label in sent))
	print(f"DNRTI Labels: {all_dnrti_labels}")


	if __name__ == "__main__":
	import pickle
	data = {"sentences_tokens": sentences_tokens, "true_labels": true_labels}
	with open('dataset.pkl', 'wb') as f:
	pickle.dump(data, f)