import sys import os sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) def list_txt_files(root_dir): """Recursively lists all .txt files in a directory.""" txt_files = [] for dirpath, _, filenames in os.walk(root_dir): for file in filenames: if file.endswith(".txt"): full_path = os.path.join(dirpath, file) txt_files.append(full_path) return txt_files def label_file(filepath): """ Assigns label based on filename prefix: - 'sta-' => 0 (pristine) - 'wea-' => 1 (weathered) Returns None if prefix is unknown. """ filename = os.path.basename(filepath).lower() if filename.startswith("sta-"): return 0 elif filename.startswith("wea-"): return 1 else: return None # Unknown or irrelevant if __name__ == "__main__": dataset_dir = os.path.join( "datasets", "rdwp", "A Raman database of microplastics weathered under natural environments" ) txt_paths = list_txt_files(dataset_dir) print(f"Found {len(txt_paths)} .txt files.") print("Sample Files: ") for path in txt_paths[:5]: print(" -", path) labeled_files = [] for path in txt_paths: label = label_file(path) if label is not None: labeled_files.append((path, label)) print(f"\nLabeled {len(labeled_files)} files:") for path, label in labeled_files[:5]: print(f" - {os.path.basename(path)} => Label: {label}")