File size: 2,247 Bytes
e484a46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""
list_spectra.py

This script provides functionality to recursively list all `.txt` files 
within a specified directory. It is designed to assist in managing and 
exploring datasets, particularly for Raman spectrum data stored in text files.

Functions:
-   list_txt_files(root_dir): Recursively finds and returns a list of all `.txt` 
    files in the given directory.

Usage:
-   The script can be executed directly to list `.txt` files in a predefined 
    dataset directory and print a summary, including the total count and a 
    sample of file paths.

Example:
    $ python list_spectra.py
    Found 100 .txt files.
    Sample Files:
     - datasets/rdwp/.../file1.txt
     - datasets/rdwp/.../file2.txt
"""
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))


def list_txt_files(root_dir):
    """Recursively lists all .txt files in a directory."""
    txt_files = []
    for dirpath, _, filenames in os.walk(root_dir):
        for file in filenames:
            if file.endswith(".txt"):
                full_path = os.path.join(dirpath, file)
                txt_files.append(full_path)
    return txt_files


def label_file(filepath):
    """
    Assigns label based on filename prefix:
    - 'sta-' => 0 (pristine)
    - 'wea-' => 1 (weathered)
    Returns None if prefix is unknown.
    """
    filename = os.path.basename(filepath).lower()
    if filename.startswith("sta-"):
        return 0
    elif filename.startswith("wea-"):
        return 1
    else:
        return None  # Unknown or irrelevant


if __name__ == "__main__":
    dataset_dir = os.path.join(
        "datasets", "rdwp",
        "A Raman database of microplastics weathered under natural environments"
    )

    txt_paths = list_txt_files(dataset_dir)

    print(f"Found {len(txt_paths)} .txt files.")
    print("Sample Files: ")
    for path in txt_paths[:5]:
        print(" -", path)

    labeled_files = []
    for path in txt_paths:
        label = label_file(path)
        if label is not None:
            labeled_files.append((path, label))

    print(f"\nLabeled {len(labeled_files)} files:")
    for path, label in labeled_files[:5]:
        print(f" - {os.path.basename(path)} => Label: {label}")