File size: 4,128 Bytes
5048713
 
 
 
c54822c
 
6446f53
5048713
 
 
 
 
 
 
 
 
 
94eae63
c54822c
5048713
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6446f53
 
 
 
5048713
 
 
6446f53
5048713
 
 
 
 
c54822c
5048713
 
 
 
c54822c
5048713
 
 
 
 
 
 
 
 
 
c54822c
5048713
 
5d63385
c54822c
5048713
 
 
 
c54822c
5048713
 
 
c54822c
5048713
c54822c
 
 
 
 
5048713
c54822c
6446f53
 
 
 
 
 
 
 
5048713
 
c54822c
5048713
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import json
import os
import string

from src.logger import get_logger

WARMUP_DATASET_SIZE = 100
DATASET_SIZE = 120

MIN_INPUT_LENGTH = 2
MAX_INPUT_LENGTH = 20

MIN_SUBMISSION_SIZE = 1
MAX_SUBMISSION_SIZE = 1024 * 1024 * 120  # 120 MB.
MAX_SINGLE_SUBMISSION_SIZE = 1024 * 1024  # 1MB.
MAX_SUBMISSION_LINES = DATASET_SIZE + 1  # Allow empty line.

logger = get_logger(__name__)


def is_valid(
    s: str,
    min_length: int = MIN_INPUT_LENGTH,
    max_length: int = MAX_INPUT_LENGTH,
) -> bool:
    """
    @brief Checks whether the given string is valid.
    @param s The string to validate.
    @return True iff all characters are in [a-zA-Z0-9], spaces, or '.' and '-', and the length if between
            min length and max length.
    """

    characters = [c for c in s]  # Not using the length from len(.) as that includes unicode characters.
    if len(characters) < min_length or len(characters) > max_length:
        return False

    # Very important: We delimit using underscores. So these _CANNOT_ be allowed in sanitised strings.
    ALLOWED = (
        [c for c in string.ascii_lowercase]
        + [c for c in string.ascii_uppercase]
        + [c for c in string.digits]
        + [" ", ".", "-"]
    )
    for c in s:
        if c not in ALLOWED:
            return False
    return True


def is_submission_file_valid(
    submission_path: str,
    is_warmup_dataset: bool,
) -> bool:
    """
    @brief Checks whether the given submission file is valid.
    @param submission_path The path to the submission file.
    @param is_warmup_dataset Whether we are working on the regular or the warmup dataset.
    @return True iff the file is within the size constraints, a JSONL, and every line is no longer than
            the fixed maximum bound.
    """

    if not os.path.exists(submission_path):
        logger.warning(f"Could not find submission file {submission_path=}")
        return False

    submission_size = os.stat(submission_path).st_size
    if submission_size < MIN_SUBMISSION_SIZE or submission_size > MAX_SUBMISSION_SIZE:
        logger.warning(f"Submission size was {submission_size}, exceeding [{MIN_SUBMISSION_SIZE, MAX_SUBMISSION_SIZE}]")
        return False

    with open(submission_path, "r") as f:

        # Not using readlines() to avoid consuming a large buffer at once.
        n_lines = 0
        seen_ids = set()
        while len(line := f.readline(MAX_SINGLE_SUBMISSION_SIZE)) > 0:
            n_lines += 1
            if n_lines > MAX_SUBMISSION_LINES:
                logger.warning(f"Got submission with more than {MAX_SUBMISSION_LINES} lines")
                return False

            if not (line.startswith("{") and (line.endswith("}") or line.endswith("}\n"))):
                logger.warning("Submission has line that does not appear to be a JSONL")
                return False

            d = json.loads(line)
            if set(d.keys()) != set(["problem_id", "solution"]):
                logger.warning("Found unexpected keys")
                return False

            if not ((type(d["problem_id"]) is str or type(d["problem_id"]) is int) and type(d["solution"] is str)):
                logger.warning("Found unexpected types")
                return False

            try:
                problem_id = int(d["problem_id"])
            except Exception:
                logger.warning("Could not convert problem ID to int")
                return False

            if is_warmup_dataset:
                if problem_id < DATASET_SIZE or problem_id >= DATASET_SIZE + WARMUP_DATASET_SIZE:
                    logger.warning(f"Problem ID {problem_id} is beyond allowed bounds")
                    return False
            else:
                if problem_id < 0 or problem_id >= DATASET_SIZE:
                    logger.warning(f"Problem ID {problem_id} is beyond allowed bounds")
                    return False

            if problem_id in seen_ids:
                logger.warning(f"Got duplicate submission -- ID {problem_id} appears twice")
                return False  # Duplicate submission.
            seen_ids.add(problem_id)

    return True