Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 4,128 Bytes
5048713 c54822c 6446f53 5048713 94eae63 c54822c 5048713 6446f53 5048713 6446f53 5048713 c54822c 5048713 c54822c 5048713 c54822c 5048713 5d63385 c54822c 5048713 c54822c 5048713 c54822c 5048713 c54822c 5048713 c54822c 6446f53 5048713 c54822c 5048713 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import json
import os
import string
from src.logger import get_logger
WARMUP_DATASET_SIZE = 100
DATASET_SIZE = 120
MIN_INPUT_LENGTH = 2
MAX_INPUT_LENGTH = 20
MIN_SUBMISSION_SIZE = 1
MAX_SUBMISSION_SIZE = 1024 * 1024 * 120 # 120 MB.
MAX_SINGLE_SUBMISSION_SIZE = 1024 * 1024 # 1MB.
MAX_SUBMISSION_LINES = DATASET_SIZE + 1 # Allow empty line.
logger = get_logger(__name__)
def is_valid(
s: str,
min_length: int = MIN_INPUT_LENGTH,
max_length: int = MAX_INPUT_LENGTH,
) -> bool:
"""
@brief Checks whether the given string is valid.
@param s The string to validate.
@return True iff all characters are in [a-zA-Z0-9], spaces, or '.' and '-', and the length if between
min length and max length.
"""
characters = [c for c in s] # Not using the length from len(.) as that includes unicode characters.
if len(characters) < min_length or len(characters) > max_length:
return False
# Very important: We delimit using underscores. So these _CANNOT_ be allowed in sanitised strings.
ALLOWED = (
[c for c in string.ascii_lowercase]
+ [c for c in string.ascii_uppercase]
+ [c for c in string.digits]
+ [" ", ".", "-"]
)
for c in s:
if c not in ALLOWED:
return False
return True
def is_submission_file_valid(
submission_path: str,
is_warmup_dataset: bool,
) -> bool:
"""
@brief Checks whether the given submission file is valid.
@param submission_path The path to the submission file.
@param is_warmup_dataset Whether we are working on the regular or the warmup dataset.
@return True iff the file is within the size constraints, a JSONL, and every line is no longer than
the fixed maximum bound.
"""
if not os.path.exists(submission_path):
logger.warning(f"Could not find submission file {submission_path=}")
return False
submission_size = os.stat(submission_path).st_size
if submission_size < MIN_SUBMISSION_SIZE or submission_size > MAX_SUBMISSION_SIZE:
logger.warning(f"Submission size was {submission_size}, exceeding [{MIN_SUBMISSION_SIZE, MAX_SUBMISSION_SIZE}]")
return False
with open(submission_path, "r") as f:
# Not using readlines() to avoid consuming a large buffer at once.
n_lines = 0
seen_ids = set()
while len(line := f.readline(MAX_SINGLE_SUBMISSION_SIZE)) > 0:
n_lines += 1
if n_lines > MAX_SUBMISSION_LINES:
logger.warning(f"Got submission with more than {MAX_SUBMISSION_LINES} lines")
return False
if not (line.startswith("{") and (line.endswith("}") or line.endswith("}\n"))):
logger.warning("Submission has line that does not appear to be a JSONL")
return False
d = json.loads(line)
if set(d.keys()) != set(["problem_id", "solution"]):
logger.warning("Found unexpected keys")
return False
if not ((type(d["problem_id"]) is str or type(d["problem_id"]) is int) and type(d["solution"] is str)):
logger.warning("Found unexpected types")
return False
try:
problem_id = int(d["problem_id"])
except Exception:
logger.warning("Could not convert problem ID to int")
return False
if is_warmup_dataset:
if problem_id < DATASET_SIZE or problem_id >= DATASET_SIZE + WARMUP_DATASET_SIZE:
logger.warning(f"Problem ID {problem_id} is beyond allowed bounds")
return False
else:
if problem_id < 0 or problem_id >= DATASET_SIZE:
logger.warning(f"Problem ID {problem_id} is beyond allowed bounds")
return False
if problem_id in seen_ids:
logger.warning(f"Got duplicate submission -- ID {problem_id} appears twice")
return False # Duplicate submission.
seen_ids.add(problem_id)
return True
|