import json import os from dataclasses import asdict, dataclass from pathlib import Path from typing import Any from jsonschema import validate # TODO: load from file schema = { "title": "RawPaper", "type": "object", "properties": { "paper_uuid": {"type": "string"}, "name": {"type": "string"}, "collection_id": {"type": "string"}, "collection_acronym": {"type": "string"}, "volume_id": {"type": "string"}, "booktitle": {"type": "string"}, "paper_id": {"type": "integer"}, "year": {"type": ["integer", "null"]}, "paper_title": {"type": "string"}, "authors": { "type": "array", "items": { "type": "object", "items": { "first": {"type": ["string", "null"]}, "last": {"type": ["string", "null"]}, }, }, }, "abstract": {"type": ["string", "null"]}, "url": {"type": "string"}, "bibkey": {"type": ["string", "null"]}, "doi": {"type": ["string", "null"]}, "fulltext": { "type": ["object", "null"], "patternProperties": {"^.*$": {"type": "array", "items": {"type": "string"}}}, }, }, } assert isinstance(schema, dict) @dataclass class RawPaper: paper_uuid: str name: str collection_id: str collection_acronym: str volume_id: str booktitle: str paper_id: int year: int | None paper_title: str authors: list[dict[str, str | None]] abstract: str | None url: str | None bibkey: str doi: str | None fulltext: dict[str, list[str]] | None @classmethod def load_from_json(cls, fpath: str | Path) -> "RawPaper": fpath = fpath if not isinstance(fpath, Path) else str(fpath) # return cls(**sienna.load(fpath)) with open(fpath, "r") as f: data = cls(**json.load(f)) return data def get_fname(self) -> str: return f"{self.name}.json" def dumps(self) -> dict[str, Any]: return asdict(self) def validate(self) -> None: validate(self.dumps(), schema=schema) def save(self, odir: str) -> None: self.validate() if not os.path.exists(odir): os.makedirs(odir, exist_ok=True) opath = os.path.join(odir, self.get_fname()) with open(opath, "w") as f: f.write(json.dumps(self.dumps(), indent=2))