import pyrootutils root = pyrootutils.setup_root( search_from=__file__, indicator=[".project-root"], pythonpath=True, dotenv=True, ) import os from argparse import ArgumentParser, RawTextHelpFormatter from dataclasses import dataclass, field from pathlib import Path from acl_anthology import Anthology from tqdm import tqdm from src.utils.pdf_utils.acl_anthology_utils import XML2RawPapers from src.utils.pdf_utils.process_pdf import ( FulltextExtractor, GrobidFulltextExtractor, PDFDownloader, ) HELP_MSG = """ Generate paper json files from an ACL Anthology collection, with fulltext extraction. Iterate over entries in the ACL Anthology metadata, and for each entry: 1. extract relevant paper info from the xml entry 2. download pdf file 3. extract fulltext 4. format a json file and save pre-requisites: - Install the requirements: pip install acl-anthology-py>=0.4.3 bs4 jsonschema - Get the meta data from ACL Anthology: git clone git@github.com:acl-org/acl-anthology.git - Start Grobid Docker container: docker run --rm --init --ulimit core=0 -p 8070:8070 lfoppiano/grobid:0.8.0 """ @dataclass class XML2Jsons: base_output_dir: Path pdf_output_dir: Path xml2raw_papers: XML2RawPapers pdf_downloader: PDFDownloader = field(default_factory=PDFDownloader) fulltext_extractor: FulltextExtractor = field(default_factory=GrobidFulltextExtractor) show_progress: bool = True @classmethod def from_cli(cls) -> "XML2Jsons": parser = ArgumentParser(description=HELP_MSG, formatter_class=RawTextHelpFormatter) parser.add_argument( "--base-output-dir", type=str, help="Directory to save all the paper json files" ) parser.add_argument( "--pdf-output-dir", type=str, help="Directory to save all the downloaded pdf files" ) parser.add_argument( "--anthology-data-dir", type=str, help="Path to ACL Anthology metadata directory, e.g., /path/to/acl-anthology-repo/data. " "You can obtain the data via: git clone git@github.com:acl-org/acl-anthology.git", ) parser.add_argument( "--collection-id-filters", nargs="+", type=str, default=None, help="If provided, only papers from the collections whose id (Anthology ID) contains the " "specified strings will be processed.", ) parser.add_argument( "--venue-id-whitelist", nargs="+", type=str, default=None, help="If provided, only papers from the specified venues will be processed. See here for " "the list of venues: https://aclanthology.org/venues", ) args = parser.parse_args() return cls( base_output_dir=Path(args.base_output_dir), pdf_output_dir=Path(args.pdf_output_dir), xml2raw_papers=XML2RawPapers( anthology=Anthology(datadir=args.anthology_data_dir), collection_id_filters=args.collection_id_filters, venue_id_whitelist=args.venue_id_whitelist, ), ) def run(self): os.makedirs(self.pdf_output_dir, exist_ok=True) papers = self.xml2raw_papers() if self.show_progress: papers = tqdm(list(papers), desc="extracting fulltext") for paper in papers: volume_dir = self.base_output_dir / paper.volume_id if paper.url is not None: pdf_save_path = self.pdf_downloader.download( paper.url, opath=self.pdf_output_dir / f"{paper.name}.pdf" ) fulltext_extraction_output = self.fulltext_extractor(pdf_save_path) if fulltext_extraction_output: plain_text, extraction_data = fulltext_extraction_output paper.fulltext = extraction_data.get("sections") if not paper.abstract: paper.abstract = extraction_data.get("abstract") paper.save(str(volume_dir)) if __name__ == "__main__": xml2jsons = XML2Jsons.from_cli() xml2jsons.run()