Spaces:
Runtime error
Runtime error
| # python 3.8.5 | |
| """ | |
| Given an Arxiv url, downloads the Tex files from the e-print URL, | |
| opens the directory that was downloaded, and concatenatnes all the .tex files together | |
| """ | |
| import os | |
| import sys | |
| import requests | |
| import shutil | |
| import gzip | |
| import glob | |
| import subprocess | |
| import time | |
| import re | |
| import argparse | |
| import logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| def download_arxiv(url, output_dir): | |
| """ | |
| Given an Arxiv url, downloads the Tex files from the e-print URL | |
| For example, the url https://arxiv.org/abs/2206.13947 | |
| """ | |
| logger.info("Downloading %s", url) | |
| # Get the arxiv id from the url | |
| arxiv_id = re.search(r"arxiv\.org\/abs\/(.*)", url).group(1) | |
| # Download the gz archive | |
| filename = os.path.join(output_dir, f"{arxiv_id}.gz") | |
| with requests.get(f"https://arxiv.org/e-print/{arxiv_id}", stream=True) as r: | |
| with open(filename, "wb") as f: | |
| shutil.copyfileobj(r.raw, f) | |
| return filename | |
| def concat_tex_files_in_archive(archive_path, output_dir): | |
| """ | |
| Given an archive path, extracts the tex files and concatenates them together | |
| """ | |
| # Extract the gz archive | |
| with gzip.open(archive_path, "rb") as f_in: | |
| with open(archive_path[:-3], "wb") as f_out: | |
| shutil.copyfileobj(f_in, f_out) | |
| # Concatenate the tex files | |
| tex_files = glob.glob(f"{archive_path[:-3]}/*.tex") | |
| with open(os.path.join(output_dir, "concat.tex"), "w") as f: | |
| for tex_file in tex_files: | |
| with open(tex_file) as f_in: | |
| f.write(f_in.read()) | |
| # Remove the extracted file and gz archive | |
| # os.remove(archive_path[:-3]) | |
| # os.remove(archive_path) | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("url", type=str, help="Arxiv URL") | |
| parser.add_argument("--output_dir", type=str, default=".", help="Output directory") | |
| args = parser.parse_args() | |
| archive_path = download_arxiv(args.url, args.output_dir) | |
| concat_tex_files_in_archive(archive_path, args.output_dir) | |
| if __name__ == "__main__": | |
| main() | |