OCRFlux / ocrflux /jsonl_to_markdown.py
mirnaresearch's picture
Initial commit for HF Space (no images)
ca5b08e
import os
import json
import argparse
def main():
parser = argparse.ArgumentParser(description="Evaluate page_to_markdown task")
parser.add_argument(
"workspace",
help="The filesystem path where work will be stored, can be a local folder",
)
parser.add_argument("--show_page_result", action="store_true", help="Whether to show the markdown of each page")
args = parser.parse_args()
src_dir = os.path.join(args.workspace, "results")
tgt_dir = os.path.join(args.workspace, "markdowns")
if not os.path.exists(tgt_dir):
os.makedirs(tgt_dir)
for jsonl_file in os.listdir(src_dir):
if jsonl_file.endswith(".jsonl"):
with open(os.path.join(src_dir, jsonl_file), "r") as f:
for line in f:
data = json.loads(line)
markdown_text = data['document_text']
file_name = os.path.basename(data['orig_path']).split(".")[0]
file_dir = os.path.join(tgt_dir, file_name)
if not os.path.exists(file_dir):
os.makedirs(file_dir)
with open(os.path.join(file_dir, file_name+".md"), "w") as f:
f.write(markdown_text)
if args.show_page_result:
page_texts = data["page_texts"]
for page_num in page_texts.keys():
page_text = page_texts[page_num]
with open(os.path.join(file_dir, file_name+"_"+str(page_num)+".md"), "w") as f:
f.write(page_text)
if __name__ == "__main__":
main()