File size: 1,677 Bytes
ca5b08e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import os
import json
import argparse
def main():
    parser = argparse.ArgumentParser(description="Evaluate page_to_markdown task")
    parser.add_argument(
        "workspace",
        help="The filesystem path where work will be stored, can be a local folder",
    )
    parser.add_argument("--show_page_result", action="store_true", help="Whether to show the markdown of each page")
    args = parser.parse_args()
    
    src_dir = os.path.join(args.workspace, "results")
    tgt_dir = os.path.join(args.workspace, "markdowns")
    if not os.path.exists(tgt_dir):
        os.makedirs(tgt_dir)
    for jsonl_file in os.listdir(src_dir):
        if jsonl_file.endswith(".jsonl"):
            with open(os.path.join(src_dir, jsonl_file), "r") as f:
                for line in f:
                    data = json.loads(line)
                    markdown_text = data['document_text']
                    file_name = os.path.basename(data['orig_path']).split(".")[0]
                    file_dir = os.path.join(tgt_dir, file_name)
                    if not os.path.exists(file_dir):
                        os.makedirs(file_dir)
                    with open(os.path.join(file_dir, file_name+".md"), "w") as f:
                        f.write(markdown_text)
                    if args.show_page_result:
                        page_texts = data["page_texts"]
                        for page_num in page_texts.keys():
                            page_text = page_texts[page_num]
                            with open(os.path.join(file_dir, file_name+"_"+str(page_num)+".md"), "w") as f:
                                f.write(page_text)

if __name__ == "__main__":
    main()