File size: 1,181 Bytes
ca5b08e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os
import json
import argparse

def main():
    parser = argparse.ArgumentParser(description="Evaluate element_merge_detect task")
    parser.add_argument(
        "workspace",
        help="The filesystem path where work will be stored, can be a local folder",
    )
    args = parser.parse_args()

    json_dir = os.path.join(args.workspace, 'jsons')
    if not os.path.exists(json_dir):
        os.makedirs(json_dir)

    jsonl_file = os.path.join(args.workspace, "data.jsonl")
    with open(jsonl_file, "r") as f:
        for line in f:
            data = json.loads(line)
            pdf_name_1 = data['pdf_name_1'].split(".")[0]
            pdf_name_2 = data['pdf_name_2'].split(".")[0]

            pdf_name,page_1 = pdf_name_1.split('_')
            pdf_name,page_2 = pdf_name_2.split('_')

            json_name = os.path.join(json_dir, pdf_name + '_' + page_1 + '_' + page_2 + '.json')
            data = {
                "page_1": "\n\n".join(data['md_elem_list_1']),
                "page_2": "\n\n".join(data['md_elem_list_2']),
            }
            with open(json_name, 'w') as f:
                json.dump(data, f)

if __name__ == "__main__":
    main()