File size: 5,729 Bytes
96b6673
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import json
import re
import numpy as np

def all_normalize(obj):
    all_values = []
    for output_sent_result in obj:
        for each_doc in output_sent_result:
            for each_span in each_doc:
                all_values.append(each_span[1])
    max_val = max(all_values)
    min_val = min(all_values)
    for output_sent_result in obj:
        for i, each_doc in enumerate(output_sent_result):
            for j, each_span in enumerate(each_doc):
                each_span = (each_span[0], (each_span[1] - min_val) / (max_val - min_val))
                output_sent_result[i][j] = each_span
    return obj
    

def load_json(file_path):

    with open(file_path, 'r') as file:
        data = file.read()
    if file_path.endswith('.jsonl'):
        data = f'[{'},{'.join(data.split("}\n{"))}]'
    objects = json.loads(data)
    return objects

def ma(text):
    pattern = r"Document \[\d+\]\(Title:[^)]+\):"

    match = re.search(pattern, text)

    if match:
        index = match.end()
        return index
    else:
        return 0

def write_json(file_path, data):
    with open(file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

def split_by_docs(scores, docs_text, doc_tokens):
    assert len(scores) == len(doc_tokens)
    sep = '\n\n'
    docs = docs_text.strip().split(sep)
    doc_lens = [len(doc) for doc in docs]
    doc_end_idx = [sum(doc_lens[:i+1]) for i in range(len(doc_lens))]
    print(doc_end_idx)

    last_tokens = [0]
    for i, token in enumerate(doc_tokens):
        next_token = doc_tokens[i+1] if i+1 < len(doc_tokens) else None
        if token == "<0x0A>" and next_token == "<0x0A>": # FOR LLAMA2 ONLY
            last_tokens.append(i + 1)
    for i, idx in enumerate(last_tokens[1:]):
        pre_idx = last_tokens[i]
        curr_tokens = doc_tokens[pre_idx:idx + 1]
        curr_tokens = [token for token in curr_tokens if token != "<0x0A>"]
        curr_doc = ''.join(curr_tokens)
        while curr_doc.startswith('\u2581'):
            curr_doc = curr_doc[1:]
        #print(curr_doc)
        #print(docs[i])
        #assert len(curr_doc) == len(docs[i]), f"{len(curr_doc)} != {len(docs[i])}"
    doc_num = len(last_tokens) - 1
    scores_per_doc = [[] for _ in range(doc_num)]
    curr_doc_idx = 0
    skip = False
    curr_char_idx = -2 # magic number
    for i, (score, token) in enumerate(zip(scores, doc_tokens)):
        if skip:
            skip = False
            continue
        if i == 0:
            token = token[1:] # remove the first space
        if token == "<0x0A>":
            curr_doc_idx += 1
            curr_char_idx = -2
            skip = True # skip the next token
            continue
        scores_per_doc[curr_doc_idx].append((curr_char_idx, score))
        curr_char_idx += len(token)
    #print(scores_per_doc[0])
    for i, doc in enumerate(docs):
        start = ma(doc) - 2
        #print(start)
        scores_per_doc[i] = list(filter(lambda x: x[0] >= start, scores_per_doc[i]))
    all_values = []
    for scores in scores_per_doc:
        # normalize
        all_values.extend([score[1] for score in scores])
    max_val = max(all_values)
    min_val = min(all_values)
    for scores in scores_per_doc:
        for i, score in enumerate(scores):
            scores[i] = (score[0], (score[1] - min_val) / (max_val - min_val))
            
    return scores_per_doc

def span_to_doc(results):
    for res in results:
        span_level = res['span_level']
        doc_level = []
        for output_sent_result in span_level:
            doc_level.append([np.mean([span[1] for span in doc]) for doc in output_sent_result])
        res['doc_level'] = doc_level
    return results




def word_level_attribute(raw, _i):
    res = load_json(f'MIRAGE/internal_res/res_attr_dict-{_i}.json')

    input_text = res["input_context"]
    input = res["input_context_tokens"]
    output = res["output_current"]
    output_tokens =res["output_current_tokens"]
    token_lens = [len(x) for x in output_tokens]
    cci_scores = res["cci_scores"]
    splited_output = raw[_i]["output"]
    all_lens = [len(x) for x in splited_output]
    end_token_idx = [sum(token_lens[:i+1]) for i in range(len(token_lens))]
    end_idx = [sum(all_lens[:i+1]) for i in range(len(all_lens))]
    end_idx = [len(list(filter(lambda x: x < idx, end_token_idx))) for idx in end_idx]
    belong_sents = [[] for _ in range(len(splited_output))]
    for token_cci in cci_scores:
        token_idx = token_cci['cti_idx']
        for i, idx in enumerate(end_idx):
            if token_idx < idx:
                belong_sents[i].append(token_cci)
                break
    scores = []
    for i, sent in enumerate(belong_sents):
        weighted_scores = [token_cci["cti_score"]*np.array(token_cci["input_context_scores"]) for token_cci in sent]
        #weighted_scores = [np.array(token_cci["input_context_scores"]) for token_cci in sent]
        sum_scores = np.sum(weighted_scores, axis=0)
        #max_scores = np.max(weighted_scores, axis=0)
        scores.append(sum_scores)
        #scores.append(max_scores)
    finals = []

    for score in scores:
        doc_scores = split_by_docs(score, input_text, input)
        finals.append(doc_scores)


    doc_finals = [[] for _ in range(len(finals))]
    for i, output_sent_result in enumerate(finals):
        docs = []
        for doc in output_sent_result:
            doc_score = sum([score[1] for score in doc])
            docs.append(doc_score)
        doc_finals[i] = docs
    print(doc_finals)



    raw[_i]["word_level"] = finals
    raw[_i]["doc_level"] = doc_finals

raw = load_json('results.json')
for i in range(len(raw)):
    word_level_attribute(raw, i)
write_json('result_.json', raw)