File size: 6,564 Bytes
b30b7da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# code.py
import re
import torch
import pandas as pd
import anthropic
import os
from dotenv import load_dotenv
from config import model, tokenizer, label_mapping, big5_dimensions, emotion_big5_priors

# .env 파일 λ‘œλ“œ
load_dotenv()

def parse_speaker_text(text):
    """λŒ€ν™” ν…μŠ€νŠΈλ₯Ό νŒŒμ‹±ν•˜μ—¬ ν™”μžλ³„ λ°œν™”λ₯Ό μΆ”μΆœν•˜λŠ” ν•¨μˆ˜"""
    speaker_dict = {}
    lines = text.strip().split('\n')
    
    print(f"πŸ“ μž…λ ₯된 총 쀄 수: {len(lines)}")  # λ””λ²„κ·Έμš©
    
    for i, line in enumerate(lines):
        line = line.strip()  # μ•žλ’€ 곡백 제거
        if not line:  # 빈 쀄 κ±΄λ„ˆλ›°κΈ°
            continue
            
        print(f"πŸ” 처리 쀑인 쀄 {i+1}: '{line}'")  # λ””λ²„κ·Έμš©
        
        # λ‹€μ–‘ν•œ νŒ¨ν„΄ 지원
        patterns = [
            r'^(\d+)\s*:\s*(.+)',  # "1: μ•ˆλ…•ν•˜μ„Έμš”"
            r'^(\d+)\s*\.\s*(.+)',  # "1. μ•ˆλ…•ν•˜μ„Έμš”"
            r'^(\d+)\s+(.+)',      # "1 μ•ˆλ…•ν•˜μ„Έμš”"
            r'^ν™”μž\s*(\d+)\s*:\s*(.+)',  # "ν™”μž1: μ•ˆλ…•ν•˜μ„Έμš”"
        ]
        
        matched = False
        for pattern in patterns:
            match = re.match(pattern, line)
            if match:
                speaker_id = int(match.group(1))
                utterance = match.group(2).strip()
                
                if speaker_id not in speaker_dict:
                    speaker_dict[speaker_id] = []
                
                speaker_dict[speaker_id].append(utterance)
                print(f"βœ… λ§€μΉ­ 성곡: ν™”μž{speaker_id} -> '{utterance}'")  # λ””λ²„κ·Έμš©
                matched = True
                break
        
        if not matched:
            print(f"❌ λ§€μΉ­ μ‹€νŒ¨: '{line}'")  # λ””λ²„κ·Έμš©
    
    print(f"🎯 μ΅œμ’… κ²°κ³Ό: {len(speaker_dict)}λͺ…μ˜ ν™”μž 발견")  # λ””λ²„κ·Έμš©
    for speaker_id, utterances in speaker_dict.items():
        print(f"   ν™”μž{speaker_id}: {len(utterances)}개 λ°œν™”")
    
    return speaker_dict

def analyze_emotions(utterances, model, tokenizer, label_mapping):
    """λ°œν™” λ¦¬μŠ€νŠΈμ— λŒ€ν•΄ 감정 뢄석을 μˆ˜ν–‰ν•˜λŠ” ν•¨μˆ˜"""
    results = {}
    
    for idx, text in enumerate(utterances):
        inputs = tokenizer(text, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        emotions = torch.softmax(outputs.logits, dim=-1)
        
        values = emotions.cpu().detach().numpy()
        df = pd.DataFrame(values, columns=[label_mapping[i] for i in range(60)])
        df = df.T.reset_index()
        df.columns = ['Emotion', 'Probability']
        df = df.sort_values(by='Probability', ascending=False).head(5)
        
        results[f"utterance_{idx+1}"] = df
    
    # μ—΄ λ°©ν–₯으둜 병합
    merged_df = None
    for key, df in results.items():
        df = df.set_index("Emotion")
        df.columns = [key]
        if merged_df is None:
            merged_df = df
        else:
            merged_df = merged_df.join(df, how='outer')
    
    return merged_df

def calculate_probabilistic_mapping(merged_df, emotion_big5_priors):
    """ν™•λ₯ μ  맀핑을 톡해 각 λ°œν™”λ³„ Big5 μ„±ν–₯ 점수λ₯Ό κ³„μ‚°ν•˜λŠ” ν•¨μˆ˜"""
    big5_results = {dim: [] for dim in big5_dimensions}
    utterance_names = []

    for utterance in merged_df.columns:
        utterance_names.append(utterance)
        observed_emotions = merged_df[utterance].dropna()

        if len(observed_emotions) == 0:
            for dim in big5_dimensions:
                big5_results[dim].append(0.0)
            continue

        big5_scores_utterance = {}
        total_weight = sum(observed_emotions.values)

        for dim in big5_dimensions:
            weighted_sum = 0.0
            for emotion, intensity in observed_emotions.items():
                if emotion in emotion_big5_priors:
                    weighted_sum += emotion_big5_priors[emotion][dim] * intensity

            big5_scores_utterance[dim] = weighted_sum / total_weight if total_weight > 0 else 0.0
            big5_results[dim].append(big5_scores_utterance[dim])

    big5_df = pd.DataFrame(big5_results, index=utterance_names)
    return big5_df

def analyze_emotion_patterns(big5_df):
    """감정 νŒ¨ν„΄ 뢄석"""
    display_df = big5_df.round(3)
    return display_df

def run_probabilistic_mapping(merged_df):
    """ν™•λ₯ μ  λ§€ν•‘ 전체 ν”„λ‘œμ„ΈμŠ€ μ‹€ν–‰"""
    big5_df = calculate_probabilistic_mapping(merged_df, emotion_big5_priors)
    result_summary = analyze_emotion_patterns(big5_df)
    return big5_df, result_summary

def calculate_big5_averages(df):
    """Big5 μ„±κ²©νŠΉμ„± λ°μ΄ν„°ν”„λ ˆμž„μ„ μž…λ ₯λ°›μ•„ 각 νŠΉμ„±μ˜ 평균을 κ³„μ‚°ν•˜λŠ” ν•¨μˆ˜"""
    averages = {}
    for column in df.columns:
        averages[column] = df[column].mean()
    return averages

def analyze_all_speakers(speaker_dict, model, tokenizer, label_mapping):
    """λͺ¨λ“  ν™”μžμ— λŒ€ν•΄ Big5 뢄석을 μˆ˜ν–‰ν•˜λŠ” ν•¨μˆ˜"""
    all_results = {}

    for speaker_id, utterances in speaker_dict.items():
        emotion_results = analyze_emotions(utterances, model, tokenizer, label_mapping)
        big5_scores, summary = run_probabilistic_mapping(emotion_results)
        big5_avg = calculate_big5_averages(big5_scores)
        all_results[speaker_id] = big5_avg

    return all_results

def stream_response(user_content: str, api_key: str):
    """Anthropic Claude APIλ₯Ό μ‚¬μš©ν•˜μ—¬ μ‹œλ‚˜λ¦¬μ˜€λ₯Ό μƒμ„±ν•˜λŠ” ν•¨μˆ˜"""
    if not api_key or not api_key.strip():
        return "❌ API ν‚€λ₯Ό μž…λ ₯ν•΄μ£Όμ„Έμš”."
    
    if not api_key.startswith('sk-ant-'):
        return "❌ μ˜¬λ°”λ₯Έ Anthropic API ν‚€ ν˜•μ‹μ΄ μ•„λ‹™λ‹ˆλ‹€. 'sk-ant-'둜 μ‹œμž‘ν•΄μ•Ό ν•©λ‹ˆλ‹€."
    
    try:
        client = anthropic.Anthropic(api_key=api_key.strip())
        
        stream = client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=3000,
            system="당신은 λͺ°μž…감 λ„˜μΉ˜λŠ” λ“œλΌλ§ˆν‹±ν•œ μ‹œλ‚˜λ¦¬μ˜€λ₯Ό λ§Œλ“œλŠ” μ „λ¬Έ μž‘κ°€μž…λ‹ˆλ‹€. 심리학적 성격 뢄석을 λ°”νƒ•μœΌλ‘œ 인물 κ°„μ˜ κ°ˆλ“±κ³Ό ν™”ν•™μž‘μš©μ΄ μƒμƒν•˜κ²Œ λŠκ»΄μ§€λŠ” μž₯면을 μ°½μ‘°ν•˜μ„Έμš”.",
            messages=[
                {"role": "user", "content": user_content}
            ],
            stream=True
        )

        result = ""
        for event in stream:
            if event.type == "content_block_delta":
                result += event.delta.text
        
        return result
        
    except Exception as e:
        return f"❌ API 호좜 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"