Spaces:
Sleeping
Sleeping
File size: 6,564 Bytes
b30b7da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
# code.py
import re
import torch
import pandas as pd
import anthropic
import os
from dotenv import load_dotenv
from config import model, tokenizer, label_mapping, big5_dimensions, emotion_big5_priors
# .env νμΌ λ‘λ
load_dotenv()
def parse_speaker_text(text):
"""λν ν
μ€νΈλ₯Ό νμ±νμ¬ νμλ³ λ°νλ₯Ό μΆμΆνλ ν¨μ"""
speaker_dict = {}
lines = text.strip().split('\n')
print(f"π μ
λ ₯λ μ΄ μ€ μ: {len(lines)}") # λλ²κ·Έμ©
for i, line in enumerate(lines):
line = line.strip() # μλ€ κ³΅λ°± μ κ±°
if not line: # λΉ μ€ κ±΄λλ°κΈ°
continue
print(f"π μ²λ¦¬ μ€μΈ μ€ {i+1}: '{line}'") # λλ²κ·Έμ©
# λ€μν ν¨ν΄ μ§μ
patterns = [
r'^(\d+)\s*:\s*(.+)', # "1: μλ
νμΈμ"
r'^(\d+)\s*\.\s*(.+)', # "1. μλ
νμΈμ"
r'^(\d+)\s+(.+)', # "1 μλ
νμΈμ"
r'^νμ\s*(\d+)\s*:\s*(.+)', # "νμ1: μλ
νμΈμ"
]
matched = False
for pattern in patterns:
match = re.match(pattern, line)
if match:
speaker_id = int(match.group(1))
utterance = match.group(2).strip()
if speaker_id not in speaker_dict:
speaker_dict[speaker_id] = []
speaker_dict[speaker_id].append(utterance)
print(f"β
λ§€μΉ μ±κ³΅: νμ{speaker_id} -> '{utterance}'") # λλ²κ·Έμ©
matched = True
break
if not matched:
print(f"β λ§€μΉ μ€ν¨: '{line}'") # λλ²κ·Έμ©
print(f"π― μ΅μ’
κ²°κ³Ό: {len(speaker_dict)}λͺ
μ νμ λ°κ²¬") # λλ²κ·Έμ©
for speaker_id, utterances in speaker_dict.items():
print(f" νμ{speaker_id}: {len(utterances)}κ° λ°ν")
return speaker_dict
def analyze_emotions(utterances, model, tokenizer, label_mapping):
"""λ°ν 리μ€νΈμ λν΄ κ°μ λΆμμ μννλ ν¨μ"""
results = {}
for idx, text in enumerate(utterances):
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
emotions = torch.softmax(outputs.logits, dim=-1)
values = emotions.cpu().detach().numpy()
df = pd.DataFrame(values, columns=[label_mapping[i] for i in range(60)])
df = df.T.reset_index()
df.columns = ['Emotion', 'Probability']
df = df.sort_values(by='Probability', ascending=False).head(5)
results[f"utterance_{idx+1}"] = df
# μ΄ λ°©ν₯μΌλ‘ λ³ν©
merged_df = None
for key, df in results.items():
df = df.set_index("Emotion")
df.columns = [key]
if merged_df is None:
merged_df = df
else:
merged_df = merged_df.join(df, how='outer')
return merged_df
def calculate_probabilistic_mapping(merged_df, emotion_big5_priors):
"""νλ₯ μ λ§€νμ ν΅ν΄ κ° λ°νλ³ Big5 μ±ν₯ μ μλ₯Ό κ³μ°νλ ν¨μ"""
big5_results = {dim: [] for dim in big5_dimensions}
utterance_names = []
for utterance in merged_df.columns:
utterance_names.append(utterance)
observed_emotions = merged_df[utterance].dropna()
if len(observed_emotions) == 0:
for dim in big5_dimensions:
big5_results[dim].append(0.0)
continue
big5_scores_utterance = {}
total_weight = sum(observed_emotions.values)
for dim in big5_dimensions:
weighted_sum = 0.0
for emotion, intensity in observed_emotions.items():
if emotion in emotion_big5_priors:
weighted_sum += emotion_big5_priors[emotion][dim] * intensity
big5_scores_utterance[dim] = weighted_sum / total_weight if total_weight > 0 else 0.0
big5_results[dim].append(big5_scores_utterance[dim])
big5_df = pd.DataFrame(big5_results, index=utterance_names)
return big5_df
def analyze_emotion_patterns(big5_df):
"""κ°μ ν¨ν΄ λΆμ"""
display_df = big5_df.round(3)
return display_df
def run_probabilistic_mapping(merged_df):
"""νλ₯ μ λ§€ν μ 체 νλ‘μΈμ€ μ€ν"""
big5_df = calculate_probabilistic_mapping(merged_df, emotion_big5_priors)
result_summary = analyze_emotion_patterns(big5_df)
return big5_df, result_summary
def calculate_big5_averages(df):
"""Big5 μ±κ²©νΉμ± λ°μ΄ν°νλ μμ μ
λ ₯λ°μ κ° νΉμ±μ νκ· μ κ³μ°νλ ν¨μ"""
averages = {}
for column in df.columns:
averages[column] = df[column].mean()
return averages
def analyze_all_speakers(speaker_dict, model, tokenizer, label_mapping):
"""λͺ¨λ νμμ λν΄ Big5 λΆμμ μννλ ν¨μ"""
all_results = {}
for speaker_id, utterances in speaker_dict.items():
emotion_results = analyze_emotions(utterances, model, tokenizer, label_mapping)
big5_scores, summary = run_probabilistic_mapping(emotion_results)
big5_avg = calculate_big5_averages(big5_scores)
all_results[speaker_id] = big5_avg
return all_results
def stream_response(user_content: str, api_key: str):
"""Anthropic Claude APIλ₯Ό μ¬μ©νμ¬ μλ리μ€λ₯Ό μμ±νλ ν¨μ"""
if not api_key or not api_key.strip():
return "β API ν€λ₯Ό μ
λ ₯ν΄μ£ΌμΈμ."
if not api_key.startswith('sk-ant-'):
return "β μ¬λ°λ₯Έ Anthropic API ν€ νμμ΄ μλλλ€. 'sk-ant-'λ‘ μμν΄μΌ ν©λλ€."
try:
client = anthropic.Anthropic(api_key=api_key.strip())
stream = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=3000,
system="λΉμ μ λͺ°μ
κ° λμΉλ λλΌλ§ν±ν μλ리μ€λ₯Ό λ§λλ μ λ¬Έ μκ°μ
λλ€. μ¬λ¦¬νμ μ±κ²© λΆμμ λ°νμΌλ‘ μΈλ¬Ό κ°μ κ°λ±κ³Ό ννμμ©μ΄ μμνκ² λκ»΄μ§λ μ₯λ©΄μ μ°½μ‘°νμΈμ.",
messages=[
{"role": "user", "content": user_content}
],
stream=True
)
result = ""
for event in stream:
if event.type == "content_block_delta":
result += event.delta.text
return result
except Exception as e:
return f"β API νΈμΆ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}" |