Spaces:
Sleeping
Sleeping
# code.py | |
import re | |
import torch | |
import pandas as pd | |
import anthropic | |
import os | |
from dotenv import load_dotenv | |
from config import model, tokenizer, label_mapping, big5_dimensions, emotion_big5_priors | |
# .env νμΌ λ‘λ | |
load_dotenv() | |
def parse_speaker_text(text): | |
"""λν ν μ€νΈλ₯Ό νμ±νμ¬ νμλ³ λ°νλ₯Ό μΆμΆνλ ν¨μ""" | |
speaker_dict = {} | |
lines = text.strip().split('\n') | |
print(f"π μ λ ₯λ μ΄ μ€ μ: {len(lines)}") # λλ²κ·Έμ© | |
for i, line in enumerate(lines): | |
line = line.strip() # μλ€ κ³΅λ°± μ κ±° | |
if not line: # λΉ μ€ κ±΄λλ°κΈ° | |
continue | |
print(f"π μ²λ¦¬ μ€μΈ μ€ {i+1}: '{line}'") # λλ²κ·Έμ© | |
# λ€μν ν¨ν΄ μ§μ | |
patterns = [ | |
r'^(\d+)\s*:\s*(.+)', # "1: μλ νμΈμ" | |
r'^(\d+)\s*\.\s*(.+)', # "1. μλ νμΈμ" | |
r'^(\d+)\s+(.+)', # "1 μλ νμΈμ" | |
r'^νμ\s*(\d+)\s*:\s*(.+)', # "νμ1: μλ νμΈμ" | |
] | |
matched = False | |
for pattern in patterns: | |
match = re.match(pattern, line) | |
if match: | |
speaker_id = int(match.group(1)) | |
utterance = match.group(2).strip() | |
if speaker_id not in speaker_dict: | |
speaker_dict[speaker_id] = [] | |
speaker_dict[speaker_id].append(utterance) | |
print(f"β λ§€μΉ μ±κ³΅: νμ{speaker_id} -> '{utterance}'") # λλ²κ·Έμ© | |
matched = True | |
break | |
if not matched: | |
print(f"β λ§€μΉ μ€ν¨: '{line}'") # λλ²κ·Έμ© | |
print(f"π― μ΅μ’ κ²°κ³Ό: {len(speaker_dict)}λͺ μ νμ λ°κ²¬") # λλ²κ·Έμ© | |
for speaker_id, utterances in speaker_dict.items(): | |
print(f" νμ{speaker_id}: {len(utterances)}κ° λ°ν") | |
return speaker_dict | |
def analyze_emotions(utterances, model, tokenizer, label_mapping): | |
"""λ°ν 리μ€νΈμ λν΄ κ°μ λΆμμ μννλ ν¨μ""" | |
results = {} | |
for idx, text in enumerate(utterances): | |
inputs = tokenizer(text, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
emotions = torch.softmax(outputs.logits, dim=-1) | |
values = emotions.cpu().detach().numpy() | |
df = pd.DataFrame(values, columns=[label_mapping[i] for i in range(60)]) | |
df = df.T.reset_index() | |
df.columns = ['Emotion', 'Probability'] | |
df = df.sort_values(by='Probability', ascending=False).head(5) | |
results[f"utterance_{idx+1}"] = df | |
# μ΄ λ°©ν₯μΌλ‘ λ³ν© | |
merged_df = None | |
for key, df in results.items(): | |
df = df.set_index("Emotion") | |
df.columns = [key] | |
if merged_df is None: | |
merged_df = df | |
else: | |
merged_df = merged_df.join(df, how='outer') | |
return merged_df | |
def calculate_probabilistic_mapping(merged_df, emotion_big5_priors): | |
"""νλ₯ μ λ§€νμ ν΅ν΄ κ° λ°νλ³ Big5 μ±ν₯ μ μλ₯Ό κ³μ°νλ ν¨μ""" | |
big5_results = {dim: [] for dim in big5_dimensions} | |
utterance_names = [] | |
for utterance in merged_df.columns: | |
utterance_names.append(utterance) | |
observed_emotions = merged_df[utterance].dropna() | |
if len(observed_emotions) == 0: | |
for dim in big5_dimensions: | |
big5_results[dim].append(0.0) | |
continue | |
big5_scores_utterance = {} | |
total_weight = sum(observed_emotions.values) | |
for dim in big5_dimensions: | |
weighted_sum = 0.0 | |
for emotion, intensity in observed_emotions.items(): | |
if emotion in emotion_big5_priors: | |
weighted_sum += emotion_big5_priors[emotion][dim] * intensity | |
big5_scores_utterance[dim] = weighted_sum / total_weight if total_weight > 0 else 0.0 | |
big5_results[dim].append(big5_scores_utterance[dim]) | |
big5_df = pd.DataFrame(big5_results, index=utterance_names) | |
return big5_df | |
def analyze_emotion_patterns(big5_df): | |
"""κ°μ ν¨ν΄ λΆμ""" | |
display_df = big5_df.round(3) | |
return display_df | |
def run_probabilistic_mapping(merged_df): | |
"""νλ₯ μ λ§€ν μ 체 νλ‘μΈμ€ μ€ν""" | |
big5_df = calculate_probabilistic_mapping(merged_df, emotion_big5_priors) | |
result_summary = analyze_emotion_patterns(big5_df) | |
return big5_df, result_summary | |
def calculate_big5_averages(df): | |
"""Big5 μ±κ²©νΉμ± λ°μ΄ν°νλ μμ μ λ ₯λ°μ κ° νΉμ±μ νκ· μ κ³μ°νλ ν¨μ""" | |
averages = {} | |
for column in df.columns: | |
averages[column] = df[column].mean() | |
return averages | |
def analyze_all_speakers(speaker_dict, model, tokenizer, label_mapping): | |
"""λͺ¨λ νμμ λν΄ Big5 λΆμμ μννλ ν¨μ""" | |
all_results = {} | |
for speaker_id, utterances in speaker_dict.items(): | |
emotion_results = analyze_emotions(utterances, model, tokenizer, label_mapping) | |
big5_scores, summary = run_probabilistic_mapping(emotion_results) | |
big5_avg = calculate_big5_averages(big5_scores) | |
all_results[speaker_id] = big5_avg | |
return all_results | |
def stream_response(user_content: str, api_key: str): | |
"""Anthropic Claude APIλ₯Ό μ¬μ©νμ¬ μλ리μ€λ₯Ό μμ±νλ ν¨μ""" | |
if not api_key or not api_key.strip(): | |
return "β API ν€λ₯Ό μ λ ₯ν΄μ£ΌμΈμ." | |
if not api_key.startswith('sk-ant-'): | |
return "β μ¬λ°λ₯Έ Anthropic API ν€ νμμ΄ μλλλ€. 'sk-ant-'λ‘ μμν΄μΌ ν©λλ€." | |
try: | |
client = anthropic.Anthropic(api_key=api_key.strip()) | |
stream = client.messages.create( | |
model="claude-sonnet-4-20250514", | |
max_tokens=3000, | |
system="λΉμ μ λͺ°μ κ° λμΉλ λλΌλ§ν±ν μλ리μ€λ₯Ό λ§λλ μ λ¬Έ μκ°μ λλ€. μ¬λ¦¬νμ μ±κ²© λΆμμ λ°νμΌλ‘ μΈλ¬Ό κ°μ κ°λ±κ³Ό ννμμ©μ΄ μμνκ² λκ»΄μ§λ μ₯λ©΄μ μ°½μ‘°νμΈμ.", | |
messages=[ | |
{"role": "user", "content": user_content} | |
], | |
stream=True | |
) | |
result = "" | |
for event in stream: | |
if event.type == "content_block_delta": | |
result += event.delta.text | |
return result | |
except Exception as e: | |
return f"β API νΈμΆ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}" |