ScriptFromChat / code.py
cksleigen's picture
Create code.py
b30b7da verified
# code.py
import re
import torch
import pandas as pd
import anthropic
import os
from dotenv import load_dotenv
from config import model, tokenizer, label_mapping, big5_dimensions, emotion_big5_priors
# .env 파일 λ‘œλ“œ
load_dotenv()
def parse_speaker_text(text):
"""λŒ€ν™” ν…μŠ€νŠΈλ₯Ό νŒŒμ‹±ν•˜μ—¬ ν™”μžλ³„ λ°œν™”λ₯Ό μΆ”μΆœν•˜λŠ” ν•¨μˆ˜"""
speaker_dict = {}
lines = text.strip().split('\n')
print(f"πŸ“ μž…λ ₯된 총 쀄 수: {len(lines)}") # λ””λ²„κ·Έμš©
for i, line in enumerate(lines):
line = line.strip() # μ•žλ’€ 곡백 제거
if not line: # 빈 쀄 κ±΄λ„ˆλ›°κΈ°
continue
print(f"πŸ” 처리 쀑인 쀄 {i+1}: '{line}'") # λ””λ²„κ·Έμš©
# λ‹€μ–‘ν•œ νŒ¨ν„΄ 지원
patterns = [
r'^(\d+)\s*:\s*(.+)', # "1: μ•ˆλ…•ν•˜μ„Έμš”"
r'^(\d+)\s*\.\s*(.+)', # "1. μ•ˆλ…•ν•˜μ„Έμš”"
r'^(\d+)\s+(.+)', # "1 μ•ˆλ…•ν•˜μ„Έμš”"
r'^ν™”μž\s*(\d+)\s*:\s*(.+)', # "ν™”μž1: μ•ˆλ…•ν•˜μ„Έμš”"
]
matched = False
for pattern in patterns:
match = re.match(pattern, line)
if match:
speaker_id = int(match.group(1))
utterance = match.group(2).strip()
if speaker_id not in speaker_dict:
speaker_dict[speaker_id] = []
speaker_dict[speaker_id].append(utterance)
print(f"βœ… λ§€μΉ­ 성곡: ν™”μž{speaker_id} -> '{utterance}'") # λ””λ²„κ·Έμš©
matched = True
break
if not matched:
print(f"❌ λ§€μΉ­ μ‹€νŒ¨: '{line}'") # λ””λ²„κ·Έμš©
print(f"🎯 μ΅œμ’… κ²°κ³Ό: {len(speaker_dict)}λͺ…μ˜ ν™”μž 발견") # λ””λ²„κ·Έμš©
for speaker_id, utterances in speaker_dict.items():
print(f" ν™”μž{speaker_id}: {len(utterances)}개 λ°œν™”")
return speaker_dict
def analyze_emotions(utterances, model, tokenizer, label_mapping):
"""λ°œν™” λ¦¬μŠ€νŠΈμ— λŒ€ν•΄ 감정 뢄석을 μˆ˜ν–‰ν•˜λŠ” ν•¨μˆ˜"""
results = {}
for idx, text in enumerate(utterances):
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
emotions = torch.softmax(outputs.logits, dim=-1)
values = emotions.cpu().detach().numpy()
df = pd.DataFrame(values, columns=[label_mapping[i] for i in range(60)])
df = df.T.reset_index()
df.columns = ['Emotion', 'Probability']
df = df.sort_values(by='Probability', ascending=False).head(5)
results[f"utterance_{idx+1}"] = df
# μ—΄ λ°©ν–₯으둜 병합
merged_df = None
for key, df in results.items():
df = df.set_index("Emotion")
df.columns = [key]
if merged_df is None:
merged_df = df
else:
merged_df = merged_df.join(df, how='outer')
return merged_df
def calculate_probabilistic_mapping(merged_df, emotion_big5_priors):
"""ν™•λ₯ μ  맀핑을 톡해 각 λ°œν™”λ³„ Big5 μ„±ν–₯ 점수λ₯Ό κ³„μ‚°ν•˜λŠ” ν•¨μˆ˜"""
big5_results = {dim: [] for dim in big5_dimensions}
utterance_names = []
for utterance in merged_df.columns:
utterance_names.append(utterance)
observed_emotions = merged_df[utterance].dropna()
if len(observed_emotions) == 0:
for dim in big5_dimensions:
big5_results[dim].append(0.0)
continue
big5_scores_utterance = {}
total_weight = sum(observed_emotions.values)
for dim in big5_dimensions:
weighted_sum = 0.0
for emotion, intensity in observed_emotions.items():
if emotion in emotion_big5_priors:
weighted_sum += emotion_big5_priors[emotion][dim] * intensity
big5_scores_utterance[dim] = weighted_sum / total_weight if total_weight > 0 else 0.0
big5_results[dim].append(big5_scores_utterance[dim])
big5_df = pd.DataFrame(big5_results, index=utterance_names)
return big5_df
def analyze_emotion_patterns(big5_df):
"""감정 νŒ¨ν„΄ 뢄석"""
display_df = big5_df.round(3)
return display_df
def run_probabilistic_mapping(merged_df):
"""ν™•λ₯ μ  λ§€ν•‘ 전체 ν”„λ‘œμ„ΈμŠ€ μ‹€ν–‰"""
big5_df = calculate_probabilistic_mapping(merged_df, emotion_big5_priors)
result_summary = analyze_emotion_patterns(big5_df)
return big5_df, result_summary
def calculate_big5_averages(df):
"""Big5 μ„±κ²©νŠΉμ„± λ°μ΄ν„°ν”„λ ˆμž„μ„ μž…λ ₯λ°›μ•„ 각 νŠΉμ„±μ˜ 평균을 κ³„μ‚°ν•˜λŠ” ν•¨μˆ˜"""
averages = {}
for column in df.columns:
averages[column] = df[column].mean()
return averages
def analyze_all_speakers(speaker_dict, model, tokenizer, label_mapping):
"""λͺ¨λ“  ν™”μžμ— λŒ€ν•΄ Big5 뢄석을 μˆ˜ν–‰ν•˜λŠ” ν•¨μˆ˜"""
all_results = {}
for speaker_id, utterances in speaker_dict.items():
emotion_results = analyze_emotions(utterances, model, tokenizer, label_mapping)
big5_scores, summary = run_probabilistic_mapping(emotion_results)
big5_avg = calculate_big5_averages(big5_scores)
all_results[speaker_id] = big5_avg
return all_results
def stream_response(user_content: str, api_key: str):
"""Anthropic Claude APIλ₯Ό μ‚¬μš©ν•˜μ—¬ μ‹œλ‚˜λ¦¬μ˜€λ₯Ό μƒμ„±ν•˜λŠ” ν•¨μˆ˜"""
if not api_key or not api_key.strip():
return "❌ API ν‚€λ₯Ό μž…λ ₯ν•΄μ£Όμ„Έμš”."
if not api_key.startswith('sk-ant-'):
return "❌ μ˜¬λ°”λ₯Έ Anthropic API ν‚€ ν˜•μ‹μ΄ μ•„λ‹™λ‹ˆλ‹€. 'sk-ant-'둜 μ‹œμž‘ν•΄μ•Ό ν•©λ‹ˆλ‹€."
try:
client = anthropic.Anthropic(api_key=api_key.strip())
stream = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=3000,
system="당신은 λͺ°μž…감 λ„˜μΉ˜λŠ” λ“œλΌλ§ˆν‹±ν•œ μ‹œλ‚˜λ¦¬μ˜€λ₯Ό λ§Œλ“œλŠ” μ „λ¬Έ μž‘κ°€μž…λ‹ˆλ‹€. 심리학적 성격 뢄석을 λ°”νƒ•μœΌλ‘œ 인물 κ°„μ˜ κ°ˆλ“±κ³Ό ν™”ν•™μž‘μš©μ΄ μƒμƒν•˜κ²Œ λŠκ»΄μ§€λŠ” μž₯면을 μ°½μ‘°ν•˜μ„Έμš”.",
messages=[
{"role": "user", "content": user_content}
],
stream=True
)
result = ""
for event in stream:
if event.type == "content_block_delta":
result += event.delta.text
return result
except Exception as e:
return f"❌ API 호좜 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"