File size: 4,319 Bytes
7c447a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import List, Optional

def rank_sample(
    df: pd.DataFrame,
    name_col: str = "name",
    category_col: str = "category",
    sentiment_col: str = "sentiment_score",
    groups: Optional[List[str]] = None,
    num_samples: int = 1000,
    temp: float = 1.0,
    target_value: float = 0.5,
) -> pd.DataFrame:

    df = df.copy()
    
    for col in [name_col, category_col, sentiment_col]:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
    
    df = df.dropna(subset=[name_col, category_col, sentiment_col])

    if groups:
        available_groups = df[category_col].unique()
        valid_groups = [g for g in groups if g in available_groups]
        if len(valid_groups) < 2:
            print(f"Warning: Only {len(valid_groups)} groups available from {groups}")
            groups = None
        else:
            groups = valid_groups
            df = df[df[category_col].isin(groups)].copy()
    
    final_groups = df[category_col].unique()
    if len(final_groups) < 2:
        print(f"Error: Only {len(final_groups)} groups in data, need at least 2")
        return df.groupby(name_col).first().reset_index()
    
    print(f"Sampling with groups: {sorted(final_groups)}")
    print(f"Target value for deviation calculation: {target_value}")
    
    df["sentiment_deviation"] = (df[sentiment_col] - target_value).abs()
    df["sentiment_rank"] = df.groupby(name_col)["sentiment_deviation"].rank(method="first", ascending=True)

    def softmax_weights(ranks: np.ndarray, temp: float) -> np.ndarray:
        t = float(temp) if temp and temp > 1e-8 else 1e-8
        x = -ranks / t
        x = x - np.max(x) 
        exps = np.exp(x)
        s = exps.sum()
        return exps / s if np.isfinite(s) and s > 0 else np.ones_like(exps) / len(exps)

    def objective_max_pairwise_diff(frame: pd.DataFrame) -> float:
        g = frame.groupby(category_col)[sentiment_col].mean().dropna()
        if len(g) < 2:
            return np.inf
        vals = g.values
        diffs = np.abs(vals[:, None] - vals[None, :])
        return float(np.max(diffs))

    best_subset = None
    best_obj = np.inf
    valid_samples = 0

    unique_names = df[name_col].nunique()
    print(f"Total unique names: {unique_names}")
    
    for i in tqdm(range(num_samples), desc="Sampling"):
        try:
            sampled_rows = []
            
            for name, group in df.groupby(name_col):
                if len(group) == 0:
                    continue
                
                ranks = group["sentiment_rank"].to_numpy(dtype=float)
                if len(ranks) == 0:
                    continue
                
                w = softmax_weights(ranks, temp=temp)
                idx = np.random.choice(group.index, p=w)
                sampled_rows.append(df.loc[idx])
            
            if len(sampled_rows) == 0:
                continue
                
            subset = pd.DataFrame(sampled_rows)

            subset_groups = subset[category_col].unique()
            if len(subset_groups) < 2:
                continue
                
            obj = objective_max_pairwise_diff(subset)
            
            if np.isfinite(obj):
                valid_samples += 1
                if obj < best_obj:
                    best_obj = obj
                    best_subset = subset.copy()
                    
                    if valid_samples % 100 == 0 or valid_samples <= 10:
                        group_means = subset.groupby(category_col)[sentiment_col].mean()
                        print(f"Sample {valid_samples}: obj={obj:.4f}, groups={dict(group_means)}")
        
        except Exception as e:
            print(f"Error in sample {i}: {e}")
            continue
    
    print(f"Valid samples: {valid_samples}/{num_samples}")
    print(f"Best objective: {best_obj:.4f}")
    
    if best_subset is None or len(best_subset) == 0:
        print("Warning: No valid samples found, returning fallback subset")
        best_subset = df.groupby(name_col).first().reset_index()
    
    final_group_counts = best_subset[category_col].value_counts()
    print(f"Final subset group distribution: {dict(final_group_counts)}")
    
    return best_subset