Spaces:

ltg
/

fluency-annotation

Running

File size: 26,091 Bytes

from __future__ import annotations

import os
import gradio as gr
import json
import random
from datetime import datetime
from typing import Dict, List, Tuple
import hashlib
import itertools
from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub import HfApi, create_repo, repo_exists, Repository
from huggingface_hub import HfFolder
import shutil
import threading
import json

from collections.abc import Iterable

from gradio.themes.base import Base
from gradio.themes.utils import colors, fonts, sizes

HF_TOKEN = os.environ.get("HF_TOKEN")
os.environ['HF_AUTH'] = HF_TOKEN
HfApi(token=HF_TOKEN)

USER_IDS = set(json.loads(os.environ.get("USER_IDS")) + json.loads(os.environ.get("USER_IDS_2")))


class Soft(Base):
    def __init__(
        self,
        *,
        primary_hue: colors.Color | str = colors.indigo,
        secondary_hue: colors.Color | str = colors.indigo,
        neutral_hue: colors.Color | str = colors.gray,
        spacing_size: sizes.Size | str = sizes.spacing_md,
        radius_size: sizes.Size | str = sizes.radius_md,
        text_size: sizes.Size | str = sizes.text_md,
        font: fonts.Font | str | Iterable[fonts.Font | str] = (
            # fonts.LocalFont("Montserrat"),
            "ui-sans-serif",
            "system-ui",
            "sans-serif",
        ),
        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
            # fonts.LocalFont("IBM Plex Mono"),
            "ui-monospace",
            "Consolas",
            "monospace",
        ),
    ):
        super().__init__(
            primary_hue=primary_hue,
            secondary_hue=secondary_hue,
            neutral_hue=neutral_hue,
            spacing_size=spacing_size,
            radius_size=radius_size,
            text_size=text_size,
            font=font,
            font_mono=font_mono,
        )
        self.name = "soft"
        super().set(
            # Colors
            background_fill_primary="*neutral_50",
            slider_color="*primary_500",
            slider_color_dark="*primary_600",
            # Shadows
            shadow_drop="0 1px 4px 0 rgb(0 0 0 / 0.1)",
            shadow_drop_lg="0 2px 5px 0 rgb(0 0 0 / 0.2)",
            # Block Labels
            block_background_fill="white",
            block_label_padding="*spacing_sm *spacing_md",
            block_label_background_fill="*primary_100",
            block_label_background_fill_dark="*primary_600",
            block_label_radius="*radius_md",
            block_label_text_size="*text_md",
            block_label_text_weight="600",
            block_label_text_color="*primary_500",
            block_label_text_color_dark="white",
            block_title_radius="*block_label_radius",
            block_title_padding="*block_label_padding",
            block_title_background_fill="*block_label_background_fill",
            block_title_text_weight="600",
            block_title_text_color="*primary_500",
            block_title_text_color_dark="white",
            block_label_margin="*spacing_md",

            # Inputs
            input_background_fill="white",
            input_border_color="*neutral_100",
            input_shadow="*shadow_drop",
            input_shadow_focus="*shadow_drop_lg",
            checkbox_shadow="none",
            # Buttons
            shadow_spread="6px",
            button_primary_shadow="*shadow_drop_lg",
            button_primary_shadow_hover="*shadow_drop_lg",
            button_primary_shadow_active="*shadow_inset",
            button_secondary_shadow="*shadow_drop_lg",
            button_secondary_shadow_hover="*shadow_drop_lg",
            button_secondary_shadow_active="*shadow_inset",
            checkbox_label_shadow="*shadow_drop_lg",
            button_primary_background_fill="*primary_500",
            button_primary_background_fill_hover="*primary_400",
            button_primary_background_fill_hover_dark="*primary_500",
            button_primary_text_color="white",
            button_secondary_background_fill="white",
            button_secondary_background_fill_hover="*neutral_100",
            button_secondary_background_fill_hover_dark="*primary_500",
            button_secondary_text_color="*neutral_800",
            button_cancel_background_fill="*button_secondary_background_fill",
            button_cancel_background_fill_hover="*button_secondary_background_fill_hover",
            button_cancel_background_fill_hover_dark="*button_secondary_background_fill_hover",
            button_cancel_text_color="*button_secondary_text_color",
            checkbox_label_background_fill_selected="*primary_500",
            checkbox_label_background_fill_selected_dark="*primary_600",
            checkbox_border_width="1px",
            checkbox_border_color="*neutral_100",
            checkbox_border_color_dark="*neutral_600",
            checkbox_background_color_selected="*primary_600",
            checkbox_background_color_selected_dark="*primary_700",
            checkbox_border_color_focus="*primary_500",
            checkbox_border_color_focus_dark="*primary_600",
            checkbox_border_color_selected="*primary_600",
            checkbox_border_color_selected_dark="*primary_700",
            checkbox_label_text_color_selected="white",
            # Borders
            block_border_width="0px",
            panel_border_width="0px",
        )


guideline = open("guidelines.md").read().strip()

# Configuration for the output dataset
ANNOTATIONS_REPO = "ltg/fluency-annotations"  # Change to your repo name
DATA_DIR = "annotation_data"
ANNOTATIONS_FILE = os.path.join(DATA_DIR, "train.jsonl")

# Model names for the three responses
MODEL_NAMES = ["mistral-Nemo", "translated-SFT", "on-policy-RL"]

# Create all pairwise comparisons
MODEL_PAIRS = list(itertools.combinations(MODEL_NAMES, 2))

# Initialize repository
def init_repository():
    """Initialize or clone the repository"""
    try:
        repo = Repository(
            local_dir=DATA_DIR, 
            clone_from=ANNOTATIONS_REPO, 
            use_auth_token=HF_TOKEN,
            repo_type="dataset"
        )
        repo.git_pull()
        return repo
    except Exception as e:
        print(f"Error initializing repository: {e}")
        # Create local directory if repo doesn't exist
        os.makedirs(DATA_DIR, exist_ok=True)
        return None

# Initialize on startup
annotation_repo = init_repository()

def load_existing_annotations():
    """Load existing annotations from the jsonl file"""
    annotations = {}
    
    if os.path.exists(ANNOTATIONS_FILE):
        try:
            with open(ANNOTATIONS_FILE, "r") as f:
                for line in f:
                    if line.strip():
                        ann = json.loads(line)
                        user_id = ann.get("user_id")
                        if user_id:
                            if user_id not in annotations:
                                annotations[user_id] = []
                            annotations[user_id].append(ann)
            print(f"Loaded {sum(len(v) for v in annotations.values())} existing annotations")
        except Exception as e:
            print(f"Error loading annotations: {e}")
    
    return annotations

def save_annotation_to_file(annotation_data):
    """Save a single annotation to the jsonl file and push to hub"""
    global annotation_repo
    
    try:
        # Pull latest changes
        if annotation_repo:
            annotation_repo.git_pull()
        
        # Append to jsonl file
        with open(ANNOTATIONS_FILE, "a") as f:
            line = json.dumps(annotation_data, ensure_ascii=False)
            f.write(f"{line}\n")
        
        # Push to hub asynchronously
        if annotation_repo:
            annotation_repo.push_to_hub(blocking=False)
            
    except Exception as e:
        print(f"Error saving annotation: {e}")
        # Try to reinitialize repository
        try:
            shutil.rmtree(DATA_DIR)
            annotation_repo = init_repository()
            
            # Retry saving
            with open(ANNOTATIONS_FILE, "a") as f:
                line = json.dumps(annotation_data, ensure_ascii=False)
                f.write(f"{line}\n")
            
            if annotation_repo:
                annotation_repo.push_to_hub(blocking=False)
        except Exception as e2:
            print(f"Failed to save annotation after retry: {e2}")

def load_dataset_samples():
    """Load and prepare dataset samples with pairwise comparisons"""
    try:
        # Load the private dataset (requires authentication)
        dataset = load_dataset("ltg/fluency-generations", split="train", token=HF_TOKEN)
        
        # Transform dataset into pairwise comparison format
        pairwise_samples = []
        
        for item in dataset:
            sample_id = item["sample_id"]
            prompt = item["prompt"]
            responses = item["responses"]
            
            # Create pairwise comparisons for this sample
            for model_a, model_b in MODEL_PAIRS:
                pairwise_samples.append({
                    "id": f"{sample_id}_{model_a}_vs_{model_b}",
                    "original_id": sample_id,
                    "prompt": prompt,
                    "response_a": responses[model_a],
                    "response_b": responses[model_b],
                    "model_a": model_a,
                    "model_b": model_b,
                    "dataset": "NTNU"
                })

        extra_dataset = load_dataset("ltg/fluency-generations", split="test", token=HF_TOKEN)
        extra_pairwise_samples = []
        for i, item in enumerate(extra_dataset):
            sample_id = item["sample_id"]
            prompt = item["prompt"]
            responses = item["responses"]
            model_a, model_b = MODEL_PAIRS[i % len(MODEL_PAIRS)]
            model_a, model_b = (model_a, model_b) if i % 2 == 0 else (model_b, model_a)
            extra_pairwise_samples.append({
                "id": f"{sample_id}_{model_a}_vs_{model_b}",
                "original_id": sample_id,
                "prompt": prompt,
                "response_a": responses[model_a],
                "response_b": responses[model_b],
                "model_a": model_a,
                "model_b": model_b,
                "dataset": "training_examples"
            })
        
        return pairwise_samples, extra_pairwise_samples
    
    except Exception as e:
        print(f"Error loading dataset: {e}")
        print("Using dummy data for testing...")
        # Fallback to dummy data for testing
        return [
            {
                "id": "dummy_001_modelA_vs_modelB",
                "original_id": "dummy_001",
                "prompt": "Test prompt for development",
                "response_a": "This is response A for testing.",
                "response_b": "This is response B for testing.",
                "model_a": "modelA",
                "model_b": "modelB",
                "dataset": "test"
            }
        ], []


def swap_sample(sample):
    return {
        "id": str(sample["original_id"]) + '_' + sample["model_b"] + '_vs_' + sample["model_a"],
        "original_id": sample["original_id"],
        "prompt": sample["prompt"],
        "response_a": sample["response_b"],
        "response_b": sample["response_a"],
        "model_a": sample["model_b"],
        "model_b": sample["model_a"],
        "dataset": sample["dataset"]
    }

# Load dataset on startup
DATASET_SAMPLES, EXTRA_DATASET_SAMPLES = load_dataset_samples()

class AnnotationManager:
    def __init__(self):
        # Load existing annotations from file
        self.annotations = load_existing_annotations()
        self.user_states = {}
        
        # Rebuild user states from loaded annotations
        for user_id, user_annotations in self.annotations.items():
            annotated_ids = [ann["sample_id"] for ann in user_annotations]
            self.user_states[user_id] = {
                "current_index": 0,
                "annotations": annotated_ids
            }
    
    def get_user_seed(self, user_id: str) -> int:
        """Generate consistent seed for user"""
        return int(hashlib.md5(user_id.encode()).hexdigest(), 16)
    
    def get_user_samples(self, user_id: str) -> List[Dict]:
        """Get shuffled samples for user based on their ID"""
        seed = self.get_user_seed(user_id)
        samples = DATASET_SAMPLES.copy()
        random.Random(seed).shuffle(samples)
        samples = [
            sample if random.Random(seed + i).randint(0, 1) == 0 else swap_sample(sample)
            for i, sample in enumerate(samples)
        ]
        samples = EXTRA_DATASET_SAMPLES.copy() + samples
        return samples
    
    def get_next_sample(self, user_id: str) -> Tuple[Dict, int, int]:
        """Get next unannotated sample for user"""
        if user_id not in self.user_states:
            # Check if user has existing annotations
            if user_id in self.annotations:
                annotated_ids = [ann["sample_id"] for ann in self.annotations[user_id]]
                self.user_states[user_id] = {
                    "current_index": 0,
                    "annotations": annotated_ids
                }
            else:
                self.user_states[user_id] = {
                    "current_index": 0,
                    "annotations": []
                }
        
        samples = self.get_user_samples(user_id)
        state = self.user_states[user_id]
        
        # Count total annotations for this user
        total_annotated = len(state["annotations"])
        
        # Find next unannotated sample
        for idx, sample in enumerate(samples):
            if not self.is_annotated(user_id, sample["id"]):
                return sample, total_annotated + 1, len(samples)
        
        # All samples annotated
        return None, len(samples), len(samples)
    
    def is_annotated(self, user_id: str, sample_id: str) -> bool:
        """Check if user has annotated this sample"""
        if user_id not in self.annotations:
            return False
        return any(ann["sample_id"] == sample_id for ann in self.annotations[user_id])
    
    def save_annotation(self, user_id: str, sample_id: str, choice: str, 
                        model_a: str = None, model_b: str = None, 
                        original_id: str = None, dataset_name: str = None):
        """Save user's annotation and persist to file"""
        if user_id not in self.annotations:
            self.annotations[user_id] = []
        
        annotation = {
            "user_id": user_id,
            "sample_id": sample_id,
            "original_sample_id": original_id,
            "dataset": dataset_name,
            "model_a": model_a,
            "model_b": model_b,
            "choice": choice,
            "timestamp": datetime.now().isoformat()
        }
        
        # Save to memory
        self.annotations[user_id].append(annotation)
        
        # Update user state
        if user_id in self.user_states:
            self.user_states[user_id]["annotations"].append(sample_id)
        else:
            self.user_states[user_id] = {
                "current_index": 0,
                "annotations": [sample_id]
            }
        
        # Save to file asynchronously
        threading.Thread(
            target=save_annotation_to_file, 
            args=(annotation,)
        ).start()
        
        print(f"Saved annotation: {annotation}")
    
    def get_user_progress(self, user_id: str) -> Dict:
        """Get user's annotation progress"""
        if user_id not in self.annotations:
            return {"completed": 0, "total": len(DATASET_SAMPLES)}
        
        completed = len(self.annotations[user_id])
        return {"completed": completed, "total": len(DATASET_SAMPLES)}


# Initialize manager
manager = AnnotationManager()

def login(user_id: str) -> Tuple:
    """Handle user login"""
    if not user_id or user_id.strip() == "" or user_id.strip() not in USER_IDS:
        return (
            gr.update(visible=True),  # login_interface
            gr.update(visible=False),  # annotation_interface
            "",  # user_state
            gr.update(value="Please enter a valid ID"),  # login_status
            gr.update(),  # prompt
            gr.update(),  # response_a
            gr.update(),  # response_b
            gr.update()   # progress
        )
    
    user_id = user_id.strip()
    sample, current, total = manager.get_next_sample(user_id)
    
    if sample is None:
        return (
            gr.update(visible=True),  # login_interface
            gr.update(visible=False),  # annotation_interface
            user_id,  # user_state
            gr.update(value=f"All {total} samples completed for user: {user_id}! 🎉"),  # login_status
            gr.update(),  # prompt
            gr.update(),  # response_a
            gr.update(),  # response_b
            gr.update()   # progress
        )
    
    # Show which models are being compared
    model_info = f" | Comparing: {sample.get('model_a', 'A')} vs {sample.get('model_b', 'B')}"
    
    return (
        gr.update(visible=False),  # login_interface
        gr.update(visible=True),   # annotation_interface
        user_id,  # user_state
        gr.update(value=""),  # login_status
        gr.update(value=sample["prompt"]),  # prompt
        gr.update(value=sample["response_a"]),  # response_a
        gr.update(value=sample["response_b"]),  # response_b
        gr.update(value=f"Progress: {current}/{total}")  # progress
    )

def annotate(choice: str, user_id: str) -> Tuple:
    """Handle annotation submission"""
    if not user_id:
        return (
            gr.update(),  # prompt
            gr.update(),  # response_a
            gr.update(),  # response_b
            gr.update(),  # progress
            gr.update(value="Error: No user logged in", visible=True)  # status
        )
    
    # Get current sample to save annotation
    sample, _, _ = manager.get_next_sample(user_id)
    if sample:
        # Map button choice to annotation value
        choice_map = {
            "a_better": "A is more fluent",
            "b_better": "B is more fluent",
            "equal": "Equally fluent"
        }
        # Save with all metadata
        manager.save_annotation(
            user_id=user_id,
            sample_id=sample["id"],
            choice=choice_map[choice],
            model_a=sample.get("model_a"),
            model_b=sample.get("model_b"),
            original_id=sample.get("original_id"),
            dataset_name=sample.get("dataset")
        )
    
    # Get next sample
    next_sample, current, total = manager.get_next_sample(user_id)
    
    if next_sample is None:
        return (
            gr.update(value="All samples completed! Thank you for your annotations."),  # prompt
            gr.update(value=""),  # response_a
            gr.update(value=""),  # response_b
            gr.update(value=f"Progress: {total}/{total} - Complete!"),  # progress
            gr.update(value="All annotations complete!", visible=True)  # status
        )
    
    # Show which models are being compared
    model_info = f" | Comparing: {next_sample.get('model_a', 'A')} vs {next_sample.get('model_b', 'B')}"
    
    return (
        gr.update(value=next_sample["prompt"]),  # prompt
        gr.update(value=next_sample["response_a"]),  # response_a
        gr.update(value=next_sample["response_b"]),  # response_b
        gr.update(value=f"Progress: {current}/{total}"),  # progress
        gr.update(value="Annotation saved!", visible=True)  # status
    )

def logout() -> Tuple:
    """Handle user logout"""
    return (
        gr.update(visible=True),   # login_interface
        gr.update(visible=False),  # annotation_interface
        "",  # user_state
        gr.update(value=""),  # login_status
        gr.update(value=""),  # prompt
        gr.update(value=""),  # response_a
        gr.update(value=""),  # response_b
        gr.update(value="")   # progress
    )

# Create Gradio interface
custom_css = """
    #login-group {
        background-color: white !important;
    }
    #login-group > * {
        background-color: white !important;
    }
    #login-group .gr-group {
        background-color: white !important;
    }
    #login-group .gr-form {
        background-color: white !important;
    }
    .light-shadow {
        box-shadow: 0 1px 4px 0 rgb(0 0 0 / 0.1) !important;
    }
    /* Target the textbox container */
    .no-style-textbox {
        border: none !important;
        box-shadow: none !important;
    }
    
    /* Target both input and textarea elements */
    .no-style-textbox input,
    .no-style-textbox textarea {
        border: none !important;
        box-shadow: none !important;
        padding: 0 !important;
        outline: none !important;
    }
    
    /* Target the Gradio textbox wrapper */
    .no-style-textbox .gr-textbox {
        border: none !important;
        box-shadow: none !important;
    }
    
    /* Target focus states */
    .no-style-textbox input:focus,
    .no-style-textbox textarea:focus {
        border: none !important;
        box-shadow: none !important;
        outline: none !important;
    }
    
    /* Additional targeting for stubborn Gradio elements */
    .no-style-textbox .gr-form,
    .no-style-textbox .gr-input {
        border: none !important;
        box-shadow: none !important;
    }
"""

# Create Gradio interface
with gr.Blocks(theme=Soft(font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial"]), title="Dataset Annotation Tool", css=custom_css) as app:
    gr.Markdown("# Norwegian Fluency Annotation")
    with gr.Accordion("Click here to see the full annotation guidelines:", open=False, elem_classes="light-shadow"):
        gr.Markdown(guideline, padding=True)

    user_state = gr.State("")
    
    # Login Interface
    with gr.Column(visible=True) as login_interface:
        with gr.Column(variant="panel", elem_id="login-group", elem_classes="light-shadow"):
            gr.Markdown("## Log in", padding=True)
            user_id_input = gr.Textbox(
                label="Enter your unique annotator ID to begin",
                placeholder="Annotator ID"
            )
        with gr.Row():
            login_btn = gr.Button("Login", variant="primary", scale=0.2, min_width=100)
            gr.HTML("")
        login_status = gr.Markdown("", padding=True)
    
    # Annotation Interface
    with gr.Column(visible=False, elem_id="annotation-group") as annotation_interface:
        progress_label = gr.Markdown("")
        
        # Row 1: Prompt
        with gr.Row(elem_classes="light-shadow"):
            prompt_display = gr.Textbox(
                label="Prompt",
                interactive=False,
                lines=1,
                elem_classes="no-style-textbox",
                autoscroll=False
            )
        
        # Row 2: Responses
        with gr.Row(elem_classes="light-shadow"):
            response_a_display = gr.Textbox(
                label="Response A",
                interactive=False,
                lines=1,
                scale=1,
                elem_classes="no-style-textbox",
                autoscroll=False,
                max_lines=100
            )
            response_b_display = gr.Textbox(
                label="Response B",
                interactive=False,
                lines=1,
                scale=1,
                elem_classes="no-style-textbox",
                autoscroll=False,
                max_lines=100
            )
        
        # Row 3: Buttons
        with gr.Row():
            btn_a = gr.Button("A is more fluent", variant="primary")
            btn_equal = gr.Button("Equally fluent", variant="primary")
            btn_b = gr.Button("B is more fluent", variant="primary")
        
        status_message = gr.Markdown("", visible=False)
        
        with gr.Row(visible=False):
            logout_btn = gr.Button("Logout", variant="stop", size="sm")
    
    # Event handlers
    login_btn.click(
        fn=login,
        inputs=[user_id_input],
        outputs=[
            login_interface, 
            annotation_interface, 
            user_state, 
            login_status,
            prompt_display,
            response_a_display,
            response_b_display,
            progress_label
        ]
    )
    
    user_id_input.submit(
        fn=login,
        inputs=[user_id_input],
        outputs=[
            login_interface, 
            annotation_interface, 
            user_state, 
            login_status,
            prompt_display,
            response_a_display,
            response_b_display,
            progress_label
        ]
    )
    
    btn_a.click(
        fn=lambda user_id: annotate("a_better", user_id),
        inputs=[user_state],
        outputs=[
            prompt_display,
            response_a_display,
            response_b_display,
            progress_label,
            status_message
        ]
    )
    
    btn_b.click(
        fn=lambda user_id: annotate("b_better", user_id),
        inputs=[user_state],
        outputs=[
            prompt_display,
            response_a_display,
            response_b_display,
            progress_label,
            status_message
        ]
    )
    
    btn_equal.click(
        fn=lambda user_id: annotate("equal", user_id),
        inputs=[user_state],
        outputs=[
            prompt_display,
            response_a_display,
            response_b_display,
            progress_label,
            status_message
        ]
    )
    
    logout_btn.click(
        fn=logout,
        inputs=[],
        outputs=[
            login_interface,
            annotation_interface,
            user_state,
            login_status,
            prompt_display,
            response_a_display,
            response_b_display,
            progress_label
        ]
    )

if __name__ == "__main__":
    app.launch()