Spaces:

minishlab
/

semantic-deduplication

Running

App Files Files Community

add-push-functionality

by burtenshaw HF Staff - opened Jun 3

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+2311

-82

Files changed (6) hide show

.python-version +1 -0
README.md +57 -1
app.py +270 -79
pyproject.toml +15 -0
requirements.txt +5 -2
uv.lock +0 -0

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

README.md CHANGED Viewed

@@ -9,6 +9,62 @@ app_file: app.py
 pinned: false
 license: mit
 short_description: Deduplicate HuggingFace datasets in seconds
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 license: mit
 short_description: Deduplicate HuggingFace datasets in seconds
+hf_oauth: true
+hf_oauth_scopes:
+  - write-repos
+  - manage-repos
 ---
+# Semantic Text Deduplication Using SemHash
+This Gradio application performs **semantic deduplication** on HuggingFace datasets using [SemHash](https://github.com/MinishLab/semhash) with [Model2Vec](https://github.com/MinishLab/model2vec) embeddings.
+## Features
+- **Two deduplication modes**:
+  - **Single dataset**: Find and remove duplicates within one dataset
+  - **Cross-dataset**: Remove entries from Dataset 2 that are similar to entries in Dataset 1
+- **Customizable similarity threshold**: Control how strict the deduplication should be (0.0 = very loose, 1.0 = exact matches only)
+- **Detailed results**: View statistics and examples of found duplicates with word-level differences highlighted
+- **Hub Integration**: 🆕 **Push deduplicated datasets directly to the Hugging Face Hub** after logging in
+## How to Use
+### 1. Choose Deduplication Type
+- **Cross-dataset**: Useful for removing training data contamination from test sets
+- **Single dataset**: Clean up duplicate entries within a single dataset
+### 2. Configure Datasets
+- Enter the HuggingFace dataset names (e.g., `SetFit/amazon_massive_scenario_en-US`)
+- Specify the dataset splits (e.g., `train`, `test`, `validation`)
+- Set the text column name (usually `text`, `sentence`, or `content`)
+### 3. Set Similarity Threshold
+- **0.9** (default): Good balance between precision and recall
+- **Higher values** (0.95-0.99): More conservative, only removes very similar texts
+- **Lower values** (0.7-0.85): More aggressive, may remove semantically similar but different texts
+### 4. Run Deduplication
+Click **"Deduplicate"** to start the process. You'll see:
+- Loading progress for datasets
+- Deduplication progress
+- Results with statistics and example duplicates
+### 5. Push to Hub (New!)
+After deduplication completes:
+1. **Log in** with your Hugging Face account using the login button
+2. Enter a **dataset name** for your cleaned dataset
+3. Click **"Push to Hub"** to upload the deduplicated dataset
+The dataset will be saved as `your-username/dataset-name` and be publicly available.
+## Notes
+- The app preserves all original columns from the datasets
+- Only the text similarity is used for deduplication decisions
+- Deduplicated datasets maintain the same structure as the original
+- OAuth login is required only for pushing to the Hub, not for deduplication

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
-from datasets import load_dataset
 from difflib import ndiff
 from semhash import SemHash
 from semhash.datamodels import DeduplicationResult
@@ -28,21 +29,30 @@ def display_word_differences(x: str, y: str) -> str:
     return f"```\n{formatted_diff}\n```"
-def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
     """Load texts from a specified dataset split."""
     ds = load_dataset(dataset_name, split=dataset_split)
-    return [example[text_column] for example in ds]
-def deduplicate_single_dataset(texts: list[str], threshold: float) -> DeduplicationResult:
-    """Deduplicate within a single dataset using SemHash, treating each text as a raw string record."""
     # Build a SemHash index from the raw texts
     semhash = SemHash.from_records(records=texts, model=model)
     # Deduplicate the entire dataset
     return semhash.self_deduplicate(threshold=threshold)
-def deduplicate_two_datasets(texts1: list[str], texts2: list[str], threshold: float) -> DeduplicationResult:
     """Deduplicate dataset2 against dataset1, both as raw strings, using SemHash."""
     # Build SemHash index on dataset1
     semhash = SemHash.from_records(records=texts1, model=model)
@@ -50,6 +60,22 @@ def deduplicate_two_datasets(texts1: list[str], texts2: list[str], threshold: fl
     return semhash.deduplicate(records=texts2, threshold=threshold)
 def perform_deduplication(
     deduplication_type: str,
     dataset1_name: str,
@@ -59,7 +85,7 @@ def perform_deduplication(
     dataset2_split: str = "",
     dataset2_text_column: str = "",
     threshold: float = default_threshold,
-    progress: gr.Progress = gr.Progress(track_tqdm=True)
 ):
     """
     Perform deduplication on one or two datasets using SemHash. This function
@@ -69,117 +95,225 @@ def perform_deduplication(
         threshold = float(threshold)
         # Load Dataset 1
-        yield "Loading Dataset 1...", ""
-        texts1 = load_dataset_texts(dataset1_name, dataset1_split, dataset1_text_column)
         if deduplication_type == "Single dataset":
             # Single-dataset deduplication
-            yield "Deduplicating within Dataset 1 (SemHash)...", ""
             result = deduplicate_single_dataset(texts1, threshold=threshold)
-            # Sort all duplicates in descending order of their highest score
             for duprec in result.duplicates:
-                duprec.duplicates.sort(key=lambda x: x[1], reverse=True)
             # Summarize results
             num_duplicates = len(result.duplicates)
             deduplicated_count = len(result.deduplicated)
             total_docs = len(texts1)
-            result_text = (
-                f"**Total documents (Dataset 1):** {total_docs}\n\n"
-                f"**Duplicates found:** {num_duplicates}\n\n"
-                f"**Unique documents after deduplication:** {deduplicated_count}\n\n"
-                + "-" * 50 + "\n\n"
-            )
-            # Show example duplicates
             if num_duplicates > 0:
-                result_text += "**Example duplicates:**\n\n"
                 # Only show duplicates that actually have near-duplicate records
-                duplicates_with_data = [duprec for duprec in result.duplicates if duprec.duplicates]
                 if duplicates_with_data:
                     for duprec in duplicates_with_data[:5]:
                         dup_text = duprec.record
                         orig_text, score = duprec.duplicates[0]
-                        differences = display_word_differences(orig_text, dup_text)
-                        result_text += (
-                            f"**Original:**\n{orig_text}\n\n"
-                            f"**Duplicate:**\n{dup_text}\n\n"
-                            f"**Similarity Score:** {score:.4f}\n"
-                            f"**Differences:**\n{differences}\n"
-                            + "-" * 50 + "\n\n"
                         )
-                else:
-                    result_text += "No near-duplicate details available.\n\n"
-            else:
-                result_text += "No duplicates found."
-            yield "Deduplication completed.", result_text
         else:
             # Cross-dataset deduplication
-            yield "Loading Dataset 2...", ""
-            texts2 = load_dataset_texts(dataset2_name, dataset2_split, dataset2_text_column)
-            yield "Deduplicating Dataset 2 against Dataset 1 (SemHash)...", ""
             result = deduplicate_two_datasets(texts1, texts2, threshold=threshold)
-            # Sort duplicates in descending order of their highest score
             for duprec in result.duplicates:
-                duprec.duplicates.sort(key=lambda x: x[1], reverse=True)
             num_duplicates = len(result.duplicates)
             total_docs2 = len(texts2)
             deduplicated_count = len(result.deduplicated)
-            result_text = (
-                f"**Total documents in {dataset2_name}/{dataset2_split}:** {total_docs2}\n\n"
-                f"**Duplicates found in Dataset 2:** {num_duplicates}\n\n"
-                f"**Unique documents after deduplication:** {deduplicated_count}\n\n"
-                + "-" * 50 + "\n\n"
-            )
             if num_duplicates > 0:
-                result_text += "**Example duplicates from Dataset 2:**\n\n"
-                # Again, only show duplicates that actually have near-duplicate records
-                duplicates_with_data = [duprec for duprec in result.duplicates if duprec.duplicates]
                 if duplicates_with_data:
                     for duprec in duplicates_with_data[:5]:
-                        dup_text = duprec.record  # The "duplicate" text from dataset2
                         orig_text, score = duprec.duplicates[0]
-                        differences = display_word_differences(orig_text, dup_text)
-                        result_text += (
-                            f"**Original (Dataset 1):**\n{orig_text}\n\n"
-                            f"**Duplicate (Dataset 2):**\n{dup_text}\n\n"
-                            f"**Similarity Score:** {score:.4f}\n"
-                            f"**Differences:**\n{differences}\n"
-                            + "-" * 50 + "\n\n"
                         )
-                else:
-                    result_text += "No near-duplicate details available.\n\n"
             else:
-                result_text += "No duplicates found."
-            yield "Deduplication completed.", result_text
     except Exception as e:
-        yield f"An error occurred: {e}", ""
-        raise e
 # --- Gradio App ---
-with gr.Blocks(theme=gr.themes.Ocean(), css="#status_output { height: 50px; overflow: auto; }") as demo:
-    gr.Markdown("# Semantic Text Deduplication Using SemHash")
     gr.Markdown("""
     This demo showcases **semantic deduplication** using [SemHash](https://github.com/MinishLab/semhash) for HuggingFace datasets, using a [Model2Vec](https://github.com/MinishLab/model2vec) encoder.
     It can be used to identify duplicate texts within a **single dataset** or across **two datasets**.
     You can adjust the similarity threshold to control the strictness of the deduplication.
-    **NOTE**: This demo runs on a free CPU backend, so it may be slow for large datasets.
-    For faster results, please run the code locally.
     """)
     deduplication_type = gr.Radio(
@@ -190,28 +324,76 @@ with gr.Blocks(theme=gr.themes.Ocean(), css="#status_output { height: 50px; over
     with gr.Row():
         dataset1_name = gr.Textbox(value=default_dataset_name, label="Dataset 1 Name")
-        dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
-        dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
     dataset2_inputs = gr.Column(visible=True)
     with dataset2_inputs:
         with gr.Row():
-            dataset2_name = gr.Textbox(value=default_dataset_name, label="Dataset 2 Name")
-            dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
-            dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-    threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
     with gr.Row():
-        compute_button = gr.Button("Deduplicate")
     status_output = gr.Markdown(elem_id="status_output")
-    result_output = gr.Markdown()
     def update_visibility(choice: str):
         return gr.update(visible=(choice == "Cross-dataset"))
-    deduplication_type.change(update_visibility, inputs=deduplication_type, outputs=dataset2_inputs)
     compute_button.click(
         fn=perform_deduplication,
@@ -225,7 +407,16 @@ with gr.Blocks(theme=gr.themes.Ocean(), css="#status_output { height: 50px; over
             dataset2_text_column,
             threshold,
         ],
-        outputs=[status_output, result_output],
     )
 demo.launch()

 import gradio as gr
+from datasets import load_dataset, Dataset
 from difflib import ndiff
+import pandas as pd
 from semhash import SemHash
 from semhash.datamodels import DeduplicationResult
     return f"```\n{formatted_diff}\n```"
+def load_dataset_texts(
+    dataset_name: str, dataset_split: str, text_column: str
+) -> tuple[list[str], Dataset]:
     """Load texts from a specified dataset split."""
     ds = load_dataset(dataset_name, split=dataset_split)
+    return [example[text_column] for example in ds], ds
+def deduplicate_single_dataset(
+    texts: list[str], threshold: float
+) -> DeduplicationResult:
+    """
+    Deduplicate within a single dataset using SemHash, treating each text
+    as a raw string record.
+    """
     # Build a SemHash index from the raw texts
     semhash = SemHash.from_records(records=texts, model=model)
     # Deduplicate the entire dataset
     return semhash.self_deduplicate(threshold=threshold)
+def deduplicate_two_datasets(
+    texts1: list[str], texts2: list[str], threshold: float
+) -> DeduplicationResult:
     """Deduplicate dataset2 against dataset1, both as raw strings, using SemHash."""
     # Build SemHash index on dataset1
     semhash = SemHash.from_records(records=texts1, model=model)
     return semhash.deduplicate(records=texts2, threshold=threshold)
+def create_deduplicated_dataset(
+    original_dataset: Dataset, deduplicated_texts: list[str], text_column: str
+) -> Dataset:
+    """Create a new dataset with only the deduplicated texts."""
+    # Create a mapping from text to original row
+    text_to_row = {row[text_column]: row for row in original_dataset}
+    # Build new dataset with deduplicated texts
+    deduplicated_rows = []
+    for text in deduplicated_texts:
+        if text in text_to_row:
+            deduplicated_rows.append(text_to_row[text])
+    return Dataset.from_list(deduplicated_rows)
 def perform_deduplication(
     deduplication_type: str,
     dataset1_name: str,
     dataset2_split: str = "",
     dataset2_text_column: str = "",
     threshold: float = default_threshold,
+    progress: gr.Progress = gr.Progress(track_tqdm=True),
 ):
     """
     Perform deduplication on one or two datasets using SemHash. This function
         threshold = float(threshold)
         # Load Dataset 1
+        texts1, dataset1 = load_dataset_texts(
+            dataset1_name, dataset1_split, dataset1_text_column
+        )
         if deduplication_type == "Single dataset":
             # Single-dataset deduplication
             result = deduplicate_single_dataset(texts1, threshold=threshold)
+            # Sort all duplicates by score (ascending for least similar)
             for duprec in result.duplicates:
+                duprec.duplicates.sort(key=lambda x: x[1])
+            # Create deduplicated dataset
+            deduplicated_dataset = create_deduplicated_dataset(
+                dataset1, result.deduplicated, dataset1_text_column
+            )
             # Summarize results
             num_duplicates = len(result.duplicates)
             deduplicated_count = len(result.deduplicated)
             total_docs = len(texts1)
+            # Create examples table
+            examples_table = None
             if num_duplicates > 0:
                 # Only show duplicates that actually have near-duplicate records
+                duplicates_with_data = [
+                    duprec for duprec in result.duplicates if duprec.duplicates
+                ]
+                # sort duplicates by score (ascending for least similar)
+                for duprec in result.duplicates:
+                    duprec.duplicates.sort(key=lambda x: x[1])
                 if duplicates_with_data:
+                    # Create table data for the 5 least similar examples
+                    table_data = []
                     for duprec in duplicates_with_data[:5]:
                         dup_text = duprec.record
                         orig_text, score = duprec.duplicates[0]
+                        table_data.append(
+                            [
+                                orig_text[:200] + "..."
+                                if len(orig_text) > 200
+                                else orig_text,
+                                dup_text[:200] + "..."
+                                if len(dup_text) > 200
+                                else dup_text,
+                                f"{score:.4f}",
+                            ]
                         )
+                    examples_table = pd.DataFrame(
+                        table_data,
+                        columns=["Original Text", "Duplicate Text", "Similarity Score"],
+                    )
+            # Show success info with stats
+            gr.Info(
+                f"Deduplication completed! Found {num_duplicates} duplicates. "
+                f"Dataset reduced from {total_docs} to {deduplicated_count} unique documents."
+            )
+            # Return table with visibility update
+            if examples_table is not None and not examples_table.empty:
+                return deduplicated_dataset, gr.update(
+                    visible=True, value=examples_table
+                )
+            else:
+                return deduplicated_dataset, gr.update(visible=False)
         else:
             # Cross-dataset deduplication
+            texts2, dataset2 = load_dataset_texts(
+                dataset2_name, dataset2_split, dataset2_text_column
+            )
             result = deduplicate_two_datasets(texts1, texts2, threshold=threshold)
+            # Sort duplicates by score (ascending for least similar)
             for duprec in result.duplicates:
+                duprec.duplicates.sort(key=lambda x: x[1])
+            # Create deduplicated dataset from dataset2
+            deduplicated_dataset = create_deduplicated_dataset(
+                dataset2, result.deduplicated, dataset2_text_column
+            )
             num_duplicates = len(result.duplicates)
             total_docs2 = len(texts2)
             deduplicated_count = len(result.deduplicated)
+            # Create examples table
+            examples_table = None
             if num_duplicates > 0:
+                # Again, only show duplicates that have records
+                duplicates_with_data = [
+                    duprec for duprec in result.duplicates if duprec.duplicates
+                ]
                 if duplicates_with_data:
+                    # Create table data for the 5 least similar examples
+                    table_data = []
                     for duprec in duplicates_with_data[:5]:
+                        dup_text = duprec.record
                         orig_text, score = duprec.duplicates[0]
+                        table_data.append(
+                            [
+                                orig_text[:200] + "..."
+                                if len(orig_text) > 200
+                                else orig_text,
+                                dup_text[:200] + "..."
+                                if len(dup_text) > 200
+                                else dup_text,
+                                f"{score:.4f}",
+                            ]
                         )
+                    examples_table = pd.DataFrame(
+                        table_data,
+                        columns=[
+                            "Original Text (Dataset 1)",
+                            "Duplicate Text (Dataset 2)",
+                            "Similarity Score",
+                        ],
+                    )
+            # Show success info with stats
+            gr.Info(
+                f"Deduplication completed! Found {num_duplicates} duplicates in Dataset 2. "
+                f"Dataset reduced from {total_docs2} to {deduplicated_count} unique documents."
+            )
+            # Return table with visibility update
+            if examples_table is not None and not examples_table.empty:
+                return deduplicated_dataset, gr.update(
+                    visible=True, value=examples_table
+                )
             else:
+                return deduplicated_dataset, gr.update(visible=False)
+    except Exception as e:
+        gr.Error(f"An error occurred during deduplication: {str(e)}")
+        return None, gr.update(visible=False)
+def push_to_hub(
+    deduplicated_dataset: Dataset,
+    output_dataset_name: str,
+    oauth_profile: gr.OAuthProfile | None,
+    oauth_token: gr.OAuthToken | None,
+    progress: gr.Progress = gr.Progress(),
+) -> str:
+    """Push the deduplicated dataset to Hugging Face Hub."""
+    if oauth_token is None:
+        raise gr.Error("Please log in with Hugging Face to push datasets to the Hub.")
+    if not output_dataset_name.strip():
+        raise gr.Error("Please provide a dataset name.")
+    if deduplicated_dataset is None:
+        raise gr.Error(
+            "No deduplicated dataset available. Please run deduplication first."
+        )
+    try:
+        progress(0.1, desc="Preparing dataset...")
+        # Determine the full dataset name (username/dataset_name)
+        username = oauth_profile.username if oauth_profile else None
+        if "/" not in output_dataset_name and username:
+            full_dataset_name = f"{username}/{output_dataset_name}"
+        else:
+            full_dataset_name = output_dataset_name
+        progress(0.3, desc="Pushing to Hub...")
+        # Push to hub using the OAuth token
+        deduplicated_dataset.push_to_hub(
+            full_dataset_name, token=oauth_token.token, private=False
+        )
+        progress(1.0, desc="Complete!")
+        gr.Info(
+            f"Successfully pushed deduplicated dataset with {len(deduplicated_dataset)} rows to the Hub!"
+        )
+        return (
+            f"✅ **Dataset published:** [{full_dataset_name}]"
+            f"(https://huggingface.co/datasets/{full_dataset_name})"
+        )
     except Exception as e:
+        raise gr.Error(f"Failed to push dataset to Hub: {str(e)}")
+def get_user_info(oauth_profile: gr.OAuthProfile | None) -> str:
+    """Display user login status."""
+    if oauth_profile is None:
+        return "Not logged in. Please log in to push datasets to the Hub."
+    return f"Logged in as: **{oauth_profile.username}**"
+def update_push_button_state(oauth_profile: gr.OAuthProfile | None):
+    """Update the push button state based on login status."""
+    is_logged_in = oauth_profile is not None
+    return gr.update(interactive=is_logged_in)
 # --- Gradio App ---
+with gr.Blocks(
+    theme=gr.themes.Ocean(), css="#status_output { height: 50px; overflow: auto; }"
+) as demo:
+    gr.Markdown("# SemDedup-My-Dataset: Semantic Text Deduplication Using SemHash")
     gr.Markdown("""
     This demo showcases **semantic deduplication** using [SemHash](https://github.com/MinishLab/semhash) for HuggingFace datasets, using a [Model2Vec](https://github.com/MinishLab/model2vec) encoder.
     It can be used to identify duplicate texts within a **single dataset** or across **two datasets**.
     You can adjust the similarity threshold to control the strictness of the deduplication.
     """)
     deduplication_type = gr.Radio(
     with gr.Row():
         dataset1_name = gr.Textbox(value=default_dataset_name, label="Dataset 1 Name")
+        dataset1_split = gr.Textbox(
+            value=default_dataset1_split, label="Dataset 1 Split"
+        )
+        dataset1_text_column = gr.Textbox(
+            value=default_text_column, label="Text Column Name"
+        )
     dataset2_inputs = gr.Column(visible=True)
     with dataset2_inputs:
         with gr.Row():
+            dataset2_name = gr.Textbox(
+                value=default_dataset_name, label="Dataset 2 Name"
+            )
+            dataset2_split = gr.Textbox(
+                value=default_dataset2_split, label="Dataset 2 Split"
+            )
+            dataset2_text_column = gr.Textbox(
+                value=default_text_column, label="Text Column Name"
+            )
+    threshold = gr.Slider(
+        0.0, 1.0, value=default_threshold, label="Similarity Threshold"
+    )
     with gr.Row():
+        compute_button = gr.Button("Deduplicate", variant="primary")
     status_output = gr.Markdown(elem_id="status_output")
+    # Examples table
+    examples_table = gr.Dataframe(
+        headers=["Original Text", "Duplicate Text", "Similarity Score"],
+        datatype=["str", "str", "str"],
+    )
+    # Hidden state to store the deduplicated dataset
+    deduplicated_dataset_state = gr.State()
+    # Output dataset configuration
+    gr.Markdown("## Push Deduplicated Dataset to Hub")
+    with gr.Row():
+        with gr.Column():
+            output_dataset_name = gr.Textbox(
+                label="Output Dataset Name",
+                placeholder="my-deduplicated-dataset",
+                info="Will be saved as username/dataset-name",
+            )
+        with gr.Column():
+            push_button = gr.Button(
+                "Push to Hub", variant="secondary", interactive=False
+            )
+            login_button = gr.LoginButton()
+    # Login section - moved below push to hub
+    with gr.Row():
+        user_info = gr.Markdown()
+        push_output = gr.Markdown()
     def update_visibility(choice: str):
         return gr.update(visible=(choice == "Cross-dataset"))
+    deduplication_type.change(
+        update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
+    )
+    # Update user info and button state when page loads or login status changes
+    demo.load(get_user_info, inputs=None, outputs=user_info)
+    demo.load(update_push_button_state, inputs=None, outputs=push_button)
+    login_button.click(get_user_info, inputs=None, outputs=user_info)
+    login_button.click(update_push_button_state, inputs=None, outputs=push_button)
     compute_button.click(
         fn=perform_deduplication,
             dataset2_text_column,
             threshold,
         ],
+        outputs=[deduplicated_dataset_state, examples_table],
+    )
+    push_button.click(
+        fn=push_to_hub,
+        inputs=[
+            deduplicated_dataset_state,
+            output_dataset_name,
+        ],
+        outputs=push_output,
     )
 demo.launch()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,15 @@

+[project]
+name = "semantic-deduplication"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "datasets>=3.6.0",
+    "gradio[oauth]>=5.32.1",
+    "huggingface-hub>=0.32.3",
+    "model2vec>=0.5.0",
+    "numpy>=2.2.6",
+    "semhash>=0.3.0",
+    "tqdm>=4.67.1",
+]

requirements.txt CHANGED Viewed

@@ -1,5 +1,8 @@
-semhash>=0.2.0
-numpy
 datasets
 tqdm

+gradio
 datasets
+semhash
+model2vec
+huggingface_hub
+numpy
 tqdm

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff