add-push-functionality

#2
by burtenshaw HF Staff - opened
Files changed (6) hide show
  1. .python-version +1 -0
  2. README.md +57 -1
  3. app.py +270 -79
  4. pyproject.toml +15 -0
  5. requirements.txt +5 -2
  6. uv.lock +0 -0
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.11
README.md CHANGED
@@ -9,6 +9,62 @@ app_file: app.py
9
  pinned: false
10
  license: mit
11
  short_description: Deduplicate HuggingFace datasets in seconds
 
 
 
 
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  pinned: false
10
  license: mit
11
  short_description: Deduplicate HuggingFace datasets in seconds
12
+ hf_oauth: true
13
+ hf_oauth_scopes:
14
+ - write-repos
15
+ - manage-repos
16
  ---
17
 
18
+ # Semantic Text Deduplication Using SemHash
19
+
20
+ This Gradio application performs **semantic deduplication** on HuggingFace datasets using [SemHash](https://github.com/MinishLab/semhash) with [Model2Vec](https://github.com/MinishLab/model2vec) embeddings.
21
+
22
+ ## Features
23
+
24
+ - **Two deduplication modes**:
25
+ - **Single dataset**: Find and remove duplicates within one dataset
26
+ - **Cross-dataset**: Remove entries from Dataset 2 that are similar to entries in Dataset 1
27
+
28
+ - **Customizable similarity threshold**: Control how strict the deduplication should be (0.0 = very loose, 1.0 = exact matches only)
29
+
30
+ - **Detailed results**: View statistics and examples of found duplicates with word-level differences highlighted
31
+
32
+ - **Hub Integration**: 🆕 **Push deduplicated datasets directly to the Hugging Face Hub** after logging in
33
+
34
+ ## How to Use
35
+
36
+ ### 1. Choose Deduplication Type
37
+ - **Cross-dataset**: Useful for removing training data contamination from test sets
38
+ - **Single dataset**: Clean up duplicate entries within a single dataset
39
+
40
+ ### 2. Configure Datasets
41
+ - Enter the HuggingFace dataset names (e.g., `SetFit/amazon_massive_scenario_en-US`)
42
+ - Specify the dataset splits (e.g., `train`, `test`, `validation`)
43
+ - Set the text column name (usually `text`, `sentence`, or `content`)
44
+
45
+ ### 3. Set Similarity Threshold
46
+ - **0.9** (default): Good balance between precision and recall
47
+ - **Higher values** (0.95-0.99): More conservative, only removes very similar texts
48
+ - **Lower values** (0.7-0.85): More aggressive, may remove semantically similar but different texts
49
+
50
+ ### 4. Run Deduplication
51
+ Click **"Deduplicate"** to start the process. You'll see:
52
+ - Loading progress for datasets
53
+ - Deduplication progress
54
+ - Results with statistics and example duplicates
55
+
56
+ ### 5. Push to Hub (New!)
57
+ After deduplication completes:
58
+ 1. **Log in** with your Hugging Face account using the login button
59
+ 2. Enter a **dataset name** for your cleaned dataset
60
+ 3. Click **"Push to Hub"** to upload the deduplicated dataset
61
+
62
+ The dataset will be saved as `your-username/dataset-name` and be publicly available.
63
+
64
+
65
+ ## Notes
66
+
67
+ - The app preserves all original columns from the datasets
68
+ - Only the text similarity is used for deduplication decisions
69
+ - Deduplicated datasets maintain the same structure as the original
70
+ - OAuth login is required only for pushing to the Hub, not for deduplication
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
- from datasets import load_dataset
3
  from difflib import ndiff
 
4
 
5
  from semhash import SemHash
6
  from semhash.datamodels import DeduplicationResult
@@ -28,21 +29,30 @@ def display_word_differences(x: str, y: str) -> str:
28
  return f"```\n{formatted_diff}\n```"
29
 
30
 
31
- def load_dataset_texts(dataset_name: str, dataset_split: str, text_column: str) -> list[str]:
 
 
32
  """Load texts from a specified dataset split."""
33
  ds = load_dataset(dataset_name, split=dataset_split)
34
- return [example[text_column] for example in ds]
35
 
36
 
37
- def deduplicate_single_dataset(texts: list[str], threshold: float) -> DeduplicationResult:
38
- """Deduplicate within a single dataset using SemHash, treating each text as a raw string record."""
 
 
 
 
 
39
  # Build a SemHash index from the raw texts
40
  semhash = SemHash.from_records(records=texts, model=model)
41
  # Deduplicate the entire dataset
42
  return semhash.self_deduplicate(threshold=threshold)
43
 
44
 
45
- def deduplicate_two_datasets(texts1: list[str], texts2: list[str], threshold: float) -> DeduplicationResult:
 
 
46
  """Deduplicate dataset2 against dataset1, both as raw strings, using SemHash."""
47
  # Build SemHash index on dataset1
48
  semhash = SemHash.from_records(records=texts1, model=model)
@@ -50,6 +60,22 @@ def deduplicate_two_datasets(texts1: list[str], texts2: list[str], threshold: fl
50
  return semhash.deduplicate(records=texts2, threshold=threshold)
51
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def perform_deduplication(
54
  deduplication_type: str,
55
  dataset1_name: str,
@@ -59,7 +85,7 @@ def perform_deduplication(
59
  dataset2_split: str = "",
60
  dataset2_text_column: str = "",
61
  threshold: float = default_threshold,
62
- progress: gr.Progress = gr.Progress(track_tqdm=True)
63
  ):
64
  """
65
  Perform deduplication on one or two datasets using SemHash. This function
@@ -69,117 +95,225 @@ def perform_deduplication(
69
  threshold = float(threshold)
70
 
71
  # Load Dataset 1
72
- yield "Loading Dataset 1...", ""
73
- texts1 = load_dataset_texts(dataset1_name, dataset1_split, dataset1_text_column)
 
74
 
75
  if deduplication_type == "Single dataset":
76
  # Single-dataset deduplication
77
- yield "Deduplicating within Dataset 1 (SemHash)...", ""
78
  result = deduplicate_single_dataset(texts1, threshold=threshold)
79
 
80
- # Sort all duplicates in descending order of their highest score
81
  for duprec in result.duplicates:
82
- duprec.duplicates.sort(key=lambda x: x[1], reverse=True)
 
 
 
 
 
83
 
84
  # Summarize results
85
  num_duplicates = len(result.duplicates)
86
  deduplicated_count = len(result.deduplicated)
87
  total_docs = len(texts1)
88
-
89
- result_text = (
90
- f"**Total documents (Dataset 1):** {total_docs}\n\n"
91
- f"**Duplicates found:** {num_duplicates}\n\n"
92
- f"**Unique documents after deduplication:** {deduplicated_count}\n\n"
93
- + "-" * 50 + "\n\n"
94
- )
95
 
96
- # Show example duplicates
 
97
  if num_duplicates > 0:
98
- result_text += "**Example duplicates:**\n\n"
99
-
100
  # Only show duplicates that actually have near-duplicate records
101
- duplicates_with_data = [duprec for duprec in result.duplicates if duprec.duplicates]
 
 
 
 
 
 
 
102
  if duplicates_with_data:
 
 
103
  for duprec in duplicates_with_data[:5]:
104
  dup_text = duprec.record
105
  orig_text, score = duprec.duplicates[0]
106
- differences = display_word_differences(orig_text, dup_text)
107
- result_text += (
108
- f"**Original:**\n{orig_text}\n\n"
109
- f"**Duplicate:**\n{dup_text}\n\n"
110
- f"**Similarity Score:** {score:.4f}\n"
111
- f"**Differences:**\n{differences}\n"
112
- + "-" * 50 + "\n\n"
 
 
 
113
  )
114
- else:
115
- result_text += "No near-duplicate details available.\n\n"
116
- else:
117
- result_text += "No duplicates found."
118
 
119
- yield "Deduplication completed.", result_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  else:
122
  # Cross-dataset deduplication
123
- yield "Loading Dataset 2...", ""
124
- texts2 = load_dataset_texts(dataset2_name, dataset2_split, dataset2_text_column)
 
125
 
126
- yield "Deduplicating Dataset 2 against Dataset 1 (SemHash)...", ""
127
  result = deduplicate_two_datasets(texts1, texts2, threshold=threshold)
128
 
129
- # Sort duplicates in descending order of their highest score
130
  for duprec in result.duplicates:
131
- duprec.duplicates.sort(key=lambda x: x[1], reverse=True)
 
 
 
 
 
132
 
133
  num_duplicates = len(result.duplicates)
134
  total_docs2 = len(texts2)
135
  deduplicated_count = len(result.deduplicated)
136
 
137
- result_text = (
138
- f"**Total documents in {dataset2_name}/{dataset2_split}:** {total_docs2}\n\n"
139
- f"**Duplicates found in Dataset 2:** {num_duplicates}\n\n"
140
- f"**Unique documents after deduplication:** {deduplicated_count}\n\n"
141
- + "-" * 50 + "\n\n"
142
- )
143
-
144
  if num_duplicates > 0:
145
- result_text += "**Example duplicates from Dataset 2:**\n\n"
146
-
147
- # Again, only show duplicates that actually have near-duplicate records
148
- duplicates_with_data = [duprec for duprec in result.duplicates if duprec.duplicates]
149
  if duplicates_with_data:
 
 
150
  for duprec in duplicates_with_data[:5]:
151
- dup_text = duprec.record # The "duplicate" text from dataset2
152
  orig_text, score = duprec.duplicates[0]
153
- differences = display_word_differences(orig_text, dup_text)
154
- result_text += (
155
- f"**Original (Dataset 1):**\n{orig_text}\n\n"
156
- f"**Duplicate (Dataset 2):**\n{dup_text}\n\n"
157
- f"**Similarity Score:** {score:.4f}\n"
158
- f"**Differences:**\n{differences}\n"
159
- + "-" * 50 + "\n\n"
 
 
 
160
  )
161
- else:
162
- result_text += "No near-duplicate details available.\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  else:
164
- result_text += "No duplicates found."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
- yield "Deduplication completed.", result_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  except Exception as e:
169
- yield f"An error occurred: {e}", ""
170
- raise e
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
 
173
  # --- Gradio App ---
174
- with gr.Blocks(theme=gr.themes.Ocean(), css="#status_output { height: 50px; overflow: auto; }") as demo:
175
- gr.Markdown("# Semantic Text Deduplication Using SemHash")
 
 
176
  gr.Markdown("""
177
  This demo showcases **semantic deduplication** using [SemHash](https://github.com/MinishLab/semhash) for HuggingFace datasets, using a [Model2Vec](https://github.com/MinishLab/model2vec) encoder.
178
  It can be used to identify duplicate texts within a **single dataset** or across **two datasets**.
179
  You can adjust the similarity threshold to control the strictness of the deduplication.
180
 
181
- **NOTE**: This demo runs on a free CPU backend, so it may be slow for large datasets.
182
- For faster results, please run the code locally.
183
  """)
184
 
185
  deduplication_type = gr.Radio(
@@ -190,28 +324,76 @@ with gr.Blocks(theme=gr.themes.Ocean(), css="#status_output { height: 50px; over
190
 
191
  with gr.Row():
192
  dataset1_name = gr.Textbox(value=default_dataset_name, label="Dataset 1 Name")
193
- dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
194
- dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
 
 
 
 
195
 
196
  dataset2_inputs = gr.Column(visible=True)
197
  with dataset2_inputs:
198
  with gr.Row():
199
- dataset2_name = gr.Textbox(value=default_dataset_name, label="Dataset 2 Name")
200
- dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
201
- dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
 
 
 
 
 
 
202
 
203
- threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
 
 
204
 
205
  with gr.Row():
206
- compute_button = gr.Button("Deduplicate")
207
 
208
  status_output = gr.Markdown(elem_id="status_output")
209
- result_output = gr.Markdown()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
  def update_visibility(choice: str):
212
  return gr.update(visible=(choice == "Cross-dataset"))
213
 
214
- deduplication_type.change(update_visibility, inputs=deduplication_type, outputs=dataset2_inputs)
 
 
 
 
 
 
 
 
215
 
216
  compute_button.click(
217
  fn=perform_deduplication,
@@ -225,7 +407,16 @@ with gr.Blocks(theme=gr.themes.Ocean(), css="#status_output { height: 50px; over
225
  dataset2_text_column,
226
  threshold,
227
  ],
228
- outputs=[status_output, result_output],
 
 
 
 
 
 
 
 
 
229
  )
230
 
231
  demo.launch()
 
1
  import gradio as gr
2
+ from datasets import load_dataset, Dataset
3
  from difflib import ndiff
4
+ import pandas as pd
5
 
6
  from semhash import SemHash
7
  from semhash.datamodels import DeduplicationResult
 
29
  return f"```\n{formatted_diff}\n```"
30
 
31
 
32
+ def load_dataset_texts(
33
+ dataset_name: str, dataset_split: str, text_column: str
34
+ ) -> tuple[list[str], Dataset]:
35
  """Load texts from a specified dataset split."""
36
  ds = load_dataset(dataset_name, split=dataset_split)
37
+ return [example[text_column] for example in ds], ds
38
 
39
 
40
+ def deduplicate_single_dataset(
41
+ texts: list[str], threshold: float
42
+ ) -> DeduplicationResult:
43
+ """
44
+ Deduplicate within a single dataset using SemHash, treating each text
45
+ as a raw string record.
46
+ """
47
  # Build a SemHash index from the raw texts
48
  semhash = SemHash.from_records(records=texts, model=model)
49
  # Deduplicate the entire dataset
50
  return semhash.self_deduplicate(threshold=threshold)
51
 
52
 
53
+ def deduplicate_two_datasets(
54
+ texts1: list[str], texts2: list[str], threshold: float
55
+ ) -> DeduplicationResult:
56
  """Deduplicate dataset2 against dataset1, both as raw strings, using SemHash."""
57
  # Build SemHash index on dataset1
58
  semhash = SemHash.from_records(records=texts1, model=model)
 
60
  return semhash.deduplicate(records=texts2, threshold=threshold)
61
 
62
 
63
+ def create_deduplicated_dataset(
64
+ original_dataset: Dataset, deduplicated_texts: list[str], text_column: str
65
+ ) -> Dataset:
66
+ """Create a new dataset with only the deduplicated texts."""
67
+ # Create a mapping from text to original row
68
+ text_to_row = {row[text_column]: row for row in original_dataset}
69
+
70
+ # Build new dataset with deduplicated texts
71
+ deduplicated_rows = []
72
+ for text in deduplicated_texts:
73
+ if text in text_to_row:
74
+ deduplicated_rows.append(text_to_row[text])
75
+
76
+ return Dataset.from_list(deduplicated_rows)
77
+
78
+
79
  def perform_deduplication(
80
  deduplication_type: str,
81
  dataset1_name: str,
 
85
  dataset2_split: str = "",
86
  dataset2_text_column: str = "",
87
  threshold: float = default_threshold,
88
+ progress: gr.Progress = gr.Progress(track_tqdm=True),
89
  ):
90
  """
91
  Perform deduplication on one or two datasets using SemHash. This function
 
95
  threshold = float(threshold)
96
 
97
  # Load Dataset 1
98
+ texts1, dataset1 = load_dataset_texts(
99
+ dataset1_name, dataset1_split, dataset1_text_column
100
+ )
101
 
102
  if deduplication_type == "Single dataset":
103
  # Single-dataset deduplication
 
104
  result = deduplicate_single_dataset(texts1, threshold=threshold)
105
 
106
+ # Sort all duplicates by score (ascending for least similar)
107
  for duprec in result.duplicates:
108
+ duprec.duplicates.sort(key=lambda x: x[1])
109
+
110
+ # Create deduplicated dataset
111
+ deduplicated_dataset = create_deduplicated_dataset(
112
+ dataset1, result.deduplicated, dataset1_text_column
113
+ )
114
 
115
  # Summarize results
116
  num_duplicates = len(result.duplicates)
117
  deduplicated_count = len(result.deduplicated)
118
  total_docs = len(texts1)
 
 
 
 
 
 
 
119
 
120
+ # Create examples table
121
+ examples_table = None
122
  if num_duplicates > 0:
 
 
123
  # Only show duplicates that actually have near-duplicate records
124
+ duplicates_with_data = [
125
+ duprec for duprec in result.duplicates if duprec.duplicates
126
+ ]
127
+
128
+ # sort duplicates by score (ascending for least similar)
129
+ for duprec in result.duplicates:
130
+ duprec.duplicates.sort(key=lambda x: x[1])
131
+
132
  if duplicates_with_data:
133
+ # Create table data for the 5 least similar examples
134
+ table_data = []
135
  for duprec in duplicates_with_data[:5]:
136
  dup_text = duprec.record
137
  orig_text, score = duprec.duplicates[0]
138
+ table_data.append(
139
+ [
140
+ orig_text[:200] + "..."
141
+ if len(orig_text) > 200
142
+ else orig_text,
143
+ dup_text[:200] + "..."
144
+ if len(dup_text) > 200
145
+ else dup_text,
146
+ f"{score:.4f}",
147
+ ]
148
  )
 
 
 
 
149
 
150
+ examples_table = pd.DataFrame(
151
+ table_data,
152
+ columns=["Original Text", "Duplicate Text", "Similarity Score"],
153
+ )
154
+
155
+ # Show success info with stats
156
+ gr.Info(
157
+ f"Deduplication completed! Found {num_duplicates} duplicates. "
158
+ f"Dataset reduced from {total_docs} to {deduplicated_count} unique documents."
159
+ )
160
+
161
+ # Return table with visibility update
162
+ if examples_table is not None and not examples_table.empty:
163
+ return deduplicated_dataset, gr.update(
164
+ visible=True, value=examples_table
165
+ )
166
+ else:
167
+ return deduplicated_dataset, gr.update(visible=False)
168
 
169
  else:
170
  # Cross-dataset deduplication
171
+ texts2, dataset2 = load_dataset_texts(
172
+ dataset2_name, dataset2_split, dataset2_text_column
173
+ )
174
 
 
175
  result = deduplicate_two_datasets(texts1, texts2, threshold=threshold)
176
 
177
+ # Sort duplicates by score (ascending for least similar)
178
  for duprec in result.duplicates:
179
+ duprec.duplicates.sort(key=lambda x: x[1])
180
+
181
+ # Create deduplicated dataset from dataset2
182
+ deduplicated_dataset = create_deduplicated_dataset(
183
+ dataset2, result.deduplicated, dataset2_text_column
184
+ )
185
 
186
  num_duplicates = len(result.duplicates)
187
  total_docs2 = len(texts2)
188
  deduplicated_count = len(result.deduplicated)
189
 
190
+ # Create examples table
191
+ examples_table = None
 
 
 
 
 
192
  if num_duplicates > 0:
193
+ # Again, only show duplicates that have records
194
+ duplicates_with_data = [
195
+ duprec for duprec in result.duplicates if duprec.duplicates
196
+ ]
197
  if duplicates_with_data:
198
+ # Create table data for the 5 least similar examples
199
+ table_data = []
200
  for duprec in duplicates_with_data[:5]:
201
+ dup_text = duprec.record
202
  orig_text, score = duprec.duplicates[0]
203
+ table_data.append(
204
+ [
205
+ orig_text[:200] + "..."
206
+ if len(orig_text) > 200
207
+ else orig_text,
208
+ dup_text[:200] + "..."
209
+ if len(dup_text) > 200
210
+ else dup_text,
211
+ f"{score:.4f}",
212
+ ]
213
  )
214
+
215
+ examples_table = pd.DataFrame(
216
+ table_data,
217
+ columns=[
218
+ "Original Text (Dataset 1)",
219
+ "Duplicate Text (Dataset 2)",
220
+ "Similarity Score",
221
+ ],
222
+ )
223
+
224
+ # Show success info with stats
225
+ gr.Info(
226
+ f"Deduplication completed! Found {num_duplicates} duplicates in Dataset 2. "
227
+ f"Dataset reduced from {total_docs2} to {deduplicated_count} unique documents."
228
+ )
229
+
230
+ # Return table with visibility update
231
+ if examples_table is not None and not examples_table.empty:
232
+ return deduplicated_dataset, gr.update(
233
+ visible=True, value=examples_table
234
+ )
235
  else:
236
+ return deduplicated_dataset, gr.update(visible=False)
237
+
238
+ except Exception as e:
239
+ gr.Error(f"An error occurred during deduplication: {str(e)}")
240
+ return None, gr.update(visible=False)
241
+
242
+
243
+ def push_to_hub(
244
+ deduplicated_dataset: Dataset,
245
+ output_dataset_name: str,
246
+ oauth_profile: gr.OAuthProfile | None,
247
+ oauth_token: gr.OAuthToken | None,
248
+ progress: gr.Progress = gr.Progress(),
249
+ ) -> str:
250
+ """Push the deduplicated dataset to Hugging Face Hub."""
251
+ if oauth_token is None:
252
+ raise gr.Error("Please log in with Hugging Face to push datasets to the Hub.")
253
+
254
+ if not output_dataset_name.strip():
255
+ raise gr.Error("Please provide a dataset name.")
256
 
257
+ if deduplicated_dataset is None:
258
+ raise gr.Error(
259
+ "No deduplicated dataset available. Please run deduplication first."
260
+ )
261
+
262
+ try:
263
+ progress(0.1, desc="Preparing dataset...")
264
+
265
+ # Determine the full dataset name (username/dataset_name)
266
+ username = oauth_profile.username if oauth_profile else None
267
+ if "/" not in output_dataset_name and username:
268
+ full_dataset_name = f"{username}/{output_dataset_name}"
269
+ else:
270
+ full_dataset_name = output_dataset_name
271
+
272
+ progress(0.3, desc="Pushing to Hub...")
273
+
274
+ # Push to hub using the OAuth token
275
+ deduplicated_dataset.push_to_hub(
276
+ full_dataset_name, token=oauth_token.token, private=False
277
+ )
278
+
279
+ progress(1.0, desc="Complete!")
280
+
281
+ gr.Info(
282
+ f"Successfully pushed deduplicated dataset with {len(deduplicated_dataset)} rows to the Hub!"
283
+ )
284
+
285
+ return (
286
+ f"✅ **Dataset published:** [{full_dataset_name}]"
287
+ f"(https://huggingface.co/datasets/{full_dataset_name})"
288
+ )
289
 
290
  except Exception as e:
291
+ raise gr.Error(f"Failed to push dataset to Hub: {str(e)}")
292
+
293
+
294
+ def get_user_info(oauth_profile: gr.OAuthProfile | None) -> str:
295
+ """Display user login status."""
296
+ if oauth_profile is None:
297
+ return "Not logged in. Please log in to push datasets to the Hub."
298
+ return f"Logged in as: **{oauth_profile.username}**"
299
+
300
+
301
+ def update_push_button_state(oauth_profile: gr.OAuthProfile | None):
302
+ """Update the push button state based on login status."""
303
+ is_logged_in = oauth_profile is not None
304
+ return gr.update(interactive=is_logged_in)
305
 
306
 
307
  # --- Gradio App ---
308
+ with gr.Blocks(
309
+ theme=gr.themes.Ocean(), css="#status_output { height: 50px; overflow: auto; }"
310
+ ) as demo:
311
+ gr.Markdown("# SemDedup-My-Dataset: Semantic Text Deduplication Using SemHash")
312
  gr.Markdown("""
313
  This demo showcases **semantic deduplication** using [SemHash](https://github.com/MinishLab/semhash) for HuggingFace datasets, using a [Model2Vec](https://github.com/MinishLab/model2vec) encoder.
314
  It can be used to identify duplicate texts within a **single dataset** or across **two datasets**.
315
  You can adjust the similarity threshold to control the strictness of the deduplication.
316
 
 
 
317
  """)
318
 
319
  deduplication_type = gr.Radio(
 
324
 
325
  with gr.Row():
326
  dataset1_name = gr.Textbox(value=default_dataset_name, label="Dataset 1 Name")
327
+ dataset1_split = gr.Textbox(
328
+ value=default_dataset1_split, label="Dataset 1 Split"
329
+ )
330
+ dataset1_text_column = gr.Textbox(
331
+ value=default_text_column, label="Text Column Name"
332
+ )
333
 
334
  dataset2_inputs = gr.Column(visible=True)
335
  with dataset2_inputs:
336
  with gr.Row():
337
+ dataset2_name = gr.Textbox(
338
+ value=default_dataset_name, label="Dataset 2 Name"
339
+ )
340
+ dataset2_split = gr.Textbox(
341
+ value=default_dataset2_split, label="Dataset 2 Split"
342
+ )
343
+ dataset2_text_column = gr.Textbox(
344
+ value=default_text_column, label="Text Column Name"
345
+ )
346
 
347
+ threshold = gr.Slider(
348
+ 0.0, 1.0, value=default_threshold, label="Similarity Threshold"
349
+ )
350
 
351
  with gr.Row():
352
+ compute_button = gr.Button("Deduplicate", variant="primary")
353
 
354
  status_output = gr.Markdown(elem_id="status_output")
355
+
356
+ # Examples table
357
+ examples_table = gr.Dataframe(
358
+ headers=["Original Text", "Duplicate Text", "Similarity Score"],
359
+ datatype=["str", "str", "str"],
360
+ )
361
+
362
+ # Hidden state to store the deduplicated dataset
363
+ deduplicated_dataset_state = gr.State()
364
+
365
+ # Output dataset configuration
366
+ gr.Markdown("## Push Deduplicated Dataset to Hub")
367
+ with gr.Row():
368
+ with gr.Column():
369
+ output_dataset_name = gr.Textbox(
370
+ label="Output Dataset Name",
371
+ placeholder="my-deduplicated-dataset",
372
+ info="Will be saved as username/dataset-name",
373
+ )
374
+ with gr.Column():
375
+ push_button = gr.Button(
376
+ "Push to Hub", variant="secondary", interactive=False
377
+ )
378
+ login_button = gr.LoginButton()
379
+
380
+ # Login section - moved below push to hub
381
+ with gr.Row():
382
+ user_info = gr.Markdown()
383
+ push_output = gr.Markdown()
384
 
385
  def update_visibility(choice: str):
386
  return gr.update(visible=(choice == "Cross-dataset"))
387
 
388
+ deduplication_type.change(
389
+ update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
390
+ )
391
+
392
+ # Update user info and button state when page loads or login status changes
393
+ demo.load(get_user_info, inputs=None, outputs=user_info)
394
+ demo.load(update_push_button_state, inputs=None, outputs=push_button)
395
+ login_button.click(get_user_info, inputs=None, outputs=user_info)
396
+ login_button.click(update_push_button_state, inputs=None, outputs=push_button)
397
 
398
  compute_button.click(
399
  fn=perform_deduplication,
 
407
  dataset2_text_column,
408
  threshold,
409
  ],
410
+ outputs=[deduplicated_dataset_state, examples_table],
411
+ )
412
+
413
+ push_button.click(
414
+ fn=push_to_hub,
415
+ inputs=[
416
+ deduplicated_dataset_state,
417
+ output_dataset_name,
418
+ ],
419
+ outputs=push_output,
420
  )
421
 
422
  demo.launch()
pyproject.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "semantic-deduplication"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "datasets>=3.6.0",
9
+ "gradio[oauth]>=5.32.1",
10
+ "huggingface-hub>=0.32.3",
11
+ "model2vec>=0.5.0",
12
+ "numpy>=2.2.6",
13
+ "semhash>=0.3.0",
14
+ "tqdm>=4.67.1",
15
+ ]
requirements.txt CHANGED
@@ -1,5 +1,8 @@
1
- semhash>=0.2.0
2
- numpy
3
  datasets
 
 
 
 
4
  tqdm
5
 
 
1
+ gradio
 
2
  datasets
3
+ semhash
4
+ model2vec
5
+ huggingface_hub
6
+ numpy
7
  tqdm
8
 
uv.lock ADDED
The diff for this file is too large to render. See raw diff