Switched default dataset
Browse files
app.py
CHANGED
|
@@ -8,7 +8,7 @@ from semhash.datamodels import DeduplicationResult
|
|
| 8 |
from model2vec import StaticModel
|
| 9 |
|
| 10 |
# Default parameters
|
| 11 |
-
default_dataset_name = "
|
| 12 |
default_dataset1_split = "train"
|
| 13 |
default_dataset2_split = "test"
|
| 14 |
default_text_column = "text"
|
|
@@ -96,9 +96,12 @@ def perform_deduplication(
|
|
| 96 |
# Show example duplicates
|
| 97 |
if num_duplicates > 0:
|
| 98 |
result_text += "**Example duplicates:**\n\n"
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
| 102 |
orig_text, score = duprec.duplicates[0]
|
| 103 |
differences = display_word_differences(orig_text, dup_text)
|
| 104 |
result_text += (
|
|
@@ -108,13 +111,8 @@ def perform_deduplication(
|
|
| 108 |
f"**Differences:**\n{differences}\n"
|
| 109 |
+ "-" * 50 + "\n\n"
|
| 110 |
)
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
result_text += (
|
| 114 |
-
f"**Duplicate:**\n{dup_text}\n\n"
|
| 115 |
-
"No near-duplicate details available.\n"
|
| 116 |
-
+ "-" * 50 + "\n\n"
|
| 117 |
-
)
|
| 118 |
else:
|
| 119 |
result_text += "No duplicates found."
|
| 120 |
|
|
@@ -145,9 +143,12 @@ def perform_deduplication(
|
|
| 145 |
|
| 146 |
if num_duplicates > 0:
|
| 147 |
result_text += "**Example duplicates from Dataset 2:**\n\n"
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
| 151 |
orig_text, score = duprec.duplicates[0]
|
| 152 |
differences = display_word_differences(orig_text, dup_text)
|
| 153 |
result_text += (
|
|
@@ -157,12 +158,8 @@ def perform_deduplication(
|
|
| 157 |
f"**Differences:**\n{differences}\n"
|
| 158 |
+ "-" * 50 + "\n\n"
|
| 159 |
)
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
f"**Potential Duplicate (Dataset 2):**\n{dup_text}\n\n"
|
| 163 |
-
"No near-duplicate details available.\n"
|
| 164 |
-
+ "-" * 50 + "\n\n"
|
| 165 |
-
)
|
| 166 |
else:
|
| 167 |
result_text += "No duplicates found."
|
| 168 |
|
|
@@ -232,4 +229,3 @@ with gr.Blocks(theme=gr.themes.Ocean(), css="#status_output { height: 50px; over
|
|
| 232 |
)
|
| 233 |
|
| 234 |
demo.launch()
|
| 235 |
-
|
|
|
|
| 8 |
from model2vec import StaticModel
|
| 9 |
|
| 10 |
# Default parameters
|
| 11 |
+
default_dataset_name = "SetFit/amazon_massive_scenario_en-US"
|
| 12 |
default_dataset1_split = "train"
|
| 13 |
default_dataset2_split = "test"
|
| 14 |
default_text_column = "text"
|
|
|
|
| 96 |
# Show example duplicates
|
| 97 |
if num_duplicates > 0:
|
| 98 |
result_text += "**Example duplicates:**\n\n"
|
| 99 |
+
|
| 100 |
+
# Only show duplicates that actually have near-duplicate records
|
| 101 |
+
duplicates_with_data = [duprec for duprec in result.duplicates if duprec.duplicates]
|
| 102 |
+
if duplicates_with_data:
|
| 103 |
+
for duprec in duplicates_with_data[:5]:
|
| 104 |
+
dup_text = duprec.record
|
| 105 |
orig_text, score = duprec.duplicates[0]
|
| 106 |
differences = display_word_differences(orig_text, dup_text)
|
| 107 |
result_text += (
|
|
|
|
| 111 |
f"**Differences:**\n{differences}\n"
|
| 112 |
+ "-" * 50 + "\n\n"
|
| 113 |
)
|
| 114 |
+
else:
|
| 115 |
+
result_text += "No near-duplicate details available.\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
else:
|
| 117 |
result_text += "No duplicates found."
|
| 118 |
|
|
|
|
| 143 |
|
| 144 |
if num_duplicates > 0:
|
| 145 |
result_text += "**Example duplicates from Dataset 2:**\n\n"
|
| 146 |
+
|
| 147 |
+
# Again, only show duplicates that actually have near-duplicate records
|
| 148 |
+
duplicates_with_data = [duprec for duprec in result.duplicates if duprec.duplicates]
|
| 149 |
+
if duplicates_with_data:
|
| 150 |
+
for duprec in duplicates_with_data[:5]:
|
| 151 |
+
dup_text = duprec.record # The "duplicate" text from dataset2
|
| 152 |
orig_text, score = duprec.duplicates[0]
|
| 153 |
differences = display_word_differences(orig_text, dup_text)
|
| 154 |
result_text += (
|
|
|
|
| 158 |
f"**Differences:**\n{differences}\n"
|
| 159 |
+ "-" * 50 + "\n\n"
|
| 160 |
)
|
| 161 |
+
else:
|
| 162 |
+
result_text += "No near-duplicate details available.\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
else:
|
| 164 |
result_text += "No duplicates found."
|
| 165 |
|
|
|
|
| 229 |
)
|
| 230 |
|
| 231 |
demo.launch()
|
|
|