Spaces:

SzegedAI
/

AI_Detector

Running

App Files Files Community

mihalykiss commited on 9 days ago

Commit

be9e2ba

verified ·

1 Parent(s): c967ac9

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -20

app.py CHANGED Viewed

@@ -1,23 +1,32 @@
 import gradio as gr
-from transformers import DebertaTokenizer, DebertaForSequenceClassification, get_linear_schedule_with_warmup
-import torch
 import re
 from tokenizers import normalizers
 from tokenizers.normalizers import Sequence, Replace, Strip, NFKC
-from tokenizers import Regex
-from transformers import DebertaTokenizerFast
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-tokenizer = DebertaTokenizerFast.from_pretrained(
-    "microsoft/deberta-base",
-    add_prefix_space=True
-)
-model_2 = DebertaForSequenceClassification.from_pretrained("mihalykiss/best_merged_41_2", num_labels=41)
 model_2.to(device).eval()
 label_mapping = {
     0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b',
@@ -38,14 +47,14 @@ def clean_text(text: str) -> str:
     return text
-newline_to_space  = Replace(Regex(r"\s*\n\s*"), " ")
-join_hyphen_break = Replace(Regex(r"(\w+)-\s*\n\s*(\w+)"), r"\1\2")
 tokenizer.backend_tokenizer.normalizer = Sequence([
-        NFKC(),
-        join_hyphen_break,
-        newline_to_space,
-        Strip()
 ])
 def classify_text(text):
@@ -58,13 +67,16 @@ def classify_text(text):
     inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True).to(device)
-    with torch.no_grad():
         logits_2 = model_2(**inputs).logits
         softmax_2 = torch.softmax(logits_2, dim=1)
-        averaged_probabilities = softmax_2
         probabilities = averaged_probabilities[0]
     ai_probs = probabilities.clone()
@@ -96,8 +108,7 @@ title = "AI Text Detector"
 description = """
-This tool uses a merged <b>DeBERTa</b> model to identify whether a given text was written by a human or generated by artificial intelligence (AI).
-<br>
 <div style="line-height: 1.8;">
 ✅ <b>Human Verification:</b> Human-written content is clearly marked.<br>

 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForSequenceClassificationimport torch
 import re
 from tokenizers import normalizers
 from tokenizers.normalizers import Sequence, Replace, Strip, NFKC
+from tokenizers import Regex
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model1_path = "modernbert.bin"
+model2_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12"
+model3_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
+model_1 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
+model_1.load_state_dict(torch.load(model1_path, map_location=device))
+model_1.to(device).eval()
+model_2 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
+model_2.load_state_dict(torch.hub.load_state_dict_from_url(model2_path, map_location=device))
 model_2.to(device).eval()
+model_3 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
+model_3.load_state_dict(torch.hub.load_state_dict_from_url(model3_path, map_location=device))
+model_3.to(device).eval()
 label_mapping = {
     0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b',
     return text
+newline_to_space  = Replace(Regex(r'\s*\n\s*'), " ")
+join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
 tokenizer.backend_tokenizer.normalizer = Sequence([
+    tokenizer.backend_tokenizer.normalizer,
+    join_hyphen_break,
+    newline_to_space,
+    Strip()
 ])
 def classify_text(text):
     inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True).to(device)
+        with torch.no_grad():
+        logits_1 = model_1(**inputs).logits
         logits_2 = model_2(**inputs).logits
+        logits_3 = model_3(**inputs).logits
+        softmax_1 = torch.softmax(logits_1, dim=1)
         softmax_2 = torch.softmax(logits_2, dim=1)
+        softmax_3 = torch.softmax(logits_3, dim=1)
+        averaged_probabilities = (softmax_1 + softmax_2 + softmax_3) / 3
         probabilities = averaged_probabilities[0]
     ai_probs = probabilities.clone()
 description = """
+This tool uses the <b>ModernBERT</b> model to identify whether a given text was written by a human or generated by artificial intelligence (AI). It works with a soft voting ensemble using <b>three</b> models, combining their outputs to improve the accuracy.<br>
 <div style="line-height: 1.8;">
 ✅ <b>Human Verification:</b> Human-written content is clearly marked.<br>