mihalykiss commited on
Commit
be9e2ba
·
verified ·
1 Parent(s): c967ac9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -20
app.py CHANGED
@@ -1,23 +1,32 @@
1
  import gradio as gr
2
- from transformers import DebertaTokenizer, DebertaForSequenceClassification, get_linear_schedule_with_warmup
3
- import torch
4
  import re
5
  from tokenizers import normalizers
6
  from tokenizers.normalizers import Sequence, Replace, Strip, NFKC
7
- from tokenizers import Regex
8
 
9
- from transformers import DebertaTokenizerFast
10
 
 
 
 
11
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
12
 
13
- tokenizer = DebertaTokenizerFast.from_pretrained(
14
- "microsoft/deberta-base",
15
- add_prefix_space=True
16
- )
 
17
 
18
- model_2 = DebertaForSequenceClassification.from_pretrained("mihalykiss/best_merged_41_2", num_labels=41)
 
19
  model_2.to(device).eval()
20
 
 
 
 
 
 
21
 
22
  label_mapping = {
23
  0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b',
@@ -38,14 +47,14 @@ def clean_text(text: str) -> str:
38
  return text
39
 
40
 
41
- newline_to_space = Replace(Regex(r"\s*\n\s*"), " ")
42
- join_hyphen_break = Replace(Regex(r"(\w+)-\s*\n\s*(\w+)"), r"\1\2")
43
 
44
  tokenizer.backend_tokenizer.normalizer = Sequence([
45
- NFKC(),
46
- join_hyphen_break,
47
- newline_to_space,
48
- Strip()
49
  ])
50
 
51
  def classify_text(text):
@@ -58,13 +67,16 @@ def classify_text(text):
58
 
59
  inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True).to(device)
60
 
61
- with torch.no_grad():
62
-
63
  logits_2 = model_2(**inputs).logits
 
64
 
 
65
  softmax_2 = torch.softmax(logits_2, dim=1)
 
66
 
67
- averaged_probabilities = softmax_2
68
  probabilities = averaged_probabilities[0]
69
 
70
  ai_probs = probabilities.clone()
@@ -96,8 +108,7 @@ title = "AI Text Detector"
96
  description = """
97
 
98
 
99
- This tool uses a merged <b>DeBERTa</b> model to identify whether a given text was written by a human or generated by artificial intelligence (AI).
100
- <br>
101
 
102
  <div style="line-height: 1.8;">
103
  ✅ <b>Human Verification:</b> Human-written content is clearly marked.<br>
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassificationimport torch
 
3
  import re
4
  from tokenizers import normalizers
5
  from tokenizers.normalizers import Sequence, Replace, Strip, NFKC
6
+ from tokenizers import Regex
7
 
8
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
9
 
10
+ model1_path = "modernbert.bin"
11
+ model2_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12"
12
+ model3_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
13
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
14
 
15
+ tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
16
+
17
+ model_1 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
18
+ model_1.load_state_dict(torch.load(model1_path, map_location=device))
19
+ model_1.to(device).eval()
20
 
21
+ model_2 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
22
+ model_2.load_state_dict(torch.hub.load_state_dict_from_url(model2_path, map_location=device))
23
  model_2.to(device).eval()
24
 
25
+ model_3 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
26
+ model_3.load_state_dict(torch.hub.load_state_dict_from_url(model3_path, map_location=device))
27
+ model_3.to(device).eval()
28
+
29
+
30
 
31
  label_mapping = {
32
  0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b',
 
47
  return text
48
 
49
 
50
+ newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
51
+ join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
52
 
53
  tokenizer.backend_tokenizer.normalizer = Sequence([
54
+ tokenizer.backend_tokenizer.normalizer,
55
+ join_hyphen_break,
56
+ newline_to_space,
57
+ Strip()
58
  ])
59
 
60
  def classify_text(text):
 
67
 
68
  inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True).to(device)
69
 
70
+ with torch.no_grad():
71
+ logits_1 = model_1(**inputs).logits
72
  logits_2 = model_2(**inputs).logits
73
+ logits_3 = model_3(**inputs).logits
74
 
75
+ softmax_1 = torch.softmax(logits_1, dim=1)
76
  softmax_2 = torch.softmax(logits_2, dim=1)
77
+ softmax_3 = torch.softmax(logits_3, dim=1)
78
 
79
+ averaged_probabilities = (softmax_1 + softmax_2 + softmax_3) / 3
80
  probabilities = averaged_probabilities[0]
81
 
82
  ai_probs = probabilities.clone()
 
108
  description = """
109
 
110
 
111
+ This tool uses the <b>ModernBERT</b> model to identify whether a given text was written by a human or generated by artificial intelligence (AI). It works with a soft voting ensemble using <b>three</b> models, combining their outputs to improve the accuracy.<br>
 
112
 
113
  <div style="line-height: 1.8;">
114
  ✅ <b>Human Verification:</b> Human-written content is clearly marked.<br>