test / app.py
mihalykiss's picture
Update app.py
cd7ef38 verified
raw
history blame
21.8 kB
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
from tokenizers import normalizers # For isinstance check
from tokenizers.normalizers import Sequence, Replace, Strip
from tokenizers import Regex # For the Regex class
import os
# --- Model & Tokenizer Configuration ---
model1_path = "https://huggingface.co/spaces/SzegedAI/AI_Detector/resolve/main/modernbert.bin"
model2_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12"
model3_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
try:
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
model_1 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
model_1.load_state_dict(torch.hub.load_state_dict_from_url(model1_path, map_location=device, progress=True))
model_1.to(device).eval()
model_2 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
model_2.load_state_dict(torch.hub.load_state_dict_from_url(model2_path, map_location=device, progress=True))
model_2.to(device).eval()
model_3 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
model_3.load_state_dict(torch.hub.load_state_dict_from_url(model3_path, map_location=device, progress=True))
model_3.to(device).eval()
except Exception as e:
print(f"Error during model loading: {e}")
tokenizer = None
model_1, model_2, model_3 = None, None, None
label_mapping = {
0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b',
6: 'bloomz', 7: 'cohere', 8: 'davinci', 9: 'dolly', 10: 'dolly-v2-12b',
11: 'flan_t5_base', 12: 'flan_t5_large', 13: 'flan_t5_small',
14: 'flan_t5_xl', 15: 'flan_t5_xxl', 16: 'gemma-7b-it', 17: 'gemma2-9b-it',
18: 'gpt-3.5-turbo', 19: 'gpt-35', 20: 'gpt4', 21: 'gpt4o',
22: 'gpt_j', 23: 'gpt_neox', 24: 'human', 25: 'llama3-70b', 26: 'llama3-8b',
27: 'mixtral-8x7b', 28: 'opt_1.3b', 29: 'opt_125m', 30: 'opt_13b',
31: 'opt_2.7b', 32: 'opt_30b', 33: 'opt_350m', 34: 'opt_6.7b',
35: 'opt_iml_30b', 36: 'opt_iml_max_1.3b', 37: 't0_11b', 38: 't0_3b',
39: 'text-davinci-002', 40: 'text-davinci-003'
}
def clean_text(text: str) -> str:
text = re.sub(r'\s{2,}', ' ', text)
text = re.sub(r'\s+([,.;:?!])', r'\1', text)
return text
if tokenizer:
# Define the new normalizers to add
custom_normalizers_to_add = [
Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2"), # join_hyphen_break
Replace(Regex(r'\s*\n\s*'), " "), # newline_to_space
Strip()
]
# Get the current normalizer from the backend tokenizer
current_backend_normalizer = tokenizer.backend_tokenizer.normalizer
if current_backend_normalizer is None:
# If no existing normalizer, just use the new ones in a Sequence
tokenizer.backend_tokenizer.normalizer = Sequence(custom_normalizers_to_add)
elif isinstance(current_backend_normalizer, normalizers.Sequence):
# If existing is a Sequence, extend its list of normalizers
# The .normalizers attribute of a Sequence is a list, so we can extend it
current_backend_normalizer.normalizers.extend(custom_normalizers_to_add)
# Re-assign to ensure change is registered if Sequence behaves immutably (though it likely modifies in place)
tokenizer.backend_tokenizer.normalizer = Sequence(current_backend_normalizer.normalizers)
else:
# If existing is a single normalizer (not None and not a Sequence), create a new Sequence
tokenizer.backend_tokenizer.normalizer = Sequence([current_backend_normalizer] + custom_normalizers_to_add)
# --- End Model & Tokenizer Configuration ---
title_md = """
<h1 style="text-align: center; margin-bottom: 5px;">AI Text Detector</h1>
<p style="text-align: center; font-size: 0.9em; color: var(--text-secondary); margin-top: 0; margin-bottom: 20px;">Developed by SzegedAI</p>
"""
description = """
<div class="app-description">
<p>This tool utilizes the <b>ModernBERT</b> model to decide whether a given text is human-authored or AI-generated. It employs a soft voting ensemble of <b>three</b> models to improve detection accuracy.</p>
<ul class="features-list">
<li><span class="icon">βœ…</span> <strong>Human Verification: </strong> Clearly identifies human-written content.</li>
<li><span class="icon">πŸ”</span> <strong>Model Detection: </strong> Capable of identifying content from over 40 AI models.</li>
<li><span class="icon">πŸ“ˆ</span> <strong>Accuracy: </strong> Performs optimally with more extensive text inputs.</li>
<li><span class="icon">πŸ“„</span> <strong>Read more: </strong> Our methodology is detailed in our research paper: &nbsp;
<a href="https://aclanthology.org/2025.genaidetect-1.15/" target="_blank" class="learn-more-link"> <b> LINK </b></a>.
</li>
</ul>
<p class="instruction-text">Paste your text into the field below to analyze its origin.</p>
</div>
"""
bottom_text = "<p class='footer-text'>SzegedAI - Mihaly Kiss</p>"
AI_texts = [
"Camels are remarkable desert animals known for their unique adaptations to harsh, arid environments. Native to the Middle East, North Africa, and parts of Asia, camels have been essential to human life for centuries, serving as a mode of transportation, a source of food, and even a symbol of endurance and survival. There are two primary species of camels: the dromedary camel, which has a single hump and is commonly found in the Middle East and North Africa, and the Bactrian camel, which has two humps and is native to Central Asia. Their humps store fat, not water, as commonly believed, allowing them to survive long periods without food by metabolizing the stored fat for energy. Camels are highly adapted to desert life. They can go for weeks without water, and when they do drink, they can consume up to 40 gallons in one sitting. Their thick eyelashes, sealable nostrils, and wide, padded feet protect them from sand and help them walk easily on loose desert terrain.",
]
Human_texts = [
"To make BERT handle a variety of down-stream tasks, our input representation is able to unambiguously represent both a single sentence and a pair of sentences (e.g., h Question, Answeri) in one token sequence. Throughout this work, a β€œsentence” can be an arbitrary span of contiguous text, rather than an actual linguistic sentence. A β€œsequence” refers to the input token sequence to BERT, which may be a single sentence or two sentences packed together. We use WordPiece embeddings (Wu et al., 2016) with a 30,000 token vocabulary. The first token of every sequence is always a special classification token ([CLS]). The final hidden state corresponding to this token is used as the aggregate sequence representation for classification tasks. Sentence pairs are packed together into a single sequence."
]
def classify_text_interface(text):
if not all([tokenizer, model_1, model_2, model_3]):
return "<p style='text-align: center; color: var(--ai-color);'><strong>Error: Models not loaded. Please check the console.</strong></p>"
cleaned_text = clean_text(text)
if not cleaned_text.strip():
result_message = "<p style='text-align: center; color: var(--text-secondary);'>Please enter some text to analyze.</p>"
return result_message
inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
with torch.no_grad():
logits_1 = model_1(**inputs).logits
logits_2 = model_2(**inputs).logits
logits_3 = model_3(**inputs).logits
softmax_1 = torch.softmax(logits_1, dim=1)
softmax_2 = torch.softmax(logits_2, dim=1)
softmax_3 = torch.softmax(logits_3, dim=1)
averaged_probabilities = (softmax_1 + softmax_2 + softmax_3) / 3
probabilities = averaged_probabilities[0]
ai_probs = probabilities.clone()
human_label_index = -1
for k, v in label_mapping.items():
if v.lower() == 'human':
human_label_index = k
break
if human_label_index != -1:
ai_probs[human_label_index] = 0
human_prob_value = probabilities[human_label_index].item() * 100
else:
human_prob_value = 0
print("Warning: 'human' label not found in label_mapping.")
ai_total_prob = ai_probs.sum().item() * 100
ai_argmax_index = torch.argmax(ai_probs).item()
ai_argmax_model = label_mapping.get(ai_argmax_index, "Unknown AI")
if human_prob_value > ai_total_prob :
result_message = (
f"<p><strong>The text is</strong> <span class='highlight-human'><strong>{human_prob_value:.2f}%</strong> likely <b>Human written</b>.</span></p>"
)
else:
result_message = (
f"<p><strong>The text is</strong> <span class='highlight-ai'><strong>{ai_total_prob:.2f}%</strong> likely <b>AI generated</b>.</span></p>"
f"<p style='margin-top: 10px; font-size: 0.95em;'><strong>Most Likely AI Source:</strong> {ai_argmax_model} (with {probabilities[ai_argmax_index].item()*100:.2f}% confidence among AI models)</p>"
)
return result_message
modern_css = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
/* Light Theme (Default) */
:root {
--primary-bg: #F4F7FC; /* Lighter body background */
--app-bg: #FFFFFF;
--text-primary: #2C3E50;
--text-secondary: #7F8C8D;
--accent-color: #1ABC9C;
--accent-color-darker: #16A085;
--border-color: #E0E6ED; /* Slightly softer border */
--input-bg: #FFFFFF;
--input-focus-border: var(--accent-color);
--human-color: #2ECC71;
--human-bg: rgba(46, 204, 113, 0.1);
--ai-color: #E74C3C;
--ai-bg: rgba(231, 76, 60, 0.1);
--shadow-color: rgba(44, 62, 80, 0.1);
--container-max-width: 800px;
--border-radius-md: 8px;
--border-radius-lg: 12px;
--examples-bg: #F8F9FA; /* Light background for examples area */
--placeholder-color: #B0BEC5;
--accordion-label-color: var(--text-primary);
--accordion-bg: var(--app-bg); /* Accordion bg same as app-bg for seamless look */
--accordion-border: var(--border-color);
--sample-textbox-bg: var(--input-bg);
}
/* Dark Theme Overrides */
@media (prefers-color-scheme: dark) {
:root {
--primary-bg: #1A1D2E; /* Dark Blue/Charcoal for body */
--app-bg: #252936; /* Slightly lighter dark for main container */
--text-primary: #EAEAF1; /* Off-white for primary text */
--text-secondary: #A0A3AF; /* Lighter gray for secondary text */
--accent-color: #1DE9B6; /* Brighter Teal for dark mode */
--accent-color-darker: #00BFA5;
--border-color: #3A3E4F; /* Darker, less prominent border */
--input-bg: #2C3040; /* Darker input background */
/* --input-focus-border: var(--accent-color); /* Can remain the same or use a brighter version */
--human-color: #69F0AE; /* Brighter Green */
--human-bg: rgba(105, 240, 174, 0.15); /* Slightly more opaque for dark bg */
--ai-color: #FF8A80; /* Brighter Red */
--ai-bg: rgba(255, 138, 128, 0.15); /* Slightly more opaque for dark bg */
--shadow-color: rgba(0, 0, 0, 0.25); /* Shadow on dark bg */
--examples-bg: #2A2E3B; /* Dark background for examples area */
--placeholder-color: #7A7D8A;
--accordion-label-color: var(--text-primary);
--accordion-bg: var(--app-bg); /* Keep accordion bg consistent with app */
--accordion-border: var(--border-color);
--sample-textbox-bg: var(--input-bg);
}
body { /* Ensure body background changes for dark mode */
background: var(--primary-bg);
}
/* Specific component adjustments for dark mode if needed */
#text_input_box textarea::placeholder {
color: var(--placeholder-color);
}
.gr-accordion > .label-wrap button {
color: var(--accordion-label-color) !important; /* Ensure it uses the dark mode text color */
}
.learn-more-link, .learn-more-link b { /* Ensure link color adapts */
color: var(--accent-color) !important;
}
.learn-more-link:hover, .learn-more-link:hover b {
color: var(--accent-color-darker) !important;
}
}
.features-list strong::after {
content: " ";
display: inline-block;
width: 0.2em; /* Adds a small space after the bolded part like "Verification: " */
}
body {
font-family: 'Inter', sans-serif;
background: var(--primary-bg); /* Use CSS variable */
color: var(--text-primary);
margin: 0;
padding: 20px;
display: flex;
justify-content: center;
align-items: flex-start;
min-height: 100vh;
box-sizing: border-box;
overflow-y: auto;
transition: background-color 0.3s ease, color 0.3s ease; /* Smooth theme transition */
}
.gradio-container {
background-color: var(--app-bg);
border-radius: var(--border-radius-lg);
padding: clamp(25px, 5vw, 40px);
box-shadow: 0 8px 25px var(--shadow-color);
max-width: var(--container-max-width);
width: 100%;
margin: 20px auto;
border: none; /* Gradio might add its own border, ensure it's controlled or removed */
transition: background-color 0.3s ease, box-shadow 0.3s ease;
}
.form.svelte-633qhp, .block.svelte-11xb1hd, .gradio-html .block {
background: none !important;
border: none !important;
box-shadow: none !important;
padding: 0 !important;
}
/* Title and subtitle are handled by title_md */
h1 { /* Fallback or for other h1s if any */
color: var(--text-primary);
font-size: clamp(24px, 5vw, 30px);
font-weight: 700;
text-align: center;
margin-bottom: 20px;
letter-spacing: -0.5px;
}
.app-description p {
color: var(--text-secondary);
font-size: clamp(14px, 2.5vw, 16px);
line-height: 1.7;
margin-bottom: 15px;
}
.app-description .instruction-text {
font-weight: 500;
color: var(--text-primary);
margin-top: 20px;
text-align: center;
}
.features-list {
list-style: none;
padding-left: 0;
margin: 20px 0;
}
.features-list li {
display: flex;
align-items: center; /* Align icon with the first line of text */
font-size: clamp(14px, 2.5vw, 16px);
color: var(--text-secondary);
margin-bottom: 12px;
line-height: 1.6;
}
.features-list .icon {
margin-right: 12px;
font-size: 1.2em;
color: var(--accent-color);
flex-shrink: 0; /* Prevent icon from shrinking */
}
.learn-more-link, .learn-more-link b {
color: var(--accent-color) !important;
text-decoration: none;
font-weight: 600;
}
.learn-more-link:hover, .learn-more-link:hover b {
color: var(--accent-color-darker) !important;
text-decoration: underline;
}
#text_input_box textarea {
background-color: var(--input-bg);
border: 1px solid var(--border-color);
border-radius: var(--border-radius-md);
font-size: clamp(15px, 2.5vw, 16px);
padding: 15px;
width: 100%;
box-sizing: border-box;
color: var(--text-primary);
transition: background-color 0.3s ease, border-color 0.3s ease, box-shadow 0.3s ease, color 0.3s ease;
min-height: 120px;
box-shadow: 0 2px 4px rgba(0,0,0,0.05); /* Subtle shadow for light mode */
}
#text_input_box textarea::placeholder {
color: var(--placeholder-color);
transition: color 0.3s ease;
}
#text_input_box textarea:focus {
border-color: var(--input-focus-border);
box-shadow: 0 0 0 3px color-mix(in srgb, var(--input-focus-border) 25%, transparent); /* Use color-mix for focus shadow */
outline: none;
}
#result_output_box {
background-color: var(--input-bg);
border: 1px solid var(--border-color);
border-radius: var(--border-radius-md);
padding: 20px;
margin-top: 25px;
width: 100%;
box-sizing: border-box;
text-align: center;
font-size: clamp(16px, 3vw, 17px);
box-shadow: 0 4px 8px rgba(0,0,0,0.05); /* Subtle shadow for light mode */
min-height: 80px;
display: flex;
flex-direction: column;
justify-content: center;
transition: background-color 0.3s ease, border-color 0.3s ease, color 0.3s ease;
}
#result_output_box p {
margin-bottom: 8px;
line-height: 1.6;
color: var(--text-primary); /* Ensure text inside result box also adapts */
}
#result_output_box p:last-child {
margin-bottom: 0;
}
#result_output_box strong { /* Ensure bold text also adapts color */
color: var(--text-primary);
}
.highlight-human, .highlight-ai { /* These are spans, color is set by their own var */
font-weight: 600;
padding: 5px 10px;
border-radius: var(--border-radius-md);
display: inline-block;
font-size: 1.05em;
transition: background-color 0.3s ease, color 0.3s ease;
}
.highlight-human {
color: var(--human-color);
background-color: var(--human-bg);
}
.highlight-ai {
color: var(--ai-color);
background-color: var(--ai-bg);
}
/* Gradio specific Tab styling (if you were to use Tabs) */
.tabs > div:first-child button {
background-color: transparent !important;
color: var(--text-secondary) !important;
border: none !important;
border-bottom: 2px solid transparent !important;
border-radius: 0 !important;
padding: 10px 15px !important;
font-weight: 500 !important;
transition: color 0.3s ease, border-bottom-color 0.3s ease !important;
}
.tabs > div:first-child button.selected {
color: var(--accent-color) !important;
border-bottom-color: var(--accent-color) !important;
font-weight: 600 !important;
}
/* Accordion and Examples Styling */
.gr-accordion {
border: 1px solid var(--accordion-border) !important;
border-radius: var(--border-radius-lg) !important;
box-shadow: none !important;
padding: 0 15px 15px 15px !important;
margin-bottom: 20px !important; /* Increased space below each accordion */
background-color: var(--accordion-bg) !important; /* Use variable for accordion background */
transition: background-color 0.3s ease, border-color 0.3s ease;
}
.gr-accordion > .label-wrap button {
font-weight: 600 !important;
color: var(--accordion-label-color) !important;
padding: 15px 0px !important; /* More padding for label */
font-size: 1.05em !important;
transition: color 0.3s ease;
}
.gr-accordion > .label-wrap { /* Remove default Gradio accordion label border */
border-bottom: none !important;
}
.gr-examples { /* Wrapper for the examples content */
padding: 15px 0px 0px 0px !important; /* Adjust padding, top padding handled by accordion */
border: none !important; /* Border is on accordion now */
border-radius: 0 !important; /* Rounded corners on accordion */
background-color: transparent !important; /* Examples area transparent, accordion has bg */
margin-top: 0px !important; /* No extra margin, accordion handles it */
}
.gr-sample-textbox { /* Individual example textboxes */
border: 1px solid var(--border-color) !important;
border-radius: var(--border-radius-md) !important;
font-size: 14px !important;
background-color: var(--sample-textbox-bg) !important;
color: var(--text-primary) !important; /* Text color for example text */
transition: background-color 0.3s ease, border-color 0.3s ease, color 0.3s ease;
}
.gr-sample-textbox:hover {
border-color: var(--accent-color) !important; /* Highlight on hover */
}
.footer-text, #bottom_text {
text-align: center;
margin-top: 40px; /* Keep space above footer */
font-size: clamp(13px, 2vw, 14px);
color: var(--text-secondary);
}
#bottom_text p {
margin: 0;
}
@media (max-width: 768px) {
body {
padding: 10px;
align-items: flex-start;
}
.gradio-container {
padding: 20px;
margin: 10px;
}
h1 { font-size: 22px; }
.app-description p, .features-list li { font-size: 14px; }
#text_input_box textarea { font-size: 15px; min-height: 100px; }
#result_output_box { font-size: 15px; padding: 15px; }
.gr-accordion > .label-wrap button { padding: 12px 0 !important; }
}
"""
iface = gr.Blocks(css=modern_css, theme=gr.themes.Base(font=[gr.themes.GoogleFont("Inter"), "sans-serif"]))
with iface:
gr.Markdown(title_md)
gr.Markdown(description)
text_input = gr.Textbox(
label="",
placeholder="Type or paste your content here...",
elem_id="text_input_box",
lines=10
)
result_output = gr.HTML(elem_id="result_output_box")
if all([tokenizer, model_1, model_2, model_3]):
text_input.change(classify_text_interface, inputs=text_input, outputs=result_output)
else:
gr.HTML("<div id='result_output_box'><p style='color: var(--ai-color); text-align: center;'><strong>Application Error: Models could not be loaded. Please check the server console for details.</strong></p></div>")
with gr.Accordion("AI Text Examples", open=False):
gr.Examples(
examples=AI_texts,
inputs=text_input,
label="",
elem_classes="gr-examples" # Added class for styling
)
with gr.Accordion("Human Text Examples", open=False):
gr.Examples(
examples=Human_texts,
inputs=text_input,
label="",
elem_classes="gr-examples" # Added class for styling
)
gr.Markdown(bottom_text, elem_id="bottom_text")
if __name__ == "__main__":
iface.launch(share=False)