File size: 10,207 Bytes
78bbe6d
 
 
208bdb2
78bbe6d
 
 
 
 
 
 
31d085e
78bbe6d
31d085e
 
 
 
 
 
78bbe6d
31d085e
 
78bbe6d
 
 
 
 
 
208bdb2
fd8896a
208bdb2
fd8896a
208bdb2
 
78bbe6d
 
 
31d085e
78bbe6d
 
 
 
 
 
 
 
 
31d085e
 
 
 
 
 
78bbe6d
 
 
 
 
 
 
 
 
31d085e
78bbe6d
 
 
 
 
 
 
 
 
 
 
 
 
31d085e
 
 
78bbe6d
 
31d085e
 
 
 
 
 
 
 
 
78bbe6d
 
 
31d085e
78bbe6d
31d085e
 
78bbe6d
31d085e
 
78bbe6d
 
 
 
 
 
31d085e
78bbe6d
 
31d085e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78bbe6d
31d085e
78bbe6d
31d085e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78bbe6d
 
 
31d085e
 
 
 
 
78bbe6d
31d085e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78bbe6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31d085e
 
78bbe6d
 
31d085e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78bbe6d
31d085e
78bbe6d
 
 
31d085e
78bbe6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da22f13
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers.pipelines import pipeline
from sentence_transformers import SentenceTransformer, util
import numpy as np
import gradio.themes as grthemes
import random
import re

# ----------------------
# Paraphrasing Model Setup (Pegasus + T5)
# ----------------------
PEGASUS_MODEL_NAME = "tuner007/pegasus_paraphrase"
T5_MODEL_NAME = "Vamsi/T5_Paraphrase_Paws"
pegasus_tokenizer = AutoTokenizer.from_pretrained(PEGASUS_MODEL_NAME)
pegasus_model = AutoModelForSeq2SeqLM.from_pretrained(PEGASUS_MODEL_NAME)
t5_tokenizer = AutoTokenizer.from_pretrained(T5_MODEL_NAME)
t5_model = AutoModelForSeq2SeqLM.from_pretrained(T5_MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pegasus_model = pegasus_model.to(device)
t5_model = t5_model.to(device)

# ----------------------
# Semantic Similarity Model
# ----------------------
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')

# ----------------------
# Local AI Detector (roberta-base-openai-detector)
# ----------------------
AI_DETECTOR_MODEL = "roberta-base-openai-detector"
ai_detector = pipeline("text-classification", model=AI_DETECTOR_MODEL, device=0 if torch.cuda.is_available() else -1)

# ----------------------
# Prompt Variations for Humanization
# ----------------------
PEGASUS_PROMPTS = [
    "Paraphrase this naturally:",
    "Rewrite as if explaining to a friend:",
    "Make this sound like a real conversation:",
    "Express this in a casual, human way:",
    "Reword this with natural flow:",
    "Make this sound less robotic:",
    "Rewrite in a friendly, informal tone:",
    "Paraphrase in a way a student would say it:",
]
T5_PROMPTS = [
    "Paraphrase the following text in a formal, academic tone:",
    "Paraphrase the following text in a casual, conversational tone:",
    "Paraphrase the following text in a friendly, approachable tone:",
    "Paraphrase the following text to bypass AI detectors and sound as human as possible:",
]

# ----------------------
# Sentence Splitter
# ----------------------
def split_sentences(text):
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s for s in sentences if s]

# ----------------------
# Aggressive Post-Processing
# ----------------------
def postprocess_text(text):
    contractions = {
        "do not": "don't", "cannot": "can't", "will not": "won't", "I am": "I'm",
        "is not": "isn't", "are not": "aren't", "did not": "didn't", "it is": "it's",
        "does not": "doesn't", "have not": "haven't", "has not": "hasn't"
    }
    for k, v in contractions.items():
        text = re.sub(rf'\b{k}\b', v, text, flags=re.IGNORECASE)
    idioms = [
        "at the end of the day", "to be honest", "as a matter of fact", "for what it's worth",
        "in a nutshell", "the bottom line is", "all things considered"
    ]
    transitions = [
        "Interestingly,", "In fact,", "To be clear,", "As a result,", "For example,", "On the other hand,", "In other words,"
    ]
    if random.random() < 0.3:
        text += " " + random.choice(idioms) + "."
    if random.random() < 0.3:
        text = random.choice(transitions) + " " + text
    # Randomly lower-case a word to mimic human error
    if random.random() < 0.2:
        words = text.split()
        if len(words) > 3:
            idx = random.randint(1, len(words)-2)
            words[idx] = words[idx].lower()
            text = ' '.join(words)
    return text

# ----------------------
# Multi-Model, Multi-Pass Paraphrasing
# ----------------------
def pegasus_paraphrase(sentence):
    prompt = random.choice(PEGASUS_PROMPTS)
    full_prompt = f"{prompt} {sentence}"
    batch = pegasus_tokenizer([full_prompt], truncation=True, padding='longest', max_length=60, return_tensors="pt").to(device)
    outputs = pegasus_model.generate(
        **batch,
        max_length=60,
        num_beams=5,
        num_return_sequences=1,
        temperature=1.0
    )
    tgt_text = pegasus_tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return tgt_text[0] if tgt_text else sentence

def t5_paraphrase(sentence):
    prompt = random.choice(T5_PROMPTS) + " " + sentence
    input_ids = t5_tokenizer.encode(prompt, return_tensors="pt", max_length=256, truncation=True).to(device)
    outputs = t5_model.generate(
        input_ids,
        do_sample=True,
        top_k=120,
        top_p=0.95,
        temperature=0.7,
        repetition_penalty=1.2,
        max_length=256,
        num_return_sequences=1
    )
    paraphrased = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return paraphrased

# ----------------------
# Feedback Loop with AI Detector
# ----------------------
def check_ai_score(text):
    try:
        result = ai_detector(text)
        for r in result:
            if r['label'] in ['LABEL_1', 'Fake']:
                return r['score'], None
            elif r['label'] in ['LABEL_0', 'Real']:
                return 1.0 - r['score'], None
        return 0.5, None
    except Exception as e:
        return None, f"AI detection error: {str(e)}"

# ----------------------
# Main Humanizer Pipeline
# ----------------------
def humanize_pipeline(text, tone, max_feedback_loops=2):
    sentences = split_sentences(text)
    paraphrased = []
    for sent in sentences:
        # First pass: Pegasus
        peg = pegasus_paraphrase(sent)
        # Second pass: T5
        t5 = t5_paraphrase(peg)
        paraphrased.append(t5)
    joined = ' '.join(paraphrased)
    processed = postprocess_text(joined)
    # Feedback loop: if still flagged as AI, re-paraphrase flagged sentences
    for _ in range(max_feedback_loops):
        ai_prob, _ = check_ai_score(processed)
        if ai_prob is not None and ai_prob < 0.5:
            break  # Considered human
        # Re-paraphrase all sentences again
        sentences = split_sentences(processed)
        paraphrased = []
        for sent in sentences:
            peg = pegasus_paraphrase(sent)
            t5 = t5_paraphrase(peg)
            paraphrased.append(t5)
        joined = ' '.join(paraphrased)
        processed = postprocess_text(joined)
    return processed

# ----------------------
# Semantic Similarity Function
# ----------------------
def semantic_similarity(text1, text2):
    emb1 = similarity_model.encode(text1, convert_to_tensor=True)
    emb2 = similarity_model.encode(text2, convert_to_tensor=True)
    sim = util.pytorch_cos_sim(emb1, emb2).item()
    return sim

# ----------------------
# Humanization Score & Rating
# ----------------------
def humanization_score(sim, ai_prob):
    score = (1.0 - sim) * 0.5 + (1.0 - ai_prob) * 0.5
    return score

def humanization_rating(score):
    if score < 0.7:
        return f"⚠️ Still AI-like ({score:.2f})"
    elif score < 0.85:
        return f"👍 Acceptable ({score:.2f})"
    else:
        return f"✅ Highly Humanized ({score:.2f})"

# ----------------------
# Main Processing Function
# ----------------------
def process(text, tone):
    if not text.strip():
        return "", "", 0.0, "", 0.0, ""
    pre_ai_prob, pre_err = check_ai_score(text)
    if pre_ai_prob is None:
        return "", f"AI Detection Error: {pre_err}", 0.0, "", 0.0, ""
    try:
        # Generate 3 versions for user choice
        outputs = [humanize_pipeline(text, tone) for _ in range(3)]
    except Exception as e:
        return f"[Paraphrasing error: {str(e)}]", "", 0.0, "", 0.0, ""
    # Pick the most human-like version (lowest ai_prob)
    best = None
    best_score = -1
    best_ai_prob = 1.0
    for out in outputs:
        post_ai_prob, _ = check_ai_score(out)
        sim = semantic_similarity(text, out)
        score = humanization_score(sim, post_ai_prob if post_ai_prob is not None else 1.0)
        if post_ai_prob is not None and post_ai_prob < best_ai_prob:
            best = out
            best_score = score
            best_ai_prob = post_ai_prob
    if best is None:
        best = outputs[0]
        best_score = 0.0
        best_ai_prob = 1.0
    sim = semantic_similarity(text, best)
    rating = humanization_rating(best_score)
    ai_score_str = f"Pre: {100*(1-pre_ai_prob):.1f}% human | Post: {100*(1-best_ai_prob):.1f}% human"
    return (
        best,
        ai_score_str,
        sim,
        rating,
        best_score * 100,
        ""
    )

# ----------------------
# Gradio UI
# ----------------------
custom_theme = grthemes.Base(
    primary_hue="blue",
    secondary_hue="blue",
    neutral_hue="slate"
)

with gr.Blocks(theme=custom_theme, title="AI Humanizer - Made by Taha") as demo:
    gr.Markdown("""
    # 🧠 AI Humanizer
    <div style='display:flex;justify-content:space-between;align-items:center;'>
        <span style='font-size:1.2em;color:#7bb1ff;'>Rewrite AI text to sound 100% human</span>
        <span style='font-weight:bold;color:#7bb1ff;'>Made by Taha</span>
    </div>
    """, elem_id="header")
    with gr.Row():
        with gr.Column():
            text_in = gr.Textbox(label="Paste AI-generated text here", lines=8, placeholder="Paste your text...", elem_id="input-box")
            tone = gr.Dropdown(["Academic", "Casual", "Friendly", "Stealth"], value="Stealth", label="Tone Selector")
            btn = gr.Button("Humanize", elem_id="humanize-btn")
        with gr.Column():
            text_out = gr.Textbox(label="Humanized Output", lines=8, interactive=False, elem_id="output-box")
            ai_scores = gr.Markdown("", elem_id="ai-scores")
            sim_score = gr.Number(label="Similarity (0=very different, 1=very similar)", interactive=False)
            rating = gr.Markdown("", elem_id="rating")
            human_score = gr.Number(label="Humanization Score (%)", interactive=False)
    btn.click(
        process,
        inputs=[text_in, tone],
        outputs=[text_out, ai_scores, sim_score, rating, human_score, gr.Textbox(visible=False)],
        api_name="humanize"
    )
    gr.Markdown("""
    <div style='text-align:center;color:#7bb1ff;margin-top:2em;'>
        <b>Made by Taha</b> | Free for unlimited use | Optimized for students and creators
    </div>
    """, elem_id="footer")

demo.launch()