huamnifierWithSimpleGrammer

Running

App Files Files

sashdev commited on Aug 31, 2024

Commit

30196dc

verified ·

1 Parent(s): 670df0e

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -112

app.py CHANGED Viewed

@@ -3,18 +3,14 @@ import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
 import torch
 import nltk
-import random
-import string
 import spacy
-import subprocess  # Import subprocess for downloading spaCy models
-# Ensure nltk data is correctly downloaded
-try:
-    nltk.download('punkt', quiet=True)
-    nltk.download('stopwords', quiet=True)
-    nltk.download('wordnet', quiet=True)
-except Exception as e:
-    print(f"Error downloading NLTK data: {e}")
 # Download spaCy model if not already installed
 try:
@@ -34,97 +30,39 @@ model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-unca
 paraphrase_tokenizer = T5Tokenizer.from_pretrained("SRDdev/Paraphrase")
 paraphrase_model = T5ForConditionalGeneration.from_pretrained("SRDdev/Paraphrase").to(device)
-# AI detection function using DistilBERT with batch processing
-def detect_ai_generated(texts):
-    inputs = tokenizer(texts, return_tensors="pt", truncation=True, max_length=512, padding=True).to(device)
-    with torch.no_grad():
-        outputs = model(**inputs)
-    probabilities = torch.softmax(outputs.logits, dim=1)[:, 1].cpu().tolist()  # List of AI-generated probabilities
-    return probabilities
-# Synonym replacement using spaCy
-def replace_with_synonyms(text, probability=0.3):
     doc = nlp(text)
-    new_text = []
     for token in doc:
-        if random.random() < probability and token.pos_ in ("NOUN", "VERB", "ADJ", "ADV"):
-            synonyms = [synonym.lemma_ for synonym in token.vocab if synonym.is_lower == token.is_lower]
-            if synonyms:
-                new_word = random.choice(synonyms)
-                new_text.append(new_word)
-            else:
-                new_text.append(token.text)
         else:
-            new_text.append(token.text)
-    return " ".join(new_text)
-# Random text transformations to simulate human-like errors
-def random_capitalize(word):
-    if word.isalpha() and random.random() < 0.1:
-        return word.capitalize()
-    return word
-def random_remove_punctuation(text):
-    if random.random() < 0.2:
-        text = list(text)
-        indices = [i for i, c in enumerate(text) if c in string.punctuation]
-        if indices:
-            remove_indices = random.sample(indices, min(3, len(indices)))
-            for idx in sorted(remove_indices, reverse=True):
-                text.pop(idx)
-        return ''.join(text)
-    return text
-def random_double_period(text):
-    if random.random() < 0.2:
-        text = text.replace('.', '..', 3)
-    return text
-def random_double_space(text):
-    if random.random() < 0.2:
-        words = text.split()
-        for _ in range(min(3, len(words) - 1)):
-            idx = random.randint(0, len(words) - 2)
-            words[idx] += '  '
-        return ' '.join(words)
-    return text
-def random_replace_comma_space(text, period_replace_percentage=0.33):
-    comma_occurrences = text.count(", ")
-    period_occurrences = text.count(". ")
-    replace_count_comma = max(1, comma_occurrences // 3)
-    replace_count_period = max(1, period_occurrences // 3)
-    comma_indices = [i for i in range(len(text)) if text.startswith(", ", i)]
-    period_indices = [i for i in range(len(text)) if text.startswith(". ", i)]
-    replace_indices_comma = random.sample(comma_indices, min(replace_count_comma, len(comma_indices)))
-    replace_indices_period = random.sample(period_indices, min(replace_count_period, len(period_indices)))
-    for idx in sorted(replace_indices_comma + replace_indices_period, reverse=True):
-        if text.startswith(", ", idx):
-            text = text[:idx] + " ," + text[idx + 2:]
-        if text.startswith(". ", idx):
-            text = text[:idx] + " ." + text[idx + 2:]
-    return text
-def transform_paragraph(paragraph):
-    words = paragraph.split()
-    if len(words) > 12:
-        words = [random_capitalize(word) for word in words]
-        transformed_paragraph = ' '.join(words)
-        transformed_paragraph = random_remove_punctuation(transformed_paragraph)
-        transformed_paragraph = random_double_period(transformed_paragraph)
-        transformed_paragraph = random_double_space(transformed_paragraph)
-        transformed_paragraph = random_replace_comma_space(transformed_paragraph)
-        transformed_paragraph = replace_with_synonyms(transformed_paragraph)  # Use spaCy for synonyms
-    else:
-        transformed_paragraph = paragraph
-    return transformed_paragraph
-def transform_text(text):
-    paragraphs = text.split('\n')
-    transformed_paragraphs = [transform_paragraph(paragraph) for paragraph in paragraphs]
-    return '\n'.join(transformed_paragraphs)
-# Humanize the AI-detected text using the SRDdev Paraphrase model with optimized parameters
 def humanize_text(AI_text):
     paragraphs = AI_text.split("\n")
     paraphrased_paragraphs = []
@@ -133,38 +71,36 @@ def humanize_text(AI_text):
             inputs = paraphrase_tokenizer(paragraph, return_tensors="pt", max_length=512, truncation=True).to(device)
             paraphrased_ids = paraphrase_model.generate(
                 inputs['input_ids'],
-                max_length=inputs['input_ids'].shape[-1] + 20,
-                num_beams=2,  # Reduced beam size for speed
                 early_stopping=True,
-                length_penalty=0.8,  # Lower penalty to generate faster
-                no_repeat_ngram_size=2,  # Reduced for performance
-                do_sample=True,  # Enable sampling to add randomness
-                top_k=50,  # Top-k sampling
-                top_p=0.95,  # Top-p (nucleus) sampling
             )
             paraphrased_text = paraphrase_tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)
             paraphrased_paragraphs.append(paraphrased_text)
     return "\n\n".join(paraphrased_paragraphs)
-# Main function to handle the overall process with batch processing
 def main_function(AI_text):
-    sentences = nltk.sent_tokenize(AI_text)
-    ai_probabilities = detect_ai_generated(sentences)
-    ai_generated_percentage = sum([1 for prob in ai_probabilities if prob > 0.5]) / len(ai_probabilities) * 100
-    # Transform AI text to make it more human-like
-    humanized_text = humanize_text(AI_text)
-    humanized_text = transform_text(humanized_text)  # Add randomness to simulate human errors
-    return f"AI-Generated Content: {ai_generated_percentage:.2f}%\n\nHumanized Text:\n{humanized_text}"
 # Gradio interface definition
 interface = gr.Interface(
     fn=main_function,
     inputs="textbox",
     outputs="textbox",
-    title="AI Text Humanizer",
-    description="Enter AI-generated text and get a human-written version. This space uses models from Hugging Face directly."
 )
 # Launch the Gradio app

 from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
 import torch
 import nltk
 import spacy
+from nltk.corpus import wordnet
+import subprocess
+# Download NLTK data (if not already downloaded)
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('wordnet')  # Download WordNet
 # Download spaCy model if not already installed
 try:
 paraphrase_tokenizer = T5Tokenizer.from_pretrained("SRDdev/Paraphrase")
 paraphrase_model = T5ForConditionalGeneration.from_pretrained("SRDdev/Paraphrase").to(device)
+# Function to find synonyms using WordNet via NLTK
+def get_synonyms(word):
+    synonyms = set()
+    for syn in wordnet.synsets(word):
+        for lemma in syn.lemmas():
+            synonyms.add(lemma.name())
+    return list(synonyms)
+# Replace words with synonyms using spaCy and WordNet
+def replace_with_synonyms(text):
     doc = nlp(text)
+    processed_text = []
     for token in doc:
+        synonyms = get_synonyms(token.text.lower())
+        if synonyms and token.pos_ in {"NOUN", "VERB", "ADJ", "ADV"}:  # Only replace certain types of words
+            replacement = synonyms[0]  # Replace with the first synonym
+            if token.is_title:
+                replacement = replacement.capitalize()
+            processed_text.append(replacement)
         else:
+            processed_text.append(token.text)
+    return " ".join(processed_text)
+# AI detection function using DistilBERT
+def detect_ai_generated(text):
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    probabilities = torch.softmax(outputs.logits, dim=1)
+    ai_probability = probabilities[0][1].item()  # Probability of being AI-generated
+    return ai_probability
+# Humanize the AI-detected text using the SRDdev Paraphrase model
 def humanize_text(AI_text):
     paragraphs = AI_text.split("\n")
     paraphrased_paragraphs = []
             inputs = paraphrase_tokenizer(paragraph, return_tensors="pt", max_length=512, truncation=True).to(device)
             paraphrased_ids = paraphrase_model.generate(
                 inputs['input_ids'],
+                max_length=inputs['input_ids'].shape[-1] + 20,  # Slightly more than the original input length
+                num_beams=4,
                 early_stopping=True,
+                length_penalty=1.0,
+                no_repeat_ngram_size=3,
             )
             paraphrased_text = paraphrase_tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)
             paraphrased_paragraphs.append(paraphrased_text)
     return "\n\n".join(paraphrased_paragraphs)
+# Main function to handle the overall process
 def main_function(AI_text):
+    # Replace words with synonyms
+    text_with_synonyms = replace_with_synonyms(AI_text)
+    # Detect AI-generated content
+    ai_probability = detect_ai_generated(text_with_synonyms)
+    # Humanize AI text
+    humanized_text = humanize_text(text_with_synonyms)
+    return f"AI-Generated Content: {ai_probability:.2f}%\n\nHumanized Text:\n{humanized_text}"
 # Gradio interface definition
 interface = gr.Interface(
     fn=main_function,
     inputs="textbox",
     outputs="textbox",
+    title="AI Text Humanizer with Synonym Replacement",
+    description="Enter AI-generated text and get a human-written version, with synonyms replaced for more natural output. This space uses models from Hugging Face directly."
 )
 # Launch the Gradio app