Spaces:

sashtech
/

aihumanifierandgrmoform

Sleeping

App Files Files Community

sashtech commited on Sep 25, 2024

Commit

fdbab88

verified ·

1 Parent(s): 7b071b0

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -103

app.py CHANGED Viewed

@@ -3,56 +3,30 @@ import gradio as gr
 from transformers import pipeline
 import spacy
 import subprocess
-import json
 import nltk
-from nltk.corpus import wordnet, stopwords
 from spellchecker import SpellChecker
 import re
-import random
 import string
-# Ensure necessary NLTK data is downloaded
-def download_nltk_resources():
-    try:
-        nltk.download('punkt')
-        nltk.download('stopwords')
-        nltk.download('averaged_perceptron_tagger')
-        nltk.download('averaged_perceptron_tagger_eng')
-        nltk.download('wordnet')
-        nltk.download('omw-1.4')
-        nltk.download('punkt_tab')
-    except Exception as e:
-        print(f"Error downloading NLTK resources: {e}")
-# Call the download function
-download_nltk_resources()
-top_words = set(stopwords.words("english"))
-# Path to the thesaurus file
-thesaurus_file_path = 'en_thesaurus.jsonl'  # Ensure the file path is correct
-# Function to load the thesaurus into a dictionary
-def load_thesaurus(file_path):
-    thesaurus_dict = {}
-    try:
-        with open(file_path, 'r', encoding='utf-8') as file:
-            for line in file:
-                entry = json.loads(line.strip())
-                word = entry.get("word")
-                synonyms = entry.get("synonyms", [])
-                if word:
-                    thesaurus_dict[word] = synonyms
-    except Exception as e:
-        print(f"Error loading thesaurus: {e}")
-    return thesaurus_dict
-# Load the thesaurus
-synonym_dict = load_thesaurus(thesaurus_file_path)
-# Words and POS tags we don't want to replace
 exclude_tags = {'PRP', 'PRP$', 'MD', 'VBZ', 'VBP', 'VBD', 'VBG', 'VBN', 'TO', 'IN', 'DT', 'CC'}
 exclude_words = {'is', 'am', 'are', 'was', 'were', 'have', 'has', 'do', 'does', 'did', 'will', 'shall', 'should', 'would', 'could', 'can', 'may', 'might'}
@@ -69,44 +43,59 @@ except OSError:
     subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
     nlp = spacy.load("en_core_web_sm")
-# Function to predict the label and score for English text (AI Detection)
-def predict_en(text):
-    try:
-        res = pipeline_en(text)[0]
-        return res['label'], res['score']
-    except Exception as e:
-        return f"Error during AI detection: {e}"
-# Function to remove plagiarism
-def plagiarism_remover(word):
-    if word.lower() in top_words or word.lower() in exclude_words or word in string.punctuation:
-        return word
-    # Check for synonyms in the custom thesaurus
-    synonyms = synonym_dict.get(word.lower(), set())
-    # If no synonyms found in the custom thesaurus, use WordNet
-    if not synonyms:
         for syn in wordnet.synsets(word):
             for lemma in syn.lemmas():
                 if "_" not in lemma.name() and lemma.name().isalpha() and lemma.name().lower() != word.lower():
                     synonyms.add(lemma.name())
-    pos_tag_word = nltk.pos_tag([word])[0]
-    if pos_tag_word[1] in exclude_tags:
-        return word
-    filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
-    if not filtered_synonyms:
-        return word
-    synonym_choice = random.choice(filtered_synonyms)
-    if word.istitle():
-        return synonym_choice.title()
-    return synonym_choice
 # Function to remove redundant and meaningless words
 def remove_redundant_words(text):
@@ -117,6 +106,7 @@ def remove_redundant_words(text):
 # Function to fix spacing before punctuation
 def fix_punctuation_spacing(text):
     words = text.split(' ')
     cleaned_words = []
     punctuation_marks = {',', '.', "'", '!', '?', ':'}
@@ -132,7 +122,8 @@ def fix_punctuation_spacing(text):
 # Function to fix possessives like "Earth's"
 def fix_possessives(text):
-    return re.sub(r'(\w)\s\'\s?s', r"\1's", text)
 # Function to capitalize the first letter of sentences and proper nouns
 def capitalize_sentences_and_nouns(text):
@@ -214,38 +205,43 @@ def correct_spelling(text):
     corrected_words = []
     for word in words:
         corrected_word = spell.correction(word)
-        corrected_words.append(corrected_word if corrected_word is not None else word)
     return ' '.join(corrected_words)
-# Main processing function for paraphrasing and grammar correction
 def paraphrase_and_correct(text):
     cleaned_text = remove_redundant_words(text)
-    cleaned_text = fix_punctuation_spacing(cleaned_text)
-    cleaned_text = fix_possessives(cleaned_text)
-    cleaned_text = capitalize_sentences_and_nouns(cleaned_text)
-    cleaned_text = force_first_letter_capital(cleaned_text)
-    cleaned_text = correct_tense_errors(cleaned_text)
-    cleaned_text = correct_article_errors(cleaned_text)
-    cleaned_text = ensure_subject_verb_agreement(cleaned_text)
-    cleaned_text = correct_spelling(cleaned_text)
-    plag_removed = plagiarism_remover(cleaned_text)
-    return plag_removed
-# Create the Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# AI Text Processor")
     with gr.Tab("AI Detection"):
-        t1 = gr.Textbox(lines=5, label='Input Text')
-        btn1 = gr.Button("Detect AI")
-        out1 = gr.Textbox(label='Prediction', interactive=False)
-        out2 = gr.Textbox(label='Confidence', interactive=False)
-        btn1.click(fn=predict_en, inputs=t1, outputs=[out1, out2])
-    with gr.Tab("Paraphrasing and Grammar Correction"):
-        t2 = gr.Textbox(lines=5, label='Input Text')
-        btn2 = gr.Button("Process Text")
-        out3 = gr.Textbox(label='Processed Text', interactive=False)
-        btn2.click(fn=paraphrase_and_correct, inputs=t2, outputs=out3)
-demo.launch()

 from transformers import pipeline
 import spacy
 import subprocess
 import nltk
+from nltk.corpus import wordnet
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
 from spellchecker import SpellChecker
 import re
 import string
+import random
+# Download necessary NLTK data
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('averaged_perceptron_tagger')
+nltk.download('averaged_perceptron_tagger_eng')
+nltk.download('wordnet')
+nltk.download('omw-1.4')
+nltk.download('punkt_tab')
+# Initialize stopwords
+stop_words = set(stopwords.words("english"))
+# Words we don't want to replace
 exclude_tags = {'PRP', 'PRP$', 'MD', 'VBZ', 'VBP', 'VBD', 'VBG', 'VBN', 'TO', 'IN', 'DT', 'CC'}
 exclude_words = {'is', 'am', 'are', 'was', 'were', 'have', 'has', 'do', 'does', 'did', 'will', 'shall', 'should', 'would', 'could', 'can', 'may', 'might'}
     subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
     nlp = spacy.load("en_core_web_sm")
+def plagiarism_removal(text):
+    def plagiarism_remover(word):
+        # Handle stopwords, punctuation, and excluded words
+        if word.lower() in stop_words or word.lower() in exclude_words or word in string.punctuation:
+            return word
+        # Find synonyms
+        synonyms = set()
         for syn in wordnet.synsets(word):
             for lemma in syn.lemmas():
+                # Exclude overly technical synonyms or words with underscores
                 if "_" not in lemma.name() and lemma.name().isalpha() and lemma.name().lower() != word.lower():
                     synonyms.add(lemma.name())
+        # Get part of speech for word and filter synonyms with the same POS
+        pos_tag_word = nltk.pos_tag([word])[0]
+        # Avoid replacing certain parts of speech
+        if pos_tag_word[1] in exclude_tags:
+            return word
+        filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
+        # Return original word if no appropriate synonyms found
+        if not filtered_synonyms:
+            return word
+        # Select a random synonym from the filtered list
+        synonym_choice = random.choice(filtered_synonyms)
+        # Retain original capitalization
+        if word.istitle():
+            return synonym_choice.title()
+        return synonym_choice
+    # Tokenize, replace words, and join them back
+    para_split = word_tokenize(text)
+    final_text = [plagiarism_remover(word) for word in para_split]
+    # Handle spacing around punctuation correctly
+    corrected_text = []
+    for i in range(len(final_text)):
+        if final_text[i] in string.punctuation and i > 0:
+            corrected_text[-1] += final_text[i]  # Append punctuation to previous word
+        else:
+            corrected_text.append(final_text[i])
+    return " ".join(corrected_text)
+# Function to predict the label and score for English text (AI Detection)
+def predict_en(text):
+    res = pipeline_en(text)[0]
+    return res['label'], res['score']
 # Function to remove redundant and meaningless words
 def remove_redundant_words(text):
 # Function to fix spacing before punctuation
 def fix_punctuation_spacing(text):
+    # Split the text into words and punctuation
     words = text.split(' ')
     cleaned_words = []
     punctuation_marks = {',', '.', "'", '!', '?', ':'}
 # Function to fix possessives like "Earth's"
 def fix_possessives(text):
+    text = re.sub(r'(\w)\s\'\s?s', r"\1's", text)
+    return text
 # Function to capitalize the first letter of sentences and proper nouns
 def capitalize_sentences_and_nouns(text):
     corrected_words = []
     for word in words:
         corrected_word = spell.correction(word)
+        if corrected_word is not None:
+            corrected_words.append(corrected_word)
+        else:
+            corrected_words.append(word)
     return ' '.join(corrected_words)
+# Main function for paraphrasing and grammar correction
 def paraphrase_and_correct(text):
+    # Add synonym replacement here
     cleaned_text = remove_redundant_words(text)
+    plag_removed = plagiarism_removal(cleaned_text)
+    paraphrased_text = capitalize_sentences_and_nouns(plag_removed)
+    paraphrased_text = force_first_letter_capital(paraphrased_text)
+    paraphrased_text = correct_article_errors(paraphrased_text)
+    paraphrased_text = correct_tense_errors(paraphrased_text)
+    paraphrased_text = ensure_subject_verb_agreement(paraphrased_text)
+    paraphrased_text = fix_possessives(paraphrased_text)
+    paraphrased_text = correct_spelling(paraphrased_text)
+    paraphrased_text = fix_punctuation_spacing(paraphrased_text)
+    return paraphrased_text
+# Gradio app setup
 with gr.Blocks() as demo:
     with gr.Tab("AI Detection"):
+        t1 = gr.Textbox(lines=5, label='Text')
+        button1 = gr.Button("🤖 Predict!")
+        label1 = gr.Textbox(lines=1, label='Predicted Label 🎃')
+        score1 = gr.Textbox(lines=1, label='Prob')
+        button1.click(fn=predict_en, inputs=t1, outputs=[label1, score1])
+    with gr.Tab("Paraphrasing & Grammar Correction"):
+        t2 = gr.Textbox(lines=5, label='Enter text for paraphrasing and grammar correction')
+        button2 = gr.Button("🔄 Paraphrase and Correct")
+        result2 = gr.Textbox(lines=5, label='Corrected Text')
+        button2.click(fn=paraphrase_and_correct, inputs=t2, outputs=result2)
+demo.launch(share=True)