Spaces:

sashtech
/

aihumanifierandgrmoform

Sleeping

App Files Files Community

sashtech commited on Sep 25, 2024

Commit

051de31

verified ·

1 Parent(s): 5834cac

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -1

app.py CHANGED Viewed

@@ -8,6 +8,73 @@ from nltk.corpus import wordnet
 from spellchecker import SpellChecker
 import re
 # Initialize the English text classification pipeline for AI detection
 pipeline_en = pipeline(task="text-classification", model="Hello-SimpleAI/chatgpt-detector-roberta")
@@ -151,7 +218,8 @@ def correct_spelling(text):
 def paraphrase_and_correct(text):
      # Add synonym replacement here
     cleaned_text = remove_redundant_words(text)
-    paraphrased_text = capitalize_sentences_and_nouns(cleaned_text)
     paraphrased_text = force_first_letter_capital(paraphrased_text)
     paraphrased_text = correct_article_errors(paraphrased_text)
     paraphrased_text = correct_tense_errors(paraphrased_text)

 from spellchecker import SpellChecker
 import re
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('averaged_perceptron_tagger')
+nltk.download('wordnet')
+top_words = set(stopwords.words("english"))  # More efficient as a set
+def plagiarism_removal(text):
+    def plagiarism_remover(word):
+        # Handle stopwords, punctuation, and excluded words
+        if word.lower() in stop_words or word.lower() in exclude_words or word in string.punctuation:
+            return word
+        # Find synonyms
+        synonyms = set()
+        for syn in wordnet.synsets(word):
+            for lemma in syn.lemmas():
+                # Exclude overly technical synonyms or words with underscores
+                if "_" not in lemma.name() and lemma.name().isalpha() and lemma.name().lower() != word.lower():
+                    synonyms.add(lemma.name())
+        # Get part of speech for word and filter synonyms with the same POS
+        pos_tag_word = nltk.pos_tag([word])[0]
+        # Avoid replacing certain parts of speech
+        if pos_tag_word[1] in exclude_tags:
+            return word
+        filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
+        # Return original word if no appropriate synonyms found
+        if not filtered_synonyms:
+            return word
+        # Select a random synonym from the filtered list
+        synonym_choice = random.choice(filtered_synonyms)
+        # Retain original capitalization
+        if word.istitle():
+            return synonym_choice.title()
+        return synonym_choice
+    # Tokenize, replace words, and join them back
+    para_split = word_tokenize(text)
+    final_text = [plagiarism_remover(word) for word in para_split]
+    # Handle spacing around punctuation correctly
+    corrected_text = []
+    for i in range(len(final_text)):
+        if final_text[i] in string.punctuation and i > 0:
+            corrected_text[-1] += final_text[i]  # Append punctuation to previous word
+        else:
+            corrected_text.append(final_text[i])
+    return " ".join(corrected_text)
+# Words we don't want to replace
+exclude_tags = {'PRP', 'PRP$', 'MD', 'VBZ', 'VBP', 'VBD', 'VBG', 'VBN', 'TO', 'IN', 'DT', 'CC'}
+exclude_words = {'is', 'am', 'are', 'was', 'were', 'have', 'has', 'do', 'does', 'did', 'will', 'shall', 'should', 'would', 'could', 'can', 'may', 'might'}
 # Initialize the English text classification pipeline for AI detection
 pipeline_en = pipeline(task="text-classification", model="Hello-SimpleAI/chatgpt-detector-roberta")
 def paraphrase_and_correct(text):
      # Add synonym replacement here
     cleaned_text = remove_redundant_words(text)
+    plag_removed=plagiarism_removal(cleaned_text)
+    paraphrased_text = capitalize_sentences_and_nouns(plag_removed)
     paraphrased_text = force_first_letter_capital(paraphrased_text)
     paraphrased_text = correct_article_errors(paraphrased_text)
     paraphrased_text = correct_tense_errors(paraphrased_text)