Spaces:

sashtech
/

aihumanifierandgrmoform

Sleeping

App Files Files Community

sashtech commited on Sep 25, 2024

Commit

2bc5696

verified ·

1 Parent(s): 1bedf23

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -40

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import gradio as gr
 from transformers import pipeline
 import spacy
 import subprocess
 import nltk
 from nltk.corpus import wordnet, stopwords  # Import stopwords here
 from spellchecker import SpellChecker
@@ -26,54 +27,70 @@ download_nltk_resources()
 top_words = set(stopwords.words("english"))  # More efficient as a set
-def plagiarism_removal(text):
-    def plagiarism_remover(word):
-        # Handle stopwords, punctuation, and excluded words
-        if word.lower() in top_words or word.lower() in exclude_words or word in string.punctuation:
-            return word
-        # Find synonyms
-        synonyms = set()
         for syn in wordnet.synsets(word):
             for lemma in syn.lemmas():
                 # Exclude overly technical synonyms or words with underscores
                 if "_" not in lemma.name() and lemma.name().isalpha() and lemma.name().lower() != word.lower():
                     synonyms.add(lemma.name())
-        # Get part of speech for word and filter synonyms with the same POS
-        pos_tag_word = nltk.pos_tag([word])[0]
-        # Avoid replacing certain parts of speech
-        if pos_tag_word[1] in exclude_tags:
-            return word
-        filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
-        # Return original word if no appropriate synonyms found
-        if not filtered_synonyms:
-            return word
-        # Select a random synonym from the filtered list
-        synonym_choice = random.choice(filtered_synonyms)
-        # Retain original capitalization
-        if word.istitle():
-            return synonym_choice.title()
-        return synonym_choice
-    # Tokenize, replace words, and join them back
-    para_split = nltk.word_tokenize(text)
-    final_text = [plagiarism_remover(word) for word in para_split]
-    # Handle spacing around punctuation correctly
-    corrected_text = []
-    for i in range(len(final_text)):
-        if final_text[i] in string.punctuation and i > 0:
-            corrected_text[-1] += final_text[i]  # Append punctuation to previous word
-        else:
-            corrected_text.append(final_text[i])
-    return " ".join(corrected_text)
 # Words we don't want to replace
 exclude_tags = {'PRP', 'PRP$', 'MD', 'VBZ', 'VBP', 'VBD', 'VBG', 'VBN', 'TO', 'IN', 'DT', 'CC'}

 from transformers import pipeline
 import spacy
 import subprocess
+import json
 import nltk
 from nltk.corpus import wordnet, stopwords  # Import stopwords here
 from spellchecker import SpellChecker
 top_words = set(stopwords.words("english"))  # More efficient as a set
+import os
+import json
+# Path to the thesaurus file
+thesaurus_file_path = 'en_thesaurus.jsonl'  # Ensure the file path is correct
+# Function to load the thesaurus into a dictionary
+def load_thesaurus(file_path):
+    thesaurus_dict = {}
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            for line in file:
+                # Parse each line as a JSON object
+                entry = json.loads(line.strip())
+                word = entry.get("word")
+                synonyms = entry.get("synonyms", [])
+                if word:
+                    thesaurus_dict[word] = synonyms
+    except Exception as e:
+        print(f"Error loading thesaurus: {e}")
+    return thesaurus_dict
+# Load the thesaurus
+synonym_dict = load_thesaurus(thesaurus_file_path)
+# Modified plagiarism_remover function to use the loaded thesaurus
+def plagiarism_remover(word):
+    # Handle stopwords, punctuation, and excluded words
+    if word.lower() in top_words or word.lower() in exclude_words or word in string.punctuation:
+        return word
+    # Check for synonyms in the custom thesaurus
+    synonyms = synonym_dict.get(word.lower(), set())
+    # If no synonyms found in the custom thesaurus, use WordNet
+    if not synonyms:
         for syn in wordnet.synsets(word):
             for lemma in syn.lemmas():
                 # Exclude overly technical synonyms or words with underscores
                 if "_" not in lemma.name() and lemma.name().isalpha() and lemma.name().lower() != word.lower():
                     synonyms.add(lemma.name())
+    # Get part of speech for word and filter synonyms with the same POS
+    pos_tag_word = nltk.pos_tag([word])[0]
+    # Avoid replacing certain parts of speech
+    if pos_tag_word[1] in exclude_tags:
+        return word
+    filtered_synonyms = [syn for syn in synonyms if nltk.pos_tag([syn])[0][1] == pos_tag_word[1]]
+    # Return original word if no appropriate synonyms found
+    if not filtered_synonyms:
+        return word
+    # Select a random synonym from the filtered list
+    synonym_choice = random.choice(filtered_synonyms)
+    # Retain original capitalization
+    if word.istitle():
+        return synonym_choice.title()
+    return synonym_choice
 # Words we don't want to replace
 exclude_tags = {'PRP', 'PRP$', 'MD', 'VBZ', 'VBP', 'VBD', 'VBG', 'VBN', 'TO', 'IN', 'DT', 'CC'}