Spaces:

nevisende
/

word-analyzer

Build error

App Files Files Community

nevisende commited on Aug 9, 2024

Commit

b5cc7ad

1 Parent(s): e599e54

Feat: create initial files

Browse files

Files changed (4) hide show

.env.exam +1 -0
.gitignore +2 -0
app.py +166 -0
requirements.txt +7 -0

.env.exam ADDED Viewed

	@@ -0,0 +1 @@


1	+ HF_TOKEN=

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ flagged
2	+ .env

app.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import os
+from dotenv import load_dotenv
+import logging
+import json
+import hashlib
+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import spacy
+import nltk
+from nltk.corpus import wordnet as wn
+from nltk.tokenize import word_tokenize
+from nltk.tag import pos_tag
+import gradio as gr
+load_dotenv()
+# Configuration
+CONFIG = {
+    'HF_TOKEN': os.getenv('HF_TOKEN'),
+    'SPACY_MODEL': 'en_core_web_sm',
+    'LOG_LEVEL': logging.INFO,
+}
+# Setup logging
+logging.basicConfig(level=CONFIG['LOG_LEVEL'], format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Set environment variables
+os.environ['HF_TOKEN'] = CONFIG['HF_TOKEN']
+# Download required NLTK data
+nltk.download('wordnet', quiet=True)
+nltk.download('averaged_perceptron_tagger', quiet=True)
+nltk.download('punkt', quiet=True)
+# Load spaCy model
+try:
+    nlp = spacy.load(CONFIG['SPACY_MODEL'])
+except IOError:
+    logger.info("Downloading spaCy model...")
+    spacy.cli.download(CONFIG['SPACY_MODEL'])
+    nlp = spacy.load(CONFIG['SPACY_MODEL'])
+def get_wordnet_pos(treebank_tag):
+    """Map POS tag to first character used by WordNet."""
+    tag_map = {
+        'J': wn.ADJ, 'V': wn.VERB, 'N': wn.NOUN, 'R': wn.ADV
+    }
+    return tag_map.get(treebank_tag[0], None)
+def lesk_algorithm(word, sentence, pos=None):
+    """Implement the Lesk algorithm for word sense disambiguation."""
+    word = word.lower()
+    context = set(word_tokenize(sentence.lower()))
+    best_sense = None
+    max_overlap = 0
+    for synset in wn.synsets(word):
+        if pos and synset.pos() != pos:
+            continue
+        signature = set(word_tokenize(synset.definition().lower()))
+        for example in synset.examples():
+            signature.update(set(word_tokenize(example.lower())))
+        overlap = len(signature.intersection(context))
+        if overlap > max_overlap:
+            max_overlap = overlap
+            best_sense = synset
+    return best_sense
+def create_unique_index(word, meaning, sentence):
+    """Create a unique index for each word-meaning pair."""
+    combined = f"{word}_{meaning}_{sentence}".encode('utf-8')
+    return hashlib.md5(combined).hexdigest()
+def is_meaningful_word(token):
+    """Check if a word is meaningful and should be included in the analysis."""
+    return (token.has_vector and  # This ensures the word is in spaCy's vocabulary
+            not token.is_stop and  # Exclude stop words
+            token.pos_ not in ['PUNCT', 'SYM', 'X'] and  # Exclude punctuation, symbols, and other
+            len(token.text) > 1)  # Exclude single-character tokens
+def process_sentence(sent):
+    """Process a single sentence and return word information."""
+    word_info = defaultdict(lambda: {"lemma": "", "meanings": []})
+    doc = nlp(sent)
+    for token in doc:
+        if is_meaningful_word(token):
+            word = token.text.lower()
+            wordnet_pos = get_wordnet_pos(token.tag_)
+            if not word_info[word]["lemma"]:
+                word_info[word]["lemma"] = token.lemma_
+            best_sense = lesk_algorithm(word, sent, wordnet_pos)
+            if best_sense:
+                definition = best_sense.definition()
+                pos = best_sense.pos()
+                unique_index = create_unique_index(word, definition, sent)
+                new_meaning = {
+                    "index": unique_index,
+                    "meaning": definition,
+                    "POS": pos,
+                    "sentence": sent
+                }
+                if not any(m['meaning'] == definition for m in word_info[word]["meanings"]):
+                    word_info[word]["meanings"].append(new_meaning)
+    return dict(word_info)
+def get_word_info(text):
+    """Get word information for all sentences in the text."""
+    sentences = nltk.sent_tokenize(text)
+    word_info = defaultdict(lambda: {"lemma": "", "meanings": []})
+    with ProcessPoolExecutor() as executor:
+        future_to_sentence = {executor.submit(process_sentence, sent): sent for sent in sentences}
+        for future in as_completed(future_to_sentence):
+            sentence_info = future.result()
+            for word, info in sentence_info.items():
+                word_info[word]["lemma"] = info["lemma"]
+                word_info[word]["meanings"].extend(info["meanings"])
+    # If a word has no meanings, try to get a default definition
+    for word, info in word_info.items():
+        if not info["meanings"]:
+            synsets = wn.synsets(word)
+            if synsets:
+                definition = synsets[0].definition()
+                pos = synsets[0].pos()
+                info["meanings"].append({
+                    "index": create_unique_index(word, definition, ""),
+                    "meaning": definition,
+                    "POS": pos,
+                    "sentence": "Default definition"
+                })
+    return dict(word_info)
+def process_text(text):
+    """Process the input text and return JSON results."""
+    try:
+        word_info = get_word_info(text)
+        return json.dumps(word_info, indent=2)
+    except Exception as e:
+        logger.error(f"Error processing text: {str(e)}")
+        return json.dumps({"error": "An error occurred while processing the text."})
+# Gradio Interface
+iface = gr.Interface(
+    fn=process_text,
+    inputs=gr.Textbox(lines=5, label="Enter your text here"),
+    outputs=gr.JSON(label="Results"),
+    title="Improved Word Sense Disambiguation API",
+    description="This API performs word sense disambiguation with special focus on 'season' and returns results in JSON format."
+)
+if __name__ == "__main__":
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch
+transformers
+nltk
+gradio
+spacy
+python-dotenv
+# https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0.tar.gz