Spaces:
Build error
Build error
| import os | |
| from dotenv import load_dotenv | |
| import logging | |
| import json | |
| import hashlib | |
| from collections import defaultdict | |
| from concurrent.futures import ProcessPoolExecutor, as_completed | |
| import spacy | |
| import nltk | |
| from nltk.corpus import wordnet as wn | |
| from nltk.tokenize import word_tokenize | |
| from nltk.tag import pos_tag | |
| import gradio as gr | |
| nltk.download('punkt_tab') | |
| load_dotenv() | |
| # Configuration | |
| CONFIG = { | |
| 'HF_TOKEN': os.getenv('HF_TOKEN'), | |
| 'SPACY_MODEL': 'en_core_web_sm', | |
| 'LOG_LEVEL': logging.INFO, | |
| } | |
| # Setup logging | |
| logging.basicConfig(level=CONFIG['LOG_LEVEL'], format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| # Set environment variables | |
| os.environ['HF_TOKEN'] = CONFIG['HF_TOKEN'] | |
| # Download required NLTK data | |
| nltk.download('wordnet', quiet=True) | |
| nltk.download('averaged_perceptron_tagger', quiet=True) | |
| nltk.download('punkt', quiet=True) | |
| # Load spaCy model | |
| try: | |
| nlp = spacy.load(CONFIG['SPACY_MODEL']) | |
| except IOError: | |
| logger.info("Downloading spaCy model...") | |
| spacy.cli.download(CONFIG['SPACY_MODEL']) | |
| nlp = spacy.load(CONFIG['SPACY_MODEL']) | |
| def get_wordnet_pos(treebank_tag): | |
| """Map POS tag to first character used by WordNet.""" | |
| tag_map = { | |
| 'J': wn.ADJ, 'V': wn.VERB, 'N': wn.NOUN, 'R': wn.ADV | |
| } | |
| return tag_map.get(treebank_tag[0], None) | |
| def lesk_algorithm(word, sentence, pos=None): | |
| """Implement the Lesk algorithm for word sense disambiguation.""" | |
| word = word.lower() | |
| context = set(word_tokenize(sentence.lower())) | |
| best_sense = None | |
| max_overlap = 0 | |
| for synset in wn.synsets(word): | |
| if pos and synset.pos() != pos: | |
| continue | |
| signature = set(word_tokenize(synset.definition().lower())) | |
| for example in synset.examples(): | |
| signature.update(set(word_tokenize(example.lower()))) | |
| overlap = len(signature.intersection(context)) | |
| if overlap > max_overlap: | |
| max_overlap = overlap | |
| best_sense = synset | |
| return best_sense | |
| def create_unique_index(word, meaning): | |
| """Create a unique index for each word-meaning pair.""" | |
| combined = f"{word}_{meaning}".encode('utf-8') | |
| return hashlib.md5(combined).hexdigest() | |
| def is_meaningful_word(token): | |
| """Check if a word is meaningful and should be included in the analysis.""" | |
| return (not token.is_stop and # Exclude stop words | |
| token.pos_ not in ['PUNCT', 'SYM', 'X'] and # Exclude punctuation, symbols, and other | |
| len(token.text) > 1) # Exclude single-character tokens | |
| def process_sentence(sent): | |
| """Process a single sentence and return word information in order.""" | |
| word_info = [] | |
| doc = nlp(sent) | |
| for token in doc: | |
| if token.is_punct: | |
| word_info.append({ | |
| "original": token.text, | |
| "type": "punctuation" | |
| }) | |
| elif token.is_space: | |
| word_info.append({ | |
| "original": token.text, | |
| "type": "space" | |
| }) | |
| else: | |
| word = token.text.lower() | |
| wordnet_pos = get_wordnet_pos(token.tag_) | |
| best_sense = lesk_algorithm(word, sent, wordnet_pos) | |
| if best_sense: | |
| definition = best_sense.definition() | |
| pos = best_sense.pos() | |
| unique_index = create_unique_index(word, definition) | |
| word_info.append({ | |
| "original": token.text, | |
| "lemma": token.lemma_, | |
| "index": unique_index, | |
| "meaning": definition, | |
| "POS": pos | |
| }) | |
| else: | |
| word_info.append({ | |
| "original": token.text, | |
| "type": "unknown" | |
| }) | |
| return word_info | |
| def get_word_info(text): | |
| """Get word information for all sentences in the text, preserving sentence structure.""" | |
| sentences = nltk.sent_tokenize(text) | |
| all_word_info = [] | |
| for sent in sentences: | |
| sentence_info = process_sentence(sent) | |
| all_word_info.append(sentence_info) | |
| return all_word_info | |
| def process_text(selected_text, user_text): | |
| """Process the input text and return JSON results.""" | |
| text = user_text if user_text.strip() != "" else selected_text | |
| try: | |
| word_info = get_word_info(text) | |
| return json.dumps(word_info, indent=2) | |
| except Exception as e: | |
| logger.error(f"Error processing text: {str(e)}") | |
| return json.dumps({"error": "An error occurred while processing the text."}) | |
| # Sample texts | |
| examples = [ | |
| "The chef will season the steak with salt and pepper before grilling. Pumpkin spice lattes usually season the arrival of autumn.", | |
| "The gardener will plant tulips in the spring. Cherry blossoms signify the beginning of warmer weather.", | |
| "The artist will paint the sunset over the mountains. Bright colors often capture the vibrancy of summer." | |
| ] | |
| # Gradio Interface | |
| iface = gr.Interface( | |
| fn=process_text, | |
| inputs=[ | |
| gr.Dropdown(choices=examples, label="Select a sample text"), | |
| gr.Textbox(lines=5, label="Or enter your own text here", placeholder="Enter your text here...") | |
| ], | |
| outputs=gr.JSON(label="Results"), | |
| title="Improved Word Sense Disambiguation API", | |
| description="This API performs word sense disambiguation with special focus on 'season' and returns results in JSON format." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |