nevisende commited on
Commit
b5cc7ad
·
1 Parent(s): e599e54

Feat: create initial files

Browse files
Files changed (4) hide show
  1. .env.exam +1 -0
  2. .gitignore +2 -0
  3. app.py +166 -0
  4. requirements.txt +7 -0
.env.exam ADDED
@@ -0,0 +1 @@
 
 
1
+ HF_TOKEN=
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ flagged
2
+ .env
app.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import logging
4
+ import json
5
+ import hashlib
6
+ from collections import defaultdict
7
+ from concurrent.futures import ProcessPoolExecutor, as_completed
8
+
9
+ import spacy
10
+ import nltk
11
+ from nltk.corpus import wordnet as wn
12
+ from nltk.tokenize import word_tokenize
13
+ from nltk.tag import pos_tag
14
+ import gradio as gr
15
+
16
+
17
+ load_dotenv()
18
+ # Configuration
19
+ CONFIG = {
20
+ 'HF_TOKEN': os.getenv('HF_TOKEN'),
21
+ 'SPACY_MODEL': 'en_core_web_sm',
22
+ 'LOG_LEVEL': logging.INFO,
23
+ }
24
+
25
+ # Setup logging
26
+ logging.basicConfig(level=CONFIG['LOG_LEVEL'], format='%(asctime)s - %(levelname)s - %(message)s')
27
+ logger = logging.getLogger(__name__)
28
+
29
+ # Set environment variables
30
+ os.environ['HF_TOKEN'] = CONFIG['HF_TOKEN']
31
+
32
+ # Download required NLTK data
33
+ nltk.download('wordnet', quiet=True)
34
+ nltk.download('averaged_perceptron_tagger', quiet=True)
35
+ nltk.download('punkt', quiet=True)
36
+
37
+ # Load spaCy model
38
+ try:
39
+ nlp = spacy.load(CONFIG['SPACY_MODEL'])
40
+ except IOError:
41
+ logger.info("Downloading spaCy model...")
42
+ spacy.cli.download(CONFIG['SPACY_MODEL'])
43
+ nlp = spacy.load(CONFIG['SPACY_MODEL'])
44
+
45
+ def get_wordnet_pos(treebank_tag):
46
+ """Map POS tag to first character used by WordNet."""
47
+ tag_map = {
48
+ 'J': wn.ADJ, 'V': wn.VERB, 'N': wn.NOUN, 'R': wn.ADV
49
+ }
50
+ return tag_map.get(treebank_tag[0], None)
51
+
52
+ def lesk_algorithm(word, sentence, pos=None):
53
+ """Implement the Lesk algorithm for word sense disambiguation."""
54
+ word = word.lower()
55
+ context = set(word_tokenize(sentence.lower()))
56
+ best_sense = None
57
+ max_overlap = 0
58
+
59
+ for synset in wn.synsets(word):
60
+ if pos and synset.pos() != pos:
61
+ continue
62
+ signature = set(word_tokenize(synset.definition().lower()))
63
+ for example in synset.examples():
64
+ signature.update(set(word_tokenize(example.lower())))
65
+ overlap = len(signature.intersection(context))
66
+ if overlap > max_overlap:
67
+ max_overlap = overlap
68
+ best_sense = synset
69
+
70
+ return best_sense
71
+
72
+ def create_unique_index(word, meaning, sentence):
73
+ """Create a unique index for each word-meaning pair."""
74
+ combined = f"{word}_{meaning}_{sentence}".encode('utf-8')
75
+ return hashlib.md5(combined).hexdigest()
76
+
77
+ def is_meaningful_word(token):
78
+ """Check if a word is meaningful and should be included in the analysis."""
79
+ return (token.has_vector and # This ensures the word is in spaCy's vocabulary
80
+ not token.is_stop and # Exclude stop words
81
+ token.pos_ not in ['PUNCT', 'SYM', 'X'] and # Exclude punctuation, symbols, and other
82
+ len(token.text) > 1) # Exclude single-character tokens
83
+
84
+ def process_sentence(sent):
85
+ """Process a single sentence and return word information."""
86
+ word_info = defaultdict(lambda: {"lemma": "", "meanings": []})
87
+ doc = nlp(sent)
88
+
89
+ for token in doc:
90
+ if is_meaningful_word(token):
91
+ word = token.text.lower()
92
+ wordnet_pos = get_wordnet_pos(token.tag_)
93
+
94
+ if not word_info[word]["lemma"]:
95
+ word_info[word]["lemma"] = token.lemma_
96
+
97
+ best_sense = lesk_algorithm(word, sent, wordnet_pos)
98
+
99
+ if best_sense:
100
+ definition = best_sense.definition()
101
+ pos = best_sense.pos()
102
+
103
+ unique_index = create_unique_index(word, definition, sent)
104
+
105
+ new_meaning = {
106
+ "index": unique_index,
107
+ "meaning": definition,
108
+ "POS": pos,
109
+ "sentence": sent
110
+ }
111
+
112
+ if not any(m['meaning'] == definition for m in word_info[word]["meanings"]):
113
+ word_info[word]["meanings"].append(new_meaning)
114
+
115
+ return dict(word_info)
116
+
117
+ def get_word_info(text):
118
+ """Get word information for all sentences in the text."""
119
+ sentences = nltk.sent_tokenize(text)
120
+ word_info = defaultdict(lambda: {"lemma": "", "meanings": []})
121
+
122
+ with ProcessPoolExecutor() as executor:
123
+ future_to_sentence = {executor.submit(process_sentence, sent): sent for sent in sentences}
124
+ for future in as_completed(future_to_sentence):
125
+ sentence_info = future.result()
126
+ for word, info in sentence_info.items():
127
+ word_info[word]["lemma"] = info["lemma"]
128
+ word_info[word]["meanings"].extend(info["meanings"])
129
+
130
+ # If a word has no meanings, try to get a default definition
131
+ for word, info in word_info.items():
132
+ if not info["meanings"]:
133
+ synsets = wn.synsets(word)
134
+ if synsets:
135
+ definition = synsets[0].definition()
136
+ pos = synsets[0].pos()
137
+ info["meanings"].append({
138
+ "index": create_unique_index(word, definition, ""),
139
+ "meaning": definition,
140
+ "POS": pos,
141
+ "sentence": "Default definition"
142
+ })
143
+
144
+ return dict(word_info)
145
+
146
+ def process_text(text):
147
+ """Process the input text and return JSON results."""
148
+ try:
149
+ word_info = get_word_info(text)
150
+ return json.dumps(word_info, indent=2)
151
+ except Exception as e:
152
+ logger.error(f"Error processing text: {str(e)}")
153
+ return json.dumps({"error": "An error occurred while processing the text."})
154
+
155
+
156
+ # Gradio Interface
157
+ iface = gr.Interface(
158
+ fn=process_text,
159
+ inputs=gr.Textbox(lines=5, label="Enter your text here"),
160
+ outputs=gr.JSON(label="Results"),
161
+ title="Improved Word Sense Disambiguation API",
162
+ description="This API performs word sense disambiguation with special focus on 'season' and returns results in JSON format."
163
+ )
164
+
165
+ if __name__ == "__main__":
166
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ nltk
4
+ gradio
5
+ spacy
6
+ python-dotenv
7
+ # https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0.tar.gz