Spaces:
Running
Running
| from __future__ import annotations | |
| import re | |
| import os | |
| import string | |
| from typing import Dict, List, Optional | |
| import streamlit as st | |
| from importlib import resources | |
| # Minimal built-ins used if the external file is missing or too small | |
| FALLBACK_WORDS: Dict[int, List[str]] = { | |
| 4: [ | |
| "TREE", "BOAT", "WIND", "FROG", "LION", "MOON", "FORK", "GLOW", "GAME", "CODE", | |
| "DATA", "BLUE", "GOLD", "ROAD", "STAR", | |
| ], | |
| 5: [ | |
| "APPLE", "RIVER", "STONE", "PLANT", "MOUSE", "BOARD", "CHAIR", "SCALE", "SMILE", "CLOUD", | |
| ], | |
| 6: [ | |
| "ORANGE", "PYTHON", "STREAM", "MARKET", "FOREST", "THRIVE", "LOGGER", "BREATH", "DOMAIN", "GALAXY", | |
| ], | |
| } | |
| MIN_REQUIRED = 25 # Per specs: require >= 500 per length before using file contents | |
| def get_wordlist_files() -> list[str]: | |
| words_dir = os.path.join(os.path.dirname(__file__), "words") | |
| if not os.path.isdir(words_dir): | |
| return [] | |
| files = [f for f in os.listdir(words_dir) if f.lower().endswith(".txt")] | |
| return sorted(files) | |
| def load_word_list(selected_file: Optional[str] = None) -> Dict[int, List[str]]: | |
| """ | |
| Load a word list, filter to uppercase A–Z, lengths in {4,5,6}, and dedupe while preserving order. | |
| If `selected_file` is provided, load battlewords/words/<selected_file>. | |
| Otherwise, try on-disk default battlewords/words/wordlist.txt; if unavailable, try packaged resource. | |
| If fewer than 500 entries exist for any required length, fall back to built-ins | |
| for that length (per specs). | |
| NOTE: To ensure cache updates when the user picks a different file, always pass | |
| the `selected_file` argument from the UI/generator. | |
| """ | |
| words_by_len: Dict[int, List[str]] = {4: [], 5: [], 6: []} | |
| used_source = "fallback" | |
| def _finalize(wbl: Dict[int, List[str]], source: str) -> Dict[int, List[str]]: | |
| try: | |
| st.session_state.wordlist_source = source | |
| st.session_state.wordlist_selected = selected_file or "wordlist.txt" | |
| st.session_state.word_counts = {k: len(v) for k, v in wbl.items()} | |
| except Exception: | |
| pass | |
| return wbl | |
| def _read_text_from_disk(fname: str) -> str: | |
| words_dir = os.path.join(os.path.dirname(__file__), "words") | |
| path = os.path.join(words_dir, fname) | |
| with open(path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| def _read_default_text() -> Optional[str]: | |
| # Prefer the on-disk default in the editable repo | |
| try: | |
| return _read_text_from_disk("wordlist.txt") | |
| except Exception: | |
| pass | |
| # Fallback to packaged data if available | |
| try: | |
| return resources.files("battlewords.words").joinpath("wordlist.txt").read_text(encoding="utf-8") | |
| except Exception: | |
| return None | |
| try: | |
| text: Optional[str] = None | |
| source_label = "fallback" | |
| if selected_file: | |
| # Validate selection against available files to avoid bad paths | |
| available = set(get_wordlist_files()) | |
| if selected_file not in available: | |
| raise FileNotFoundError(f"Selected word list '{selected_file}' not found in words/ directory.") | |
| text = _read_text_from_disk(selected_file) | |
| source_label = f"file:{selected_file}" | |
| else: | |
| text = _read_default_text() | |
| if text is not None: | |
| source_label = "default" | |
| if text is None: | |
| raise FileNotFoundError("No word list file found on disk or in packaged resources.") | |
| seen = {4: set(), 5: set(), 6: set()} | |
| for raw in text.splitlines(): | |
| line = raw.strip() | |
| if not line or line.startswith("#"): | |
| continue | |
| if "#" in line: | |
| line = line.split("#", 1)[0].strip() | |
| word = line.upper() | |
| if not re.fullmatch(r"[A-Z]+", word): | |
| continue | |
| L = len(word) | |
| if L in (4, 5, 6) and word not in seen[L]: | |
| words_by_len[L].append(word) | |
| seen[L].add(word) | |
| counts = {k: len(v) for k, v in words_by_len.items()} | |
| if all(counts[k] >= MIN_REQUIRED for k in (4, 5, 6)): | |
| used_source = source_label | |
| return _finalize(words_by_len, used_source) | |
| # Per spec: fallback for any length below threshold | |
| mixed: Dict[int, List[str]] = { | |
| 4: words_by_len[4] if counts[4] >= MIN_REQUIRED else FALLBACK_WORDS[4], | |
| 5: words_by_len[5] if counts[5] >= MIN_REQUIRED else FALLBACK_WORDS[5], | |
| 6: words_by_len[6] if counts[6] >= MIN_REQUIRED else FALLBACK_WORDS[6], | |
| } | |
| used_source = f"{source_label}+fallback" if any(counts[k] >= MIN_REQUIRED for k in (4, 5, 6)) else "fallback" | |
| return _finalize(mixed, used_source) | |
| except Exception: | |
| # Missing file or read error | |
| used_source = "fallback" | |
| return _finalize(FALLBACK_WORDS, used_source) | |
| # Ensure this function is at module scope (not indented) and import string at top | |
| def compute_word_difficulties3(file_path, words_array=None): | |
| """ | |
| 1. Read and sanitize word list: uppercase A–Z only, skip comments/blank lines. | |
| 2. Count occurrences of each letter across all words (A..Z only). | |
| 3. Compute frequency f_l = count / n, rarity r_l = 1 - f_l for each letter. | |
| 4. Count words sharing same first/last letters for each pair. | |
| 5. If words_array provided, use it (uppercase); else use full list. | |
| 6. For each word: get unique letters L_w, k = |L_w|. | |
| 7. Compute average rarity a_w = sum(r_l for l in L_w) / k. | |
| 8. Get count c_w of words with same first/last, uniqueness u_w = 1 / c_w. | |
| 9. Difficulty d_w = [k * (26 - k)] / [(k + 1) * (a_w + u_w)] if denominator != 0, else 0. | |
| 10. Return total difficulty (sum d_w) and dict of {word: d_w}. | |
| Original Version: Battlewords v0.2.24 to 0.2.28 | |
| 2024-06: Updated to handle missing files gracefully and ensure A–Z filtering | |
| """ | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| raw_lines = f.readlines() | |
| except Exception: | |
| return 0, {} | |
| # Sanitize lines similarly to load_word_list() | |
| cleaned_words = [] | |
| for raw in raw_lines: | |
| line = raw.strip() | |
| if not line or line.startswith("#"): | |
| continue | |
| if "#" in line: | |
| line = line.split("#", 1)[0].strip() | |
| word = line.upper() | |
| # keep only A–Z words | |
| if re.fullmatch(r"[A-Z]+", word): | |
| cleaned_words.append(word) | |
| W = cleaned_words | |
| n = len(W) | |
| if n == 0: | |
| return 0, {} | |
| letter_counts = {l: 0 for l in string.ascii_uppercase} | |
| start_end_counts = {} | |
| for w in W: | |
| letters = set(w) | |
| # Only count A..Z to avoid KeyError | |
| for l in letters: | |
| if l in letter_counts: | |
| letter_counts[l] += 1 | |
| first, last = w[0], w[-1] | |
| key = (first, last) | |
| start_end_counts[key] = start_end_counts.get(key, 0) + 1 | |
| f_l = {l: count / n for l, count in letter_counts.items()} | |
| r_l = {l: 1 - f for l, f in f_l.items()} | |
| if words_array is None: | |
| words_array = W | |
| else: | |
| # Ensure A–Z and uppercase for the selection as well | |
| words_array = [ | |
| w.upper() | |
| for w in words_array | |
| if re.fullmatch(r"[A-Z]+", w.upper()) | |
| ] | |
| difficulties = {} | |
| for w in words_array: | |
| L_w = set(w) | |
| k = len(L_w) | |
| if k == 0: | |
| continue | |
| a_w = sum(r_l.get(l, 0) for l in L_w) / k | |
| first, last = w[0], w[-1] | |
| c_w = start_end_counts.get((first, last), 1) | |
| u_w = 1 / c_w | |
| denominator = (k + 1) * (a_w + u_w) | |
| d_w = 0 if denominator == 0 else (k * (26 - k)) / denominator | |
| difficulties[w] = d_w | |
| total_difficulty = sum(difficulties.values()) | |
| return total_difficulty, difficulties | |
| def compute_word_difficulties2(file_path, words_array=None): | |
| """ | |
| 1. Read and sanitize word list: uppercase A–Z only, skip comments/blank lines. | |
| 2. Compute corpus token frequencies p_l for letters (A..Z) from total occurrences. | |
| 3. Count words sharing same first/last letters for each pair (start_end_counts). | |
| 4. If words_array provided, use it (uppercase, A–Z only); else use full list W. | |
| 5. For each word w: q_l(w) = c_l(w)/len(w). Difficulty = Σ_l q_l(w) * p_l. | |
| Optionally scale by (2 - u_w) where u_w = 1 / count(first,last). | |
| 6. Return total difficulty and per-word difficulties. | |
| # Version 2: uses letter occurrence frequencies instead of presence/absence. | |
| """ | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| raw_lines = f.readlines() | |
| except Exception: | |
| return 0, {} | |
| # Sanitize lines similarly to load_word_list() | |
| cleaned_words = [] | |
| for raw in raw_lines: | |
| line = raw.strip() | |
| if not line or line.startswith("#"): | |
| continue | |
| if "#" in line: | |
| line = line.split("#", 1)[0].strip() | |
| word = line.upper() | |
| if re.fullmatch(r"[A-Z]+", word): | |
| cleaned_words.append(word) | |
| W = cleaned_words | |
| if not W: | |
| return 0, {} | |
| # Start/end pair counts (same as before) | |
| start_end_counts: Dict[tuple[str, str], int] = {} | |
| for w in W: | |
| first, last = w[0], w[-1] | |
| key = (first, last) | |
| start_end_counts[key] = start_end_counts.get(key, 0) + 1 | |
| # Corpus token frequencies p_l (counts every occurrence, not just presence) | |
| token_counts = {l: 0 for l in string.ascii_uppercase} | |
| for w in W: | |
| for l in w: | |
| if l in token_counts: | |
| token_counts[l] += 1 | |
| total_tokens = sum(token_counts.values()) or 1 | |
| p_l = {l: token_counts[l] / total_tokens for l in string.ascii_uppercase} | |
| # Candidate set | |
| if words_array is None: | |
| words_array = W | |
| else: | |
| words_array = [ | |
| w.upper() | |
| for w in words_array | |
| if re.fullmatch(r"[A-Z]+", w.upper()) | |
| ] | |
| difficulties: Dict[str, float] = {} | |
| for w in words_array: | |
| m = len(w) | |
| if m == 0: | |
| continue | |
| # q_l(w) from counts within the word (accounts for repeats) | |
| counts_in_w: Dict[str, int] = {} | |
| for ch in w: | |
| if ch in p_l: | |
| counts_in_w[ch] = counts_in_w.get(ch, 0) + 1 | |
| # Base difficulty: alignment with common letters (q · p) | |
| commonness = sum((cnt / m) * p_l.get(l, 0.0) for l, cnt in counts_in_w.items()) | |
| # Optional scaling for common start/end patterns | |
| first, last = w[0], w[-1] | |
| c_w = start_end_counts.get((first, last), 1) | |
| u_w = 1.0 / c_w # uniqueness | |
| d_w = commonness * (2.0 - u_w) | |
| difficulties[w] = d_w | |
| total_difficulty = sum(difficulties.values()) | |
| return total_difficulty, difficulties | |
| def compute_word_difficulties(file_path, words_array=None): | |
| """ | |
| 1. Read and sanitize word list: uppercase A–Z only, skip comments/blank lines. | |
| 2. Count occurrences of each letter across all words (A..Z only). | |
| 3. Compute frequency f_l = count / n, rarity r_l = 1 - f_l for each letter. | |
| 4. Count words sharing same first/last letters for each pair. | |
| 5. If words_array provided, use it (uppercase); else use full list. | |
| 6. For each word: get unique letters L_w, k = |L_w|. | |
| 7. Compute weighted average rarity a_w = sum(r_l * count_in_word) / total_letters_in_word. | |
| 8. Get count c_w of words with same first/last, uniqueness u_w = 1 / c_w. | |
| 9. Difficulty d_w = [k * (26 - k)] / [(k + 1) * (a_w + u_w)] if denominator != 0, else 0. | |
| 10. Return total difficulty (sum d_w) and dict of {word: d_w}. | |
| VERION 3.0 | |
| """ | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| raw_lines = f.readlines() | |
| except Exception: | |
| return 0, {} | |
| # Sanitize lines similarly to load_word_list() | |
| cleaned_words = [] | |
| for raw in raw_lines: | |
| line = raw.strip() | |
| if not line or line.startswith("#"): | |
| continue | |
| if "#" in line: | |
| line = line.split("#", 1)[0].strip() | |
| word = line.upper() | |
| # keep only A–Z words | |
| if re.fullmatch(r"[A-Z]+", word): | |
| cleaned_words.append(word) | |
| W = cleaned_words | |
| n = len(W) | |
| if n == 0: | |
| return 0, {} | |
| letter_counts = {l: 0 for l in string.ascii_uppercase} | |
| start_end_counts = {} | |
| for w in W: | |
| letters = set(w) | |
| # Only count A..Z to avoid KeyError | |
| for l in letters: | |
| if l in letter_counts: | |
| letter_counts[l] += 1 | |
| first, last = w[0], w[-1] | |
| key = (first, last) | |
| start_end_counts[key] = start_end_counts.get(key, 0) + 1 | |
| f_l = {l: count / n for l, count in letter_counts.items()} | |
| r_l = {l: 1 - f for l, f in f_l.items()} | |
| if words_array is None: | |
| words_array = W | |
| else: | |
| # Ensure A–Z and uppercase for the selection as well | |
| words_array = [ | |
| w.upper() | |
| for w in words_array | |
| if re.fullmatch(r"[A-Z]+", w.upper()) | |
| ] | |
| difficulties = {} | |
| for w in words_array: | |
| # Count occurrences of each letter in the word | |
| letter_freq = {l: w.count(l) for l in set(w)} | |
| # Compute weighted average rarity | |
| total_letters = len(w) | |
| a_w = sum(r_l.get(l, 0) * freq for l, freq in letter_freq.items()) / total_letters | |
| L_w = set(w) | |
| k = len(L_w) | |
| if k == 0: | |
| continue | |
| first, last = w[0], w[-1] | |
| c_w = start_end_counts.get((first, last), 1) | |
| u_w = c_w / 18 # magic number to scale uniqueness based on word lengths | |
| denominator = (k + 1) * (a_w + u_w) | |
| d_w = 0 if denominator == 0 else (k * (26 - k)) / denominator | |
| difficulties[w] = d_w | |
| total_difficulty = sum(difficulties.values()) | |
| return total_difficulty, difficulties |