Spaces:

Surn
/

BattleWords

Running

App Files Files Community

BattleWords / battlewords /word_loader.py

Surn

0.2.29

1b1b6cc 21 days ago

raw

history blame contribute delete

14 kB

	from __future__ import annotations

	import re
	import os
	import string
	from typing import Dict, List, Optional

	import streamlit as st
	from importlib import resources


	# Minimal built-ins used if the external file is missing or too small
	FALLBACK_WORDS: Dict[int, List[str]] = {
	4: [
	"TREE", "BOAT", "WIND", "FROG", "LION", "MOON", "FORK", "GLOW", "GAME", "CODE",
	"DATA", "BLUE", "GOLD", "ROAD", "STAR",
	],
	5: [
	"APPLE", "RIVER", "STONE", "PLANT", "MOUSE", "BOARD", "CHAIR", "SCALE", "SMILE", "CLOUD",
	],
	6: [
	"ORANGE", "PYTHON", "STREAM", "MARKET", "FOREST", "THRIVE", "LOGGER", "BREATH", "DOMAIN", "GALAXY",
	],
	}

	MIN_REQUIRED = 25 # Per specs: require >= 500 per length before using file contents


	def get_wordlist_files() -> list[str]:
	words_dir = os.path.join(os.path.dirname(__file__), "words")
	if not os.path.isdir(words_dir):
	return []
	files = [f for f in os.listdir(words_dir) if f.lower().endswith(".txt")]
	return sorted(files)


	@st.cache_data(show_spinner=False)
	def load_word_list(selected_file: Optional[str] = None) -> Dict[int, List[str]]:
	"""
	Load a word list, filter to uppercase A–Z, lengths in {4,5,6}, and dedupe while preserving order.

	If `selected_file` is provided, load battlewords/words/<selected_file>.
	Otherwise, try on-disk default battlewords/words/wordlist.txt; if unavailable, try packaged resource.

	If fewer than 500 entries exist for any required length, fall back to built-ins
	for that length (per specs).

	NOTE: To ensure cache updates when the user picks a different file, always pass
	the `selected_file` argument from the UI/generator.
	"""
	words_by_len: Dict[int, List[str]] = {4: [], 5: [], 6: []}
	used_source = "fallback"

	def _finalize(wbl: Dict[int, List[str]], source: str) -> Dict[int, List[str]]:
	try:
	st.session_state.wordlist_source = source
	st.session_state.wordlist_selected = selected_file or "wordlist.txt"
	st.session_state.word_counts = {k: len(v) for k, v in wbl.items()}
	except Exception:
	pass
	return wbl

	def _read_text_from_disk(fname: str) -> str:
	words_dir = os.path.join(os.path.dirname(__file__), "words")
	path = os.path.join(words_dir, fname)
	with open(path, "r", encoding="utf-8") as f:
	return f.read()

	def _read_default_text() -> Optional[str]:
	# Prefer the on-disk default in the editable repo
	try:
	return _read_text_from_disk("wordlist.txt")
	except Exception:
	pass
	# Fallback to packaged data if available
	try:
	return resources.files("battlewords.words").joinpath("wordlist.txt").read_text(encoding="utf-8")
	except Exception:
	return None

	try:
	text: Optional[str] = None
	source_label = "fallback"

	if selected_file:
	# Validate selection against available files to avoid bad paths
	available = set(get_wordlist_files())
	if selected_file not in available:
	raise FileNotFoundError(f"Selected word list '{selected_file}' not found in words/ directory.")
	text = _read_text_from_disk(selected_file)
	source_label = f"file:{selected_file}"
	else:
	text = _read_default_text()
	if text is not None:
	source_label = "default"

	if text is None:
	raise FileNotFoundError("No word list file found on disk or in packaged resources.")

	seen = {4: set(), 5: set(), 6: set()}
	for raw in text.splitlines():
	line = raw.strip()
	if not line or line.startswith("#"):
	continue
	if "#" in line:
	line = line.split("#", 1)[0].strip()
	word = line.upper()
	if not re.fullmatch(r"[A-Z]+", word):
	continue
	L = len(word)
	if L in (4, 5, 6) and word not in seen[L]:
	words_by_len[L].append(word)
	seen[L].add(word)

	counts = {k: len(v) for k, v in words_by_len.items()}
	if all(counts[k] >= MIN_REQUIRED for k in (4, 5, 6)):
	used_source = source_label
	return _finalize(words_by_len, used_source)

	# Per spec: fallback for any length below threshold
	mixed: Dict[int, List[str]] = {
	4: words_by_len[4] if counts[4] >= MIN_REQUIRED else FALLBACK_WORDS[4],
	5: words_by_len[5] if counts[5] >= MIN_REQUIRED else FALLBACK_WORDS[5],
	6: words_by_len[6] if counts[6] >= MIN_REQUIRED else FALLBACK_WORDS[6],
	}
	used_source = f"{source_label}+fallback" if any(counts[k] >= MIN_REQUIRED for k in (4, 5, 6)) else "fallback"
	return _finalize(mixed, used_source)

	except Exception:
	# Missing file or read error
	used_source = "fallback"
	return _finalize(FALLBACK_WORDS, used_source)


	# Ensure this function is at module scope (not indented) and import string at top
	def compute_word_difficulties3(file_path, words_array=None):
	"""
	1. Read and sanitize word list: uppercase A–Z only, skip comments/blank lines.
	2. Count occurrences of each letter across all words (A..Z only).
	3. Compute frequency f_l = count / n, rarity r_l = 1 - f_l for each letter.
	4. Count words sharing same first/last letters for each pair.
	5. If words_array provided, use it (uppercase); else use full list.
	6. For each word: get unique letters L_w, k = \|L_w\|.
	7. Compute average rarity a_w = sum(r_l for l in L_w) / k.
	8. Get count c_w of words with same first/last, uniqueness u_w = 1 / c_w.
	9. Difficulty d_w = [k * (26 - k)] / [(k + 1) * (a_w + u_w)] if denominator != 0, else 0.
	10. Return total difficulty (sum d_w) and dict of {word: d_w}.
	Original Version: Battlewords v0.2.24 to 0.2.28
	2024-06: Updated to handle missing files gracefully and ensure A–Z filtering
	"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	raw_lines = f.readlines()
	except Exception:
	return 0, {}

	# Sanitize lines similarly to load_word_list()
	cleaned_words = []
	for raw in raw_lines:
	line = raw.strip()
	if not line or line.startswith("#"):
	continue
	if "#" in line:
	line = line.split("#", 1)[0].strip()
	word = line.upper()
	# keep only A–Z words
	if re.fullmatch(r"[A-Z]+", word):
	cleaned_words.append(word)

	W = cleaned_words
	n = len(W)
	if n == 0:
	return 0, {}

	letter_counts = {l: 0 for l in string.ascii_uppercase}
	start_end_counts = {}

	for w in W:
	letters = set(w)
	# Only count A..Z to avoid KeyError
	for l in letters:
	if l in letter_counts:
	letter_counts[l] += 1
	first, last = w[0], w[-1]
	key = (first, last)
	start_end_counts[key] = start_end_counts.get(key, 0) + 1

	f_l = {l: count / n for l, count in letter_counts.items()}
	r_l = {l: 1 - f for l, f in f_l.items()}

	if words_array is None:
	words_array = W
	else:
	# Ensure A–Z and uppercase for the selection as well
	words_array = [
	w.upper()
	for w in words_array
	if re.fullmatch(r"[A-Z]+", w.upper())
	]

	difficulties = {}
	for w in words_array:
	L_w = set(w)
	k = len(L_w)
	if k == 0:
	continue
	a_w = sum(r_l.get(l, 0) for l in L_w) / k
	first, last = w[0], w[-1]
	c_w = start_end_counts.get((first, last), 1)
	u_w = 1 / c_w
	denominator = (k + 1) * (a_w + u_w)
	d_w = 0 if denominator == 0 else (k * (26 - k)) / denominator
	difficulties[w] = d_w

	total_difficulty = sum(difficulties.values())
	return total_difficulty, difficulties


	def compute_word_difficulties2(file_path, words_array=None):
	"""
	1. Read and sanitize word list: uppercase A–Z only, skip comments/blank lines.
	2. Compute corpus token frequencies p_l for letters (A..Z) from total occurrences.
	3. Count words sharing same first/last letters for each pair (start_end_counts).
	4. If words_array provided, use it (uppercase, A–Z only); else use full list W.
	5. For each word w: q_l(w) = c_l(w)/len(w). Difficulty = Σ_l q_l(w) * p_l.
	Optionally scale by (2 - u_w) where u_w = 1 / count(first,last).
	6. Return total difficulty and per-word difficulties.
	# Version 2: uses letter occurrence frequencies instead of presence/absence.
	"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	raw_lines = f.readlines()
	except Exception:
	return 0, {}

	# Sanitize lines similarly to load_word_list()
	cleaned_words = []
	for raw in raw_lines:
	line = raw.strip()
	if not line or line.startswith("#"):
	continue
	if "#" in line:
	line = line.split("#", 1)[0].strip()
	word = line.upper()
	if re.fullmatch(r"[A-Z]+", word):
	cleaned_words.append(word)

	W = cleaned_words
	if not W:
	return 0, {}

	# Start/end pair counts (same as before)
	start_end_counts: Dict[tuple[str, str], int] = {}
	for w in W:
	first, last = w[0], w[-1]
	key = (first, last)
	start_end_counts[key] = start_end_counts.get(key, 0) + 1

	# Corpus token frequencies p_l (counts every occurrence, not just presence)
	token_counts = {l: 0 for l in string.ascii_uppercase}
	for w in W:
	for l in w:
	if l in token_counts:
	token_counts[l] += 1
	total_tokens = sum(token_counts.values()) or 1
	p_l = {l: token_counts[l] / total_tokens for l in string.ascii_uppercase}

	# Candidate set
	if words_array is None:
	words_array = W
	else:
	words_array = [
	w.upper()
	for w in words_array
	if re.fullmatch(r"[A-Z]+", w.upper())
	]

	difficulties: Dict[str, float] = {}
	for w in words_array:
	m = len(w)
	if m == 0:
	continue

	# q_l(w) from counts within the word (accounts for repeats)
	counts_in_w: Dict[str, int] = {}
	for ch in w:
	if ch in p_l:
	counts_in_w[ch] = counts_in_w.get(ch, 0) + 1

	# Base difficulty: alignment with common letters (q · p)
	commonness = sum((cnt / m) * p_l.get(l, 0.0) for l, cnt in counts_in_w.items())

	# Optional scaling for common start/end patterns
	first, last = w[0], w[-1]
	c_w = start_end_counts.get((first, last), 1)
	u_w = 1.0 / c_w # uniqueness
	d_w = commonness * (2.0 - u_w)

	difficulties[w] = d_w

	total_difficulty = sum(difficulties.values())
	return total_difficulty, difficulties


	def compute_word_difficulties(file_path, words_array=None):
	"""
	1. Read and sanitize word list: uppercase A–Z only, skip comments/blank lines.
	2. Count occurrences of each letter across all words (A..Z only).
	3. Compute frequency f_l = count / n, rarity r_l = 1 - f_l for each letter.
	4. Count words sharing same first/last letters for each pair.
	5. If words_array provided, use it (uppercase); else use full list.
	6. For each word: get unique letters L_w, k = \|L_w\|.
	7. Compute weighted average rarity a_w = sum(r_l * count_in_word) / total_letters_in_word.
	8. Get count c_w of words with same first/last, uniqueness u_w = 1 / c_w.
	9. Difficulty d_w = [k * (26 - k)] / [(k + 1) * (a_w + u_w)] if denominator != 0, else 0.
	10. Return total difficulty (sum d_w) and dict of {word: d_w}.
	VERION 3.0
	"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	raw_lines = f.readlines()
	except Exception:
	return 0, {}

	# Sanitize lines similarly to load_word_list()
	cleaned_words = []
	for raw in raw_lines:
	line = raw.strip()
	if not line or line.startswith("#"):
	continue
	if "#" in line:
	line = line.split("#", 1)[0].strip()
	word = line.upper()
	# keep only A–Z words
	if re.fullmatch(r"[A-Z]+", word):
	cleaned_words.append(word)

	W = cleaned_words
	n = len(W)
	if n == 0:
	return 0, {}

	letter_counts = {l: 0 for l in string.ascii_uppercase}
	start_end_counts = {}

	for w in W:
	letters = set(w)
	# Only count A..Z to avoid KeyError
	for l in letters:
	if l in letter_counts:
	letter_counts[l] += 1
	first, last = w[0], w[-1]
	key = (first, last)
	start_end_counts[key] = start_end_counts.get(key, 0) + 1

	f_l = {l: count / n for l, count in letter_counts.items()}
	r_l = {l: 1 - f for l, f in f_l.items()}

	if words_array is None:
	words_array = W
	else:
	# Ensure A–Z and uppercase for the selection as well
	words_array = [
	w.upper()
	for w in words_array
	if re.fullmatch(r"[A-Z]+", w.upper())
	]

	difficulties = {}
	for w in words_array:
	# Count occurrences of each letter in the word
	letter_freq = {l: w.count(l) for l in set(w)}

	# Compute weighted average rarity
	total_letters = len(w)
	a_w = sum(r_l.get(l, 0) * freq for l, freq in letter_freq.items()) / total_letters

	L_w = set(w)
	k = len(L_w)
	if k == 0:
	continue
	first, last = w[0], w[-1]
	c_w = start_end_counts.get((first, last), 1)
	u_w = c_w / 18 # magic number to scale uniqueness based on word lengths
	denominator = (k + 1) * (a_w + u_w)
	d_w = 0 if denominator == 0 else (k * (26 - k)) / denominator
	difficulties[w] = d_w

	total_difficulty = sum(difficulties.values())
	return total_difficulty, difficulties