Spaces:

GenAIDevTOProd
/

slotmatch

Running

slotmatch / extractor.py

Upload folder using huggingface_hub

8983b2d verified 3 months ago

1.97 kB

	from slotmatch.schema import SchemaValidator
	from slotmatch.utils import extract_value_by_regex, fuzzy_match_key, compute_confidence

	class SlotExtractor:
	def __init__(self, schema: dict):
	self.validator = SchemaValidator(schema)
	self.schema = self.validator.get_schema()
	self.schema_keys = list(self.schema.keys())

	def extract(self, text: str) -> dict:
	result = {}

	for expected_key in self.schema_keys:
	# 1. Try regex directly
	raw_value = extract_value_by_regex(text, expected_key)
	if raw_value is not None:
	result[expected_key] = {
	"value": self._coerce_type(raw_value, self.schema[expected_key]),
	"confidence": compute_confidence("regex")
	}
	continue

	# 2. Try fuzzy match
	fuzzy_key, score = fuzzy_match_key(expected_key, self._get_all_keys_from_text(text))
	if fuzzy_key:
	raw_value = extract_value_by_regex(text, fuzzy_key)
	if raw_value is not None:
	result[expected_key] = {
	"value": self._coerce_type(raw_value, self.schema[expected_key]),
	"confidence": compute_confidence("fuzzy") * score
	}
	continue

	# 3. Fallback
	result[expected_key] = {
	"value": None,
	"confidence": 0.0
	}

	return result

	def _get_all_keys_from_text(self, text: str) -> list:
	import re
	pattern = r'["\']?([\w-]+)["\']?\s*[:=]'
	return list(set(re.findall(pattern, text)))

	def _coerce_type(self, value, expected_type):
	try:
	if expected_type == bool:
	return value.lower() in ['true', 'yes', '1']
	return expected_type(value)
	except:
	return value # fallback to original