Spaces:

dev-jas
/

polymer-aging-ml

Sleeping

polymer-aging-ml / utils /preprocessing.py

devjas1

(DEPLOY): move HF Space app to repo root for build

f0b325d 4 months ago

3.65 kB

	"""
	Preprocessing utilities for polymer classification app.
	Adapted from the original scripts/preprocess_dataset.py for Hugging Face Spaces deployment.
	"""

	import numpy as np
	from scipy.interpolate import interp1d
	from scipy.signal import savgol_filter
	from sklearn.preprocessing import minmax_scale

	# Default resample target
	TARGET_LENGTH = 500

	def remove_baseline(y):
	"""Simple baseline correction using polynomial fitting (order 2)"""
	x = np.arange(len(y))
	coeffs = np.polyfit(x, y, deg=2)
	baseline = np.polyval(coeffs, x)
	return y - baseline

	def normalize_spectrum(y):
	"""Min-max normalization to [0, 1]"""
	return minmax_scale(y)

	def smooth_spectrum(y, window_length=11, polyorder=2):
	"""Apply Savitzky-Golay smoothing."""
	if len(y) < window_length:
	window_length = len(y) if len(y) % 2 == 1 else len(y) - 1
	if window_length < 3:
	return y
	return savgol_filter(y, window_length, polyorder)

	def resample_spectrum(x, y, target_len=TARGET_LENGTH):
	"""
	Resample a spectrum to a fixed number of points using linear interpolation.

	Args:
	x (array-like): Wavenumber values
	y (array-like): Intensity values
	target_len (int): Target number of points

	Returns:
	np.ndarray: Resampled intensity values
	"""
	# Ensure inputs are numpy arrays
	x = np.asarray(x)
	y = np.asarray(y)

	# Check for valid input
	if len(x) != len(y):
	raise ValueError(f"x and y must have same length: {len(x)} vs {len(y)}")

	if len(x) < 2:
	raise ValueError("Need at least 2 points for interpolation")

	# Sort by x values to ensure monotonic order
	sort_idx = np.argsort(x)
	x_sorted = x[sort_idx]
	y_sorted = y[sort_idx]

	# Check for duplicate x values
	if len(np.unique(x_sorted)) != len(x_sorted):
	# Remove duplicates by averaging y values for same x
	x_unique, inverse_indices = np.unique(x_sorted, return_inverse=True)
	y_unique = np.zeros_like(x_unique, dtype=float)
	for i in range(len(x_unique)):
	mask = inverse_indices == i
	y_unique[i] = np.mean(y_sorted[mask])
	x_sorted, y_sorted = x_unique, y_unique

	# Create interpolation function
	f_interp = interp1d(x_sorted, y_sorted, kind='linear', bounds_error=False, fill_value=np.nan)

	# Generate uniform grid
	x_uniform = np.linspace(min(x_sorted), max(x_sorted), target_len)
	y_uniform = f_interp(x_uniform)

	return y_uniform

	def preprocess_spectrum(x, y, target_len=500, baseline_correction=False,
	apply_smoothing=False, normalize=False):
	"""
	Complete preprocessing pipeline for a single spectrum.

	Args:
	x (array-like): Wavenumber values
	y (array-like): Intensity values
	target_len (int): Number of points to resample to
	baseline_correction (bool): Whether to apply baseline removal
	apply_smoothing (bool): Whether to apply Savitzky-Golay smoothing
	normalize (bool): Whether to apply min-max normalization

	Returns:
	np.ndarray: Preprocessed spectrum
	"""
	# Resample first
	y_processed = resample_spectrum(x, y, target_len=target_len)

	# Optional preprocessing steps
	if baseline_correction:
	y_processed = remove_baseline(y_processed)

	if apply_smoothing:
	y_processed = smooth_spectrum(y_processed)

	if normalize:
	y_processed = normalize_spectrum(y_processed)

	return y_processed