fix first seen date feat
Browse files
data.py
CHANGED
|
@@ -68,12 +68,21 @@ def generate_fake_dates(num_days: int = 7) -> List[str]:
|
|
| 68 |
|
| 69 |
def parse_json_field(value) -> dict:
|
| 70 |
"""Safely parse a JSON field that might be a string or dict."""
|
|
|
|
|
|
|
| 71 |
if isinstance(value, str):
|
| 72 |
try:
|
| 73 |
return json.loads(value)
|
| 74 |
except:
|
| 75 |
return {}
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
def extract_date_from_path(path: str, pattern: str) -> Optional[str]:
|
| 79 |
"""Extract date from file path using regex pattern."""
|
|
@@ -368,21 +377,37 @@ def get_fake_historical_data(start_date: str, end_date: str) -> pd.DataFrame:
|
|
| 368 |
|
| 369 |
def find_failure_first_seen(historical_df: pd.DataFrame, model_name: str, test_name: str, device: str, gpu_type: str) -> Optional[str]:
|
| 370 |
"""Find the first date when a specific test failure appeared in historical data."""
|
| 371 |
-
if historical_df.empty:
|
| 372 |
return None
|
| 373 |
|
| 374 |
try:
|
| 375 |
-
|
|
|
|
|
|
|
| 376 |
if model_data.empty:
|
| 377 |
return None
|
| 378 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
# Check each date (oldest first) for this failure
|
| 380 |
for _, row in model_data.sort_values('date').iterrows():
|
| 381 |
-
|
| 382 |
-
if
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 386 |
return None
|
| 387 |
|
| 388 |
except Exception as e:
|
|
|
|
| 68 |
|
| 69 |
def parse_json_field(value) -> dict:
|
| 70 |
"""Safely parse a JSON field that might be a string or dict."""
|
| 71 |
+
if value is None or pd.isna(value):
|
| 72 |
+
return {}
|
| 73 |
if isinstance(value, str):
|
| 74 |
try:
|
| 75 |
return json.loads(value)
|
| 76 |
except:
|
| 77 |
return {}
|
| 78 |
+
# Handle dict-like objects (including pandas Series/dict)
|
| 79 |
+
if isinstance(value, dict):
|
| 80 |
+
return value
|
| 81 |
+
# Try to convert to dict if possible
|
| 82 |
+
try:
|
| 83 |
+
return dict(value) if hasattr(value, '__iter__') else {}
|
| 84 |
+
except:
|
| 85 |
+
return {}
|
| 86 |
|
| 87 |
def extract_date_from_path(path: str, pattern: str) -> Optional[str]:
|
| 88 |
"""Extract date from file path using regex pattern."""
|
|
|
|
| 377 |
|
| 378 |
def find_failure_first_seen(historical_df: pd.DataFrame, model_name: str, test_name: str, device: str, gpu_type: str) -> Optional[str]:
|
| 379 |
"""Find the first date when a specific test failure appeared in historical data."""
|
| 380 |
+
if historical_df is None or historical_df.empty:
|
| 381 |
return None
|
| 382 |
|
| 383 |
try:
|
| 384 |
+
model_name_lower = model_name.lower()
|
| 385 |
+
# Filter by model name (case-insensitive)
|
| 386 |
+
model_data = historical_df[historical_df.index.str.lower() == model_name_lower].copy()
|
| 387 |
if model_data.empty:
|
| 388 |
return None
|
| 389 |
|
| 390 |
+
# Ensure we have a 'date' column
|
| 391 |
+
if 'date' not in model_data.columns:
|
| 392 |
+
return None
|
| 393 |
+
|
| 394 |
# Check each date (oldest first) for this failure
|
| 395 |
for _, row in model_data.sort_values('date').iterrows():
|
| 396 |
+
failures_raw = row.get(f'failures_{device}')
|
| 397 |
+
if failures_raw is None or pd.isna(failures_raw):
|
| 398 |
+
continue
|
| 399 |
+
|
| 400 |
+
# Parse failures (could be dict, string, or already parsed)
|
| 401 |
+
failures = parse_json_field(failures_raw)
|
| 402 |
+
if not isinstance(failures, dict) or gpu_type not in failures:
|
| 403 |
+
continue
|
| 404 |
+
|
| 405 |
+
# Check each test in this gpu_type
|
| 406 |
+
for test in failures.get(gpu_type, []):
|
| 407 |
+
if isinstance(test, dict) and test.get('line', '') == test_name:
|
| 408 |
+
date_value = row.get('date')
|
| 409 |
+
return date_value if date_value else None
|
| 410 |
+
|
| 411 |
return None
|
| 412 |
|
| 413 |
except Exception as e:
|