Spaces:
Runtime error
Runtime error
Create preprocessor.py
Browse files- data/preprocessor.py +40 -0
data/preprocessor.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Data Preprocessor for ChipVerifyAI
|
| 4 |
+
Preprocess datasets for ML training and inference
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
class DataPreprocessor:
|
| 11 |
+
"""Preprocess data for ML models"""
|
| 12 |
+
|
| 13 |
+
def __init__(self):
|
| 14 |
+
self.feature_columns = [
|
| 15 |
+
'lines_of_code', 'module_count', 'signal_count', 'always_blocks',
|
| 16 |
+
'assign_statements', 'if_statements', 'case_statements', 'for_loops',
|
| 17 |
+
'function_count', 'task_count', 'clock_domains', 'reset_signals',
|
| 18 |
+
'interface_signals', 'memory_instances', 'fsm_count', 'pipeline_stages',
|
| 19 |
+
'arithmetic_units', 'complexity_score', 'has_memory', 'has_fsm',
|
| 20 |
+
'has_pipeline', 'has_floating_point', 'is_complex', 'is_large'
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
def preprocess_for_ml(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 24 |
+
"""Preprocess DataFrame for ML training"""
|
| 25 |
+
processed_df = df.copy()
|
| 26 |
+
# Fill missing
|
| 27 |
+
for col in self.feature_columns:
|
| 28 |
+
if col in processed_df.columns:
|
| 29 |
+
if processed_df[col].dtype == 'bool':
|
| 30 |
+
processed_df[col] = processed_df[col].fillna(False)
|
| 31 |
+
else:
|
| 32 |
+
processed_df[col] = processed_df[col].fillna(processed_df[col].median())
|
| 33 |
+
|
| 34 |
+
# Convert booleans to int
|
| 35 |
+
bool_cols = processed_df.select_dtypes(include=['bool']).columns
|
| 36 |
+
processed_df[bool_cols] = processed_df[bool_cols].astype(int)
|
| 37 |
+
|
| 38 |
+
# Optional: Remove outliers here if needed
|
| 39 |
+
|
| 40 |
+
return processed_df
|