Skitzo-4152 commited on
Commit
81d1046
·
verified ·
1 Parent(s): c3d23b3

Create preprocessor.py

Browse files
Files changed (1) hide show
  1. data/preprocessor.py +40 -0
data/preprocessor.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Data Preprocessor for ChipVerifyAI
4
+ Preprocess datasets for ML training and inference
5
+ """
6
+
7
+ import pandas as pd
8
+ import numpy as np
9
+
10
+ class DataPreprocessor:
11
+ """Preprocess data for ML models"""
12
+
13
+ def __init__(self):
14
+ self.feature_columns = [
15
+ 'lines_of_code', 'module_count', 'signal_count', 'always_blocks',
16
+ 'assign_statements', 'if_statements', 'case_statements', 'for_loops',
17
+ 'function_count', 'task_count', 'clock_domains', 'reset_signals',
18
+ 'interface_signals', 'memory_instances', 'fsm_count', 'pipeline_stages',
19
+ 'arithmetic_units', 'complexity_score', 'has_memory', 'has_fsm',
20
+ 'has_pipeline', 'has_floating_point', 'is_complex', 'is_large'
21
+ ]
22
+
23
+ def preprocess_for_ml(self, df: pd.DataFrame) -> pd.DataFrame:
24
+ """Preprocess DataFrame for ML training"""
25
+ processed_df = df.copy()
26
+ # Fill missing
27
+ for col in self.feature_columns:
28
+ if col in processed_df.columns:
29
+ if processed_df[col].dtype == 'bool':
30
+ processed_df[col] = processed_df[col].fillna(False)
31
+ else:
32
+ processed_df[col] = processed_df[col].fillna(processed_df[col].median())
33
+
34
+ # Convert booleans to int
35
+ bool_cols = processed_df.select_dtypes(include=['bool']).columns
36
+ processed_df[bool_cols] = processed_df[bool_cols].astype(int)
37
+
38
+ # Optional: Remove outliers here if needed
39
+
40
+ return processed_df