File size: 3,264 Bytes
ae6148e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import ast
import re
import keyword


def check_common_issues(code):
    """Check for common Python coding issues."""
    issues = []
    
    # Check for missing imports
    if "pd." in code and "import pandas" not in code:
        issues.append("Error: 'pd' used but 'pandas' not imported.")
    if "np." in code and "import numpy" not in code:
        issues.append("Error: 'np' used but 'numpy' not imported.")
    
    # Check for undefined variables using AST
    try:
        tree = ast.parse(code)
        assigned_vars = set()
        used_vars = set()
        
        # Collect assigned variables
        for node in ast.walk(tree):
            if isinstance(node, ast.Name):
                if isinstance(node.ctx, ast.Store):
                    assigned_vars.add(node.id)
                elif isinstance(node.ctx, ast.Load):
                    used_vars.add(node.id)
        
        # Exclude built-ins, keywords, and common module names
        excluded = set(keyword.kwlist + dir(__builtins__) + ['numpy', 'pandas', 'sklearn', 'torch', 'tensorflow'])
        undefined_vars = [var for var in used_vars if var not in assigned_vars and var not in excluded]
        if undefined_vars:
            issues.append(f"Warning: Undefined variables detected: {', '.join(undefined_vars)}.")
    except SyntaxError as e:
        issues.append(f"Warning: Syntax error in code: {str(e)}. Unable to check for undefined variables.")
    
    # Check for bare except clauses
    if "except:" in code and not re.search(r'except\s+\w+:', code):
        issues.append("Warning: Bare 'except:' clause detected. Specify exception type for better error handling.")
    
    # Check for overly long lines
    lines = code.split('\n')
    long_lines = [i + 1 for i, line in enumerate(lines) if len(line.strip()) > 120]
    if long_lines:
        issues.append(f"Warning: Lines {', '.join(map(str, long_lines))} exceed 120 characters. Consider reformatting.")
    
    return issues


def check_ml_issues(code):
    """Check for machine learning-specific issues."""
    issues = []
    
    # Check for unscaled data in ML models
    if "LogisticRegression" in code and "StandardScaler" not in code:
        issues.append("Warning: LogisticRegression used without data scaling. Consider using StandardScaler for better performance.")
    
    # Check for missing train-test split
    if any(model in code for model in ["LogisticRegression", "RandomForest", "SVC"]) and "train_test_split" not in code:
        issues.append("Warning: No train-test split detected. Use sklearn.model_selection.train_test_split to evaluate model performance.")
    
    # Check for lack of cross-validation
    if any(model in code for model in ["LogisticRegression", "RandomForest", "SVC"]) and "cross_val_score" not in code and "GridSearchCV" not in code:
        issues.append("Warning: No cross-validation detected. Consider using cross_val_score or GridSearchCV for robust model evaluation.")
    
    # Check for direct use of model.predict without validation
    if ".predict(" in code and "train_test_split" not in code:
        issues.append("Warning: Model prediction used without train-test split. Validate model on separate test data to avoid overfitting.")
    
    return issues