Spaces:
Sleeping
Sleeping
| from transformers import AutoTokenizer, AutoModel | |
| import torch | |
| import re | |
| from .issues import check_common_issues, check_ml_issues | |
| from scripts.logger import get_logger | |
| logger = get_logger(__name__) | |
| def load_codebert(): | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base") | |
| model = AutoModel.from_pretrained("microsoft/codebert-base") | |
| if torch.cuda.is_available(): | |
| model.to('cuda') | |
| logger.info("CodeBERT loaded successfully.") | |
| return tokenizer, model | |
| except Exception as e: | |
| logger.error(f"Error loading CodeBERT: {e}") | |
| return None, None | |
| def strip_comments(code): | |
| """Remove single-line and multi-line comments from code.""" | |
| # Remove single-line comments | |
| code = re.sub(r'#.*$', '', code, flags=re.MULTILINE) | |
| # Remove multi-line comments (docstrings or triple-quoted strings) | |
| code = re.sub(r'"""[\s\S]*?"""|''[\s\S]*?''', '', code) | |
| return code | |
| def analyze_with_codebert(code): | |
| tokenizer, model = load_codebert() | |
| if tokenizer is None or model is None: | |
| return "CodeBERT not loaded." | |
| try: | |
| # Strip comments to avoid processing comment text | |
| clean_code = strip_comments(code) | |
| # Tokenize and analyze code with CodeBERT | |
| inputs = tokenizer(code, return_tensors="pt", truncation=True, max_length=512, padding=True) | |
| if torch.cuda.is_available(): | |
| inputs = {k: v.to('cuda') for k, v in inputs.items()} | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy() | |
| # Basic feedback based on code length and complexity | |
| feedback = [f"Code analyzed with CodeBERT. Length: {len(code)} characters."] | |
| if len(code) > 1000: | |
| feedback.append("Warning: Code is lengthy (>1000 characters), consider refactoring for readability.") | |
| # Heuristic checks for common issues | |
| feedback.extend(check_common_issues(clean_code)) | |
| # Machine learning-specific checks if relevant | |
| if any(lib in code for lib in ["sklearn", "tensorflow", "torch"]): | |
| feedback.extend(check_ml_issues(code)) | |
| # Combine feedback | |
| if len(feedback) == 1: | |
| feedback.append("No critical issues detected by heuristic checks.") | |
| return "\n".join(feedback) | |
| except Exception as e: | |
| logger.error(f"Error analyzing code with CodeBERT: {e}") | |
| return f"Error analyzing code: {str(e)}" | |