Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pandas as pd | |
| import json | |
| import io | |
| import os | |
| from utils import add_log | |
| # Handle missing dependencies | |
| try: | |
| from datasets import Dataset, DatasetDict | |
| except ImportError: | |
| # Create dummy classes for Dataset and DatasetDict | |
| class Dataset: | |
| def from_list(cls, items): | |
| return {"data": items, "column_names": list(items[0].keys()) if items else []} | |
| def from_dict(cls, dict_obj): | |
| return {"data": dict_obj, "column_names": list(dict_obj.keys())} | |
| def from_pandas(cls, df): | |
| return {"data": df, "column_names": df.columns.tolist()} | |
| def train_test_split(self, test_size=0.2): | |
| return { | |
| "train": self, | |
| "test": self | |
| } | |
| class DatasetDict(dict): | |
| pass | |
| def process_python_dataset(uploaded_file, dataset_name): | |
| """ | |
| Process an uploaded Python dataset file. | |
| Supports .py, .json, and .csv formats. | |
| Args: | |
| uploaded_file: The uploaded file object | |
| dataset_name: Name to identify the dataset | |
| Returns: | |
| bool: Success status | |
| """ | |
| try: | |
| file_extension = uploaded_file.name.split('.')[-1].lower() | |
| if file_extension == 'py': | |
| # Process Python file | |
| content = uploaded_file.read().decode('utf-8') | |
| # Split by function or class definitions for separate examples | |
| examples = split_python_file(content) | |
| dataset = create_dataset_from_examples(examples) | |
| elif file_extension == 'json': | |
| # Process JSON file | |
| content = json.loads(uploaded_file.read().decode('utf-8')) | |
| if isinstance(content, list): | |
| dataset = Dataset.from_list(content) | |
| else: | |
| dataset = Dataset.from_dict(content) | |
| elif file_extension == 'csv': | |
| # Process CSV file | |
| df = pd.read_csv(uploaded_file) | |
| dataset = Dataset.from_pandas(df) | |
| else: | |
| add_log(f"Unsupported file format: {file_extension}", "ERROR") | |
| return False | |
| # Split into train/validation sets | |
| train_test_split = dataset.train_test_split(test_size=0.2) | |
| # Create a DatasetDict | |
| dataset_dict = DatasetDict({ | |
| 'train': train_test_split['train'], | |
| 'validation': train_test_split['test'] | |
| }) | |
| # Store in session state | |
| st.session_state.datasets[dataset_name] = { | |
| 'data': dataset_dict, | |
| 'info': { | |
| 'name': dataset_name, | |
| 'size': len(dataset), | |
| 'train_size': len(train_test_split['train']), | |
| 'validation_size': len(train_test_split['test']), | |
| 'columns': dataset.column_names, | |
| 'created_at': pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S") | |
| } | |
| } | |
| add_log(f"Dataset '{dataset_name}' processed successfully with {len(dataset)} examples") | |
| return True | |
| except Exception as e: | |
| add_log(f"Error processing dataset: {str(e)}", "ERROR") | |
| return False | |
| def split_python_file(content): | |
| """ | |
| Split a Python file content into separate code examples. | |
| Args: | |
| content: String content of Python file | |
| Returns: | |
| list: List of code examples | |
| """ | |
| examples = [] | |
| # Simple splitting by function or class definitions | |
| lines = content.split('\n') | |
| current_example = [] | |
| for line in lines: | |
| if (line.startswith('def ') or line.startswith('class ')) and current_example: | |
| # Start of a new function/class, save the previous one | |
| examples.append('\n'.join(current_example)) | |
| current_example = [line] | |
| else: | |
| current_example.append(line) | |
| # Add the last example | |
| if current_example: | |
| examples.append('\n'.join(current_example)) | |
| # If no examples were extracted, use the whole file as one example | |
| if not examples: | |
| examples = [content] | |
| return [{'code': example} for example in examples] | |
| def create_dataset_from_examples(examples): | |
| """ | |
| Create a dataset from code examples. | |
| Args: | |
| examples: List of code examples | |
| Returns: | |
| Dataset: Hugging Face dataset | |
| """ | |
| return Dataset.from_list(examples) | |
| def validate_dataset_structure(dataset): | |
| """ | |
| Validate that the dataset has the required structure for training. | |
| Args: | |
| dataset: Hugging Face dataset | |
| Returns: | |
| bool: True if valid, False otherwise | |
| """ | |
| if 'code' not in dataset.column_names: | |
| add_log("Dataset missing 'code' column required for training", "ERROR") | |
| return False | |
| return True | |
| def list_available_datasets(): | |
| """ | |
| List all available datasets in session state. | |
| Returns: | |
| list: List of dataset names | |
| """ | |
| if 'datasets' in st.session_state: | |
| return list(st.session_state.datasets.keys()) | |
| return [] | |
| def get_dataset_info(dataset_name): | |
| """ | |
| Get information about a dataset. | |
| Args: | |
| dataset_name: Name of the dataset | |
| Returns: | |
| dict: Dataset information | |
| """ | |
| if 'datasets' in st.session_state and dataset_name in st.session_state.datasets: | |
| return st.session_state.datasets[dataset_name]['info'] | |
| return None | |
| def get_dataset(dataset_name): | |
| """ | |
| Get a dataset by name. | |
| Args: | |
| dataset_name: Name of the dataset | |
| Returns: | |
| Dataset: The dataset object | |
| """ | |
| if 'datasets' in st.session_state and dataset_name in st.session_state.datasets: | |
| return st.session_state.datasets[dataset_name]['data'] | |
| return None | |