File size: 1,978 Bytes
f2bab5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""

Command-line interface for the Cloud Agents system.

"""
import click
import asyncio
import logging
from .coordinator import Coordinator
from .scaling import ScalingManager
from .config import settings

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@click.group()
def cli():
    """Cloud Agents CLI for distributed model training."""
    pass

@cli.command()
@click.option('--num-epochs', default=1, help='Number of training epochs')
@click.option('--steps-per-epoch', default=100, help='Steps per epoch')
def train(num_epochs, steps_per_epoch):
    """Start distributed training."""
    try:
        coordinator = Coordinator()
        scaling_manager = ScalingManager()
        
        async def run_training():
            # Start scaling manager
            asyncio.create_task(scaling_manager.monitor_and_scale())
            
            # Start training
            await coordinator.coordinate_training({
                'num_epochs': num_epochs,
                'steps_per_epoch': steps_per_epoch
            })
        
        asyncio.run(run_training())
    
    except Exception as e:
        logger.error(f"Training failed: {e}")
        raise

@cli.command()
def status():
    """Get cluster status."""
    try:
        scaling_manager = ScalingManager()
        status = scaling_manager.get_cluster_status()
        
        click.echo("Cluster Status:")
        click.echo(f"Total Agents: {status['total_agents']}")
        click.echo(f"Busy Agents: {status['busy_agents']}")
        click.echo(f"Idle Agents: {status['idle_agents']}")
        click.echo(f"Utilization: {status['utilization']:.2%}")
        click.echo(f"Can Scale Up: {status['can_scale_up']}")
        click.echo(f"Can Scale Down: {status['can_scale_down']}")
    
    except Exception as e:
        logger.error(f"Failed to get status: {e}")
        raise

if __name__ == '__main__':
    cli()