|
|
"""
|
|
|
Command-line interface for the Cloud Agents system.
|
|
|
"""
|
|
|
import click
|
|
|
import asyncio
|
|
|
import logging
|
|
|
from .coordinator import Coordinator
|
|
|
from .scaling import ScalingManager
|
|
|
from .config import settings
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@click.group()
|
|
|
def cli():
|
|
|
"""Cloud Agents CLI for distributed model training."""
|
|
|
pass
|
|
|
|
|
|
@cli.command()
|
|
|
@click.option('--num-epochs', default=1, help='Number of training epochs')
|
|
|
@click.option('--steps-per-epoch', default=100, help='Steps per epoch')
|
|
|
def train(num_epochs, steps_per_epoch):
|
|
|
"""Start distributed training."""
|
|
|
try:
|
|
|
coordinator = Coordinator()
|
|
|
scaling_manager = ScalingManager()
|
|
|
|
|
|
async def run_training():
|
|
|
|
|
|
asyncio.create_task(scaling_manager.monitor_and_scale())
|
|
|
|
|
|
|
|
|
await coordinator.coordinate_training({
|
|
|
'num_epochs': num_epochs,
|
|
|
'steps_per_epoch': steps_per_epoch
|
|
|
})
|
|
|
|
|
|
asyncio.run(run_training())
|
|
|
|
|
|
except Exception as e:
|
|
|
logger.error(f"Training failed: {e}")
|
|
|
raise
|
|
|
|
|
|
@cli.command()
|
|
|
def status():
|
|
|
"""Get cluster status."""
|
|
|
try:
|
|
|
scaling_manager = ScalingManager()
|
|
|
status = scaling_manager.get_cluster_status()
|
|
|
|
|
|
click.echo("Cluster Status:")
|
|
|
click.echo(f"Total Agents: {status['total_agents']}")
|
|
|
click.echo(f"Busy Agents: {status['busy_agents']}")
|
|
|
click.echo(f"Idle Agents: {status['idle_agents']}")
|
|
|
click.echo(f"Utilization: {status['utilization']:.2%}")
|
|
|
click.echo(f"Can Scale Up: {status['can_scale_up']}")
|
|
|
click.echo(f"Can Scale Down: {status['can_scale_down']}")
|
|
|
|
|
|
except Exception as e:
|
|
|
logger.error(f"Failed to get status: {e}")
|
|
|
raise
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
cli() |