Mentors4EDU's picture
Upload 14 files
f2bab5e verified
"""
Command-line interface for the Cloud Agents system.
"""
import click
import asyncio
import logging
from .coordinator import Coordinator
from .scaling import ScalingManager
from .config import settings
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@click.group()
def cli():
"""Cloud Agents CLI for distributed model training."""
pass
@cli.command()
@click.option('--num-epochs', default=1, help='Number of training epochs')
@click.option('--steps-per-epoch', default=100, help='Steps per epoch')
def train(num_epochs, steps_per_epoch):
"""Start distributed training."""
try:
coordinator = Coordinator()
scaling_manager = ScalingManager()
async def run_training():
# Start scaling manager
asyncio.create_task(scaling_manager.monitor_and_scale())
# Start training
await coordinator.coordinate_training({
'num_epochs': num_epochs,
'steps_per_epoch': steps_per_epoch
})
asyncio.run(run_training())
except Exception as e:
logger.error(f"Training failed: {e}")
raise
@cli.command()
def status():
"""Get cluster status."""
try:
scaling_manager = ScalingManager()
status = scaling_manager.get_cluster_status()
click.echo("Cluster Status:")
click.echo(f"Total Agents: {status['total_agents']}")
click.echo(f"Busy Agents: {status['busy_agents']}")
click.echo(f"Idle Agents: {status['idle_agents']}")
click.echo(f"Utilization: {status['utilization']:.2%}")
click.echo(f"Can Scale Up: {status['can_scale_up']}")
click.echo(f"Can Scale Down: {status['can_scale_down']}")
except Exception as e:
logger.error(f"Failed to get status: {e}")
raise
if __name__ == '__main__':
cli()