Upload 14 files
Browse files- .env.example +9 -0
- MODEL_CARD.md +159 -0
- README.md +47 -3
- cloud_agents/__init__.py +10 -0
- cloud_agents/agent.py +161 -0
- cloud_agents/cli.py +64 -0
- cloud_agents/config.py +23 -0
- cloud_agents/coordinator.py +208 -0
- cloud_agents/couchdb_client.py +141 -0
- cloud_agents/db_views.py +74 -0
- cloud_agents/scaling.py +153 -0
- cloud_agents/tensor_ops.py +75 -0
- requirements.txt +10 -0
- setup.py +30 -0
.env.example
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
COUCHDB_URL=http://localhost:5984
|
| 2 |
+
COUCHDB_USER=admin
|
| 3 |
+
COUCHDB_PASSWORD=password
|
| 4 |
+
COORDINATOR_HOST=localhost
|
| 5 |
+
COORDINATOR_PORT=8000
|
| 6 |
+
MODEL_ID=OpenPeerAI/OpenPeerLLM
|
| 7 |
+
RAY_HEAD_PORT=6379
|
| 8 |
+
BATCH_SIZE=32
|
| 9 |
+
GRADIENT_ACCUMULATION_STEPS=4
|
MODEL_CARD.md
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Model Card: Cloud Agents for OpenPeerLLM
|
| 2 |
+
|
| 3 |
+
## Model Details
|
| 4 |
+
|
| 5 |
+
- **Model Type:** Distributed Training System for Language Models
|
| 6 |
+
- **Primary Purpose:** Training Large Language Models in a distributed environment
|
| 7 |
+
- **Framework:** PyTorch with Ray
|
| 8 |
+
- **Target Model:** [OpenPeerLLM](https://huggingface.co/OpenPeerAI/OpenPeerLLM)
|
| 9 |
+
- **License:** MIT
|
| 10 |
+
|
| 11 |
+
## Intended Use
|
| 12 |
+
|
| 13 |
+
### Primary Use
|
| 14 |
+
|
| 15 |
+
- Distributed training of large language models
|
| 16 |
+
- Grid computing/distributed computing-based learning for tensors
|
| 17 |
+
- Horizontal scaling of model training infrastructure
|
| 18 |
+
|
| 19 |
+
### Out-of-Scope Uses
|
| 20 |
+
|
| 21 |
+
- Production deployment of models
|
| 22 |
+
- Single-machine training
|
| 23 |
+
- Real-time inference
|
| 24 |
+
|
| 25 |
+
## System Architecture
|
| 26 |
+
|
| 27 |
+
### Components
|
| 28 |
+
|
| 29 |
+
1. **Distributed Agents**
|
| 30 |
+
- Lightweight worker nodes for distributed computing
|
| 31 |
+
- Automatic scaling based on workload
|
| 32 |
+
- Built-in fault tolerance and recovery
|
| 33 |
+
|
| 34 |
+
2. **CouchDB Coordination Layer**
|
| 35 |
+
- Job distribution and management
|
| 36 |
+
- State synchronization
|
| 37 |
+
- Agent discovery and registration
|
| 38 |
+
|
| 39 |
+
3. **Tensor Operations**
|
| 40 |
+
- Distributed gradient computation
|
| 41 |
+
- Efficient parameter updates
|
| 42 |
+
- Gradient averaging and clipping
|
| 43 |
+
|
| 44 |
+
4. **Training Orchestration**
|
| 45 |
+
- Automated model checkpoint management
|
| 46 |
+
- Dynamic load balancing
|
| 47 |
+
- Progress monitoring and reporting
|
| 48 |
+
|
| 49 |
+
## Performance
|
| 50 |
+
|
| 51 |
+
### Scaling Characteristics
|
| 52 |
+
|
| 53 |
+
- **Minimum Agents:** 2
|
| 54 |
+
- **Maximum Agents:** 10 (configurable)
|
| 55 |
+
- **Scale-up Threshold:** 80% utilization
|
| 56 |
+
- **Scale-down Threshold:** 30% utilization
|
| 57 |
+
- **Auto-scaling:** Yes, based on workload
|
| 58 |
+
|
| 59 |
+
### Resource Requirements
|
| 60 |
+
|
| 61 |
+
- **Per Agent:**
|
| 62 |
+
- CPU: 1 core minimum
|
| 63 |
+
- GPU: Optional, supports fractional GPU allocation
|
| 64 |
+
- Memory: Varies based on model size
|
| 65 |
+
- Network: Reliable connection to CouchDB and other agents
|
| 66 |
+
|
| 67 |
+
## Limitations
|
| 68 |
+
|
| 69 |
+
1. **Network Dependency**
|
| 70 |
+
- Requires stable network connectivity between agents
|
| 71 |
+
- CouchDB must be accessible to all agents
|
| 72 |
+
|
| 73 |
+
2. **Scaling Limits**
|
| 74 |
+
- Upper bound on number of concurrent agents
|
| 75 |
+
- Network latency can impact synchronization
|
| 76 |
+
|
| 77 |
+
3. **Resource Management**
|
| 78 |
+
- Requires careful monitoring of resource utilization
|
| 79 |
+
- GPU memory management crucial for large models
|
| 80 |
+
|
| 81 |
+
## Training Details
|
| 82 |
+
|
| 83 |
+
### Training Data
|
| 84 |
+
|
| 85 |
+
- Uses the same training data as OpenPeerLLM
|
| 86 |
+
- Supports distributed batch processing
|
| 87 |
+
- Configurable gradient accumulation steps
|
| 88 |
+
|
| 89 |
+
### Training Procedure
|
| 90 |
+
|
| 91 |
+
1. **Initialization**
|
| 92 |
+
- Model weights loaded from HuggingFace hub
|
| 93 |
+
- Agents register with coordinator
|
| 94 |
+
- Initial state distributed to all agents
|
| 95 |
+
|
| 96 |
+
2. **Training Loop**
|
| 97 |
+
- Distributed gradient computation
|
| 98 |
+
- Synchronized parameter updates
|
| 99 |
+
- Regular checkpointing
|
| 100 |
+
- Automatic agent scaling
|
| 101 |
+
|
| 102 |
+
### Hyperparameters
|
| 103 |
+
|
| 104 |
+
Configurable through environment variables:
|
| 105 |
+
- Batch size
|
| 106 |
+
- Gradient accumulation steps
|
| 107 |
+
- Number of epochs
|
| 108 |
+
- Learning rate
|
| 109 |
+
- Scaling thresholds
|
| 110 |
+
|
| 111 |
+
## Getting Started
|
| 112 |
+
|
| 113 |
+
1. **Installation**
|
| 114 |
+
```bash
|
| 115 |
+
pip install -r requirements.txt
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
2. **Configuration**
|
| 119 |
+
- Copy `.env.example` to `.env`
|
| 120 |
+
- Configure CouchDB connection
|
| 121 |
+
- Set desired training parameters
|
| 122 |
+
|
| 123 |
+
3. **Launch Training**
|
| 124 |
+
```bash
|
| 125 |
+
python -m cloud_agents.cli train --num-epochs 3 --steps-per-epoch 100
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
4. **Monitor Progress**
|
| 129 |
+
```bash
|
| 130 |
+
python -m cloud_agents.cli status
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
## Ethical Considerations
|
| 134 |
+
|
| 135 |
+
- Resource efficiency through intelligent scaling
|
| 136 |
+
- Environmental impact minimization via workload-based scaling
|
| 137 |
+
- Distributed approach reduces single-point-of-failure risks
|
| 138 |
+
|
| 139 |
+
## Maintenance
|
| 140 |
+
|
| 141 |
+
This system is maintained as an open-source project. Users are encouraged to:
|
| 142 |
+
- Report issues and bugs
|
| 143 |
+
- Suggest improvements
|
| 144 |
+
- Contribute to the codebase
|
| 145 |
+
- Share performance metrics and optimization strategies
|
| 146 |
+
|
| 147 |
+
## Citation
|
| 148 |
+
|
| 149 |
+
If you use this system in your research, please cite:
|
| 150 |
+
|
| 151 |
+
```bibtex
|
| 152 |
+
@software{cloud_agents_2025,
|
| 153 |
+
title = {Cloud Agents: Distributed Training System for OpenPeerLLM},
|
| 154 |
+
year = {2025},
|
| 155 |
+
author = {Andrew Magdy Kamal},
|
| 156 |
+
url = {hhttps://huggingface.co/OpenPeerAI/Cloud-Agents},
|
| 157 |
+
note = {Distributed computing framework for training large language models}
|
| 158 |
+
}
|
| 159 |
+
```
|
README.md
CHANGED
|
@@ -1,3 +1,47 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Cloud Agents for Distributed Model Training
|
| 2 |
+
|
| 3 |
+
A lightweight and horizontally scalable distributed computing system for training large language models, specifically designed for OpenPeerLLM.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- Distributed tensor operations for model training
|
| 8 |
+
- CouchDB-based coordination layer
|
| 9 |
+
- Automatic agent discovery and load balancing
|
| 10 |
+
- Horizontal scaling capabilities
|
| 11 |
+
- Fault tolerance and recovery
|
| 12 |
+
- Integration with OpenPeerAI's OpenPeerLLM
|
| 13 |
+
|
| 14 |
+
## Installation
|
| 15 |
+
|
| 16 |
+
```bash
|
| 17 |
+
pip install -r requirements.txt
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
## Configuration
|
| 21 |
+
|
| 22 |
+
1. Set up CouchDB instance
|
| 23 |
+
2. Copy `.env.example` to `.env` and configure your settings
|
| 24 |
+
3. Start the coordinator node
|
| 25 |
+
4. Launch agent nodes
|
| 26 |
+
|
| 27 |
+
## Quick Start
|
| 28 |
+
|
| 29 |
+
```bash
|
| 30 |
+
# Start coordinator
|
| 31 |
+
python -m cloud_agents.coordinator
|
| 32 |
+
|
| 33 |
+
# Start agent (on each machine)
|
| 34 |
+
python -m cloud_agents.agent
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
## Architecture
|
| 38 |
+
|
| 39 |
+
- `coordinator`: Manages job distribution and agent coordination
|
| 40 |
+
- `agent`: Handles tensor operations and model training
|
| 41 |
+
- `couchdb_client`: Interface for CouchDB communication
|
| 42 |
+
- `tensor_ops`: Distributed tensor operations
|
| 43 |
+
- `utils`: Helper functions and utilities
|
| 44 |
+
|
| 45 |
+
## License
|
| 46 |
+
|
| 47 |
+
MIT
|
cloud_agents/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cloud Agents package initialization.
|
| 3 |
+
"""
|
| 4 |
+
from .agent import Agent
|
| 5 |
+
from .coordinator import Coordinator
|
| 6 |
+
from .couchdb_client import CouchDBClient
|
| 7 |
+
from .config import settings
|
| 8 |
+
|
| 9 |
+
__version__ = "0.1.0"
|
| 10 |
+
__all__ = ["Agent", "Coordinator", "CouchDBClient", "settings"]
|
cloud_agents/agent.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Base agent class for distributed computing.
|
| 3 |
+
"""
|
| 4 |
+
import torch
|
| 5 |
+
import ray
|
| 6 |
+
import uuid
|
| 7 |
+
import asyncio
|
| 8 |
+
from typing import Dict, Any, Optional
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
import logging
|
| 11 |
+
from .couchdb_client import CouchDBClient
|
| 12 |
+
from .config import settings
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
@ray.remote
|
| 17 |
+
class Agent:
|
| 18 |
+
"""Distributed computing agent for tensor operations and model training."""
|
| 19 |
+
|
| 20 |
+
def __init__(self):
|
| 21 |
+
self.agent_id = str(uuid.uuid4())
|
| 22 |
+
self.db_client = CouchDBClient()
|
| 23 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 24 |
+
self.current_job: Optional[Dict] = None
|
| 25 |
+
self._register_agent()
|
| 26 |
+
self._start_heartbeat()
|
| 27 |
+
|
| 28 |
+
def _register_agent(self):
|
| 29 |
+
"""Register agent with the cluster."""
|
| 30 |
+
capabilities = {
|
| 31 |
+
"device": str(self.device),
|
| 32 |
+
"cuda_available": torch.cuda.is_available(),
|
| 33 |
+
"cuda_devices": torch.cuda.device_count() if torch.cuda.is_available() else 0,
|
| 34 |
+
"memory_available": torch.cuda.get_device_properties(0).total_memory if torch.cuda.is_available() else 0
|
| 35 |
+
}
|
| 36 |
+
success = self.db_client.register_agent(self.agent_id, capabilities)
|
| 37 |
+
if not success:
|
| 38 |
+
raise RuntimeError("Failed to register agent")
|
| 39 |
+
|
| 40 |
+
def _start_heartbeat(self):
|
| 41 |
+
"""Start agent heartbeat."""
|
| 42 |
+
async def heartbeat_loop():
|
| 43 |
+
while True:
|
| 44 |
+
try:
|
| 45 |
+
self.db_client.update_heartbeat(self.agent_id)
|
| 46 |
+
await asyncio.sleep(30)
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.error(f"Heartbeat error: {e}")
|
| 49 |
+
await asyncio.sleep(5)
|
| 50 |
+
|
| 51 |
+
asyncio.create_task(heartbeat_loop())
|
| 52 |
+
|
| 53 |
+
def process_tensors(self, tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
|
| 54 |
+
"""Process tensor operations."""
|
| 55 |
+
results = {}
|
| 56 |
+
for name, tensor in tensors.items():
|
| 57 |
+
tensor = tensor.to(self.device)
|
| 58 |
+
# Perform tensor operations
|
| 59 |
+
results[name] = self._compute_tensor(tensor)
|
| 60 |
+
return results
|
| 61 |
+
|
| 62 |
+
def _compute_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
|
| 63 |
+
"""Compute operations on a single tensor."""
|
| 64 |
+
# Add custom tensor operations here
|
| 65 |
+
return tensor
|
| 66 |
+
|
| 67 |
+
async def run(self):
|
| 68 |
+
"""Main agent loop."""
|
| 69 |
+
while True:
|
| 70 |
+
try:
|
| 71 |
+
# Try to claim a job
|
| 72 |
+
job = self.db_client.claim_job(self.agent_id)
|
| 73 |
+
if job:
|
| 74 |
+
self.current_job = job
|
| 75 |
+
await self._process_job(job)
|
| 76 |
+
else:
|
| 77 |
+
await asyncio.sleep(1)
|
| 78 |
+
except Exception as e:
|
| 79 |
+
logger.error(f"Error in agent loop: {e}")
|
| 80 |
+
await asyncio.sleep(5)
|
| 81 |
+
|
| 82 |
+
async def _process_job(self, job: Dict[str, Any]):
|
| 83 |
+
"""Process a claimed job."""
|
| 84 |
+
try:
|
| 85 |
+
job_type = job['type']
|
| 86 |
+
params = job['params']
|
| 87 |
+
|
| 88 |
+
result = None
|
| 89 |
+
if job_type == 'gradient_computation':
|
| 90 |
+
result = await self._compute_gradients(params)
|
| 91 |
+
elif job_type == 'model_update':
|
| 92 |
+
result = await self._update_model(params)
|
| 93 |
+
|
| 94 |
+
# Store job results
|
| 95 |
+
self.db_client.update_job_status(
|
| 96 |
+
job['_id'],
|
| 97 |
+
'completed',
|
| 98 |
+
result
|
| 99 |
+
)
|
| 100 |
+
except Exception as e:
|
| 101 |
+
logger.error(f"Job processing error: {e}")
|
| 102 |
+
self.db_client.update_job_status(
|
| 103 |
+
job['_id'],
|
| 104 |
+
'failed',
|
| 105 |
+
{'error': str(e)}
|
| 106 |
+
)
|
| 107 |
+
finally:
|
| 108 |
+
self.current_job = None
|
| 109 |
+
|
| 110 |
+
async def _compute_gradients(self, params: Dict[str, Any]) -> Dict[str, Any]:
|
| 111 |
+
"""Compute gradients for model training."""
|
| 112 |
+
try:
|
| 113 |
+
# Load model checkpoint
|
| 114 |
+
checkpoint = params.get('checkpoint')
|
| 115 |
+
if checkpoint:
|
| 116 |
+
state_dict = torch.load(checkpoint, map_location=self.device)
|
| 117 |
+
# Compute gradients
|
| 118 |
+
gradients = self._compute_model_gradients(state_dict, params.get('batch'))
|
| 119 |
+
# Store gradients in CouchDB
|
| 120 |
+
gradient_id = self.db_client.store_gradients(
|
| 121 |
+
self.current_job['_id'],
|
| 122 |
+
gradients
|
| 123 |
+
)
|
| 124 |
+
return {'gradient_id': gradient_id}
|
| 125 |
+
except Exception as e:
|
| 126 |
+
logger.error(f"Gradient computation error: {e}")
|
| 127 |
+
raise
|
| 128 |
+
|
| 129 |
+
def _compute_model_gradients(self, state_dict: Dict[str, torch.Tensor], batch: Dict[str, Any]) -> Dict[str, Any]:
|
| 130 |
+
"""Compute gradients for a given model state and batch."""
|
| 131 |
+
# Convert gradients to serializable format
|
| 132 |
+
gradients = {}
|
| 133 |
+
for name, param in state_dict.items():
|
| 134 |
+
if param.requires_grad:
|
| 135 |
+
grad = param.grad
|
| 136 |
+
if grad is not None:
|
| 137 |
+
gradients[name] = grad.cpu().numpy().tolist()
|
| 138 |
+
return gradients
|
| 139 |
+
|
| 140 |
+
async def _update_model(self, params: Dict[str, Any]) -> Dict[str, Any]:
|
| 141 |
+
"""Update model with new parameters."""
|
| 142 |
+
try:
|
| 143 |
+
new_state = params.get('state')
|
| 144 |
+
if new_state:
|
| 145 |
+
# Apply model updates
|
| 146 |
+
state_id = self.db_client.store_model_state(new_state)
|
| 147 |
+
return {'state_id': state_id}
|
| 148 |
+
except Exception as e:
|
| 149 |
+
logger.error(f"Model update error: {e}")
|
| 150 |
+
raise
|
| 151 |
+
|
| 152 |
+
def shutdown(self):
|
| 153 |
+
"""Shutdown the agent."""
|
| 154 |
+
# Update agent status to inactive
|
| 155 |
+
self.db_client.update_job_status(
|
| 156 |
+
self.agent_id,
|
| 157 |
+
'inactive'
|
| 158 |
+
)
|
| 159 |
+
# Clean up resources
|
| 160 |
+
if torch.cuda.is_available():
|
| 161 |
+
torch.cuda.empty_cache()
|
cloud_agents/cli.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Command-line interface for the Cloud Agents system.
|
| 3 |
+
"""
|
| 4 |
+
import click
|
| 5 |
+
import asyncio
|
| 6 |
+
import logging
|
| 7 |
+
from .coordinator import Coordinator
|
| 8 |
+
from .scaling import ScalingManager
|
| 9 |
+
from .config import settings
|
| 10 |
+
|
| 11 |
+
logging.basicConfig(level=logging.INFO)
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
@click.group()
|
| 15 |
+
def cli():
|
| 16 |
+
"""Cloud Agents CLI for distributed model training."""
|
| 17 |
+
pass
|
| 18 |
+
|
| 19 |
+
@cli.command()
|
| 20 |
+
@click.option('--num-epochs', default=1, help='Number of training epochs')
|
| 21 |
+
@click.option('--steps-per-epoch', default=100, help='Steps per epoch')
|
| 22 |
+
def train(num_epochs, steps_per_epoch):
|
| 23 |
+
"""Start distributed training."""
|
| 24 |
+
try:
|
| 25 |
+
coordinator = Coordinator()
|
| 26 |
+
scaling_manager = ScalingManager()
|
| 27 |
+
|
| 28 |
+
async def run_training():
|
| 29 |
+
# Start scaling manager
|
| 30 |
+
asyncio.create_task(scaling_manager.monitor_and_scale())
|
| 31 |
+
|
| 32 |
+
# Start training
|
| 33 |
+
await coordinator.coordinate_training({
|
| 34 |
+
'num_epochs': num_epochs,
|
| 35 |
+
'steps_per_epoch': steps_per_epoch
|
| 36 |
+
})
|
| 37 |
+
|
| 38 |
+
asyncio.run(run_training())
|
| 39 |
+
|
| 40 |
+
except Exception as e:
|
| 41 |
+
logger.error(f"Training failed: {e}")
|
| 42 |
+
raise
|
| 43 |
+
|
| 44 |
+
@cli.command()
|
| 45 |
+
def status():
|
| 46 |
+
"""Get cluster status."""
|
| 47 |
+
try:
|
| 48 |
+
scaling_manager = ScalingManager()
|
| 49 |
+
status = scaling_manager.get_cluster_status()
|
| 50 |
+
|
| 51 |
+
click.echo("Cluster Status:")
|
| 52 |
+
click.echo(f"Total Agents: {status['total_agents']}")
|
| 53 |
+
click.echo(f"Busy Agents: {status['busy_agents']}")
|
| 54 |
+
click.echo(f"Idle Agents: {status['idle_agents']}")
|
| 55 |
+
click.echo(f"Utilization: {status['utilization']:.2%}")
|
| 56 |
+
click.echo(f"Can Scale Up: {status['can_scale_up']}")
|
| 57 |
+
click.echo(f"Can Scale Down: {status['can_scale_down']}")
|
| 58 |
+
|
| 59 |
+
except Exception as e:
|
| 60 |
+
logger.error(f"Failed to get status: {e}")
|
| 61 |
+
raise
|
| 62 |
+
|
| 63 |
+
if __name__ == '__main__':
|
| 64 |
+
cli()
|
cloud_agents/config.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration settings for Cloud Agents.
|
| 3 |
+
"""
|
| 4 |
+
from pydantic_settings import BaseSettings
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
class Settings(BaseSettings):
|
| 8 |
+
"""Settings for Cloud Agents configuration."""
|
| 9 |
+
COUCHDB_URL: str = "http://localhost:5984"
|
| 10 |
+
COUCHDB_USER: str = "admin"
|
| 11 |
+
COUCHDB_PASSWORD: str = "password"
|
| 12 |
+
COORDINATOR_HOST: str = "localhost"
|
| 13 |
+
COORDINATOR_PORT: int = 8000
|
| 14 |
+
MODEL_ID: str = "OpenPeerAI/OpenPeerLLM"
|
| 15 |
+
RAY_HEAD_PORT: int = 6379
|
| 16 |
+
BATCH_SIZE: int = 32
|
| 17 |
+
GRADIENT_ACCUMULATION_STEPS: int = 4
|
| 18 |
+
|
| 19 |
+
class Config:
|
| 20 |
+
env_file = ".env"
|
| 21 |
+
env_file_encoding = "utf-8"
|
| 22 |
+
|
| 23 |
+
settings = Settings()
|
cloud_agents/coordinator.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Coordinator for distributed model training.
|
| 3 |
+
"""
|
| 4 |
+
import torch
|
| 5 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 6 |
+
from typing import Dict, List, Any, Optional
|
| 7 |
+
import asyncio
|
| 8 |
+
import logging
|
| 9 |
+
from huggingface_hub import snapshot_download
|
| 10 |
+
import os
|
| 11 |
+
import ray
|
| 12 |
+
from .couchdb_client import CouchDBClient
|
| 13 |
+
from .config import settings
|
| 14 |
+
from .tensor_ops import TensorOps
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
class Coordinator:
|
| 19 |
+
"""Coordinator for distributed training of OpenPeerLLM."""
|
| 20 |
+
|
| 21 |
+
def __init__(self):
|
| 22 |
+
self.db_client = CouchDBClient()
|
| 23 |
+
self.model_id = settings.MODEL_ID
|
| 24 |
+
self.batch_size = settings.BATCH_SIZE
|
| 25 |
+
self.gradient_accumulation_steps = settings.GRADIENT_ACCUMULATION_STEPS
|
| 26 |
+
self._initialize_model()
|
| 27 |
+
|
| 28 |
+
def _initialize_model(self):
|
| 29 |
+
"""Initialize the model and tokenizer."""
|
| 30 |
+
try:
|
| 31 |
+
# Download model and tokenizer from Hugging Face
|
| 32 |
+
cache_dir = snapshot_download(self.model_id)
|
| 33 |
+
self.model = AutoModelForCausalLM.from_pretrained(cache_dir)
|
| 34 |
+
self.tokenizer = AutoTokenizer.from_pretrained(cache_dir)
|
| 35 |
+
|
| 36 |
+
# Store initial model state
|
| 37 |
+
initial_state = {
|
| 38 |
+
'model_state': self.model.state_dict(),
|
| 39 |
+
'step': 0,
|
| 40 |
+
'epoch': 0
|
| 41 |
+
}
|
| 42 |
+
self.db_client.store_model_state(initial_state)
|
| 43 |
+
|
| 44 |
+
except Exception as e:
|
| 45 |
+
logger.error(f"Failed to initialize model: {e}")
|
| 46 |
+
raise
|
| 47 |
+
|
| 48 |
+
async def coordinate_training(self, training_config: Dict[str, Any]):
|
| 49 |
+
"""Coordinate distributed training across agents."""
|
| 50 |
+
try:
|
| 51 |
+
num_epochs = training_config.get('num_epochs', 1)
|
| 52 |
+
steps_per_epoch = training_config.get('steps_per_epoch', 100)
|
| 53 |
+
|
| 54 |
+
for epoch in range(num_epochs):
|
| 55 |
+
logger.info(f"Starting epoch {epoch}")
|
| 56 |
+
await self._train_epoch(epoch, steps_per_epoch)
|
| 57 |
+
|
| 58 |
+
# Save checkpoint after each epoch
|
| 59 |
+
self._save_checkpoint(epoch)
|
| 60 |
+
except Exception as e:
|
| 61 |
+
logger.error(f"Training coordination error: {e}")
|
| 62 |
+
raise
|
| 63 |
+
|
| 64 |
+
async def _train_epoch(self, epoch: int, steps_per_epoch: int):
|
| 65 |
+
"""Train for one epoch."""
|
| 66 |
+
for step in range(steps_per_epoch):
|
| 67 |
+
# Get active agents
|
| 68 |
+
active_agents = self.db_client.get_active_agents()
|
| 69 |
+
if not active_agents:
|
| 70 |
+
logger.warning("No active agents available")
|
| 71 |
+
await asyncio.sleep(5)
|
| 72 |
+
continue
|
| 73 |
+
|
| 74 |
+
# Distribute gradient computation jobs
|
| 75 |
+
gradient_jobs = await self._distribute_gradient_computation(
|
| 76 |
+
active_agents,
|
| 77 |
+
self.batch_size
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# Collect and process gradients
|
| 81 |
+
gradients = await self._collect_gradients(gradient_jobs)
|
| 82 |
+
if gradients:
|
| 83 |
+
# Update model with collected gradients
|
| 84 |
+
self._update_model_parameters(gradients)
|
| 85 |
+
|
| 86 |
+
# Distribute updated model state to agents
|
| 87 |
+
await self._distribute_model_update()
|
| 88 |
+
|
| 89 |
+
async def _distribute_gradient_computation(
|
| 90 |
+
self,
|
| 91 |
+
agents: List[Dict[str, Any]],
|
| 92 |
+
batch_size: int
|
| 93 |
+
) -> List[str]:
|
| 94 |
+
"""Distribute gradient computation jobs to available agents."""
|
| 95 |
+
job_ids = []
|
| 96 |
+
|
| 97 |
+
# Get current model state
|
| 98 |
+
current_state = self.db_client.get_latest_model_state()
|
| 99 |
+
if not current_state:
|
| 100 |
+
raise RuntimeError("No model state available")
|
| 101 |
+
|
| 102 |
+
# Create gradient computation jobs
|
| 103 |
+
for agent in agents:
|
| 104 |
+
job_id = self.db_client.create_job(
|
| 105 |
+
'gradient_computation',
|
| 106 |
+
{
|
| 107 |
+
'batch_size': batch_size,
|
| 108 |
+
'state': current_state['state']
|
| 109 |
+
}
|
| 110 |
+
)
|
| 111 |
+
job_ids.append(job_id)
|
| 112 |
+
|
| 113 |
+
return job_ids
|
| 114 |
+
|
| 115 |
+
async def _collect_gradients(self, job_ids: List[str]) -> Optional[List[Dict[str, Any]]]:
|
| 116 |
+
"""Collect gradients from completed jobs."""
|
| 117 |
+
all_gradients = []
|
| 118 |
+
timeout = 300 # 5 minutes timeout
|
| 119 |
+
|
| 120 |
+
async def wait_for_job(job_id: str) -> Optional[Dict[str, Any]]:
|
| 121 |
+
start_time = asyncio.get_event_time()
|
| 122 |
+
while True:
|
| 123 |
+
if asyncio.get_event_time() - start_time > timeout:
|
| 124 |
+
logger.warning(f"Job {job_id} timed out")
|
| 125 |
+
return None
|
| 126 |
+
|
| 127 |
+
job = self.db_client.get_job(job_id)
|
| 128 |
+
if job['status'] == 'completed':
|
| 129 |
+
gradient_id = job['result']['gradient_id']
|
| 130 |
+
return self.db_client.get_gradients(gradient_id)
|
| 131 |
+
elif job['status'] == 'failed':
|
| 132 |
+
logger.error(f"Job {job_id} failed: {job.get('result', {}).get('error')}")
|
| 133 |
+
return None
|
| 134 |
+
|
| 135 |
+
await asyncio.sleep(1)
|
| 136 |
+
|
| 137 |
+
# Wait for all gradient computations to complete
|
| 138 |
+
gradient_tasks = [wait_for_job(job_id) for job_id in job_ids]
|
| 139 |
+
gradients = await asyncio.gather(*gradient_tasks)
|
| 140 |
+
|
| 141 |
+
# Filter out None results (failed jobs)
|
| 142 |
+
return [g for g in gradients if g is not None]
|
| 143 |
+
|
| 144 |
+
def _update_model_parameters(self, gradients: List[Dict[str, Any]]):
|
| 145 |
+
"""Update model parameters with collected gradients."""
|
| 146 |
+
try:
|
| 147 |
+
# Average gradients from all workers
|
| 148 |
+
avg_gradients = TensorOps.average_gradients([
|
| 149 |
+
{k: torch.tensor(v) for k, v in g.items()}
|
| 150 |
+
for g in gradients
|
| 151 |
+
])
|
| 152 |
+
|
| 153 |
+
# Apply gradient clipping
|
| 154 |
+
clipped_gradients = TensorOps.gradient_clipping(avg_gradients, max_norm=1.0)
|
| 155 |
+
|
| 156 |
+
# Update model parameters
|
| 157 |
+
with torch.no_grad():
|
| 158 |
+
for name, param in self.model.named_parameters():
|
| 159 |
+
if name in clipped_gradients:
|
| 160 |
+
param.sub_(clipped_gradients[name] * self.model.config.learning_rate)
|
| 161 |
+
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logger.error(f"Error updating model parameters: {e}")
|
| 164 |
+
raise
|
| 165 |
+
|
| 166 |
+
async def _distribute_model_update(self):
|
| 167 |
+
"""Distribute updated model state to all agents."""
|
| 168 |
+
try:
|
| 169 |
+
# Store updated model state
|
| 170 |
+
state = {
|
| 171 |
+
'model_state': self.model.state_dict(),
|
| 172 |
+
'timestamp': datetime.utcnow().isoformat()
|
| 173 |
+
}
|
| 174 |
+
state_id = self.db_client.store_model_state(state)
|
| 175 |
+
|
| 176 |
+
# Create model update jobs for all active agents
|
| 177 |
+
active_agents = self.db_client.get_active_agents()
|
| 178 |
+
for agent in active_agents:
|
| 179 |
+
self.db_client.create_job(
|
| 180 |
+
'model_update',
|
| 181 |
+
{
|
| 182 |
+
'state_id': state_id,
|
| 183 |
+
'state': state
|
| 184 |
+
}
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
except Exception as e:
|
| 188 |
+
logger.error(f"Error distributing model update: {e}")
|
| 189 |
+
raise
|
| 190 |
+
|
| 191 |
+
def _save_checkpoint(self, epoch: int):
|
| 192 |
+
"""Save a checkpoint of the current model state."""
|
| 193 |
+
try:
|
| 194 |
+
checkpoint_dir = os.path.join(os.getcwd(), 'checkpoints')
|
| 195 |
+
os.makedirs(checkpoint_dir, exist_ok=True)
|
| 196 |
+
|
| 197 |
+
checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch}.pt")
|
| 198 |
+
torch.save({
|
| 199 |
+
'epoch': epoch,
|
| 200 |
+
'model_state_dict': self.model.state_dict(),
|
| 201 |
+
'optimizer_state_dict': self.optimizer.state_dict() if hasattr(self, 'optimizer') else None
|
| 202 |
+
}, checkpoint_path)
|
| 203 |
+
|
| 204 |
+
logger.info(f"Saved checkpoint for epoch {epoch}")
|
| 205 |
+
|
| 206 |
+
except Exception as e:
|
| 207 |
+
logger.error(f"Error saving checkpoint: {e}")
|
| 208 |
+
raise
|
cloud_agents/couchdb_client.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CouchDB client for distributed coordination.
|
| 3 |
+
"""
|
| 4 |
+
import couchdb
|
| 5 |
+
import uuid
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from typing import Dict, List, Optional, Any
|
| 8 |
+
from .config import settings
|
| 9 |
+
|
| 10 |
+
class CouchDBClient:
|
| 11 |
+
"""Client for interacting with CouchDB for distributed coordination."""
|
| 12 |
+
|
| 13 |
+
def __init__(self):
|
| 14 |
+
self.server = couchdb.Server(settings.COUCHDB_URL)
|
| 15 |
+
self.server.resource.credentials = (
|
| 16 |
+
settings.COUCHDB_USER,
|
| 17 |
+
settings.COUCHDB_PASSWORD
|
| 18 |
+
)
|
| 19 |
+
self._ensure_databases()
|
| 20 |
+
|
| 21 |
+
def _ensure_databases(self):
|
| 22 |
+
"""Ensure required databases exist."""
|
| 23 |
+
required_dbs = ['agents', 'jobs', 'gradients', 'model_state']
|
| 24 |
+
for db_name in required_dbs:
|
| 25 |
+
if db_name not in self.server:
|
| 26 |
+
self.server.create(db_name)
|
| 27 |
+
|
| 28 |
+
def register_agent(self, agent_id: str, capabilities: Dict[str, Any]) -> bool:
|
| 29 |
+
"""Register an agent in the cluster."""
|
| 30 |
+
db = self.server['agents']
|
| 31 |
+
doc = {
|
| 32 |
+
'_id': agent_id,
|
| 33 |
+
'status': 'active',
|
| 34 |
+
'capabilities': capabilities,
|
| 35 |
+
'last_heartbeat': datetime.utcnow().isoformat(),
|
| 36 |
+
'current_job': None
|
| 37 |
+
}
|
| 38 |
+
try:
|
| 39 |
+
db.save(doc)
|
| 40 |
+
return True
|
| 41 |
+
except couchdb.http.ResourceConflict:
|
| 42 |
+
return False
|
| 43 |
+
|
| 44 |
+
def update_heartbeat(self, agent_id: str) -> bool:
|
| 45 |
+
"""Update agent heartbeat."""
|
| 46 |
+
db = self.server['agents']
|
| 47 |
+
try:
|
| 48 |
+
doc = db[agent_id]
|
| 49 |
+
doc['last_heartbeat'] = datetime.utcnow().isoformat()
|
| 50 |
+
db.save(doc)
|
| 51 |
+
return True
|
| 52 |
+
except couchdb.http.ResourceNotFound:
|
| 53 |
+
return False
|
| 54 |
+
|
| 55 |
+
def create_job(self, job_type: str, params: Dict[str, Any]) -> str:
|
| 56 |
+
"""Create a new job in the job queue."""
|
| 57 |
+
db = self.server['jobs']
|
| 58 |
+
job_id = str(uuid.uuid4())
|
| 59 |
+
doc = {
|
| 60 |
+
'_id': job_id,
|
| 61 |
+
'type': job_type,
|
| 62 |
+
'params': params,
|
| 63 |
+
'status': 'pending',
|
| 64 |
+
'created_at': datetime.utcnow().isoformat(),
|
| 65 |
+
'assigned_to': None
|
| 66 |
+
}
|
| 67 |
+
db.save(doc)
|
| 68 |
+
return job_id
|
| 69 |
+
|
| 70 |
+
def claim_job(self, agent_id: str) -> Optional[Dict[str, Any]]:
|
| 71 |
+
"""Attempt to claim a pending job."""
|
| 72 |
+
db = self.server['jobs']
|
| 73 |
+
for row in db.view('_all_docs', include_docs=True):
|
| 74 |
+
doc = row.doc
|
| 75 |
+
if doc.get('status') == 'pending':
|
| 76 |
+
try:
|
| 77 |
+
doc['status'] = 'in_progress'
|
| 78 |
+
doc['assigned_to'] = agent_id
|
| 79 |
+
doc['claimed_at'] = datetime.utcnow().isoformat()
|
| 80 |
+
db.save(doc)
|
| 81 |
+
return doc
|
| 82 |
+
except couchdb.http.ResourceConflict:
|
| 83 |
+
continue
|
| 84 |
+
return None
|
| 85 |
+
|
| 86 |
+
def update_job_status(self, job_id: str, status: str, result: Optional[Dict[str, Any]] = None) -> bool:
|
| 87 |
+
"""Update job status and optionally store results."""
|
| 88 |
+
db = self.server['jobs']
|
| 89 |
+
try:
|
| 90 |
+
doc = db[job_id]
|
| 91 |
+
doc['status'] = status
|
| 92 |
+
if result:
|
| 93 |
+
doc['result'] = result
|
| 94 |
+
doc['updated_at'] = datetime.utcnow().isoformat()
|
| 95 |
+
db.save(doc)
|
| 96 |
+
return True
|
| 97 |
+
except couchdb.http.ResourceNotFound:
|
| 98 |
+
return False
|
| 99 |
+
|
| 100 |
+
def store_gradients(self, job_id: str, gradients: Dict[str, Any]) -> str:
|
| 101 |
+
"""Store computed gradients."""
|
| 102 |
+
db = self.server['gradients']
|
| 103 |
+
gradient_id = str(uuid.uuid4())
|
| 104 |
+
doc = {
|
| 105 |
+
'_id': gradient_id,
|
| 106 |
+
'job_id': job_id,
|
| 107 |
+
'gradients': gradients,
|
| 108 |
+
'timestamp': datetime.utcnow().isoformat()
|
| 109 |
+
}
|
| 110 |
+
db.save(doc)
|
| 111 |
+
return gradient_id
|
| 112 |
+
|
| 113 |
+
def get_active_agents(self) -> List[Dict[str, Any]]:
|
| 114 |
+
"""Get list of currently active agents."""
|
| 115 |
+
db = self.server['agents']
|
| 116 |
+
active_agents = []
|
| 117 |
+
for row in db.view('_all_docs', include_docs=True):
|
| 118 |
+
doc = row.doc
|
| 119 |
+
if doc.get('status') == 'active':
|
| 120 |
+
active_agents.append(doc)
|
| 121 |
+
return active_agents
|
| 122 |
+
|
| 123 |
+
def store_model_state(self, state: Dict[str, Any]) -> str:
|
| 124 |
+
"""Store current model state."""
|
| 125 |
+
db = self.server['model_state']
|
| 126 |
+
state_id = str(uuid.uuid4())
|
| 127 |
+
doc = {
|
| 128 |
+
'_id': state_id,
|
| 129 |
+
'state': state,
|
| 130 |
+
'timestamp': datetime.utcnow().isoformat()
|
| 131 |
+
}
|
| 132 |
+
db.save(doc)
|
| 133 |
+
return state_id
|
| 134 |
+
|
| 135 |
+
def get_latest_model_state(self) -> Optional[Dict[str, Any]]:
|
| 136 |
+
"""Retrieve the latest model state."""
|
| 137 |
+
db = self.server['model_state']
|
| 138 |
+
# Get the most recent state by timestamp
|
| 139 |
+
for row in db.view('_all_docs', include_docs=True, descending=True, limit=1):
|
| 140 |
+
return row.doc
|
| 141 |
+
return None
|
cloud_agents/db_views.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Database views for CouchDB.
|
| 3 |
+
"""
|
| 4 |
+
from typing import Dict
|
| 5 |
+
|
| 6 |
+
# Views to be created in CouchDB for efficient querying
|
| 7 |
+
|
| 8 |
+
VIEWS: Dict[str, Dict] = {
|
| 9 |
+
'agents': {
|
| 10 |
+
'_design/agents': {
|
| 11 |
+
'views': {
|
| 12 |
+
'active': {
|
| 13 |
+
'map': '''function(doc) {
|
| 14 |
+
if (doc.status === 'active') {
|
| 15 |
+
emit(doc._id, doc);
|
| 16 |
+
}
|
| 17 |
+
}'''
|
| 18 |
+
},
|
| 19 |
+
'by_status': {
|
| 20 |
+
'map': '''function(doc) {
|
| 21 |
+
emit(doc.status, doc);
|
| 22 |
+
}'''
|
| 23 |
+
}
|
| 24 |
+
}
|
| 25 |
+
}
|
| 26 |
+
},
|
| 27 |
+
'jobs': {
|
| 28 |
+
'_design/jobs': {
|
| 29 |
+
'views': {
|
| 30 |
+
'pending': {
|
| 31 |
+
'map': '''function(doc) {
|
| 32 |
+
if (doc.status === 'pending') {
|
| 33 |
+
emit(doc._id, doc);
|
| 34 |
+
}
|
| 35 |
+
}'''
|
| 36 |
+
},
|
| 37 |
+
'by_agent': {
|
| 38 |
+
'map': '''function(doc) {
|
| 39 |
+
if (doc.assigned_to) {
|
| 40 |
+
emit(doc.assigned_to, doc);
|
| 41 |
+
}
|
| 42 |
+
}'''
|
| 43 |
+
}
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
},
|
| 47 |
+
'gradients': {
|
| 48 |
+
'_design/gradients': {
|
| 49 |
+
'views': {
|
| 50 |
+
'by_job': {
|
| 51 |
+
'map': '''function(doc) {
|
| 52 |
+
emit(doc.job_id, doc);
|
| 53 |
+
}'''
|
| 54 |
+
},
|
| 55 |
+
'by_timestamp': {
|
| 56 |
+
'map': '''function(doc) {
|
| 57 |
+
emit(doc.timestamp, doc);
|
| 58 |
+
}'''
|
| 59 |
+
}
|
| 60 |
+
}
|
| 61 |
+
}
|
| 62 |
+
},
|
| 63 |
+
'model_state': {
|
| 64 |
+
'_design/model_state': {
|
| 65 |
+
'views': {
|
| 66 |
+
'by_timestamp': {
|
| 67 |
+
'map': '''function(doc) {
|
| 68 |
+
emit(doc.timestamp, doc);
|
| 69 |
+
}'''
|
| 70 |
+
}
|
| 71 |
+
}
|
| 72 |
+
}
|
| 73 |
+
}
|
| 74 |
+
}
|
cloud_agents/scaling.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Scaling manager for horizontal scaling of cloud agents.
|
| 3 |
+
"""
|
| 4 |
+
import ray
|
| 5 |
+
import asyncio
|
| 6 |
+
import logging
|
| 7 |
+
from typing import Dict, List, Optional, Any
|
| 8 |
+
from datetime import datetime, timedelta
|
| 9 |
+
from .couchdb_client import CouchDBClient
|
| 10 |
+
from .agent import Agent
|
| 11 |
+
from .config import settings
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
class ScalingManager:
|
| 16 |
+
"""Manager for horizontal scaling of cloud agents."""
|
| 17 |
+
|
| 18 |
+
def __init__(self):
|
| 19 |
+
self.db_client = CouchDBClient()
|
| 20 |
+
self._initialize_ray()
|
| 21 |
+
self.min_agents = 2
|
| 22 |
+
self.max_agents = 10
|
| 23 |
+
self.scale_up_threshold = 0.8 # Scale up when 80% of agents are busy
|
| 24 |
+
self.scale_down_threshold = 0.3 # Scale down when less than 30% of agents are busy
|
| 25 |
+
self.agent_refs: Dict[str, ray.actor.ActorHandle] = {}
|
| 26 |
+
|
| 27 |
+
def _initialize_ray(self):
|
| 28 |
+
"""Initialize Ray for distributed computing."""
|
| 29 |
+
if not ray.is_initialized():
|
| 30 |
+
ray.init(address=f"ray://{settings.COORDINATOR_HOST}:{settings.RAY_HEAD_PORT}")
|
| 31 |
+
|
| 32 |
+
async def monitor_and_scale(self):
|
| 33 |
+
"""Monitor cluster health and scale as needed."""
|
| 34 |
+
while True:
|
| 35 |
+
try:
|
| 36 |
+
await self._check_agent_health()
|
| 37 |
+
await self._scale_cluster()
|
| 38 |
+
await asyncio.sleep(60) # Check every minute
|
| 39 |
+
except Exception as e:
|
| 40 |
+
logger.error(f"Error in monitor and scale loop: {e}")
|
| 41 |
+
await asyncio.sleep(5)
|
| 42 |
+
|
| 43 |
+
async def _check_agent_health(self):
|
| 44 |
+
"""Check health of all agents and remove dead ones."""
|
| 45 |
+
try:
|
| 46 |
+
active_agents = self.db_client.get_active_agents()
|
| 47 |
+
current_time = datetime.utcnow()
|
| 48 |
+
|
| 49 |
+
for agent in active_agents:
|
| 50 |
+
last_heartbeat = datetime.fromisoformat(agent['last_heartbeat'])
|
| 51 |
+
if current_time - last_heartbeat > timedelta(minutes=5):
|
| 52 |
+
# Agent is considered dead
|
| 53 |
+
logger.warning(f"Agent {agent['_id']} appears to be dead. Removing...")
|
| 54 |
+
await self._remove_agent(agent['_id'])
|
| 55 |
+
except Exception as e:
|
| 56 |
+
logger.error(f"Error checking agent health: {e}")
|
| 57 |
+
raise
|
| 58 |
+
|
| 59 |
+
async def _scale_cluster(self):
|
| 60 |
+
"""Scale the cluster based on workload."""
|
| 61 |
+
try:
|
| 62 |
+
active_agents = self.db_client.get_active_agents()
|
| 63 |
+
total_agents = len(active_agents)
|
| 64 |
+
busy_agents = len([a for a in active_agents if a['current_job'] is not None])
|
| 65 |
+
|
| 66 |
+
if total_agents < 1:
|
| 67 |
+
# Always ensure at least one agent is running
|
| 68 |
+
await self._add_agent()
|
| 69 |
+
return
|
| 70 |
+
|
| 71 |
+
utilization = busy_agents / total_agents if total_agents > 0 else 0
|
| 72 |
+
|
| 73 |
+
# Scale up if needed
|
| 74 |
+
if utilization >= self.scale_up_threshold and total_agents < self.max_agents:
|
| 75 |
+
num_to_add = min(2, self.max_agents - total_agents) # Add up to 2 agents at a time
|
| 76 |
+
logger.info(f"Scaling up: Adding {num_to_add} agents")
|
| 77 |
+
for _ in range(num_to_add):
|
| 78 |
+
await self._add_agent()
|
| 79 |
+
|
| 80 |
+
# Scale down if needed
|
| 81 |
+
elif utilization <= self.scale_down_threshold and total_agents > self.min_agents:
|
| 82 |
+
num_to_remove = min(1, total_agents - self.min_agents) # Remove 1 agent at a time
|
| 83 |
+
logger.info(f"Scaling down: Removing {num_to_remove} agents")
|
| 84 |
+
idle_agents = [a for a in active_agents if a['current_job'] is None]
|
| 85 |
+
for _ in range(num_to_remove):
|
| 86 |
+
if idle_agents:
|
| 87 |
+
await self._remove_agent(idle_agents.pop()['_id'])
|
| 88 |
+
|
| 89 |
+
except Exception as e:
|
| 90 |
+
logger.error(f"Error scaling cluster: {e}")
|
| 91 |
+
raise
|
| 92 |
+
|
| 93 |
+
async def _add_agent(self):
|
| 94 |
+
"""Add a new agent to the cluster."""
|
| 95 |
+
try:
|
| 96 |
+
# Create new agent actor using Ray
|
| 97 |
+
agent_ref = ray.remote(Agent).options(
|
| 98 |
+
num_cpus=1,
|
| 99 |
+
num_gpus=0.5 if ray.get_gpu_ids() else 0
|
| 100 |
+
).remote()
|
| 101 |
+
|
| 102 |
+
# Store reference
|
| 103 |
+
agent_id = await agent_ref.get_id.remote()
|
| 104 |
+
self.agent_refs[agent_id] = agent_ref
|
| 105 |
+
|
| 106 |
+
# Start agent
|
| 107 |
+
ray.get(agent_ref.run.remote())
|
| 108 |
+
|
| 109 |
+
logger.info(f"Added new agent {agent_id}")
|
| 110 |
+
return agent_id
|
| 111 |
+
|
| 112 |
+
except Exception as e:
|
| 113 |
+
logger.error(f"Error adding agent: {e}")
|
| 114 |
+
raise
|
| 115 |
+
|
| 116 |
+
async def _remove_agent(self, agent_id: str):
|
| 117 |
+
"""Remove an agent from the cluster."""
|
| 118 |
+
try:
|
| 119 |
+
# Get agent reference
|
| 120 |
+
agent_ref = self.agent_refs.get(agent_id)
|
| 121 |
+
if agent_ref:
|
| 122 |
+
# Shutdown agent gracefully
|
| 123 |
+
await agent_ref.shutdown.remote()
|
| 124 |
+
# Remove from Ray
|
| 125 |
+
ray.kill(agent_ref)
|
| 126 |
+
# Remove from local tracking
|
| 127 |
+
del self.agent_refs[agent_id]
|
| 128 |
+
|
| 129 |
+
logger.info(f"Removed agent {agent_id}")
|
| 130 |
+
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.error(f"Error removing agent: {e}")
|
| 133 |
+
raise
|
| 134 |
+
|
| 135 |
+
def get_cluster_status(self) -> Dict[str, Any]:
|
| 136 |
+
"""Get current status of the cluster."""
|
| 137 |
+
try:
|
| 138 |
+
active_agents = self.db_client.get_active_agents()
|
| 139 |
+
total_agents = len(active_agents)
|
| 140 |
+
busy_agents = len([a for a in active_agents if a['current_job'] is not None])
|
| 141 |
+
|
| 142 |
+
return {
|
| 143 |
+
'total_agents': total_agents,
|
| 144 |
+
'busy_agents': busy_agents,
|
| 145 |
+
'idle_agents': total_agents - busy_agents,
|
| 146 |
+
'utilization': busy_agents / total_agents if total_agents > 0 else 0,
|
| 147 |
+
'can_scale_up': total_agents < self.max_agents,
|
| 148 |
+
'can_scale_down': total_agents > self.min_agents
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
except Exception as e:
|
| 152 |
+
logger.error(f"Error getting cluster status: {e}")
|
| 153 |
+
raise
|
cloud_agents/tensor_ops.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tensor operations for distributed computing.
|
| 3 |
+
"""
|
| 4 |
+
import torch
|
| 5 |
+
import numpy as np
|
| 6 |
+
from typing import Dict, List, Optional, Union, Tuple
|
| 7 |
+
|
| 8 |
+
class TensorOps:
|
| 9 |
+
"""Utility class for distributed tensor operations."""
|
| 10 |
+
|
| 11 |
+
@staticmethod
|
| 12 |
+
def split_tensor(tensor: torch.Tensor, num_parts: int) -> List[torch.Tensor]:
|
| 13 |
+
"""Split a tensor into multiple parts for distributed processing."""
|
| 14 |
+
return torch.chunk(tensor, num_parts)
|
| 15 |
+
|
| 16 |
+
@staticmethod
|
| 17 |
+
def merge_tensors(tensors: List[torch.Tensor], dim: int = 0) -> torch.Tensor:
|
| 18 |
+
"""Merge multiple tensors back into a single tensor."""
|
| 19 |
+
return torch.cat(tensors, dim=dim)
|
| 20 |
+
|
| 21 |
+
@staticmethod
|
| 22 |
+
def average_gradients(gradients: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
|
| 23 |
+
"""Average gradients from multiple workers."""
|
| 24 |
+
avg_gradients = {}
|
| 25 |
+
for key in gradients[0].keys():
|
| 26 |
+
avg_gradients[key] = torch.mean(torch.stack([g[key] for g in gradients]), dim=0)
|
| 27 |
+
return avg_gradients
|
| 28 |
+
|
| 29 |
+
@staticmethod
|
| 30 |
+
def serialize_tensor(tensor: torch.Tensor) -> Dict[str, Union[List, str]]:
|
| 31 |
+
"""Serialize a tensor for storage/transmission."""
|
| 32 |
+
return {
|
| 33 |
+
'data': tensor.cpu().numpy().tolist(),
|
| 34 |
+
'shape': list(tensor.shape),
|
| 35 |
+
'dtype': str(tensor.dtype)
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
@staticmethod
|
| 39 |
+
def deserialize_tensor(tensor_dict: Dict[str, Union[List, str]]) -> torch.Tensor:
|
| 40 |
+
"""Deserialize a tensor from storage/transmission format."""
|
| 41 |
+
data = np.array(tensor_dict['data'])
|
| 42 |
+
shape = tensor_dict['shape']
|
| 43 |
+
dtype = getattr(torch, tensor_dict['dtype'].split('.')[-1])
|
| 44 |
+
return torch.tensor(data, dtype=dtype).reshape(shape)
|
| 45 |
+
|
| 46 |
+
@staticmethod
|
| 47 |
+
def gradient_clipping(gradients: Dict[str, torch.Tensor], max_norm: float) -> Dict[str, torch.Tensor]:
|
| 48 |
+
"""Apply gradient clipping to prevent exploding gradients."""
|
| 49 |
+
for k, v in gradients.items():
|
| 50 |
+
if v is not None:
|
| 51 |
+
torch.nn.utils.clip_grad_norm_(v, max_norm)
|
| 52 |
+
return gradients
|
| 53 |
+
|
| 54 |
+
@staticmethod
|
| 55 |
+
def reduce_precision(tensor: torch.Tensor, bits: int = 16) -> torch.Tensor:
|
| 56 |
+
"""Reduce tensor precision for efficient transmission."""
|
| 57 |
+
if bits == 16:
|
| 58 |
+
return tensor.half()
|
| 59 |
+
elif bits == 32:
|
| 60 |
+
return tensor.float()
|
| 61 |
+
else:
|
| 62 |
+
raise ValueError("Unsupported precision bits")
|
| 63 |
+
|
| 64 |
+
@staticmethod
|
| 65 |
+
def shard_tensor(tensor: torch.Tensor, shard_size: int) -> List[torch.Tensor]:
|
| 66 |
+
"""Shard a tensor into smaller pieces for distributed processing."""
|
| 67 |
+
return [tensor[i:i + shard_size] for i in range(0, tensor.size(0), shard_size)]
|
| 68 |
+
|
| 69 |
+
@staticmethod
|
| 70 |
+
def compute_parameter_norm(parameters: Dict[str, torch.Tensor]) -> float:
|
| 71 |
+
"""Compute the total norm of all parameters."""
|
| 72 |
+
total_norm = 0.0
|
| 73 |
+
for param in parameters.values():
|
| 74 |
+
total_norm += param.norm().item() ** 2
|
| 75 |
+
return total_norm ** 0.5
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch>=2.0.0
|
| 2 |
+
transformers>=4.30.0
|
| 3 |
+
couchdb>=1.2
|
| 4 |
+
numpy>=1.24.0
|
| 5 |
+
python-dotenv>=1.0.0
|
| 6 |
+
aiohttp>=3.8.0
|
| 7 |
+
pydantic>=2.0.0
|
| 8 |
+
ray>=2.6.0
|
| 9 |
+
tqdm>=4.65.0
|
| 10 |
+
huggingface_hub>=0.16.0
|
setup.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from setuptools import setup, find_packages
|
| 2 |
+
|
| 3 |
+
setup(
|
| 4 |
+
name="cloud_agents",
|
| 5 |
+
version="0.1.0",
|
| 6 |
+
packages=find_packages(),
|
| 7 |
+
install_requires=[
|
| 8 |
+
"torch>=2.0.0",
|
| 9 |
+
"transformers>=4.30.0",
|
| 10 |
+
"couchdb>=1.2",
|
| 11 |
+
"numpy>=1.24.0",
|
| 12 |
+
"python-dotenv>=1.0.0",
|
| 13 |
+
"aiohttp>=3.8.0",
|
| 14 |
+
"pydantic>=2.0.0",
|
| 15 |
+
"ray>=2.6.0",
|
| 16 |
+
"tqdm>=4.65.0",
|
| 17 |
+
"huggingface_hub>=0.16.0",
|
| 18 |
+
],
|
| 19 |
+
python_requires=">=3.8",
|
| 20 |
+
author="Andrew Magdy Kamal",
|
| 21 |
+
description="Distributed cloud agents for training OpenPeerLLM",
|
| 22 |
+
long_description=open("README.md").read(),
|
| 23 |
+
long_description_content_type="text/markdown",
|
| 24 |
+
classifiers=[
|
| 25 |
+
"Development Status :: 3 - Alpha",
|
| 26 |
+
"Intended Audience :: Science/Research",
|
| 27 |
+
"License :: OSI Approved :: MIT License",
|
| 28 |
+
"Programming Language :: Python :: 3.8",
|
| 29 |
+
],
|
| 30 |
+
)
|