Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- orchestrator.py +77 -0
orchestrator.py
CHANGED
|
@@ -525,3 +525,80 @@ class AgentOrchestrator:
|
|
| 525 |
self.logger.error(f"Resource error: {error}")
|
| 526 |
# Implement recovery logic
|
| 527 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
self.logger.error(f"Resource error: {error}")
|
| 526 |
# Implement recovery logic
|
| 527 |
pass
|
| 528 |
+
|
| 529 |
+
async def _recover_agent(self, agent_id: str):
|
| 530 |
+
"""Recover a failed agent."""
|
| 531 |
+
try:
|
| 532 |
+
agent = self.agents[agent_id]
|
| 533 |
+
|
| 534 |
+
# Log recovery attempt
|
| 535 |
+
self.logger.info(f"Attempting to recover agent {agent_id}")
|
| 536 |
+
|
| 537 |
+
# Reset agent state
|
| 538 |
+
agent.state = AgentState.IDLE
|
| 539 |
+
agent.load = 0
|
| 540 |
+
agent.last_active = datetime.now()
|
| 541 |
+
|
| 542 |
+
# Reassign any tasks that were assigned to this agent
|
| 543 |
+
for task_id, task in self.tasks.items():
|
| 544 |
+
if task.assigned_to == agent_id:
|
| 545 |
+
await self._reassign_task(task_id)
|
| 546 |
+
|
| 547 |
+
# Update metrics
|
| 548 |
+
agent.metrics["recovery_attempts"] = agent.metrics.get("recovery_attempts", 0) + 1
|
| 549 |
+
|
| 550 |
+
self.logger.info(f"Successfully recovered agent {agent_id}")
|
| 551 |
+
return True
|
| 552 |
+
|
| 553 |
+
except Exception as e:
|
| 554 |
+
self.logger.error(f"Failed to recover agent {agent_id}: {e}")
|
| 555 |
+
return False
|
| 556 |
+
|
| 557 |
+
async def _recover_task(self, task_id: str):
|
| 558 |
+
"""Recover a failed task."""
|
| 559 |
+
try:
|
| 560 |
+
task = self.tasks[task_id]
|
| 561 |
+
|
| 562 |
+
# Log recovery attempt
|
| 563 |
+
self.logger.info(f"Attempting to recover task {task_id}")
|
| 564 |
+
|
| 565 |
+
# Reset task state
|
| 566 |
+
task.state = "pending"
|
| 567 |
+
task.assigned_to = None
|
| 568 |
+
|
| 569 |
+
# Try to reassign the task
|
| 570 |
+
await self._reassign_task(task_id)
|
| 571 |
+
|
| 572 |
+
self.logger.info(f"Successfully recovered task {task_id}")
|
| 573 |
+
return True
|
| 574 |
+
|
| 575 |
+
except Exception as e:
|
| 576 |
+
self.logger.error(f"Failed to recover task {task_id}: {e}")
|
| 577 |
+
return False
|
| 578 |
+
|
| 579 |
+
async def _recover_resource(self, resource_id: str):
|
| 580 |
+
"""Recover a failed resource."""
|
| 581 |
+
try:
|
| 582 |
+
# Log recovery attempt
|
| 583 |
+
self.logger.info(f"Attempting to recover resource {resource_id}")
|
| 584 |
+
|
| 585 |
+
# Release any locks on the resource
|
| 586 |
+
if resource_id in self.resource_locks:
|
| 587 |
+
lock = self.resource_locks[resource_id]
|
| 588 |
+
if lock.locked():
|
| 589 |
+
lock.release()
|
| 590 |
+
|
| 591 |
+
# Reset resource state
|
| 592 |
+
if resource_id in self.resource_pool:
|
| 593 |
+
self.resource_pool[resource_id] = {
|
| 594 |
+
"state": "available",
|
| 595 |
+
"last_error": None,
|
| 596 |
+
"last_recovery": datetime.now()
|
| 597 |
+
}
|
| 598 |
+
|
| 599 |
+
self.logger.info(f"Successfully recovered resource {resource_id}")
|
| 600 |
+
return True
|
| 601 |
+
|
| 602 |
+
except Exception as e:
|
| 603 |
+
self.logger.error(f"Failed to recover resource {resource_id}: {e}")
|
| 604 |
+
return False
|