Spaces:

Victarry
/

PP-schedule-visualizer

Runtime error

App Files Files Community

Victarry commited on Mar 13

Commit

f140d7b

1 Parent(s): 86eaa70

Add visualization for 1F1B overlap.

Browse files

Files changed (4) hide show

main.py +1 -2
src/execution_model.py +20 -3
src/strategies.py +3 -1
src/visualizer.py +100 -13

main.py CHANGED Viewed

@@ -105,8 +105,7 @@ def run_1f1b_overlap(cfg: DictConfig) -> None:
     )
     schedule = generate_1f1b_overlap_schedule(schedule_config)
     schedule.execute()
-    schedule.show()
-    # visualize_pipeline_parallelism_dash(schedule, port=cfg.visualization_port)
 if __name__ == "__main__":

     )
     schedule = generate_1f1b_overlap_schedule(schedule_config)
     schedule.execute()
+    visualize_pipeline_parallelism_dash(schedule, port=cfg.visualization_port)
 if __name__ == "__main__":

src/execution_model.py CHANGED Viewed

@@ -158,9 +158,8 @@ class ScheduleConfig:
                 # Check if we have a specific time for this combination
                 if (op_type1, op_type2) in self.overlapped_op_times:
                     return self.overlapped_op_times[(op_type1, op_type2)]
-                # Otherwise, use the sum of individual times
-                return (self.get_op_time(op_type1, stage_id) +
-                        self.get_op_time(op_type2, stage_id))
         if op_type not in self.op_times:
             raise ValueError(f"Invalid operation type: {op_type}")
@@ -184,6 +183,12 @@ class Schedule:
         self.config = config
         self.init_operations()
     def init_operations(self):
         op_types = ["forward", "backward"]
@@ -197,10 +202,21 @@ class Schedule:
                     )
     def get_op(self, batch_id: int, stage_id: int, op_type: str):
         return self.ops[(batch_id, stage_id, op_type)]
     def get_dependencies(self, op: Operation, include_device_dependency=True):
         deps = []
         if op.op_type == "forward":
             if op.stage_id > 0:
                 deps.append(
@@ -272,6 +288,7 @@ class Schedule:
             print(f"\nTotal execution time: {total_time:.2f}")
     def execute(self):
         def execute_op(op: Operation):
             if op.end_time is not None:
                 return

                 # Check if we have a specific time for this combination
                 if (op_type1, op_type2) in self.overlapped_op_times:
                     return self.overlapped_op_times[(op_type1, op_type2)]
+                # Otherwise, use the max of individual times plus a small overhead
+                return max(self.get_op_time(op_type1, stage_id), self.get_op_time(op_type2, stage_id)) + 0.2
         if op_type not in self.op_times:
             raise ValueError(f"Invalid operation type: {op_type}")
         self.config = config
         self.init_operations()
+        self.op_to_overlapped = {}
+    def register_overlapped_operation(self, overlapped_op: OverlappedOperation):
+        for op in overlapped_op.operations:
+            self.op_to_overlapped[(op.batch_id, op.stage_id, op.op_type)] = overlapped_op
+            self.ops[(op.batch_id, op.stage_id, op.op_type)] = overlapped_op
     def init_operations(self):
         op_types = ["forward", "backward"]
                     )
     def get_op(self, batch_id: int, stage_id: int, op_type: str):
+        if (batch_id, stage_id, op_type) in self.op_to_overlapped:
+            return self.op_to_overlapped[(batch_id, stage_id, op_type)]
         return self.ops[(batch_id, stage_id, op_type)]
     def get_dependencies(self, op: Operation, include_device_dependency=True):
         deps = []
+        if isinstance(op, OverlappedOperation):
+            for sub_op in op.operations:
+                deps.extend(self.get_dependencies(sub_op, include_device_dependency=False))
+            if include_device_dependency:
+                device_index = self.device_queues[op.device_id].ops.index(op)
+                if device_index > 0:
+                    deps.append((self.device_queues[op.device_id].ops[device_index - 1], 0.0))
+            return deps
         if op.op_type == "forward":
             if op.stage_id > 0:
                 deps.append(
             print(f"\nTotal execution time: {total_time:.2f}")
     def execute(self):
+        # TODO: change the execution order to topological order via DAG
         def execute_op(op: Operation):
             if op.end_time is not None:
                 return

src/strategies.py CHANGED Viewed

@@ -114,7 +114,9 @@ def generate_1f1b_overlap_schedule(config: ScheduleConfig):
         for _ in range(steady_batches):
             fwd_op = schedule.get_op(fwd_batch_id, i, "forward")
             bwd_op = schedule.get_op(bwd_batch_id, i, "backward")
-            schedule.device_queues[i].add_operation(OverlappedOperation([fwd_op, bwd_op]))
             fwd_batch_id += 1
             bwd_batch_id += 1

         for _ in range(steady_batches):
             fwd_op = schedule.get_op(fwd_batch_id, i, "forward")
             bwd_op = schedule.get_op(bwd_batch_id, i, "backward")
+            overlapped_op = OverlappedOperation([fwd_op, bwd_op])
+            schedule.register_overlapped_operation(overlapped_op)
+            schedule.device_queues[i].add_operation(overlapped_op)
             fwd_batch_id += 1
             bwd_batch_id += 1

src/visualizer.py CHANGED Viewed

@@ -8,7 +8,7 @@ from functools import lru_cache
 import webbrowser
 from threading import Timer
-from src.execution_model import Schedule
 def convert_schedule_to_visualization_format(schedule: Schedule):
@@ -32,15 +32,37 @@ def convert_schedule_to_visualization_format(schedule: Schedule):
         visualization_data[device_id] = []
         for op in device_queue.ops:
-            visualization_data[device_id].append(
-                {
-                    "type": op.op_type,
-                    "batch": op.batch_id + 1,  # +1 because batch_id is 0-indexed
-                    "stage": op.stage_id,
-                    "start_time": op.start_time,
-                    "duration": op.end_time - op.start_time,
-                }
-            )
     return visualization_data
@@ -103,13 +125,30 @@ def get_color(op_type: str, stage_id: int, num_devices: int):
         "#99cc99",  # Pale green
         "#c6e6c6",  # Pastel green
     ]
     virtual_stage = stage_id // num_devices
     # If virtual_stage is beyond our color list, cycle through the colors
     color_index = virtual_stage % len(forward_colors)
-    if op_type == "forward":
         return forward_colors[color_index]
     elif op_type == "backward":
         return backward_colors[color_index]
@@ -191,6 +230,14 @@ def create_pipeline_figure(
                 color = get_color(task["type"], task["stage"], num_devices)
                 text_color = "black"
                 name = "Backward (Weight)"
             else:
                 color = empty_color
                 text_color = "black"
@@ -222,14 +269,34 @@ def create_pipeline_figure(
                 dict(
                     x=start_time + duration / 2,
                     y=y_pos,
-                    text=f"{task['batch']}",
                     showarrow=False,
                     font=dict(color=text_color, size=12, family="Arial, bold"),
                 )
             )
             # Prepare hover data (add traces in batches later)
-            hover_text = f"Batch: {task['batch']}<br>Stage: {task['stage']}<br>Type: {name}<br>Start: {task['start_time']:.2f}<br>End: {task['start_time'] + task['duration']:.2f}<br>Duration: {task['duration']:.2f}"
             hover_traces.append(
                 dict(
@@ -268,6 +335,13 @@ def create_pipeline_figure(
             virtual_stage = task["stage"] // num_devices
             max_virtual_stage = max(max_virtual_stage, virtual_stage)
     # Add forward and backward items for each virtual stage
     for vs in range(max_virtual_stage + 1):
         legend_items.append(
@@ -300,6 +374,15 @@ def create_pipeline_figure(
                     color=get_color("backward_W", vs * num_devices, num_devices),
                 )
             )
     # If no tasks found, add default legend items
     if not legend_items:
@@ -314,6 +397,10 @@ def create_pipeline_figure(
                 name="Backward Weight (VS 0)",
                 color=get_color("backward_W", 0, num_devices),
             ),
         ]
     for i, item in enumerate(legend_items):

 import webbrowser
 from threading import Timer
+from src.execution_model import Schedule, OverlappedOperation
 def convert_schedule_to_visualization_format(schedule: Schedule):
         visualization_data[device_id] = []
         for op in device_queue.ops:
+            # Handle both regular Operations and OverlappedOperations
+            if isinstance(op, OverlappedOperation):
+                visualization_data[device_id].append(
+                    {
+                        "type": op.op_type,
+                        "batch": op.batch_id + 1,  # +1 because batch_id is 0-indexed
+                        "stage": op.stage_id,
+                        "start_time": op.start_time,
+                        "duration": op.end_time - op.start_time,
+                        "is_overlapped": True,
+                        "operations": [
+                            {
+                                "type": nested_op.op_type,
+                                "batch": nested_op.batch_id + 1,
+                                "stage": nested_op.stage_id
+                            }
+                            for nested_op in op.operations
+                        ]
+                    }
+                )
+            else:
+                visualization_data[device_id].append(
+                    {
+                        "type": op.op_type,
+                        "batch": op.batch_id + 1,  # +1 because batch_id is 0-indexed
+                        "stage": op.stage_id,
+                        "start_time": op.start_time,
+                        "duration": op.end_time - op.start_time,
+                        "is_overlapped": False
+                    }
+                )
     return visualization_data
         "#99cc99",  # Pale green
         "#c6e6c6",  # Pastel green
     ]
+    # Purple palette for overlapped operations
+    overlapped_colors = [
+        "#9966cc",  # Medium purple
+        "#8a2be2",  # Blue violet
+        "#9370db",  # Medium purple
+        "#6a5acd",  # Slate blue
+        "#7b68ee",  # Medium slate blue
+        "#ba55d3",  # Medium orchid
+        "#9932cc",  # Dark orchid
+        "#d8bfd8",  # Thistle
+        "#e6e6fa",  # Lavender
+        "#dda0dd",  # Plum
+    ]
     virtual_stage = stage_id // num_devices
     # If virtual_stage is beyond our color list, cycle through the colors
     color_index = virtual_stage % len(forward_colors)
+    # Handle overlapped operations
+    if op_type.startswith("overlapped_"):
+        return overlapped_colors[color_index]
+    elif op_type == "forward":
         return forward_colors[color_index]
     elif op_type == "backward":
         return backward_colors[color_index]
                 color = get_color(task["type"], task["stage"], num_devices)
                 text_color = "black"
                 name = "Backward (Weight)"
+            elif task["type"].startswith("overlapped_"):
+                color = get_color(task["type"], task["stage"], num_devices)
+                text_color = "white"
+                name = "Overlapped"
+                # Create a more descriptive name for the hover text
+                if "is_overlapped" in task and task["is_overlapped"]:
+                    op_types = [op["type"] for op in task["operations"]]
+                    name = f"Overlapped ({', '.join(op_types)})"
             else:
                 color = empty_color
                 text_color = "black"
                 dict(
                     x=start_time + duration / 2,
                     y=y_pos,
+                    text=f"{task['batch']}" + ("*" if task.get("is_overlapped", False) else ""),
                     showarrow=False,
                     font=dict(color=text_color, size=12, family="Arial, bold"),
                 )
             )
             # Prepare hover data (add traces in batches later)
+            if task.get("is_overlapped", False):
+                # Enhanced hover text for overlapped operations
+                op_details = "<br>".join([
+                    f"- {op['type']} (Batch {op['batch']}, Stage {op['stage']})"
+                    for op in task["operations"]
+                ])
+                hover_text = (
+                    f"Overlapped Operations:<br>{op_details}<br>"
+                    f"Start: {task['start_time']:.2f}<br>"
+                    f"End: {task['start_time'] + task['duration']:.2f}<br>"
+                    f"Duration: {task['duration']:.2f}"
+                )
+            else:
+                hover_text = (
+                    f"Batch: {task['batch']}<br>"
+                    f"Stage: {task['stage']}<br>"
+                    f"Type: {name}<br>"
+                    f"Start: {task['start_time']:.2f}<br>"
+                    f"End: {task['start_time'] + task['duration']:.2f}<br>"
+                    f"Duration: {task['duration']:.2f}"
+                )
             hover_traces.append(
                 dict(
             virtual_stage = task["stage"] // num_devices
             max_virtual_stage = max(max_virtual_stage, virtual_stage)
+    # Check if overlapped operations exist
+    has_overlapped = any(
+        task.get("is_overlapped", False)
+        for device in schedule_data
+        for task in schedule_data[device]
+    )
     # Add forward and backward items for each virtual stage
     for vs in range(max_virtual_stage + 1):
         legend_items.append(
                     color=get_color("backward_W", vs * num_devices, num_devices),
                 )
             )
+        # Add entry for overlapped operations if they exist
+        if has_overlapped:
+            legend_items.append(
+                dict(
+                    name=f"Overlapped (VS {vs})",
+                    color=get_color("overlapped_", vs * num_devices, num_devices),
+                )
+            )
     # If no tasks found, add default legend items
     if not legend_items:
                 name="Backward Weight (VS 0)",
                 color=get_color("backward_W", 0, num_devices),
             ),
+            dict(
+                name="Overlapped (VS 0)",
+                color=get_color("overlapped_", 0, num_devices),
+            ),
         ]
     for i, item in enumerate(legend_items):