Coverage for /home/jenkins/.local/lib/python3.10/site-packages/hyper_parallel/core/pipeline

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ============================================================================

15"""pipeline schedule"""

16from abc import ABC, abstractmethod

17from enum import Enum, auto

18from collections import defaultdict

19import itertools

20import bisect

21import logging

22import re

23import hyper_parallel

24from hyper_parallel.platform import get_platform

25from hyper_parallel.core.fully_shard.api import HSDPModule

26from hyper_parallel.core.pipeline_parallel.utils import BatchDimSpec

27platform = get_platform()

28logger = logging.getLogger(__name__)

31class MetaStepType(Enum):

32 """Specify the enumeration type for MetaStep."""

33 FWD = auto()

34 BWD = auto()

35 BWD_INPUT = auto()

36 BWD_WEIGHT = auto()

37 FWD_RECV = auto()

38 FWD_SEND = auto()

39 BWD_RECV = auto()

40 BWD_SEND = auto()

41 # Composite P2P: a contiguous run of FWD_SEND/FWD_RECV/BWD_SEND/BWD_RECV

42 # coalesced by ``coalesce_p2p`` into one step whose ``sub_steps`` the runtime

43 # groups by peer and issues as ``batch_isend_irecv`` (same-peer send+recv ->

44 # duplex). Only produced under ``p2p_transport="batch"``.

45 BATCH_SEND_RECV = auto()

46 OVERLAP_F_B = auto()

47 OVERLAP_B_F = auto()

48 FSDP_UNSHARD = auto()

49 FSDP_RESHARD = auto()

50 FSDP_REDUCE_GRAD = auto()

51 SWAP_SET_GROUP = auto()

52 SWAP_LAUNCH_OFFLOAD = auto()

53 SWAP_WAIT_OFFLOAD = auto()

54 SWAP_LAUNCH_LOAD = auto()

55 SWAP_WAIT_LOAD = auto()

58class MetaStep:

59 """

60 Meta step of PipelineSchedule.

61 An execution list composed of MetaStep can be constructed

62 and fed into the PipelineSchedule for execution.

64 Args:

65 micro_index (int | None): The index of micro-batch. ``None`` for

66 composite types (``OVERLAP_F_B`` / ``OVERLAP_B_F``) whose real

67 micro index lives in each ``sub_steps`` entry.

68 type (MetaStepType): Specify the type of current step.

69 stage_index (int | None): Stage index of current step. ``None``

70 for composite types; use ``sub_steps`` to get each direction's

71 stage.

72 sub_steps (tuple[MetaStep, MetaStep] | None): For composite types

73 only: ``(fwd, bwd)`` for ``OVERLAP_F_B``, ``(bwd, fwd)`` for

74 ``OVERLAP_B_F``.

75 boundary_p2p (tuple[MetaStep, ...] | None): For ``OVERLAP_B_F`` under

76 the ``"boundary"`` P2P transport only: P2P steps to issue at the

77 fwd/bwd boundary inside the overlap (the forward's ``FWD_SEND``

78 plus the next slot's recvs), hoisted out of the following gap by

79 ``attach_fwd_boundary_p2p``. Issued via

80 :meth:`PipelineScheduleRuntime.exec_boundary_p2p`.

81 """

82 def __init__(self, micro_index, meta_type, stage_index, sub_steps=None,

83 boundary_p2p=None):

84 self._type = meta_type

85 self._micro_index = micro_index

86 self._stage_index = stage_index

87 self._sub_steps = sub_steps

88 self._boundary_p2p = boundary_p2p

90 @property

91 def micro_index(self):

92 """Return the micro-batch index of this step."""

93 return self._micro_index

95 @property

96 def stage_index(self):

97 """Return the stage index of this step."""

98 return self._stage_index

100 @property

101 def type(self):

102 """Return the MetaStepType of this step."""

103 return self._type

104

105 @property

106 def sub_steps(self):

107 """Sub-steps for composite types: ``(fwd, bwd)`` for OVERLAP_F_B,

108 ``(bwd, fwd)`` for OVERLAP_B_F, or ``None``."""

109 return self._sub_steps

110

111 @property

112 def boundary_p2p(self):

113 """P2P steps to issue at the overlap's fwd/bwd boundary, or ``None``."""

114 return self._boundary_p2p

115

116 def __eq__(self, value):

117 if not isinstance(value, MetaStep):

118 return NotImplemented

119 return (self.type == value.type

120 and self.micro_index == value.micro_index

121 and self.stage_index == value.stage_index

122 and self.sub_steps == value.sub_steps)

123

124 def __ne__(self, value):

125 if not isinstance(value, MetaStep):

126 return NotImplemented

127 return not self.__eq__(value)

128

129 def __hash__(self):

130 return hash((self.type, self.micro_index, self.stage_index))

131

132 def __str__(self):

133 if self.sub_steps:

134 sub = ", ".join(str(s) for s in self.sub_steps)

135 return (f"MetaStep(type={self.type}, micro_index={self.micro_index}, "

136 f"stage_index={self.stage_index}, sub_steps=[{sub}])")

137 return f"MetaStep(type={self.type}, micro_index={self.micro_index}, stage_index={self.stage_index})"

138

139 def __repr__(self):

140 return self.__str__()

141

142 @staticmethod

143 def from_str(step_str):

144 """Parse a MetaStep from its string representation."""

145 pass

146

147

148def generate_stage_to_rank_mapping(real_stage_num, stage_num, style='loop'):

149 """Generate stage to rank mapping for loop or V schedules."""

150 if style == 'loop':

151 return {stage_index: stage_index % real_stage_num for stage_index in range(stage_num)}

152 if style == 'v':

153 if stage_num % real_stage_num != 0:

154 raise ValueError(

155 f"stage_num {stage_num} must be evenly divisible by real_stage_num {real_stage_num} for V schedules."

156 )

157 mapping = {}

158 rank_index = 0

159 for stage_index in range(stage_num):

160 mapping[stage_index] = rank_index

161 if (stage_index + 1) % real_stage_num == 0:

162 continue

163 if (stage_index // real_stage_num) % 2 == 0:

164 rank_index += 1

165 else:

166 rank_index -= 1

167 return mapping

168 raise ValueError(f"Unsupported stage rank mapping style: {style}")

169

170

171def generate_rank_to_stage_mapping(real_stage_num, stage_num, style='loop'):

172 """Invert the stage to rank mapping."""

173 stage_to_rank = generate_stage_to_rank_mapping(real_stage_num, stage_num, style)

174 rank_to_stages = defaultdict(list)

175 for stage_index, rank in stage_to_rank.items():

176 rank_to_stages[rank].append(stage_index)

177

178 for stages in rank_to_stages.values():

179 stages.sort()

180 return dict(rank_to_stages)

181

182

183def iter_leaf_meta_steps(step):

184 """Yield leaf MetaSteps, recursively expanding OVERLAP containers.

185

186 Both ``OVERLAP_F_B`` and ``OVERLAP_B_F`` carry their real FWD/BWD work in

187 ``sub_steps``; the FSDP unshard/reshard injection pass relies on this to

188 see the FWD/BWD buried inside an overlap. Missing ``OVERLAP_B_F`` here let

189 an overlapped FWD run against a resharded stage → "expected HSDPModule

190 parameters in unsharded state". Mirror the other expansion sites

191 (run/_expand/add_fsdp_*) which already handle both composite types.

192 """

193 if step is None:

194 return

195 if step.type in (MetaStepType.OVERLAP_F_B, MetaStepType.OVERLAP_B_F) and step.sub_steps:

196 for sub_step in step.sub_steps:

197

198 yield from iter_leaf_meta_steps(sub_step)

199 return

200 yield step

201

202

203class PipelineContext:

204 """Per-run state handed to a custom execution function (see

205 :meth:`PipelineScheduleRuntime.register_custom_function`).

206

207 A plain data carrier for one :meth:`PipelineScheduleRuntime.run_microbatches`

208 call. The P2P helpers (``wait_fwd_recv`` / ``wait_bwd_recv`` / ``send_fwd``

209 / ``send_bwd``) and the ``enable_dxdw_split`` flag live on the schedule, so a

210 callback reaches them through :attr:`schedule`, e.g.

211 ``ctx.schedule.send_bwd(stage, micro_index)``.

212

213 Attributes:

214 schedule: The owning :class:`PipelineScheduleRuntime`.

215 arg_mbs: Per-micro-batch positional args.

216 kwarg_mbs: Per-micro-batch keyword args.

217 losses: Mutable list collecting per-step losses.

218 """

219

220 def __init__(self, schedule: "PipelineScheduleRuntime", arg_mbs: list,

221 kwarg_mbs: list, losses: list) -> None:

222 """Bundle the active schedule with one run's micro-batch inputs and losses."""

223 self.schedule = schedule

224 self.arg_mbs = arg_mbs

225 self.kwarg_mbs = kwarg_mbs

226 self.losses = losses

227

228

229def _exec_fsdp_unshard(stage):

230 """Unshard every HSDPModule in the stage's submodule tree."""

231 for _, module in platform.get_cells_and_names(stage.submodule):

232 if isinstance(module, HSDPModule):

233 module.unshard()

234

235

236def _exec_fsdp_reshard(stage):

237 """Reshard every HSDPModule in the stage's submodule tree."""

238 for _, module in platform.get_cells_and_names(stage.submodule):

239 if isinstance(module, HSDPModule):

240 module.reshard()

241

242

243def _exec_fsdp_reduce_grad(stage):

244 """Run the stage's FSDP post-backward gradient reduction."""

245 stage.execute_reduce_grad()

246

247

248# FSDP control MetaStep -> handler(stage). Membership also marks which

249# MetaStepTypes are FSDP control steps, so the runtime loop dispatches with a

250# single table lookup instead of re-switching on the step type.

251_FSDP_STEP_HANDLERS = {

252 MetaStepType.FSDP_UNSHARD: _exec_fsdp_unshard,

253 MetaStepType.FSDP_RESHARD: _exec_fsdp_reshard,

254 MetaStepType.FSDP_REDUCE_GRAD: _exec_fsdp_reduce_grad,

255}

256

257

258class PipelineScheduleRuntime(ABC):

259 """

260 Base class for pipeline schedule.

261 Implements the `split_microbatches` and `run_microbatches` method.

262 Derived classes should implement `run_microbatches` method and `run` method.

263

264 Supports registering **custom execution functions** for any

265 :class:`MetaStepType` via :meth:`register_custom_function`. When

266 ``run_microbatches`` encounters a step whose type has a registered

267 handler, it creates a :class:`PipelineContext` and delegates execution

268 to the handler instead of using the built-in logic.

269

270 Args:

271 stages (list[PipelineStage], PipelineStage): PipelineStage used to run_microbatches.

272 micro_batch_num (int): The number of micro-batch.

273 args_batch_dim (int | BatchDimSpec | list | tuple, optional): Per

274 positional-arg batch dim, indexed by arg position. Entries may be

275 plain ``int`` (or ``None`` to keep the default); a single-input

276 model may pass a bare ``int``/``BatchDimSpec`` instead of a

277 one-element list (wrapped automatically). Default ``None``.

278 kwargs_batch_dim (dict, optional): Per keyword-arg batch dim, mapping

279 arg name to a plain ``int`` or ``BatchDimSpec``. Default ``None``.

280 swap (bool, optional): Whether to inject pipeline activation swap

281 control steps. Supported by ``ScheduleGPipe``, ``Schedule1F1B``,

282 and ``ScheduleInterleaved1F1B``. Default ``False``.

283 p2p_transport (str, optional): How pipeline send/recv are issued.

284 ``"auto"`` (default) — gap-time duplex batching on overlap_b_f

285 schedules (``coalesce_p2p``: same-peer send+recv as one

286 ``batch_isend_irecv``, TX||RX; hardware-validated and measured a

287 net win on real workloads), plain per-op ``isend``/``irecv``

288 everywhere else. ``"plain"`` — force per-op ``isend``/``irecv``

289 (escape hatch for transports or topologies where batching

290 misbehaves). ``"batch"`` — duplex batching explicitly (what

291 ``"auto"`` picks under overlap_b_f). ``"boundary"`` —

292 EXPERIMENTAL fwd-boundary batching: each overlap's ``F_SEND`` +

293 the next slot's recvs go out mid-overlap, right after the forward,

294 as per-op solo batches; only ``B_SEND`` waits for the backward.

295 Avoids the duplex handle's send-coupling (a2a-friendly) and posts

296 the activation send ~half a slot early, but is not yet

297 hardware-validated — opt in deliberately. Must be set identically

298 on every rank — HCCL cannot match a batched op against a plain one

299 (EI0005).

300 """

301

302 _P2P_TRANSPORTS = ("auto", "plain", "batch", "boundary")

303

304 def __init__(self,

305 stages,

306 micro_batch_num,

307 args_batch_dim=None,

308 kwargs_batch_dim=None,

309 output_concat_dim=None,

310 overlap_p2p=False,

311 swap=False,

312 p2p_transport="auto"):

313 if p2p_transport not in self._P2P_TRANSPORTS:

314 raise ValueError(

315 f"p2p_transport must be one of {self._P2P_TRANSPORTS}, got "

316 f"{p2p_transport!r}"

317 )

318 self.stages = self._check_stages(stages)

319 self.micro_batch_num = micro_batch_num

320 self._args_batch_dim = self._normalize_args_batch_dim(args_batch_dim)

321 self._kwargs_batch_dim = self._normalize_kwargs_batch_dim(kwargs_batch_dim)

322 self._output_concat_dim = output_concat_dim

323 self.split_micro_batch = platform.micro_batch(self.micro_batch_num,

324 self._args_batch_dim, self._kwargs_batch_dim)

325 self.n_local_stages = len(self.stages)

326 self._stage_dict = self.convert_stages_dict()

327 self.real_stage_num = self.stages[0].stage_num // self.n_local_stages

328 self._stage_num = self.stages[0].stage_num

329 self._stage_to_rank_index = None

330 self._overlap_p2p = overlap_p2p

331 self.exec_order = {}

332 self._init_stages()

333 self._build_stage_to_rank_index()

334 self.fwd_handle_cache = {}

335 self.bwd_handle_cache = {}

336 self._custom_fn_map = {}

337 self._pp_swap_enabled = swap

338 # Outstanding async send handle groups for the in-flight

339 # ``run_microbatches`` call; reset per run and drained at its end.

340 self._send_handles = []

341 # ``p2p_transport`` resolves in ``build_exec_order`` (it needs the

342 # subclass's ``_overlap_b_f``) to one of:

343 #

344 # * ``"batch"`` (the ``"auto"`` default on overlap_b_f schedules) —

345 # gap-time duplex via ``coalesce_p2p``: same-peer send+recv as one

346 # ``batch_isend_irecv`` (TX||RX on the full-duplex link).

347 # Hardware-validated and MEASURED a net win on real workloads — the

348 # duplex saving outweighs its known cost (MS's single handle couples

349 # the riding send into the compute-gating recv wait, which can

350 # shave EP a2a overlap).

351 # * ``"plain"`` (the ``"auto"`` default elsewhere) — per-op

352 # ``isend``/``irecv``, the upstream-original path.

353 # * ``"boundary"`` (EXPERIMENTAL, explicit opt-in only) — fwd-boundary

354 # batching. ``attach_fwd_boundary_p2p`` hangs each steady gap's

355 # ``F_SEND`` (its data is ready when the overlap's forward finishes

356 # — the backward, ~2x FLOPs, is the long pole) plus the next slot's

357 # recvs on the OVERLAP_B_F step; the stage's after-forward hook

358 # fires ``exec_boundary_p2p`` at the fwd/bwd boundary, while the

359 # backward is still running. The send leaves roughly half a slot

360 # early — the only mode that moves the SENDER's post time — and the

361 # recv handles carry no send (a2a-friendly). Every op is a per-op

362 # solo batch and ``coalesce_p2p`` is NOT run: a hoisted recv cannot

363 # stay duplexed with a send that is only ready later, and

364 # asymmetric shapes (one end duplex, other end split) hang —

365 # all-solo keeps per-pair batch sequences complementary

366 # ([S,R] vs [R,S]), safe under both candidate HCCL pairing

367 # semantics. Promote to the auto default only after it earns both

368 # a hardware accuracy pass and a perf win over "batch".

369 #

370 # Two transport invariants for any future rewrite: plain per-pair

371 # streams need the gap's recv-first/send-first complementarity (a

372 # send-crossing hoist made both ends recv-first -> rendezvous deadlock,

373 # 2026-06), and batch pairing needs per-pair shape mirroring (a

374 # one-sided split made 2 solos face 1 duplex -> hang, 2026-06).

375 self._p2p_transport = p2p_transport

376 # Effective mode + per-op batch gating; set by ``build_exec_order``.

377 self._p2p_mode = None

378 self._batch_p2p = False

379 # OVERLAP steps whose boundary_p2p was already issued this run (the

380 # stage after-forward hook and the post-step safety net are both

381 # allowed to call exec_boundary_p2p; reset per run_microbatches call).

382 self._boundary_issued = set()

383 # (fwd stage_index, micro_index) -> armed OVERLAP step, consumed by the

384 # stage after-forward hook to fire the boundary issue mid-overlap.

385 self._pending_boundary = {}

386

387 def register_custom_function(self, step_type: MetaStepType, fn) -> None:

388 """Register a custom execution function for the given step type.

389

390 When :meth:`run_microbatches` encounters a :class:`MetaStep` whose

391 ``type`` matches ``step_type``, it calls ``fn(step, ctx)`` instead

392 of the built-in logic.

393

394 Args:

395 step_type: The :class:`MetaStepType` to intercept.

396 fn: A callable with signature ``(step: MetaStep, ctx: PipelineContext) -> None``.

397

398 Example:

399 >>> def my_overlap_callback(step, ctx):

400 ... fwd_step, bwd_step = step.sub_steps

401 ... # custom parallel execution logic

402 >>> schedule.register_custom_function(MetaStepType.OVERLAP_F_B, my_overlap_callback)

403 """

404 self._custom_fn_map[step_type] = fn

405

406 def _inject_local_fsdp_actions(self):

407 """Annotate the local rank schedule with optional FSDP control actions."""

408 current_rank = self._stage_to_rank_index[self.stages[0].stage_index]

409 managed_stage_indices = {

410 stage.stage_index

411 for stage in self.stages

412 if isinstance(stage.submodule, HSDPModule)

413 }

414 if not managed_stage_indices:

415 return

416 if len(managed_stage_indices) != len(self.stages):

417 raise RuntimeError(

418 "When injecting fsdp_action, expect all stages to be HSDPModule. "

419 "Check whether all separated modules are wrapped with 'fully_shard'."

420 )

421 rank_actions = add_fsdp_unshard_reshard(self.exec_order[current_rank], managed_stage_indices)

422 self.exec_order[current_rank] = add_fsdp_reduce_grad(

423 rank_actions,

424 managed_stage_indices,

425 self.micro_batch_num,

426 )

427

428 def _inject_local_pp_swap_actions(self):

429 """Annotate the local rank schedule with pipeline activation-swap actions."""

430 if not self._pp_swap_enabled:

431 return

432 current_rank = self._stage_to_rank_index[self.stages[0].stage_index]

433 from hyper_parallel.core.pipeline_parallel.pipeline_swap import ( # pylint: disable=C0415

434 inject_pipeline_swap_steps,

435 )

436 self.exec_order[current_rank] = inject_pipeline_swap_steps(self.exec_order[current_rank])

437

438 @abstractmethod

439 def _build_stage_to_rank_index(self) -> None:

440 """

441 Build attribute of _stage_to_rank_index.

442 Each subclass constructs it according to its own schedule style.

443 """

444

445 @abstractmethod

446 def construct_exec_order(self) -> None:

447 """Build exec order, PP cmopute and PP comms(Send/Recv)"""

448

449 def build_exec_order(self) -> None:

450 """Build the execution order and inject optional PP-swap/FSDP actions.

451

452 Also resolves ``p2p_transport``: ``"auto"`` becomes ``"batch"`` (the

453 measured-beneficial duplex) on schedules running with ``overlap_b_f``

454 and ``"plain"`` everywhere else, then the matching order-rewrite pass

455 runs last (after swap/FSDP injection) so it sees the final per-rank

456 order.

457 """

458 mode = self._p2p_transport

459 if mode == "auto":

460 mode = "batch" if getattr(self, "_overlap_b_f", False) else "plain"

461 self._p2p_mode = mode

462 self._batch_p2p = mode != "plain"

463 self.construct_exec_order()

464 self._inject_local_pp_swap_actions()

465 self._inject_local_fsdp_actions()

466 if mode == "boundary":

467 # fwd-boundary mode: hang the forward's F_SEND + next slot's recvs

468 # on the OVERLAP step (issued mid-overlap, right after the

469 # forward). Everything stays per-op solo batches — deliberately

470 # NO coalesce_p2p (see __init__).

471 self.exec_order = attach_fwd_boundary_p2p(self.exec_order)

472 elif mode == "batch":

473 # Coalesce contiguous P2P runs into BATCH_SEND_RECV so the runtime

474 # issues same-peer send+recv as one duplex batch. NOTE: couples

475 # the riding send into the compute-gating recv wait — see

476 # __init__.

477 self.exec_order = coalesce_p2p(self.exec_order)

478

479 def convert_stages_dict(self):

480 """convert stages to dict."""

481 stage_dict = {}

482 for stage in self.stages:

483 stage_dict[stage.stage_index] = stage

484 return stage_dict

485

486 def split_microbatches(self, args, kwargs):

487 """split_microbatches."""

488 if args or kwargs:

489 args_split, kwargs_split = self.split_micro_batch(args, kwargs)

490 return args_split, kwargs_split

491 return [[] for _ in range(self.micro_batch_num)], [{} for _ in range(self.micro_batch_num)]

492

493 @staticmethod

494 def _to_spec(elem):

495 """Normalize one batch-dim entry: ``int`` -> ``BatchDimSpec``.

496

497 ``None`` and ``BatchDimSpec`` pass through unchanged. ``bool`` is

498 rejected even though it is an ``int`` subclass, so ``True``/``False``

499 are not silently read as dims 1/0.

500 """

501 if elem is None or isinstance(elem, BatchDimSpec):

502 return elem

503 if isinstance(elem, int) and not isinstance(elem, bool):

504 return BatchDimSpec(elem)

505 raise TypeError(

506 f"batch-dim entry must be int, BatchDimSpec or None, but got {type(elem)}.")

507

508 @staticmethod

509 def _normalize_args_batch_dim(args_batch_dim):

510 """Accept a plain ``int``/``BatchDimSpec`` or a ``list``/``tuple`` of them.

511

512 ``args_batch_dim`` is a per-arg spec indexed by positional-arg position

513 (see ``_MicroBatch``). A single-input model can pass a bare ``int`` /

514 ``BatchDimSpec`` instead of the awkward one-element

515 ``BatchDimSpec.from_tuple((0,))``; elements may be plain ``int`` (or

516 ``None`` to keep the default). Always returns ``None`` or a

517 ``tuple[BatchDimSpec | None]`` so downstream per-arg indexing is

518 unchanged.

519 """

520 if args_batch_dim is None:

521 return None

522 if isinstance(args_batch_dim, BatchDimSpec) or \

523 (isinstance(args_batch_dim, int) and not isinstance(args_batch_dim, bool)):

524 args_batch_dim = (args_batch_dim,)

525 if isinstance(args_batch_dim, (list, tuple)):

526 return tuple(PipelineScheduleRuntime._to_spec(e) for e in args_batch_dim)

527 raise TypeError(

528 f"args_batch_dim must be int, BatchDimSpec or a list/tuple of them, "

529 f"but got {type(args_batch_dim)}.")

530

531 @staticmethod

532 def _normalize_kwargs_batch_dim(kwargs_batch_dim):

533 """Accept plain ``int`` dict values: ``{\"x\": 0}`` -> ``{\"x\": BatchDimSpec(0)}``.

534

535 ``kwargs_batch_dim`` maps each keyword-arg name to its batch dim.

536 Returns ``None`` or a ``dict[str, BatchDimSpec | None]`` so downstream

537 per-key indexing is unchanged.

538 """

539 if kwargs_batch_dim is None:

540 return None

541 if not isinstance(kwargs_batch_dim, dict):

542 raise TypeError(

543 f"kwargs_batch_dim must be a dict[str, int | BatchDimSpec], "

544 f"but got {type(kwargs_batch_dim)}.")

545 return {k: PipelineScheduleRuntime._to_spec(v) for k, v in kwargs_batch_dim.items()}

546

547 def _check_stages(self, stages):

548 """check stages type."""

549 if isinstance(stages, hyper_parallel.PipelineStage):

550 return [stages]

551 if isinstance(stages, (list, tuple)):

552 for stage in stages:

553 if not isinstance(stage, hyper_parallel.PipelineStage):

554 raise TypeError(f"Argument 'stages' must be type of PipelineStage, \

555 list or tuple of PipelineStage, but got list or tuple of {type(stage)}.")

556 return stages

557 raise TypeError(f"Argument 'stages' must be type of PipelineStage, \

558 list or tuple of PipelineStage, but got type of {type(stages)}.")

559

560 def _init_stages(self):

561 """init stages."""

562 for stage in self.stages:

563 stage.init(self.n_local_stages)

564 # After-forward hook: lets the schedule issue fwd-boundary P2P the

565 # moment a forward chunk completes (no-op unless an OVERLAP step

566 # with boundary_p2p was armed for that (stage, micro)).

567 stage._after_forward_chunk = self._on_forward_chunk_done # pylint: disable=W0212

568

569 def _on_forward_chunk_done(self, stage_index, micro_index):

570 """Stage after-forward hook: fire the armed boundary P2P, if any.

571

572 Runs on the thread executing the forward (the overlap callback's

573 ``fwd_fn`` / the main thread), at the fwd/bwd boundary — the paired

574 backward is still running, so the boundary ops overlap it. Keyed by

575 the overlap's forward ``(stage_index, micro_index)``; unrelated

576 forwards (warm-up steps, recompute re-runs of past micros) miss the

577 key and no-op.

578 """

579 step = self._pending_boundary.pop((stage_index, micro_index), None)

580 if step is not None:

581 self.exec_boundary_p2p(step)

582

583 def run(self, *args, **kwargs):

584 """schedule run."""

585 losses = []

586 try:

587 split_args, split_kwargs = self.split_microbatches(args, kwargs)

588 self.run_microbatches(split_args, split_kwargs, losses)

589 finally:

590 # An exception unwinds past run_microbatches' end-of-iteration send

591 # drain, leaving in-flight isend/irecv handles un-waited. Wait them

592 # here so the comm contract holds on the error path too. No-op on the

593 # normal path. See _drain_inflight_p2p.

594 self._drain_inflight_p2p()

595 return losses

596

597 def sync_shared_parameters_grad(self):

598 """sync_shared_parameters_grad."""

599 for stage in self.stages:

600 stage.sync_shared_parameters_grad()

601

602 def update_losses(self, stage, loss, losses):

603 """update_losses."""

604 if stage.is_last_stage:

605 losses.append(loss)

606

607 @property

608 def enable_dxdw_split(self) -> bool:

609 """Whether this schedule splits ``OVERLAP_B_F`` backward into dx/dw."""

610 return getattr(self, "_enable_dxdw_split", False)

611

612 def _wait_p2p(self, handles):

613 for handle in handles:

614 if handle is not None:

615 handle.wait()

616

617 def _drain_inflight_p2p(self):

618 """Wait every P2P handle still in flight — error-path cleanup.

619

620 run_microbatches waits its deferred sends only in the end-of-iteration

621 drain; an exception mid-iteration unwinds past that drain, leaving issued

622 isend/irecv handles un-waited in ``_send_handles`` and the recv caches

623 (the ``CommHandle destroyed without calling wait()`` warning). run()'s

624 finally calls this so every handle is still ``wait()``-ed — honoring the

625 comm contract — on the error path too, and pops them so a later run()

626 does not re-wait stale handles (the recv caches are never reset per run).

627 No-op on the normal path: the drain already emptied ``_send_handles`` and

628 every cached recv was consumed.

629 """

630 while self._send_handles:

631 self._wait_p2p(self._send_handles.pop())

632 while self.fwd_handle_cache:

633 self._wait_p2p(self.fwd_handle_cache.popitem()[1])

634 while self.bwd_handle_cache:

635 self._wait_p2p(self.bwd_handle_cache.popitem()[1])

636

637 def _batched_issue(self, specs):

638 """Launch same-peer P2P ``specs`` as one ``batch_isend_irecv`` group.

639

640 ``specs`` are ``(op_type, tensor, peer_global_rank)`` from the stage's

641 ``*_specs`` builders (which carry the meta/bookkeeping side effects).

642 Returns ``[handle]`` (the single batch handle) or ``[]`` — shaped like

643 the per-op ``exec_*_ops`` return so the cache / drain paths are

644 unchanged. Only the launch is coalesced; matching stays per-peer FIFO.

645 """

646 if not specs:

647 return []

648 ops = [platform.p2p_op(op_type, tensor, peer) for op_type, tensor, peer in specs]

649 handle = platform.batch_isend_irecv(ops)

650 return [handle] if handle is not None else []

651

652 # --- P2P step primitives ------------------------------------------------

653 # One method per cross-rank comm action, used both by the runtime loop

654 # (``_exec_step``) and by OVERLAP callbacks (via ``ctx.schedule``). With

655 # ``overlap_p2p=True`` comm is decoupled from its compute: a recv caches its

656 # handles for the consuming step to ``wait_*`` later, and a send defers its

657 # handles to the end-of-iteration drain. With ``overlap_p2p=False`` every

658 # op waits inline.

659

660 def recv_fwd(self, stage: "hyper_parallel.PipelineStage", micro_index: int) -> None:

661 """Post the FWD recv for ``micro_index``; cache it (overlap_p2p) or wait now."""

662 handles = (self._batched_issue(stage.fwd_recv_specs(micro_index))

663 if self._batch_p2p else stage.exec_fwd_recv_ops(micro_index))

664 if self._overlap_p2p:

665 self.fwd_handle_cache[(stage.stage_index, micro_index)] = handles

666 else:

667 self._wait_p2p(handles)

668

669 def recv_bwd(self, stage: "hyper_parallel.PipelineStage", micro_index: int) -> None:

670 """Post the BWD recv for ``micro_index``; cache it (overlap_p2p) or wait now."""

671 handles = (self._batched_issue(stage.bwd_recv_specs(micro_index))

672 if self._batch_p2p else stage.exec_bwd_recv_ops(micro_index))

673 if self._overlap_p2p:

674 self.bwd_handle_cache[(stage.stage_index, micro_index)] = handles

675 else:

676 self._wait_p2p(handles)

677

678 def wait_fwd_recv(self, stage_index: int, micro_index: int) -> None:

679 """Wait the FWD recv cached by :meth:`recv_fwd`; no-op if nothing is cached."""

680 handles = self.fwd_handle_cache.pop((stage_index, micro_index), None)

681 if handles:

682 self._wait_p2p(handles)

683

684 def wait_bwd_recv(self, stage_index: int, micro_index: int) -> None:

685 """Wait the BWD recv cached by :meth:`recv_bwd`; no-op if nothing is cached."""

686 handles = self.bwd_handle_cache.pop((stage_index, micro_index), None)

687 if handles:

688 self._wait_p2p(handles)

689

690 def send_fwd(self, stage: "hyper_parallel.PipelineStage", micro_index: int) -> None:

691 """Send this stage's forward output for ``micro_index`` to the next stage."""

692 handles = (self._batched_issue(stage.fwd_send_specs(micro_index))

693 if self._batch_p2p else stage.exec_fwd_send_ops(micro_index)) or []

694 if self._overlap_p2p:

695 # Append the whole handle group: run_microbatches drains _send_handles

696 # group by group, so a bare handle would be wrongly iterated as a list.

697 self._send_handles.append(handles)

698 else:

699 self._wait_p2p(handles)

700

701 def send_bwd(self, stage: "hyper_parallel.PipelineStage", micro_index: int) -> None:

702 """Send this stage's input-gradient for ``micro_index`` to the previous stage.

703

704 Driven by the scheduler's ``BWD_SEND`` step. It pops the input grad that

705 the backward (unified ``backward_one_chunk`` or, under

706 ``enable_dxdw_split=True``, ``backward_input_one_chunk``) wrote to the

707 stage's ``bwd_cache``. Calling it manually in addition to the scheduled

708 ``BWD_SEND`` would double-send the gradient.

709 """

710 handles = (self._batched_issue(stage.bwd_send_specs(micro_index))

711 if self._batch_p2p else stage.exec_bwd_send_ops(micro_index)) or []

712 if self._overlap_p2p:

713 self._send_handles.append(handles)

714 else:

715 self._wait_p2p(handles)

716

717 def _arm_boundary(self, step):

718 """Register ``step`` for the stage after-forward hook; return its key.

719

720 No-op (returns ``None``) unless ``step`` carries ``boundary_p2p``. The

721 key is the overlap's forward ``(stage_index, micro_index)`` — exactly

722 what the hook receives when that forward chunk completes.

723 """

724 if not getattr(step, "boundary_p2p", None) or not step.sub_steps:

725 return None

726 fwd_sub = next((s for s in step.sub_steps

727 if s.type == MetaStepType.FWD), None)

728 if fwd_sub is None:

729 return None

730 key = (fwd_sub.stage_index, fwd_sub.micro_index)

731 self._pending_boundary[key] = step

732 return key

733

734 def _finish_boundary(self, step, armed_key) -> None:

735 """Post-step safety net: issue any boundary P2P the hook missed."""

736 self.exec_boundary_p2p(step)

737 if armed_key is not None:

738 self._pending_boundary.pop(armed_key, None)

739

740 def exec_boundary_p2p(self, step) -> None:

741 """Issue ``step.boundary_p2p`` (fwd-boundary P2P) once per run.

742

743 Fired by the stage after-forward hook (``_on_forward_chunk_done``) the

744 moment the overlap's forward chunk completes — the backward is still

745 running on its own thread, so the F_SEND leaves ~half a slot early and

746 the next slot's recvs are already posted when the peers' sends arrive.

747 Idempotent per ``run_microbatches`` call: the post-step safety net

748 (``_finish_boundary``) also invokes it, so an overlap whose forward

749 never went through ``forward_one_chunk`` degrades to gap-time issue

750 order instead of dropping the ops.

751

752 Dispatches through the existing per-op helpers (``send_fwd`` /

753 ``recv_fwd`` / ``recv_bwd``), so batching, handle caching for

754 ``wait_*_recv`` and deferred-send bookkeeping behave exactly like the

755 scheduled steps they replace. No-op for steps without

756 ``boundary_p2p``.

757 """

758 ops = getattr(step, "boundary_p2p", None)

759 if not ops or id(step) in self._boundary_issued:

760 return

761 self._boundary_issued.add(id(step))

762 for sub in ops:

763 stage = self._stage_dict[sub.stage_index]

764 if sub.type == MetaStepType.FWD_SEND:

765 self.send_fwd(stage, sub.micro_index)

766 elif sub.type == MetaStepType.FWD_RECV:

767 self.recv_fwd(stage, sub.micro_index)

768 elif sub.type == MetaStepType.BWD_RECV:

769 self.recv_bwd(stage, sub.micro_index)

770 elif sub.type == MetaStepType.BWD_SEND:

771 # attach_fwd_boundary_p2p never hoists BWD_SEND (its grad is

772 # produced by the backward still in flight); defensive only.

773 self.send_bwd(stage, sub.micro_index)

774

775 # ``op_type -> (specs builder name, route kind)`` for a coalesced sub-step.

776 # ``route`` is the recv-cache kind (so wait_*_recv finds the handle), or

777 # ``None`` for a send (no local consumer).

778 _BATCH_SUB_DISPATCH = {

779 MetaStepType.FWD_RECV: ("fwd_recv_specs", "fwd"),

780 MetaStepType.BWD_RECV: ("bwd_recv_specs", "bwd"),

781 MetaStepType.FWD_SEND: ("fwd_send_specs", None),

782 MetaStepType.BWD_SEND: ("bwd_send_specs", None),

783 }

784

785 def _exec_batch_send_recv(self, step) -> None:

786 """Execute a coalesced P2P run: one ``batch_isend_irecv`` per peer.

787

788 Builds each sub-step's specs (same meta/bookkeeping side effects as the

789 per-step ``recv_fwd`` / ``send_fwd`` / ...), groups every op by peer

790 global rank, and issues one batch per peer so a same-peer send+recv runs

791 duplex. Handle routing mirrors the per-step path: under

792 ``overlap_p2p`` a batch carrying a recv is cached for ``wait_*_recv``

793 (its send rides along), a send-only batch defers to ``_send_handles``;

794 without ``overlap_p2p`` every batch waits inline.

795 """

796 # (op_type, tensor, peer, route) per op; route = (kind, stage, micro) for

797 # a recv, else None.

798 tagged = []

799 for sub in step.sub_steps:

800 builder_name, kind = self._BATCH_SUB_DISPATCH[sub.type]

801 stage = self._stage_dict[sub.stage_index]

802 specs = getattr(stage, builder_name)(sub.micro_index)

803 route = (kind, sub.stage_index, sub.micro_index) if kind is not None else None

804 for op_type, tensor, peer in specs:

805 tagged.append((op_type, tensor, peer, route))

806

807 by_peer = {}

808 for item in tagged:

809 by_peer.setdefault(item[2], []).append(item)

810

811 for items in by_peer.values():

812 ops = [platform.p2p_op(op_type, tensor, peer) for op_type, tensor, peer, _ in items]

813 handle = platform.batch_isend_irecv(ops)

814 if handle is None:

815 continue

816 if not self._overlap_p2p:

817 self._wait_p2p([handle])

818 continue

819 recv_routes = [route for *_, route in items if route is not None]

820 if recv_routes:

821 for kind, si, mi in recv_routes:

822 cache = self.fwd_handle_cache if kind == "fwd" else self.bwd_handle_cache

823 cache[(si, mi)] = [handle]

824 else:

825 self._send_handles.append([handle])

826

827 def _assert_in_unshard_if_needed(self, stage, check_step):

828 if not isinstance(stage.submodule, HSDPModule):

829 return

830 submodule_hsdp_scheduler = stage.submodule.hsdp_scheduler

831 scheduler_state = submodule_hsdp_scheduler.hsdp_state

832 if scheduler_state.is_shard:

833 raise RuntimeError(

834 f"Executing MetaStep: {check_step}, expected HSDPModule parameters in unsharded "

835 f"state, but got sharded parameters."

836 )

837

838 def _exec_step(self, cur_step, arg_mbs, kwarg_mbs, losses):

839 """Execute one built-in step (non-custom, non-composite).

840

841 Each comm step dispatches to a single P2P primitive; each compute step

842 first waits its cached recv (a no-op under ``overlap_p2p=False``) and

843 then runs.

844 """

845 stage = self._stage_dict[cur_step.stage_index]

846 micro_index = cur_step.micro_index

847 step_type = cur_step.type

848

849 if step_type in (

850 MetaStepType.SWAP_SET_GROUP,

851 MetaStepType.SWAP_LAUNCH_OFFLOAD,

852 MetaStepType.SWAP_WAIT_OFFLOAD,

853 MetaStepType.SWAP_LAUNCH_LOAD,

854 MetaStepType.SWAP_WAIT_LOAD,

855 ):

856 self._exec_pipeline_swap_step(cur_step, arg_mbs, kwarg_mbs)

857

858 elif step_type == MetaStepType.FWD_RECV:

859 self.recv_fwd(stage, micro_index)

860

861 elif step_type == MetaStepType.FWD:

862 self._assert_in_unshard_if_needed(stage, cur_step)

863 self.wait_fwd_recv(stage.stage_index, micro_index)

864 out = stage.forward_one_chunk(micro_index, arg_mbs[micro_index], kwarg_mbs[micro_index])

865 self.update_losses(stage, out, losses)

866

867 elif step_type == MetaStepType.FWD_SEND:

868 self.send_fwd(stage, micro_index)

869

870 elif step_type == MetaStepType.BWD_RECV:

871 self.recv_bwd(stage, micro_index)

872

873 elif step_type == MetaStepType.BWD_INPUT:

874 self._assert_in_unshard_if_needed(stage, cur_step)

875 self.wait_bwd_recv(stage.stage_index, micro_index)

876 stage.backward_input_one_chunk(micro_index)

877

878 elif step_type == MetaStepType.BWD_WEIGHT:

879 self._assert_in_unshard_if_needed(stage, cur_step)

880 self.wait_bwd_recv(stage.stage_index, micro_index)

881 stage.backward_weight_one_chunk(micro_index)

882

883 elif step_type == MetaStepType.BWD:

884 self._assert_in_unshard_if_needed(stage, cur_step)

885 self.wait_bwd_recv(stage.stage_index, micro_index)

886 stage.backward_one_chunk(micro_index)

887

888 elif step_type == MetaStepType.BWD_SEND:

889 self.send_bwd(stage, micro_index)

890

891 else:

892 # FSDP control steps dispatch via the handler table; any other type

893 # is a no-op here (composite/custom types are handled upstream).

894 fsdp_handler = _FSDP_STEP_HANDLERS.get(step_type)

895 if fsdp_handler is not None:

896 fsdp_handler(stage)

897

898 def _exec_pipeline_swap_step(self, cur_step, arg_mbs, kwarg_mbs):

899 """Execute a pipeline activation-swap control step."""

900 from hyper_parallel.core.pipeline_parallel.pipeline_swap import ( # pylint: disable=C0415

901 swap_launch_load,

902 swap_launch_offload,

903 swap_set_group,

904 swap_wait_load,

905 swap_wait_offload,

906 )

907

908 if cur_step.type == MetaStepType.SWAP_SET_GROUP:

909 swap_set_group(cur_step)

910 elif cur_step.type == MetaStepType.SWAP_LAUNCH_OFFLOAD:

911 swap_launch_offload(cur_step, self, arg_mbs, kwarg_mbs)

912 elif cur_step.type == MetaStepType.SWAP_WAIT_OFFLOAD:

913 swap_wait_offload(cur_step)

914 elif cur_step.type == MetaStepType.SWAP_LAUNCH_LOAD:

915 swap_launch_load(cur_step)

916 elif cur_step.type == MetaStepType.SWAP_WAIT_LOAD:

917 swap_wait_load(cur_step)

918

919 def run_microbatches(self, arg_mbs: list, kwarg_mbs: list, losses: list) -> None:

920 """Execute the schedule step by step.

921

922 Steps whose :attr:`MetaStep.type` has a registered custom function

923 are delegated to that function with a :class:`PipelineContext`.

924 Composite ``OVERLAP_F_B`` / ``OVERLAP_B_F`` steps without a

925 registered handler fall back to executing their ``sub_steps``

926 sequentially via :meth:`_exec_step` — correct but without

927 comm/compute overlap. All other steps are executed by

928 :meth:`_exec_step`.

929

930 Logs one ``DEBUG`` line per non-bubble step showing the rank's

931 progress: ``rank=<r> step=<i>/<n> <MetaStep>``. Enable with

932 ``logging.getLogger('hyper_parallel.core.pipeline_parallel.scheduler')

933 .setLevel(logging.DEBUG)`` to trace per-rank schedule advancement

934 (handy when diagnosing deadlocks or callback ordering issues).

935 """

936 real_stage_index = self.stages[0].stage_index % self.real_stage_num

937 self._send_handles = []

938 self._boundary_issued = set()

939 self._pending_boundary = {}

940 ctx = None # lazily created

941

942 ordered = self.exec_order[real_stage_index]

943 total_steps = len(ordered)

944 logger.debug(

945 "run_microbatches start: rank=%d total_steps=%d micro_batch_num=%d",

946 real_stage_index, total_steps, self.micro_batch_num,

947 )

948

949 for step_idx, cur_step in enumerate(ordered):

950 if cur_step is None:

951 continue

952

953 logger.debug(

954 "rank=%d step=%d/%d %s",

955 real_stage_index, step_idx, total_steps, cur_step,

956 )

957

958 # Arm the fwd-boundary hook: when this step carries boundary_p2p,

959 # the stage's after-forward hook fires exec_boundary_p2p the moment

960 # the overlap's forward chunk completes (works for any callback —

961 # no callback cooperation needed).

962 armed_key = self._arm_boundary(cur_step)

963

964 # Check for registered custom function

965 custom_fn = self._custom_fn_map.get(cur_step.type)

966 if custom_fn is not None:

967 if ctx is None:

968 ctx = PipelineContext(self, arg_mbs, kwarg_mbs, losses)

969 custom_fn(cur_step, ctx)

970 # Safety net: if the forward hook never fired (custom fwd path),

971 # issue the boundary ops now (gap-time order) instead of

972 # dropping them. Idempotent.

973 self._finish_boundary(cur_step, armed_key)

974 continue

975

976 # Coalesced P2P block: group sub-steps by peer, issue one

977 # batch_isend_irecv per peer (same-peer send+recv -> duplex).

978 if cur_step.type == MetaStepType.BATCH_SEND_RECV:

979 self._exec_batch_send_recv(cur_step)

980 continue

981

982 # Default for composite OVERLAP steps: run sub_steps sequentially.

983 # P2P send/recv around these steps are already laid out in two

984 # virtual slots by ``add_send_recv``, so sequential execution is

985 # semantically equivalent to non-overlapped 1F1B.

986 if (cur_step.type in (MetaStepType.OVERLAP_F_B, MetaStepType.OVERLAP_B_F)

987 and cur_step.sub_steps):

988 for sub in cur_step.sub_steps:

989 self._exec_step(sub, arg_mbs, kwarg_mbs, losses)

990 self._finish_boundary(cur_step, armed_key)

991 continue

992

993 self._exec_step(cur_step, arg_mbs, kwarg_mbs, losses)

994

995 logger.debug(

996 "run_microbatches end: rank=%d pending_send_handles=%d",

997 real_stage_index, len(self._send_handles),

998 )

999 self.sync_shared_parameters_grad()

1000 while self._send_handles:

1001 self._wait_p2p(self._send_handles.pop())

1002

1003

1004class _OverlapPhantom:

1005 """Internal marker used by :func:`add_send_recv` to expand an

1006 ``OVERLAP_F_B`` or ``OVERLAP_B_F`` step into two virtual time slots.

1007

1008 An overlap step composes two sub-steps (``B + F`` or ``F + B``) that

1009 execute concurrently on the GPU but occupy **two** logical time slots

1010 in the column-scan sender timeline — the sender can only finish

1011 emitting the second sub-step's output after the first sub-step has

1012 completed. Treating an overlap step as a single slot places the RECV

1013 triggered by the second sub-step too early on the receiver.

1014

1015 Each overlap step is expanded into two phantoms:

1016 * ``is_first_half=True`` — represents the first sub-step's emission

1017 slot; the original overlap step is emitted into the output

1018 schedule here (only once).

1019 * ``is_first_half=False`` — represents the second sub-step's emission

1020 slot; only its send/recv comms are inserted.

1021 """

1022

1023 __slots__ = ('obf_step', 'sub_step', 'is_first_half')

1024

1025 def __init__(self, obf_step, sub_step, is_first_half: bool):

1026 self.obf_step = obf_step

1027 self.sub_step = sub_step

1028 self.is_first_half = is_first_half

1029

1030

1031def _expand_overlap_slots(scheduler, real_stage_num):

1032 """Expand OVERLAP steps in a per-rank schedule into 2 virtual time slots.

1033

1034 Returns a new ``{rank: [MetaStep | _OverlapPhantom | None, ...]}`` dict

1035 where each OVERLAP step is replaced by a pair of phantoms. Non-OVERLAP

1036 entries pass through unchanged.

1037 """

1038 expanded = {}

1039 for rank in range(real_stage_num):

1040 order = scheduler[rank]

1041 exp = []

1042 for op in order:

1043 if (op is not None

1044 and op.type in (MetaStepType.OVERLAP_F_B, MetaStepType.OVERLAP_B_F)

1045 and op.sub_steps):

1046 exp.append(_OverlapPhantom(op, op.sub_steps[0], is_first_half=True))

1047 exp.append(_OverlapPhantom(op, op.sub_steps[1], is_first_half=False))

1048 else:

1049 exp.append(op)

1050 expanded[rank] = exp

1051 return expanded

1052

1053

1054def _process_rank_items(real_stage_num, current_items, insert_step_comms, new_schedule):

1055 """Run ``insert_step_comms`` for each rank's current item, even ranks first.

1056

1057 Even-before-odd ordering avoids P2P deadlocks between adjacent ranks.

1058 """

1059 for rank in range(0, real_stage_num, 2):

1060 item = current_items.get(rank)

1061 if item is not None:

1062 sub = item.sub_step if isinstance(item, _OverlapPhantom) else item

1063 insert_step_comms(sub, rank, new_schedule)

1064 for rank in range(1, real_stage_num, 2):

1065 item = current_items.get(rank)

1066 if item is not None:

1067 sub = item.sub_step if isinstance(item, _OverlapPhantom) else item

1068 insert_step_comms(sub, rank, new_schedule)

1069

1070

1071def _column_scan_insert_comms(expanded, real_stage_num, insert_step_comms):

1072 """Column-scan over an OVERLAP-expanded schedule to insert SEND/RECV.

1073

1074 Processes ``expanded`` one time slot at a time. Emits the original

1075 overlap step into ``new_schedule`` only once (at the first-half

1076 phantom). Delegates comm insertion to ``insert_step_comms`` for each

1077 plain step or phantom's underlying sub-step.

1078

1079 Even ranks are processed before odd ranks at each time step to avoid

1080 P2P deadlocks between adjacent ranks.

1081

1082 Args:

1083 expanded: Result of :func:`_expand_overlap_slots`.

1084 real_stage_num: Number of physical ranks.

1085 insert_step_comms: Callable ``(step, rank, new_schedule) -> None``

1086 that inserts SEND/RECV for a single FWD/BWD step.

1087

1088 Returns:

1089 ``{rank: [MetaStep, ...]}`` final schedule.

1090 """

1091 max_length = max(len(order) for order in expanded.values())

1092 new_schedule = {rank: [] for rank in range(real_stage_num)}

1093

1094 for time_step in range(max_length):

1095 current_items = {}

1096 for rank in range(real_stage_num):

1097 if time_step < len(expanded[rank]):

1098 item = expanded[rank][time_step]

1099 current_items[rank] = item

1100 if item is None:

1101 # Preserve bubble slots to keep per-rank time-step

1102 # indexing aligned with the column scan. The runtime

1103 # loop skips ``None`` entries, so this is execution-

1104 # semantics-neutral.

1105 new_schedule[rank].append(None)

1106 continue

1107 if isinstance(item, _OverlapPhantom):

1108 # Emit the overlap step only once, at the first-half slot.

1109 if item.is_first_half:

1110 new_schedule[rank].append(item.obf_step)

1111 else:

1112 new_schedule[rank].append(item)

1113 else:

1114 current_items[rank] = None

1115

1116 _process_rank_items(

1117 real_stage_num, current_items, insert_step_comms, new_schedule,

1118 )

1119

1120 return new_schedule

1121

1122

1123_P2P_STEP_TYPES = frozenset({

1124 MetaStepType.FWD_SEND, MetaStepType.FWD_RECV,

1125 MetaStepType.BWD_SEND, MetaStepType.BWD_RECV,

1126})

1127

1128

1129def coalesce_p2p(exec_order):

1130 """Coalesce maximal contiguous runs of >=2 P2P steps into BATCH_SEND_RECV.

1131

1132 A *run* is a maximal sequence of consecutive ``FWD_SEND`` / ``FWD_RECV`` /

1133 ``BWD_SEND`` / ``BWD_RECV`` steps with no compute / overlap / bubble (``None``)

1134 step between them — so no recv in the run is consumed before the batch is

1135 issued, and all sends' data is already produced. Each such run is replaced

1136 by a single :class:`MetaStep` of type ``BATCH_SEND_RECV`` carrying the run as

1137 ``sub_steps`` (order preserved, so per-direction FIFO is kept); the runtime

1138 groups those sub-steps by peer and issues one ``batch_isend_irecv`` per peer

1139 (same-peer send+recv -> duplex). Runs of length 1 are left untouched (the

1140 per-op batched path still batches them, so every transfer is still

1141 batch-vs-batch). Pure ``exec_order -> exec_order`` transform.

1142

1143 Args:

1144 exec_order: ``{rank: [MetaStep | None, ...]}``.

1145

1146 Returns:

1147 A new ``{rank: [...]}`` with contiguous P2P runs coalesced.

1148 """

1149 def _flush(run, new):

1150 if len(run) >= 2:

1151 new.append(MetaStep(None, MetaStepType.BATCH_SEND_RECV, None, sub_steps=tuple(run)))

1152 else:

1153 new.extend(run)

1154

1155 out = {}

1156 for rank, order in exec_order.items():

1157 new = []

1158 run = []

1159 for step in order:

1160 if step is not None and step.type in _P2P_STEP_TYPES:

1161 run.append(step)

1162 continue

1163 _flush(run, new)

1164 run = []

1165 new.append(step)

1166 _flush(run, new)

1167 out[rank] = new

1168 return out

1169

1170

1171_RECV_STEP_TYPES = frozenset({MetaStepType.FWD_RECV, MetaStepType.BWD_RECV})

1172

1173

1174def attach_fwd_boundary_p2p(exec_order):

1175 """Hang each overlap gap's boundary-safe P2P on the OVERLAP_B_F step.

1176

1177 For every ``OVERLAP_B_F`` step, the contiguous P2P run right after it is

1178 split by data readiness at the overlap's fwd/bwd boundary (forward is the

1179 short side; backward, ~2x FLOPs, is the long pole):

1180

1181 * the forward's own ``FWD_SEND`` — its payload exists the moment the

1182 forward sub-step finishes, no need to wait out the backward;

1183 * every ``FWD_RECV`` / ``BWD_RECV`` — no data dependency at all;

1184

1185 are removed from the gap and attached to the OVERLAP step as

1186 ``boundary_p2p`` (order: ``F_SEND`` first, then the recvs in original

1187 order), to be issued by :meth:`PipelineScheduleRuntime.exec_boundary_p2p`

1188 at the boundary, while the backward is still running. ``BWD_SEND`` (its

1189 grad is produced by that backward) and any send not produced by this

1190 overlap's forward stay in the gap.

1191

1192 Pairing shape (the reason this composition is safe where naive

1193 hoist+coalesce hung): with every op issued as a per-op solo batch, each

1194 pair's per-pair batch sequence per slot is ``[F_SEND, B_RECV]`` on the

1195 prev end and ``[F_RECV, B_SEND]`` on the next end — complementary at every

1196 position and equal in count, so it matches under both per-direction FIFO

1197 and per-pair shape-mirroring semantics. Per-direction FIFO data order is

1198 preserved (each direction's ops keep their relative order; they all shift

1199 by the same amount). Pure ``exec_order -> exec_order`` transform.

1200

1201 Args:

1202 exec_order: ``{rank: [MetaStep | None, ...]}``.

1203

1204 Returns:

1205 A new ``{rank: [...]}`` with boundary P2P attached to OVERLAP steps.

1206 """

1207 out = {}

1208 for rank, order in exec_order.items():

1209 new = []

1210 i = 0

1211 while i < len(order):

1212 step = order[i]

1213 if (step is None or step.type != MetaStepType.OVERLAP_B_F

1214 or not step.sub_steps):

1215 new.append(step)

1216 i += 1

1217 continue

1218 run, j = _p2p_run_after(order, i + 1)

1219 boundary, leftover = _split_boundary_run(step, run)

1220 if not boundary:

1221 new.append(step)

1222 i += 1

1223 continue

1224 new.append(MetaStep(step.micro_index, step.type, step.stage_index,

1225 sub_steps=step.sub_steps, boundary_p2p=boundary))

1226 new.extend(leftover)

1227 i = j

1228 out[rank] = new

1229 return out

1230

1231

1232def _p2p_run_after(order, start):

1233 """Collect the contiguous P2P run starting at ``start``.

1234

1235 Returns ``(run, end)`` where ``end`` is the index of the first step past

1236 the run (a compute step, ``None`` bubble, or end of order).

1237 """

1238 run = []

1239 j = start

1240 while j < len(order) and order[j] is not None and order[j].type in _P2P_STEP_TYPES:

1241 run.append(order[j])

1242 j += 1

1243 return run, j

1244

1245

1246def _split_boundary_run(step, run):

1247 """Split an overlap's trailing P2P run by fwd/bwd-boundary data readiness.

1248

1249 Returns ``(boundary, leftover)``: ``boundary`` holds the overlap's own

1250 forward ``FWD_SEND`` (payload ready at the boundary) first, then every

1251 recv (no data dependency), keeping original order; ``leftover`` keeps the

1252 sends produced by the still-running backward, in place.

1253 """

1254 fwd_sub = next((s for s in step.sub_steps

1255 if s.type == MetaStepType.FWD), None)

1256

1257 def _is_own_fwd_send(s):

1258 return (fwd_sub is not None

1259 and s.type == MetaStepType.FWD_SEND

1260 and s.stage_index == fwd_sub.stage_index

1261 and s.micro_index == fwd_sub.micro_index)

1262

1263 boundary = ([s for s in run if _is_own_fwd_send(s)]

1264 + [s for s in run if s.type in _RECV_STEP_TYPES])

1265 taken = {id(s) for s in boundary}

1266 leftover = [s for s in run if id(s) not in taken]

1267 return tuple(boundary), leftover

1268

1269

1270def split_overlap_dxdw(exec_order: dict) -> dict:

1271 """Split each OVERLAP_B_F backward into dx (in the pair) + dw (after the gap).

1272

1273 Rewrites ``(BWD, FWD)`` sub_steps to ``(BWD_INPUT, FWD)`` and inserts the

1274 matching ``BWD_WEIGHT`` after the contiguous P2P run that follows the

1275 overlap. The overlap then joins at ``max(dx, fwd)`` instead of

1276 ``max(dx + dw, fwd)``, so the gap's ``BWD_SEND`` (dx already wrote its

1277 grad to ``bwd_cache``) and the next slot's recvs are issued a dw earlier;

1278 under ``overlap_p2p`` they are async and dw computes while they fly.

1279

1280 Comm placement is untouched: the pass runs after ``add_send_recv`` and

1281 only moves local compute, so the cross-rank matching order is identical,

1282 and the P2P run stays contiguous (dw lands after it, not inside) so

1283 ``coalesce_p2p`` / ``attach_fwd_boundary_p2p`` see the same gap shape.

1284

1285 First-stage (``stage_index == 0``) backwards stay unified: their dx is a

1286 no-op (no input grad to compute or send), so splitting would only move

1287 the whole backward out of the overlap and lose its fwd overlap.

1288

1289 Args:

1290 exec_order: ``{rank: [MetaStep | None, ...]}``.

1291

1292 Returns:

1293 A new ``{rank: [...]}`` with overlap backwards split into dx/dw.

1294 """

1295 out = {}

1296 for rank, order in exec_order.items():

1297 new = []

1298 i = 0

1299 n = len(order)

1300 while i < n:

1301 step = order[i]

1302 i += 1

1303 if (step is None or step.type != MetaStepType.OVERLAP_B_F

1304 or not step.sub_steps):

1305 new.append(step)

1306 continue

1307 bwd_sub, fwd_sub = step.sub_steps

1308 if bwd_sub.type != MetaStepType.BWD or bwd_sub.stage_index == 0:

1309 new.append(step)

1310 continue

1311 dx = MetaStep(bwd_sub.micro_index, MetaStepType.BWD_INPUT, bwd_sub.stage_index)

1312 new.append(MetaStep(step.micro_index, step.type, step.stage_index,

1313 sub_steps=(dx, fwd_sub)))

1314 run, i = _p2p_run_after(order, i)

1315 new.extend(run)

1316 new.append(MetaStep(bwd_sub.micro_index, MetaStepType.BWD_WEIGHT,

1317 bwd_sub.stage_index))

1318 out[rank] = new

1319 return out

1320

1321

1322def add_send_recv(scheduler, stage_num, real_stage_num, style='loop'):

1323 """Insert P2P send/recv operations into a per-rank compute schedule.

1324

1325 For each FWD or BWD step that requires cross-rank communication, a

1326 ``FWD_SEND`` / ``BWD_SEND`` is appended to the sender's schedule and a

1327 ``FWD_RECV`` / ``BWD_RECV`` is appended to the receiver's schedule.

1328

1329 ``OVERLAP_F_B`` / ``OVERLAP_B_F`` composite steps are expanded into

1330 **two** virtual time slots during the column scan so that the RECV

1331 triggered by the **second** sub-step lands in the receiver's schedule

1332 one slot later — matching the fact that the sender can only finish

1333 emitting the second sub-step's output after the first completes.

1334

1335 Even ranks are processed before odd ranks at each time step to avoid

1336 P2P deadlocks between adjacent ranks.

1337

1338 The resulting per-gap op order (steady state:

1339 ``[B_RECV, B_SEND, F_RECV, F_SEND]``) is LOAD-BEARING, not cosmetic.

1340 Each adjacent rank pair shares one comm (HCCL split-by-group) on which

1341 plain send/recv execute in queue order, and this layout makes one end of

1342 every pair recv-first while the other is send-first, so the two queue

1343 heads always match (recv<->send), then the tails match (send<->recv).

1344 Any later pass that reorders P2P ops relative to EACH OTHER breaks this:

1345 a (since removed) hoist variant that moved recvs across sends made both

1346 ends recv-first and deadlocked on hardware (2026-06).

1347 ``attach_fwd_boundary_p2p`` (the ``"boundary"`` transport) is safe: it keeps

1348 every per-direction FIFO and runs on the batch transport with per-op solo

1349 batches, whose per-pair sequences stay complementary.

1350

1351 Args:

1352 scheduler: ``{rank: [MetaStep | None, ...]}`` — compute schedule

1353 with ``None`` for bubble slots.

1354 stage_num: Total number of virtual pipeline stages.

1355 real_stage_num: Number of physical ranks.

1356 style: Topology mapping — ``'loop'`` or ``'v'``.

1357

1358 Returns:

1359 ``{rank: [MetaStep, ...]}`` — schedule with communication ops inserted.

1360 """

1361

1362 def stage_to_rank(stage_index: int) -> int:

1363 """Map a virtual stage index to its physical rank."""

1364 if style == 'loop':

1365 return stage_index % real_stage_num

1366 if style == 'v':

1367 if stage_index < real_stage_num:

1368 return stage_index

1369 return stage_num - 1 - stage_index

1370 raise ValueError(f"Argument 'style' must be 'loop' or 'v', but got {style!r}.")

1371

1372 def _fwd_peer(stage_index: int):

1373 """Return the rank that receives this stage's forward output, or None."""

1374 if stage_index >= stage_num - 1:

1375 return None

1376 peer = stage_to_rank(stage_index + 1)

1377 return peer if peer != stage_to_rank(stage_index) else None

1378

1379 def _bwd_peer(stage_index: int):

1380 """Return the rank that receives this stage's backward gradient, or None."""

1381 if stage_index <= 0:

1382 return None

1383 peer = stage_to_rank(stage_index - 1)

1384 return peer if peer != stage_to_rank(stage_index) else None

1385

1386 def _insert_comms_for_step(step, rank, new_schedule):

1387 """Insert send/recv for a single FWD, BWD, or composite OVERLAP step."""

1388 if step is None:

1389 return

1390

1391 if step.type == MetaStepType.FWD:

1392 peer = _fwd_peer(step.stage_index)

1393 if peer is not None:

1394 new_schedule[rank].append(

1395 MetaStep(step.micro_index, MetaStepType.FWD_SEND, step.stage_index))

1396 new_schedule[peer].append(

1397 MetaStep(step.micro_index, MetaStepType.FWD_RECV, step.stage_index + 1))

1398

1399 elif step.type == MetaStepType.BWD:

1400 peer = _bwd_peer(step.stage_index)

1401 if peer is not None:

1402 new_schedule[rank].append(

1403 MetaStep(step.micro_index, MetaStepType.BWD_SEND, step.stage_index))

1404 new_schedule[peer].append(

1405 MetaStep(step.micro_index, MetaStepType.BWD_RECV, step.stage_index - 1))

1406

1407 elif step.type in (MetaStepType.OVERLAP_F_B, MetaStepType.OVERLAP_B_F) and step.sub_steps:

1408 for sub in step.sub_steps:

1409 _insert_comms_for_step(sub, rank, new_schedule)

1410

1411 # --- Main logic: expand OVERLAP steps into 2 virtual slots, then scan ---

1412 expanded = _expand_overlap_slots(scheduler, real_stage_num)

1413 return _column_scan_insert_comms(expanded, real_stage_num, _insert_comms_for_step)

1414

1415

1416_ALIGN_PAD = object()

1417"""Sentinel marking a forced 1F1B-boundary bubble produced during alignment."""

1418

1419

1420def _step_dep_ready(step, rank, t, done, stage_num, stage_to_rank):

1421 """Cross-rank data dependency check used by the alignment simulator.

1422

1423 A FWD step at stage ``s`` depends on FWD at stage ``s-1`` (on a

1424 different rank); BWD at stage ``s`` depends on BWD at stage ``s+1``.

1425 Steps at boundaries or whose producer lives on the same rank are

1426 always ready.

1427 """

1428 si, mi = step.stage_index, step.micro_index

1429 if step.type == MetaStepType.FWD:

1430 if si == 0 or stage_to_rank(si - 1) == rank:

1431 return True

1432 key = (MetaStepType.FWD, si - 1, mi)

1433 return key in done and done[key] < t

1434 if step.type == MetaStepType.BWD:

1435 if si == stage_num - 1 or stage_to_rank(si + 1) == rank:

1436 return True

1437 key = (MetaStepType.BWD, si + 1, mi)

1438 return key in done and done[key] < t

1439 return True

1440

1441

1442def _simulate_aligned_schedule(padded, stage_num, real_stage_num, stage_to_rank):

1443 """Simulate execution time-step by time-step, inserting bubbles where

1444 a step is not yet ready (cross-rank dep) or where the cooldown

1445 rhythm requires it.

1446

1447 Args:

1448 padded: ``{rank: [step | _ALIGN_PAD | None, ...]}`` after

1449 1F1B-boundary padding.

1450 stage_num: Total number of virtual pipeline stages.

1451 real_stage_num: Number of physical ranks.

1452 stage_to_rank: Topology mapping from stage to rank.

1453

1454 Returns:

1455 ``{rank: [step | None, ...]}`` ready for the column-scan SEND/RECV

1456 insertion phase.

1457 """

1458 remaining_fwd = {

1459 rank: sum(

1460 1 for s in padded[rank]

1461 if s is not _ALIGN_PAD and s is not None and s.type == MetaStepType.FWD

1462 )

1463 for rank in range(real_stage_num)

1464 }

1465 cursors = {r: 0 for r in range(real_stage_num)}

1466 aligned = {r: [] for r in range(real_stage_num)}

1467 done = {}

1468 last_was_cooldown_bwd = {r: False for r in range(real_stage_num)}

1469 max_t = sum(len(v) for v in padded.values()) + real_stage_num * 20

1470

1471 def _emit_bubble(rank):

1472 aligned[rank].append(None)

1473 last_was_cooldown_bwd[rank] = False

1474

1475 def _emit_step(rank, step, t, in_cooldown):

1476 aligned[rank].append(step)

1477 done[(step.type, step.stage_index, step.micro_index)] = t

1478 cursors[rank] += 1

1479 if step.type == MetaStepType.FWD:

1480 remaining_fwd[rank] -= 1

1481 last_was_cooldown_bwd[rank] = in_cooldown and step.type == MetaStepType.BWD

1482

1483 def _step_rank_at(t, rank):

1484 if cursors[rank] >= len(padded[rank]):

1485 return

1486 item = padded[rank][cursors[rank]]

1487 if item is _ALIGN_PAD:

1488 _emit_bubble(rank)

1489 cursors[rank] += 1

1490 return

1491 in_cooldown = remaining_fwd[rank] == 0

1492 # Cooldown rhythm: alternate None / BWD in pure-BWD phase.

1493 cooldown_skip = (

1494 in_cooldown

1495 and item.type == MetaStepType.BWD

1496 and last_was_cooldown_bwd[rank]

1497 )

1498 if cooldown_skip:

1499 _emit_bubble(rank)

1500 return

1501 if not _step_dep_ready(item, rank, t, done, stage_num, stage_to_rank):

1502 _emit_bubble(rank)

1503 return

1504 _emit_step(rank, item, t, in_cooldown)

1505

1506 for t in range(max_t):

1507 if all(cursors[r] >= len(padded[r]) for r in range(real_stage_num)):

1508 break

1509 for rank in range(real_stage_num):

1510 _step_rank_at(t, rank)

1511 return aligned

1512

1513

1514def auto_align_and_add_send_recv(scheduler, stage_num, real_stage_num, style='loop'):

1515 """Auto-insert bubble alignment and P2P send/recv into a pure-compute schedule.

1516

1517 Unlike :func:`add_send_recv` which requires the caller to pre-insert

1518 ``None`` bubble slots for time-step alignment, this function accepts a

1519 **pure compute order** (``FWD`` / ``BWD`` only, no ``None`` needed) and

1520 automatically determines bubble placement via execution simulation.

1521

1522 Three constraints are enforced:

1523

1524 1. **Data dependency** — a ``FWD(stage_k)`` cannot execute until

1525 ``FWD(stage_{k-1})`` on its source rank has completed (and

1526 analogously for ``BWD``).

1527 2. **1F1B transition alignment** — ``real_stage_num - 1 - rank`` padding

1528 slots are inserted at the warmup → 1F1B boundary (detected as the

1529 first ``FWD`` immediately followed by a ``BWD`` in the compute order)

1530 so that all ranks enter the 1F1B steady state in lockstep.

1531 3. **Cooldown rhythm** — once a rank exhausts its ``FWD`` ops and enters

1532 pure-``BWD`` cooldown, consecutive ``BWD`` steps are separated by a

1533 ``None`` slot, maintaining the column-phase-sync property (no rank

1534 does ``BWD`` while another does ``FWD`` at the same time step).

1535

1536 After alignment, a column-scan pass inserts ``FWD_SEND`` / ``FWD_RECV``

1537 and ``BWD_SEND`` / ``BWD_RECV`` with the same prefetch semantics as

1538 :func:`add_send_recv`.

1539

1540 Args:

1541 scheduler: ``{rank: [MetaStep, ...]}`` — pure compute schedule.

1542 ``None`` entries are silently stripped before processing.

1543 stage_num: Total number of virtual pipeline stages.

1544 real_stage_num: Number of physical ranks.

1545 style: Topology mapping — ``'loop'`` or ``'v'``.

1546

1547 Returns:

1548 ``{rank: [MetaStep, ...]}`` — fully aligned schedule with bubbles

1549 and communication ops inserted.

1550 """

1551

1552 # ---- topology helpers (shared with column-scan phase) ----

1553

1554 def stage_to_rank(stage_index: int) -> int:

1555 if style == 'loop':

1556 return stage_index % real_stage_num

1557 if style == 'v':

1558 if stage_index < real_stage_num:

1559 return stage_index

1560 return stage_num - 1 - stage_index

1561 raise ValueError(f"Argument 'style' must be 'loop' or 'v', but got {style!r}.")

1562

1563 def _fwd_peer(stage_index: int):

1564 if stage_index >= stage_num - 1:

1565 return None

1566 peer = stage_to_rank(stage_index + 1)

1567 return peer if peer != stage_to_rank(stage_index) else None

1568

1569 def _bwd_peer(stage_index: int):

1570 if stage_index <= 0:

1571 return None

1572 peer = stage_to_rank(stage_index - 1)

1573 return peer if peer != stage_to_rank(stage_index) else None

1574

1575 # ---- Phase 1: strip None, detect 1F1B boundary, insert transition padding ----

1576

1577 def _find_1f1b_boundary(order):

1578 """Index of the first FWD followed by BWD; ``len(order)`` if absent."""

1579 for i in range(len(order) - 1):

1580 if (order[i].type == MetaStepType.FWD

1581 and order[i + 1].type == MetaStepType.BWD):

1582 return i

1583 return len(order)

1584

1585 padded = {}

1586 for rank in range(real_stage_num):

1587 order = [s for s in scheduler[rank] if s is not None]

1588 boundary = _find_1f1b_boundary(order)

1589 pad_count = real_stage_num - 1 - rank

1590 padded[rank] = order[:boundary] + [_ALIGN_PAD] * pad_count + order[boundary:]

1591

1592 # ---- Phase 2: simulate execution with data deps + cooldown rhythm ----

1593

1594 aligned = _simulate_aligned_schedule(padded, stage_num, real_stage_num, stage_to_rank)

1595

1596 # ---- Phase 3: column-scan SEND/RECV insertion (same as add_send_recv) ----

1597

1598 def _insert_comms_for_step(step, rank, new_schedule):

1599 if step is None:

1600 return

1601 if step.type == MetaStepType.FWD:

1602 peer = _fwd_peer(step.stage_index)

1603 if peer is not None:

1604 new_schedule[rank].append(

1605 MetaStep(step.micro_index, MetaStepType.FWD_SEND, step.stage_index))

1606 new_schedule[peer].append(

1607 MetaStep(step.micro_index, MetaStepType.FWD_RECV, step.stage_index + 1))

1608 elif step.type == MetaStepType.BWD:

1609 peer = _bwd_peer(step.stage_index)

1610 if peer is not None:

1611 new_schedule[rank].append(

1612 MetaStep(step.micro_index, MetaStepType.BWD_SEND, step.stage_index))

1613 new_schedule[peer].append(

1614 MetaStep(step.micro_index, MetaStepType.BWD_RECV, step.stage_index - 1))

1615 elif step.type in (MetaStepType.OVERLAP_F_B, MetaStepType.OVERLAP_B_F) and step.sub_steps:

1616 for sub in step.sub_steps:

1617 _insert_comms_for_step(sub, rank, new_schedule)

1618

1619 # Expand OVERLAP steps into 2 virtual slots before the column scan so

1620 # the RECV triggered by an overlap's second sub-step lands one slot

1621 # later on the receiver — matching the fact that the sender can only

1622 # finish emitting the second sub-step after the first completes.

1623 expanded = _expand_overlap_slots(aligned, real_stage_num)

1624 return _column_scan_insert_comms(expanded, real_stage_num, _insert_comms_for_step)

1625

1626

1627class ScheduleGPipe(PipelineScheduleRuntime):

1628 """

1629 The Gpipe schedule.

1630 It first executes all forward micro batches and then execute all backward micro batches.

1631 """

1632 def __init__(self,

1633 stages,

1634 micro_batch_num,

1635 args_batch_dim=None,

1636 kwargs_batch_dim=None,

1637 output_concat_dim=None,

1638 swap=False):

1639 super().__init__(stages,

1640 micro_batch_num,

1641 args_batch_dim=args_batch_dim,

1642 kwargs_batch_dim=kwargs_batch_dim,

1643 output_concat_dim=output_concat_dim,

1644 swap=swap)

1645 self.build_exec_order()

1646

1647 def _build_stage_to_rank_index(self) -> None:

1648 self._stage_to_rank_index = generate_stage_to_rank_mapping(

1649 self.real_stage_num, self._stage_num, style='loop'

1650 )

1651

1652 def construct_exec_order(self):

1653 """construct_exec_order of Gpipe."""

1654 for stage_index in range(self.real_stage_num):

1655 order_list = []

1656 for mb_index in range(self.micro_batch_num):

1657 if stage_index != 0:

1658 order_list.append(MetaStep(mb_index, MetaStepType.FWD_RECV, stage_index))

1659 order_list.append(MetaStep(mb_index, MetaStepType.FWD, stage_index))

1660 if stage_index != self.real_stage_num - 1:

1661 order_list.append(MetaStep(mb_index, MetaStepType.FWD_SEND, stage_index))

1662 for mb_index in range(self.micro_batch_num):

1663 if stage_index != self.real_stage_num - 1:

1664 order_list.append(MetaStep(mb_index, MetaStepType.BWD_RECV, stage_index))

1665 order_list.append(MetaStep(mb_index, MetaStepType.BWD, stage_index))

1666 if stage_index != 0:

1667 order_list.append(MetaStep(mb_index, MetaStepType.BWD_SEND, stage_index))

1668 self.exec_order[stage_index] = order_list

1669

1670

1671class Schedule1F1B(PipelineScheduleRuntime):

1672 """

1673 The 1F1B schedule.

1674 It will perform one forward and one backward on the micro batches in steady state.

1675 """

1676 def __init__(self,

1677 stages,

1678 micro_batch_num,

1679 args_batch_dim=None,

1680 kwargs_batch_dim=None,

1681 output_concat_dim=None,

1682 swap=False):

1683 super().__init__(stages,

1684 micro_batch_num,

1685 args_batch_dim=args_batch_dim,

1686 kwargs_batch_dim=kwargs_batch_dim,

1687 output_concat_dim=output_concat_dim,

1688 swap=swap)

1689 self.build_exec_order()

1690

1691 def _build_stage_to_rank_index(self) -> None:

1692 self._stage_to_rank_index = generate_stage_to_rank_mapping(

1693 self.real_stage_num, self._stage_num, style='loop'

1694 )

1695

1696 def construct_exec_order(self):

1697 """construct_exec_order of 1F1B."""

1698 for stage_index in range(self.real_stage_num):

1699 order_list = []

1700 fwd_index = 0

1701 bwd_index = 0

1702 # warmup phase

1703 warmup_micro_batches = min(self.real_stage_num - stage_index, self.micro_batch_num)

1704 for _ in range(warmup_micro_batches):

1705 if stage_index != 0:

1706 order_list.append(MetaStep(fwd_index, MetaStepType.FWD_RECV, stage_index))

1707 if stage_index % 2 == 0:

1708 order_list.append(MetaStep(fwd_index, MetaStepType.FWD, stage_index))

1709 if fwd_index != warmup_micro_batches - 1:

1710 order_list.append(MetaStep(fwd_index, MetaStepType.FWD_SEND, stage_index))

1711 else:

1712 if fwd_index > 0:

1713 order_list.append(MetaStep(fwd_index - 1, MetaStepType.FWD_SEND, stage_index))

1714 order_list.append(MetaStep(fwd_index, MetaStepType.FWD, stage_index))

1715 fwd_index += 1

1716

1717 # if warmup phase cannot filled up, then we need to execute fwd send in advance

1718 if self.real_stage_num - stage_index > self.micro_batch_num:

1719 order_list.append(MetaStep(fwd_index - 1, MetaStepType.FWD_SEND, stage_index))

1720 fwd_index += 1

1721 # steady phase

1722 steady_micro_batches = self.micro_batch_num - warmup_micro_batches

1723 for _ in range(steady_micro_batches):

1724 if stage_index != self.real_stage_num - 1:

1725 order_list.append(MetaStep(bwd_index, MetaStepType.BWD_RECV, stage_index))

1726 order_list.append(MetaStep(fwd_index - 1, MetaStepType.FWD_SEND, stage_index))

1727 order_list.append(MetaStep(bwd_index, MetaStepType.BWD, stage_index))

1728

1729 if stage_index != 0:

1730 order_list.append(MetaStep(bwd_index, MetaStepType.BWD_SEND, stage_index))

1731 order_list.append(MetaStep(fwd_index, MetaStepType.FWD_RECV, stage_index))

1732 order_list.append(MetaStep(fwd_index, MetaStepType.FWD, stage_index))

1733 fwd_index += 1

1734 bwd_index += 1

1735

1736 # cooldown phase

1737 cooldown_micro_batches = warmup_micro_batches

1738 for _ in range(cooldown_micro_batches):

1739 if stage_index != self.real_stage_num - 1:

1740 order_list.append(MetaStep(bwd_index, MetaStepType.BWD_RECV, stage_index))

1741 if bwd_index == self.micro_batch_num - warmup_micro_batches and fwd_index <= self.micro_batch_num:

1742 order_list.append(MetaStep(fwd_index - 1, MetaStepType.FWD_SEND, stage_index))

1743 order_list.append(MetaStep(bwd_index, MetaStepType.BWD, stage_index))

1744

1745 if stage_index != 0:

1746 order_list.append(MetaStep(bwd_index, MetaStepType.BWD_SEND, stage_index))

1747 bwd_index += 1

1748 self.exec_order[stage_index] = order_list

1749

1750

1751class ScheduleInterleaved1F1B(PipelineScheduleRuntime):

1752 """The Interleaved 1F1B schedule.

1753

1754 Supports multiple stages per rank. In steady state, performs one

1755 forward followed by one backward on each micro-batch. Handles the

1756 cases where ``micro_batch_num`` is less than, equal to, or greater

1757 than the stage count, including non-evenly-divisible micro counts.

1758

1759 Two orthogonal overlap modes can be enabled via constructor flags:

1760

1761 * ``overlap_p2p=True``: defer P2P recv ``handle.wait()`` until the

1762 consuming FWD/BWD step (or the OVERLAP_B_F callback when

1763 ``overlap_b_f=True``), letting recv overlap with prior compute.

1764 * ``overlap_b_f=True``: in the 1F1B steady state, pair consecutive

1765 ``(B_i, F_{i+1})`` steps into ``OVERLAP_B_F`` composite steps so

1766 a registered callback can drive comm/compute overlap (typically

1767 via :class:`CommComputeOverlap` for MoE EP A2A). Users register

1768 the callback through :meth:`register_custom_function`.

1769 * ``enable_dxdw_split=True`` (requires ``overlap_b_f``): each

1770 steady-state pair becomes ``(BWD_INPUT_i, F_{i+1})`` and the

1771 ``BWD_WEIGHT_i`` runs as its own step after the pair's P2P gap

1772 (see :func:`split_overlap_dxdw`), so the input-grad send is

1773 issued once dx and the paired forward finish instead of waiting

1774 out the full backward.

1775

1776 The two overlap flags are independent and can be combined.

1777

1778 Example:

1779 >>> # Plain interleaved 1F1B

1780 >>> sched = ScheduleInterleaved1F1B(stages, 8)

1781 >>> # With B/F overlap (dual-pipe-style comm/compute overlap)

1782 >>> sched = ScheduleInterleaved1F1B(stages, 8, overlap_b_f=True)

1783 >>> sched.register_custom_function(MetaStepType.OVERLAP_B_F, callback)

1784 """

1785 def __init__(self,

1786 stages,

1787 micro_batch_num,

1788 args_batch_dim=None,

1789 kwargs_batch_dim=None,

1790 output_concat_dim=None,

1791 overlap_p2p=False,

1792 overlap_b_f=False,

1793 swap=False,

1794 enable_dxdw_split=False,

1795 p2p_transport="auto"):

1796 super().__init__(stages,

1797 micro_batch_num,

1798 args_batch_dim=args_batch_dim,

1799 kwargs_batch_dim=kwargs_batch_dim,

1800 output_concat_dim=output_concat_dim,

1801 overlap_p2p=overlap_p2p,

1802 swap=swap,

1803 p2p_transport=p2p_transport)

1804 # _overlap_b_f selects between plain F/B emission and OVERLAP_B_F

1805 # pairing in the 1F1B steady-state phase. Must be set before

1806 # ``construct_stage_exec_order`` is called below.

1807 self._overlap_b_f = overlap_b_f

1808 # dx/dw split: ``construct_exec_order`` rewrites each steady-state

1809 # OVERLAP_B_F pair to ``(BWD_INPUT, FWD)`` and re-emits the matching

1810 # BWD_WEIGHT after the pair's P2P gap (``split_overlap_dxdw``), so the

1811 # input-grad send leaves at ``max(dx, fwd)`` and dw overlaps with the

1812 # in-flight P2P instead of delaying it.

1813 self._enable_dxdw_split = enable_dxdw_split

1814 if enable_dxdw_split and not overlap_b_f:

1815 raise ValueError(

1816 "enable_dxdw_split=True requires overlap_b_f=True; the split "

1817 "is only applied to BWD sub-steps inside OVERLAP_B_F composite steps."

1818 )

1819 if enable_dxdw_split and swap:

1820 raise ValueError(

1821 "enable_dxdw_split=True is incompatible with swap=True: "

1822 "pipeline-swap injection anchors on unified BWD steps and "

1823 "does not recognize the split BWD_INPUT/BWD_WEIGHT steps."

1824 )

1825

1826 self._init_round_layout()

1827 self.build_exec_order()

1828

1829 def _init_round_layout(self):

1830 """Compute per-round micro-batch counts used by stage-order emission.

1831

1832 Populates ``n_rounds``, ``n_microbatch_per_round`` and its prefix-sum

1833 ``n_microbatch_per_round_accu`` from ``micro_batch_num``,

1834 ``real_stage_num`` and ``n_local_stages``. Factored out of

1835 ``__init__`` so the pure schedule-construction path (used by offline

1836 unit tests) can be exercised without instantiating stages.

1837 """

1838 self.n_rounds = max(1, self.micro_batch_num // self.real_stage_num)

1839 if self.micro_batch_num < self.real_stage_num:

1840 base = self.micro_batch_num - self.real_stage_num

1841 remainder = 0

1842 else:

1843 n_extra_microbatch = self.micro_batch_num % self.real_stage_num

1844 base = n_extra_microbatch // self.n_rounds

1845 remainder = n_extra_microbatch % self.n_rounds

1846 self.n_microbatch_per_round = \

1847 [self.real_stage_num + base + 1 if i < remainder else

1848 self.real_stage_num + base for i in range(self.n_rounds)]

1849 self.n_microbatch_per_round_accu = \

1850 [x * self.n_local_stages for x in itertools.accumulate(self.n_microbatch_per_round)]

1851 self.n_microbatch_per_round_accu.insert(0, 0)

1852

1853 def construct_exec_order(self):

1854 for stage_index in range(self.real_stage_num):

1855 self.exec_order[stage_index] = self.construct_stage_exec_order(stage_index)

1856 self.exec_order = add_send_recv(self.exec_order, self._stage_num, self.real_stage_num, style='loop')

1857 if self.enable_dxdw_split:

1858 self.exec_order = split_overlap_dxdw(self.exec_order)

1859

1860 def _build_stage_to_rank_index(self) -> None:

1861 self._stage_to_rank_index = generate_stage_to_rank_mapping(

1862 self.real_stage_num, self._stage_num, style='loop'

1863 )

1864

1865 def warmup_ops(self, stage_index):

1866 """warmup phase."""

1867 warmup_ops_last_stage = (self.n_local_stages - 1) * self.n_microbatch_per_round[0]

1868 warmup_ops = warmup_ops_last_stage + 2 * (self.real_stage_num - 1 - stage_index)

1869 return min(warmup_ops, self.micro_batch_num * self.n_local_stages)

1870

1871 def forward_stage_index(self, op_index, stage_index):

1872 """obtain forward stage_index based on op_index."""

1873 accu_index = bisect.bisect_right(self.n_microbatch_per_round_accu, op_index) - 1

1874 local_index = (op_index - self.n_microbatch_per_round_accu[accu_index]) // \

1875 self.n_microbatch_per_round[accu_index]

1876 return (local_index * self.real_stage_num) + stage_index

1877

1878 def backward_stage_index(self, op_index, stage_index):

1879 """obtain backward stage_index based on op_index."""

1880 accu_index = bisect.bisect_right(self.n_microbatch_per_round_accu, op_index) - 1

1881 local_index = (op_index - self.n_microbatch_per_round_accu[accu_index]) // \

1882 self.n_microbatch_per_round[accu_index]

1883 local_index = self.n_local_stages - 1 - local_index

1884 return (local_index * self.real_stage_num) + stage_index

1885

1886 def _short_micro(self) -> bool:

1887 """True when ``micro_batch_num < real_stage_num`` (extra-bubble regime)."""

1888 return self.micro_batch_num < self.real_stage_num

1889

1890 def _trailing_bubble(self) -> int:

1891 """Bubble count appended after a BWD with ``micro == micro_batch_num - 1``

1892 in the short-micro regime.

1893 """

1894 return self.real_stage_num - self.micro_batch_num

1895

1896 def _emit_warmup_ops(self, stage_index, warmup_ops, fwd_stage_micro_index):

1897 """Emit pure-FWD warmup ops with optional short-micro bubble padding."""

1898 ops = []

1899 short = self._short_micro()

1900 last_micro = self.micro_batch_num - 1

1901 last_stage = self.real_stage_num - 1

1902 bubble = self._trailing_bubble()

1903 for op_idx in range(warmup_ops):

1904 fwd_stage_idx = self.forward_stage_index(op_idx, stage_index)

1905 fwd_micro_idx = fwd_stage_micro_index[fwd_stage_idx]

1906 ops.append(MetaStep(fwd_micro_idx, MetaStepType.FWD, fwd_stage_idx))

1907 need_pad = (

1908 short

1909 and fwd_micro_idx == last_micro

1910 and (op_idx != warmup_ops - 1 or stage_index == last_stage)

1911 )

1912 if need_pad:

1913 ops.extend([None] * bubble)

1914 fwd_stage_micro_index[fwd_stage_idx] += 1

1915 return ops

1916

1917 def _emit_cooldown_ops(self, stage_index, warmup_ops, fwd_bwd_ops, total_ops,

1918 bwd_stage_micro_index):

1919 """Emit pure-BWD cooldown ops (each preceded by a bubble) with

1920 optional short-micro trailing padding.

1921 """

1922 ops = []

1923 short = self._short_micro()

1924 last_micro = self.micro_batch_num - 1

1925 # Double the bubble at each chunk's last-micro BWD: one ``bubble`` covers

1926 # the missing ``rs - micro`` micros, the second offsets the next chunk

1927 # by 2 slots so the wrap-around grad (rank 0 stage ``rs`` -> rank

1928 # last_stage stage ``rs - 1``) lands AFTER its producer in column-scan

1929 # time. Matches the +2 cooldown-rhythm offset that non-short Interleaved

1930 # 1F1B naturally has from extra 1F1B ops on rank last_stage.

1931 bubble = 2 * self._trailing_bubble()

1932 for op_idx in range(warmup_ops + fwd_bwd_ops, total_ops):

1933 ops.append(None)

1934 bwd_stage_idx = self.backward_stage_index(op_idx - warmup_ops, stage_index)

1935 bwd_micro_idx = bwd_stage_micro_index[bwd_stage_idx]

1936 ops.append(MetaStep(bwd_micro_idx, MetaStepType.BWD, bwd_stage_idx))

1937 if short and bwd_micro_idx == last_micro:

1938 ops.extend([None] * bubble)

1939 bwd_stage_micro_index[bwd_stage_idx] += 1

1940 return ops

1941

1942 def _emit_1f1b_ops(self, stage_index, warmup_ops, fwd_bwd_ops,

1943 fwd_stage_micro_index, bwd_stage_micro_index):

1944 """Emit interleaved (FWD, BWD) pairs for the 1F1B steady-state phase."""

1945 ops = []

1946 short = self._short_micro()

1947 last_micro = self.micro_batch_num - 1

1948 last_stage = self.real_stage_num - 1

1949 # Double the bubble at the 1F1B->cooldown chunk boundary on rank

1950 # last_stage; see :meth:`_emit_cooldown_ops` for the alignment rationale.

1951 bubble = 2 * self._trailing_bubble()

1952 for op_idx in range(warmup_ops, warmup_ops + fwd_bwd_ops):

1953 fwd_stage_idx = self.forward_stage_index(op_idx, stage_index)

1954 fwd_micro_idx = fwd_stage_micro_index[fwd_stage_idx]

1955 ops.append(MetaStep(fwd_micro_idx, MetaStepType.FWD, fwd_stage_idx))

1956 fwd_stage_micro_index[fwd_stage_idx] += 1

1957 bwd_stage_idx = self.backward_stage_index(op_idx - warmup_ops, stage_index)

1958 bwd_micro_idx = bwd_stage_micro_index[bwd_stage_idx]

1959 ops.append(MetaStep(bwd_micro_idx, MetaStepType.BWD, bwd_stage_idx))

1960 need_pad = (

1961 short

1962 and bwd_micro_idx == last_micro

1963 and stage_index == last_stage

1964 )

1965 if need_pad:

1966 ops.extend([None] * bubble)

1967 bwd_stage_micro_index[bwd_stage_idx] += 1

1968 return ops

1969

1970 @staticmethod

1971 def _collect_fwd_bwd_steps(emit_fwd, emit_bwd, fwd_bwd_ops, warmup_ops):

1972 """Walk the 1F1B range collecting parallel ``fwd_steps`` / ``bwd_steps``.

1973

1974 ``emit_fwd(op_idx)`` and ``emit_bwd(op_idx)`` build a single

1975 :class:`MetaStep` and advance their respective per-stage micro

1976 counters as a side effect.

1977 """

1978 fwd_steps = []

1979 bwd_steps = []

1980 for op_idx in range(warmup_ops, warmup_ops + fwd_bwd_ops):

1981 fwd_steps.append(emit_fwd(op_idx))

1982 bwd_steps.append(emit_bwd(op_idx))

1983 return fwd_steps, bwd_steps

1984

1985 @staticmethod

1986 def _pair_into_overlap_b_f(fwd_steps, bwd_steps):

1987 """Build ``F₁, [B_i, F_{i+1}], B_n`` ordering with OVERLAP_B_F pairs.

1988

1989 ``sub_steps`` carry the ``(bwd, fwd)`` tuple — callbacks access

1990 them via ``step.sub_steps`` to recover per-direction stage /

1991 micro info.

1992 """

1993 ops = []

1994 if fwd_steps:

1995 ops.append(fwd_steps[0]) # F₁ runs alone

1996 for i in range(len(bwd_steps) - 1):

1997 ops.append(MetaStep(

1998 None, MetaStepType.OVERLAP_B_F, None,

1999 sub_steps=(bwd_steps[i], fwd_steps[i + 1]),

2000 ))

2001 if bwd_steps:

2002 ops.append(bwd_steps[-1]) # B_n runs alone

2003 return ops

2004

2005 def _emit_1f1b_overlap_ops(self, stage_index, warmup_ops, fwd_bwd_ops,

2006 fwd_stage_micro_index, bwd_stage_micro_index):

2007 """Emit ``F₁, [B_i, F_{i+1}], B_n`` for the 1F1B phase under

2008 ``overlap_b_f=True``. Each ``[B_i, F_{i+1}]`` becomes an

2009 ``OVERLAP_B_F`` composite step; a registered callback drives the

2010 actual concurrent execution. Short-micro extra-bubble padding

2011 on the last rank is appended after ``B_n``.

2012 """

2013 def emit_fwd(op_idx):

2014 fwd_si = self.forward_stage_index(op_idx, stage_index)

2015 fwd_mi = fwd_stage_micro_index[fwd_si]

2016 fwd_stage_micro_index[fwd_si] += 1

2017 return MetaStep(fwd_mi, MetaStepType.FWD, fwd_si)

2018

2019 def emit_bwd(op_idx):

2020 bwd_si = self.backward_stage_index(op_idx - warmup_ops, stage_index)

2021 bwd_mi = bwd_stage_micro_index[bwd_si]

2022 bwd_stage_micro_index[bwd_si] += 1

2023 return MetaStep(bwd_mi, MetaStepType.BWD, bwd_si)

2024

2025 fwd_steps, bwd_steps = self._collect_fwd_bwd_steps(

2026 emit_fwd, emit_bwd, fwd_bwd_ops, warmup_ops,

2027 )

2028 ops = self._pair_into_overlap_b_f(fwd_steps, bwd_steps)

2029

2030 last_stage = self.real_stage_num - 1

2031 if self._short_micro() and stage_index == last_stage and bwd_steps:

2032 if bwd_steps[-1].micro_index == self.micro_batch_num - 1:

2033 # Double the bubble at the 1F1B->cooldown chunk boundary;

2034 # see :meth:`_emit_cooldown_ops` for the alignment rationale.

2035 ops.extend([None] * (2 * self._trailing_bubble()))

2036 return ops

2037

2038 def construct_stage_exec_order(self, stage_index):

2039 """Construct the execution order for ``stage_index``.

2040

2041 Builds: warmup → bubbles → 1F1B steady state → cooldown. The

2042 1F1B segment switches between :meth:`_emit_1f1b_ops` (plain) and

2043 :meth:`_emit_1f1b_overlap_ops` (OVERLAP_B_F pairing) based on

2044 the ``overlap_b_f`` constructor flag.

2045 """

2046 warmup_ops = self.warmup_ops(stage_index)

2047 fwd_bwd_ops = self.n_local_stages * self.micro_batch_num - warmup_ops

2048 total_ops = 2 * warmup_ops + fwd_bwd_ops

2049 order_list = [None for _ in range(stage_index)]

2050 fwd_stage_micro_index = defaultdict(int)

2051 bwd_stage_micro_index = defaultdict(int)

2052 order_list.extend(self._emit_warmup_ops(stage_index, warmup_ops, fwd_stage_micro_index))

2053 bubbles_before_1f1b = max(

2054 0,

2055 2 * (self.real_stage_num - stage_index - 1) - self.micro_batch_num,

2056 )

2057 order_list.extend([None] * bubbles_before_1f1b)

2058 order_list.extend([None] * (self.real_stage_num - 1 - stage_index))

2059 if self._overlap_b_f:

2060 order_list.extend(self._emit_1f1b_overlap_ops(

2061 stage_index, warmup_ops, fwd_bwd_ops,

2062 fwd_stage_micro_index, bwd_stage_micro_index,

2063 ))

2064 else:

2065 order_list.extend(self._emit_1f1b_ops(

2066 stage_index, warmup_ops, fwd_bwd_ops,

2067 fwd_stage_micro_index, bwd_stage_micro_index,

2068 ))

2069 order_list.extend(self._emit_cooldown_ops(

2070 stage_index, warmup_ops, fwd_bwd_ops, total_ops, bwd_stage_micro_index,

2071 ))

2072 return order_list

2073

2074

2075def detect_cycle_in_graph(ranks_map):

2076 """

2077 Detects a cycle in the directed graph constructed from ranks_map.

2078

2079 Args:

2080 ranks_map: A dictionary where keys are rank names and values are lists of nodes.

2081

2082 Returns:

2083 tuple: (cycle_path, cycle_ranks) where cycle_path is a list of nodes forming the cycle and cycle_ranks

2084 is a list of rank transitions corresponding to the cycle path.

2085 """

2086 graph = defaultdict(list)

2087 rank_edges = {}

2088

2089 for rank, nodes in ranks_map.items():

2090 for i in range(len(nodes) - 1):

2091 u, v = nodes[i], nodes[i + 1]

2092 graph[u].append(v)

2093 rank_edges[(u, v)] = rank

2094

2095 visited = set()

2096 path = []

2097 node_indices = {}

2098 cycle_path = []

2099 cycle_ranks = []

2100

2101 stack = []

2102 for node in list(graph.keys()):

2103 if node not in visited:

2104 stack.append((node, False))

2105 while stack:

2106 current_node, is_processed = stack.pop()

2107

2108 if is_processed:

2109 path.pop()

2110 del node_indices[current_node]

2111 continue

2112

2113 if current_node in node_indices:

2114 cycle_start = node_indices[current_node]

2115 cycle_path = path[cycle_start:] + [current_node]

2116 for i in range(cycle_start, len(path)):

2117 u = path[i]

2118 v = path[i + 1] if i + 1 < len(path) else current_node

2119 cycle_ranks.append(f"{rank_edges[(u, v)]} {u} -> {v}")

2120 return cycle_path, cycle_ranks

2121

2122 if current_node in visited:

2123 continue

2124

2125 visited.add(current_node)

2126 node_indices[current_node] = len(path)

2127 path.append(current_node)

2128

2129 stack.append((current_node, True))

2130 for neighbor in reversed(graph[current_node]):

2131 stack.append((neighbor, False))

2132

2133 return None, None

2134

2135

2136def output_cycle_results(cycle_path, cycle_ranks):

2137 """

2138 Helper function to output cycle detection results.

2139

2140 Args:

2141 cycle_path (list): List of nodes forming a cycle, if any.

2142 cycle_ranks (list): List of ranks involved in the cycle.

2143

2144 Returns:

2145 None: Outputs results to the console.

2146 """

2147 if cycle_path:

2148 logger.error("Cycle detected:")

2149 path_str = " -> ".join(str(node) for node in cycle_path)

2150 logger.error("%s -> %s", path_str, cycle_path[0]) # Close the cycle

2151 logger.error("Involving ranks:")

2152 for rank in cycle_ranks:

2153 logger.error(rank)

2154 else:

2155 logger.warning("Cycle Check succeeded. There is no cycle in the graph.")

2156

2157

2158def parse_and_validate(data: dict, all_rank: bool = True):

2159 """

2160 Parse and validate execution orders in a directed graph structure.

2161

2162 This function checks the integrity and consistency of a given dataset, ensuring all required

2163 keys are present and correctly referenced. It also validates the structure of the input data

2164 and parses string values to extract meaningful components.

2165

2166 Args:

2167 data (dict): A dictionary where keys are string identifiers and values are lists of strings.

2168 Each value represents a dependency or reference to other keys.

2169 all_rank (bool): If True, checks that all elements referenced in the data are present as keys

2170 in the dictionary. If False, only checks intersections.

2171

2172 Returns:

2173 None: Log error messages to the console if validation fails, otherwise completes silently.

2174

2175 Raises:

2176 ValueError: Raised indirectly if `parse_elements` encounters malformed input strings.

2177 TypeError: Raised indirectly if data contains unexpected types.

2178 """

2179

2180 def parse_elements(value: str, max_groups: int = 2) -> set:

2181 """Extract unique elements inside the first one or two parentheses from a string."""

2182

2183 groups = re.findall(r'\((\d+)\)', value)

2184 limited_groups = groups[:max_groups] # Limit to the first `max_groups` matches

2185

2186 return {item.strip() for item in limited_groups}

2187

2188 if not isinstance(data, dict):

2189 logger.error("Input must be a dictionary with string keys and lists of strings as values.")

2190 return

2191

2192 key_to_values = {key: set(values) for key, values in data.items() if

2193 isinstance(values, list) and all(isinstance(v, str) for v in values)}

2194

2195 for key, values in data.items():

2196 if not isinstance(values, list) or not all(isinstance(v, str) for v in values):

2197 logger.error("Values for key '%s' must be a list of strings.", key)

2198 continue

2199

2200 for value in values:

2201 try:

2202 elements = parse_elements(value)

2203 except (ValueError, TypeError, AttributeError) as e:

2204 logger.error("Unable to parse elements from value '%s' in key '%s'. Error: %s", value, key, e)

2205 continue

2206

2207 # Check for missing keys if all_rank is True

2208 if all_rank:

2209 missing_keys = elements - key_to_values.keys()

2210 if missing_keys:

2211 logger.error("The following keys are missing for value '%s': %s", value, missing_keys)

2212 continue

2213

2214 # Check if the value is present in the referenced keys

2215 for element in elements & key_to_values.keys() if not all_rank else elements:

2216 if value not in key_to_values[element]:

2217 logger.error("Key '%s' is missing the value '%s'.", element, value)

2218

2219

2220def generate_operations(order_list: dict[int, list[MetaStep]],

2221 chunk_num: int,

2222 com_type: str = 'loop') -> dict[str, list[str]]:

2223 """

2224 Generate formatted operations dictionary from pipeline execution order.

2225

2226 Args:

2227 order_list (dict): Dictionary where keys are rank IDs and values are MetaStep execution sequences

2228 chunk_num (int): Number of chunks (virtual pipeline stages)

2229 com_type (str): Stage-to-rank mapping type ('loop' for cyclic, 'v' for V-shaped)

2230

2231 Returns:

2232 Dictionary where keys are rank IDs (as strings) and values are lists of formatted operation strings

2233 """

2234

2235 def stage_to_rank(stage_index, style, stage_num, real_stage_num):

2236 """Map stage index to rank"""

2237 if style == 'loop':

2238 return stage_index % real_stage_num

2239 if style == 'v':

2240 if stage_index < real_stage_num:

2241 return stage_index

2242 return stage_num - 1 - stage_index

2243 raise ValueError("Invalid style")

2244

2245 def find_send_target(stage_idx, op_type):

2246 """Find target stage for SEND operation"""

2247 if op_type == MetaStepType.FWD_SEND:

2248 return forward_comm.get(stage_idx)

2249 return backward_comm.get(stage_idx)

2250

2251 def find_recv_source(stage_idx, op_type):

2252 """Find source stage for RECV operation"""

2253 if op_type == MetaStepType.FWD_RECV:

2254 # Reverse lookup in forward_comm

2255 for src, dst in forward_comm.items():

2256 if dst == stage_idx:

2257 return src

2258 else:

2259 # Reverse lookup in backward_comm

2260 for src, dst in backward_comm.items():

2261 if dst == stage_idx:

2262 return src

2263 return None

2264

2265 real_stage = len(order_list)

2266 total_stages = real_stage * chunk_num

2267

2268 # Build communication rules

2269 forward_comm = {}

2270 backward_comm = {}

2271

2272 for i in range(total_stages):

2273 if i + 1 < total_stages:

2274 forward_comm[i] = i + 1

2275 if i - 1 >= 0:

2276 backward_comm[i] = i - 1

2277

2278 formatted_operations = defaultdict(list)

2279

2280 for rank, steps in order_list.items():

2281 operation_counter = defaultdict(int)

2282

2283 for step in steps:

2284 if step.type in [MetaStepType.FWD_SEND, MetaStepType.BWD_SEND]:

2285 target_stage = find_send_target(step.stage_index, step.type)

2286 if target_stage is not None:

2287 target_rank = stage_to_rank(target_stage, com_type, total_stages, real_stage)

2288 comm_pair = (rank, target_rank, step.micro_index)

2289 operation_counter[comm_pair] += 1

2290 count = operation_counter[comm_pair]

2291 formatted_op = f"Send_Receive_({rank})->({target_rank})_micro{step.micro_index}_{count}th"

2292 formatted_operations[str(rank)].append(formatted_op)

2293

2294 elif step.type in [MetaStepType.FWD_RECV, MetaStepType.BWD_RECV]:

2295 source_stage = find_recv_source(step.stage_index, step.type)

2296 if source_stage is not None:

2297 source_rank = stage_to_rank(source_stage, com_type, total_stages, real_stage)

2298 comm_pair = (source_rank, rank, step.micro_index)

2299 operation_counter[comm_pair] += 1

2300 count = operation_counter[comm_pair]

2301 formatted_op = f"Send_Receive_({source_rank})->({rank})_micro{step.micro_index}_{count}th"

2302 formatted_operations[str(rank)].append(formatted_op)

2303

2304 # Convert defaultdict to dict

2305 return dict(formatted_operations)

2306

2307

2308def validate_pipeline_execution(order_list: dict[int, list[MetaStep]],

2309 chunk_num: int,

2310 com_type: str = 'loop') -> dict[str, any]:

2311 """

2312 Comprehensive validation function for pipeline parallel execution order.

2313

2314 This function validates the execution order of pipeline parallelism by:

2315 1. Checking SEND/RECV communication pair matching

2316 2. Detecting duplicate operations

2317 3. Detecting cycles in communication graphs

2318 4. Verifying computation-SEND matching

2319

2320 Args:

2321 order_list: Dictionary where keys are rank IDs and values are MetaStep execution sequences

2322 chunk_num: Number of chunks (virtual pipeline stages)

2323 com_type: Stage-to-rank mapping type ('loop' for cyclic, 'v' for V-shaped)

2324

2325 Returns:

2326 Dictionary containing validation results with the following keys:

2327 - validation: Communication pair validation results

2328 - cycle_detection: Cycle detection results

2329 - computation_send_matching: Computation-SEND matching validation results

2330 - has_errors: Boolean indicating if any errors were found

2331 - error_messages: List of all error messages found

2332 - formatted_operations: Generated formatted operations

2333 """

2334

2335 # Generate operations

2336 formatted_operations = generate_operations(order_list, chunk_num, com_type)

2337

2338 parse_and_validate(formatted_operations, True)

2339

2340 # Detect cycles

2341 cycle_path, cycle_ranks = detect_cycle_in_graph(formatted_operations)

2342

2343 # Output results

2344 output_cycle_results(cycle_path, cycle_ranks)

2345

2346 result = {

2347 'formatted_operations': formatted_operations,

2348 'cycle_path': cycle_path,

2349 'cycle_ranks': cycle_ranks,

2350 'has_cycle': bool(cycle_path)

2351 }

2352 return result

2353

2354

2355_COMPUTE_META_STEP_TYPES = frozenset({

2356 MetaStepType.FWD,

2357 MetaStepType.BWD,

2358 MetaStepType.BWD_INPUT,

2359 MetaStepType.BWD_WEIGHT,

2360})

2361

2362

2363def _next_active_stage_indices(actions, start_index, max_active_stages, managed_stage_indices):

2364 """Find the next distinct managed stages that will execute compute work.

2365

2366 Send/recv and previously injected FSDP control steps are skipped so that the

2367 lookahead window only counts real compute, otherwise communication-only

2368 actions would consume the budget and shrink the effective prefetch depth.

2369 """

2370 stage_indices = []

2371 seen = set()

2372 for action in actions[start_index:]:

2373 for leaf_step in iter_leaf_meta_steps(action):

2374 if leaf_step.type not in _COMPUTE_META_STEP_TYPES:

2375 continue

2376 if leaf_step.stage_index not in managed_stage_indices or leaf_step.stage_index in seen:

2377 continue

2378 seen.add(leaf_step.stage_index)

2379 stage_indices.append(leaf_step.stage_index)

2380 if len(stage_indices) == max_active_stages:

2381 return stage_indices

2382 return stage_indices

2383

2384

2385def add_fsdp_unshard_reshard(actions, managed_stage_indices, max_active_stages=3):

2386 """Insert FSDP unshard/reshard actions for locally managed stages."""

2387 if not managed_stage_indices:

2388 return actions

2389

2390 fsdp_actions = []

2391 active_stages = []

2392 for index, action in enumerate(actions):

2393 next_stage_indices = _next_active_stage_indices(

2394 actions, index, max_active_stages, managed_stage_indices

2395 )

2396 evicted_stages = [stage_index for stage_index in active_stages if stage_index not in next_stage_indices]

2397 fetched_stages = [stage_index for stage_index in next_stage_indices if stage_index not in active_stages]

2398 for stage_index in evicted_stages:

2399 fsdp_actions.append(MetaStep(None, MetaStepType.FSDP_RESHARD, stage_index))

2400 active_stages.remove(stage_index)

2401 for stage_index in fetched_stages:

2402 fsdp_actions.append(MetaStep(None, MetaStepType.FSDP_UNSHARD, stage_index))

2403 active_stages.append(stage_index)

2404 fsdp_actions.append(action)

2405

2406 while active_stages:

2407 fsdp_actions.append(MetaStep(None, MetaStepType.FSDP_RESHARD, active_stages.pop(0)))

2408 return fsdp_actions

2409

2410

2411def add_fsdp_reduce_grad(actions, managed_stage_indices, micro_batch_num):

2412 """Insert FSDP reduce-grad actions after the last backward-like action of each stage."""

2413 if not managed_stage_indices:

2414 return actions

2415

2416 fsdp_actions = []

2417 for action in actions:

2418 fsdp_actions.append(action)

2419 reduced_stage_indices = []

2420 for leaf_step in iter_leaf_meta_steps(action):

2421 if leaf_step.stage_index not in managed_stage_indices:

2422 continue

2423 if leaf_step.type not in (MetaStepType.BWD, MetaStepType.BWD_WEIGHT):

2424 continue

2425 if leaf_step.micro_index != micro_batch_num - 1:

2426 continue

2427 if leaf_step.stage_index not in reduced_stage_indices:

2428 reduced_stage_indices.append(leaf_step.stage_index)

2429 for stage_index in reduced_stage_indices:

2430 fsdp_actions.append(MetaStep(None, MetaStepType.FSDP_REDUCE_GRAD, stage_index))

2431 return fsdp_actions

Coverage for / home / jenkins / .local / lib / python3.10 / site-packages / hyper_parallel / core / pipeline_parallel / scheduler.py: 29%

1120 statements