Coverage for /home/jenkins/.local/lib/python3.10/site-packages/hyper

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ============================================================================

15"""framework platform api"""

16import os

17from datetime import timedelta

18from enum import auto, Enum

19from typing import Optional, Any, Union

21import numpy as np

23# Environment variable name used to specify the AI framework platform to use

24HYPER_PARALLEL_PLATFORM = "HYPER_PARALLEL_PLATFORM"

26# Identifier for the MindSpore framework

27HYPER_PARALLEL_PLATFORM_MINDSPORE = "mindspore"

29# Identifier for the PyTorch framework

30HYPER_PARALLEL_PLATFORM_TORCH = "torch"

33class AsyncHandle:

34 """Idempotent wait handle for an async collective operation.

36 Wraps the async tensor returned by

37 :meth:`Platform.differentiable_all_to_all_single_async` and provides a

38 :meth:`wait` method that is safe to call multiple times.

39 """

41 def __init__(self, async_tensor) -> None:

42 self._tensor = async_tensor

43 self._waited = False

45 def wait(self):

46 """Wait for the async collective to complete.

48 Idempotent — the first call blocks until the collective finishes;

49 subsequent calls are no-ops.

51 Returns:

52 The now-materialised result tensor.

53 """

54 if not self._waited:

55 get_platform().wait_async_tensor(self._tensor)

56 self._waited = True

57 return self._tensor

60class PlatformType(Enum):

61 """Enumeration class for AI framework platform types.

63 Used to identify different deep learning framework platform types.

64 """

65 MINDSPORE = auto()

66 PYTORCH = auto()

69# Global platform instance, used to cache the created platform object

70platform = None

73def get_mindspore_platform():

74 """Create and return a MindSpore platform instance.

76 Returns:

77 MindSporePlatform: A MindSpore platform instance.

78 """

79 # pylint: disable=C0415

80 from hyper_parallel.platform.mindspore.platform import MindSporePlatform

81 global platform

82 platform = MindSporePlatform()

83 return platform

86def get_torch_platform():

87 """Create and return a PyTorch platform instance.

89 Returns:

90 TorchPlatform: A PyTorch platform instance.

91 """

92 # pylint: disable=C0415

93 from hyper_parallel.platform.torch.platform import TorchPlatform

94 global platform

95 platform = TorchPlatform()

96 return platform

99def get_platform():

100 """Obtain a framework platform instance.

101

102 Returns the appropriate AI framework platform instance based on environment variables or a default priority order.

103 The lookup priority is as follows:

104 1. Platform specified by environment variable

105 2. MindSpore platform (default preferred choice)

106 3. PyTorch platform (fallback option)

107

108 Returns:

109 Platform: An instance of the framework platform

110

111 Raises:

112 ImportError: Raised when none of the supported frameworks are available

113 """

114 if platform is not None:

115 return platform

116 platform_type = os.environ.get(HYPER_PARALLEL_PLATFORM)

117 if platform_type is not None and isinstance(platform_type, str):

118 platform_type = platform_type.lower()

119 if platform_type == HYPER_PARALLEL_PLATFORM_MINDSPORE:

120 return get_mindspore_platform()

121 if platform_type == HYPER_PARALLEL_PLATFORM_TORCH:

122 return get_torch_platform()

123 try:

124 return get_mindspore_platform()

125 except ImportError:

126 return get_torch_platform()

127

128

129EXISTING_COMM_GROUPS = {}

130

131

132class Platform:

133 """Platform api"""

134 current_grad_handle = None

135 post_grad_handle_process = None

136 grad_sync_stream = None

137

138 @property

139 def custom_ops(self):

140 """Return the platform-specific custom ops interface.

141

142 Subclasses MUST override this property to return an object that

143 exposes the platform-specific custom operator implementations.

144

145 Returns:

146 object: Platform-specific custom ops class instance.

147 """

148 raise NotImplementedError(

149 "Platform subclasses must implement custom_ops"

150 )

151

152 @staticmethod

153 def get_rank():

154 """Get the rank of the current process in the default process group.

155

156 Returns:

157 int: The rank of the current process.

158 """

159 raise NotImplementedError("Platform subclasses must implement get_rank")

160

161 @staticmethod

162 def get_global_rank(group, group_rank):

163 """Convert a group rank to its global rank.

164

165 Args:

166 group: The process group to query.

167 group_rank (int): The rank within the group.

168

169 Returns:

170 int: The global rank corresponding to the group rank.

171 """

172 raise NotImplementedError("Platform subclasses must implement get_global_rank")

173

174 @staticmethod

175 def get_world_size():

176 """Get the total number of processes in the default process group.

177

178 Returns:

179 int: The world size (total number of processes).

180 """

181 raise NotImplementedError("Platform subclasses must implement get_world_size")

182

183 @staticmethod

184 def get_op_name(func):

185 """Get the canonical name of an operator function.

186

187 Args:

188 func: The operator function to query.

189

190 Returns:

191 str: The canonical name of the operator.

192 """

193 raise NotImplementedError("Platform subclasses must implement get_op_name")

194

195 @staticmethod

196 def differentiable_all_gather_concat(data, group, concat_size, concat_dim, rank_list=None):

197 """Perform differentiable all-gather and concatenate tensors along a dimension.

198

199 Args:

200 data: The input tensor to gather.

201 group: The process group for collective communication.

202 concat_size (int): The size to concatenate along concat_dim.

203 concat_dim (int): The dimension along which to concatenate.

204 rank_list: Optional rank order expected by the logical layout.

205

206 Returns:

207 The concatenated tensor after all-gather operation.

208 """

209 raise NotImplementedError("Platform subclasses must implement differentiable_all_gather_concat")

210

211 @staticmethod

212 def chunk(data, split_dim, split_size, index):

213 """Split tensor along a dimension and return the chunk at the given index.

214

215 Args:

216 data: The input tensor to split.

217 split_dim (int): The dimension along which to split.

218 split_size (int): The size of each split chunk.

219 index (int): The index of the chunk to return.

220

221 Returns:

222 The tensor chunk at the specified index.

223 """

224 raise NotImplementedError("Platform subclasses must implement chunk")

225

226 @staticmethod

227 def differentiable_all_to_all(input_data, output_shape, group):

228 """Perform differentiable all-to-all communication.

229

230 Args:

231 input_data: The input tensor to redistribute.

232 output_shape: The shape of the output tensor.

233 group: The process group for collective communication.

234

235 Returns:

236 The output tensor after all-to-all operation.

237 """

238 raise NotImplementedError("Platform subclasses must implement differentiable_all_to_all")

239

240 @staticmethod

241 def tensor_type_cast(input_data, cast_type):

242 """Cast tensor to a specified dtype.

243

244 Args:

245 input_data: The input tensor to cast.

246 cast_type: The target dtype to cast to.

247

248 Returns:

249 The tensor cast to the specified dtype.

250 """

251 raise NotImplementedError("Platform subclasses must implement tensor_type_cast")

252

253 @staticmethod

254 def is_tensor(obj: Any) -> bool:

255 """Return True if ``obj`` is this framework's tensor type."""

256 raise NotImplementedError("Platform subclasses must implement is_tensor")

257

258 @staticmethod

259 def get_tensor_storage_size(tensor: Any) -> int:

260 """Return serialized byte size (numel * element size) for this framework's tensor."""

261 raise NotImplementedError("Platform subclasses must implement get_tensor_storage_size")

262

263 @staticmethod

264 def differentiable_all_reduce(data, op, group):

265 """Perform differentiable all-reduce operation.

266

267 Args:

268 data: The input tensor to reduce.

269 op: The reduction operation (e.g., sum, max, min).

270 group: The process group for collective communication.

271

272 Returns:

273 The reduced tensor with gradients supported.

274 """

275 raise NotImplementedError("Platform subclasses must implement differentiable_all_reduce")

276

277 @staticmethod

278 def differentiable_reduce_scatter(data, dev_num, axis, op, group):

279 """Perform differentiable reduce-scatter operation.

280

281 Args:

282 data: The input tensor to reduce and scatter.

283 dev_num (int): The number of devices to scatter across.

284 axis (int): The axis along which to scatter.

285 op: The reduction operation (e.g., sum, max, min).

286 group: The process group for collective communication.

287

288 Returns:

289 The scattered tensor chunk with gradients supported.

290 """

291 raise NotImplementedError("Platform subclasses must implement differentiable_reduce_scatter")

292

293 @staticmethod

294 def init_parameters(module, stage_index):

295 """Initialize parameters for a module at a specific pipeline stage.

296

297 This method is primarily needed for MindSpore platform which requires

298 explicit parameter initialization interface.

299

300 Args:

301 module: The module whose parameters need to be initialized.

302 stage_index (int): The pipeline stage index for the module.

303

304 Raises:

305 ValueError: If module is None or stage_index is negative.

306 """

307 if module is None:

308 raise ValueError("input module must not be none.")

309 if stage_index < 0:

310 raise ValueError("input stage_index must be positive.")

311

312 @staticmethod

313 def get_cell_construct(cell):

314 """Get the construct (forward) function of a cell/module.

315

316 Args:

317 cell: The cell or module to get the construct function from.

318

319 Returns:

320 The construct/forward callable of the cell.

321 """

322 raise NotImplementedError("Platform subclasses must implement get_cell_construct")

323

324 @staticmethod

325 def get_cells_and_names(cell):

326 """Get all nested cells/modules and their names.

327

328 Args:

329 cell: The root cell or module to traverse.

330

331 Returns:

332 list: A list of tuples containing (name, cell) pairs.

333 """

334 raise NotImplementedError("Platform subclasses must implement get_cells_and_names")

335

336 @staticmethod

337 def get_modules(module):

338 """Return all sub-modules contained in the given module."""

339 raise NotImplementedError("Platform subclasses must implement get_modules")

340

341 @staticmethod

342 def search_parameter_by_name(cell, param_name: str):

343 """Search for a parameter by name within a cell/module.

344

345 Args:

346 cell: The cell or module to search in.

347 param_name (str): The name of the parameter to find.

348

349 Returns:

350 The parameter if found, otherwise None.

351 """

352 raise NotImplementedError("Platform subclasses must implement search_parameter_by_name")

353

354 @staticmethod

355 def update_parameter_by_name(cell, result: tuple, new_param) -> bool:

356 """Update a parameter by name within a cell/module.

357

358 Args:

359 cell: The cell or module containing the parameter.

360 result (tuple): A tuple containing (param_name, parameter) to update.

361 new_param: The new parameter value to set.

362

363 Returns:

364 bool: True if update was successful, False otherwise.

365 """

366 raise NotImplementedError("Platform subclasses must implement update_parameter_by_name")

367

368 @staticmethod

369 def set_layout_into_parameter(param, layout):

370 """Attach a DTensor layout to a parameter.

371

372 Args:

373 param: The parameter to attach the layout to.

374 layout: The DTensor layout describing tensor distribution.

375 """

376 raise NotImplementedError("Platform subclasses must implement set_layout_into_parameter")

377

378 @staticmethod

379 def get_param_local_shape(param):

380 """Get the local shape of a distributed parameter.

381

382 Args:

383 param: The parameter to query.

384

385 Returns:

386 tuple: The local shape of the parameter shard.

387 """

388 raise NotImplementedError("Platform subclasses must implement get_param_local_shape")

389

390 @staticmethod

391 def get_param_local_data(param):

392 """Get the local data tensor of a distributed parameter.

393

394 Args:

395 param: The parameter to query.

396

397 Returns:

398 The local tensor data of the parameter shard.

399 """

400 raise NotImplementedError("Platform subclasses must implement get_param_local_data")

401

402 @staticmethod

403 def update_param_data(param, data):

404 """Update the data of a parameter with new tensor data.

405

406 Args:

407 param: The parameter to update.

408 data: The new tensor data to assign.

409 """

410 raise NotImplementedError("Platform subclasses must implement update_param_data")

411

412 @staticmethod

413 def get_param_type_size(param):

414 """Get the size in bytes of a parameter's dtype.

415

416 Args:

417 param: The parameter to query.

418

419 Returns:

420 int: The size in bytes of the parameter's data type.

421 """

422 raise NotImplementedError("Platform subclasses must implement get_param_type_size")

423

424 @staticmethod

425 def new_zero_parameter(param_shape, param_type, requires_grad, device):

426 """Create a new parameter initialized with zeros.

427

428 Args:

429 param_shape (tuple): The shape of the parameter.

430 param_type: The dtype of the parameter.

431 requires_grad (bool): Whether the parameter requires gradients.

432 device: The device on which to create the parameter.

433

434 Returns:

435 A new parameter tensor filled with zeros.

436 """

437 raise NotImplementedError("Platform subclasses must implement new_zero_parameter")

438

439 @staticmethod

440 def new_tensor(tensor_shape, tensor_type, device):

441 """Create a new tensor with the specified shape, dtype, and device.

442

443 Args:

444 tensor_shape (tuple): The shape of the tensor.

445 tensor_type: The dtype of the tensor.

446 device: The device on which to create the tensor.

447

448 Returns:

449 A new tensor with uninitialized values.

450 """

451 raise NotImplementedError("Platform subclasses must implement new_tensor")

452

453 @staticmethod

454 def full_like(tensor, fill_value, dtype=None):

455 """Create a tensor filled with a value, with same shape as input.

456

457 Args:

458 tensor: The input tensor to copy shape from.

459 fill_value: The value to fill the new tensor with.

460 dtype: Optional dtype for the new tensor. If None, uses input tensor's dtype.

461

462 Returns:

463 A new tensor filled with the specified value.

464 """

465 raise NotImplementedError("Platform subclasses must implement full_like")

466

467 @staticmethod

468 def set_tensor_requires_grad(input_tensor):

469 """Enable gradient tracking for a tensor in-place.

470

471 Args:

472 input_tensor: The tensor to enable gradients for.

473

474 Returns:

475 The same tensor with requires_grad set to True.

476 """

477 raise NotImplementedError("Platform subclasses must implement set_tensor_requires_grad")

478

479 @staticmethod

480 def all_gather_into_tensor(data, group_info, async_op=False):

481 """Gather tensors from all ranks into a single output tensor.

482

483 Args:

484 data: The input tensor to gather.

485 group_info: The process group for collective communication.

486 async_op (bool): If True, returns a work handle for async operation.

487

488 Returns:

489 The gathered tensor, or a tuple of (tensor, handle) if async_op is True.

490 """

491 raise NotImplementedError("Platform subclasses must implement all_gather_into_tensor")

492

493 @staticmethod

494 def all_reduce(data, group_info, async_op=False):

495 """Reduce tensors across all ranks using specified operation.

496

497 Args:

498 data: The input tensor to reduce.

499 group_info: The process group for collective communication.

500 async_op (bool): If True, returns a work handle for async operation.

501

502 Returns:

503 The reduced tensor, or a tuple of (tensor, handle) if async_op is True.

504 """

505 raise NotImplementedError("Platform subclasses must implement all_reduce")

506

507 @staticmethod

508 def broadcast(data, src, group, async_op=False):

509 """Broadcast tensor from source rank to all ranks in group.

510

511 Args:

512 data: The tensor to broadcast (only valid on source rank).

513 src (int): The source rank to broadcast from.

514 group: The process group for collective communication.

515 async_op (bool): If True, returns a work handle for async operation.

516

517 Returns:

518 The broadcasted tensor, or a tuple of (tensor, handle) if async_op is True.

519 """

520 raise NotImplementedError("Platform subclasses must implement broadcast")

521

522 @staticmethod

523 def isend(tensor, dst=None, group=None, tag=0):

524 """Send tensor asynchronously to destination rank.

525

526 Args:

527 tensor: The tensor to send.

528 dst (int, optional): The destination rank. Defaults to None.

529 group: The process group for communication. Defaults to None.

530 tag (int): A tag to identify the send operation. Defaults to 0.

531

532 Returns:

533 A work handle that can be waited on.

534 """

535 raise NotImplementedError("Platform subclasses must implement isend")

536

537 @staticmethod

538 def irecv(tensor, src=None, group=None, tag=0):

539 """Receive tensor asynchronously from source rank.

540

541 Args:

542 tensor: The tensor buffer to receive data into.

543 src (int, optional): The source rank. Defaults to None.

544 group: The process group for communication. Defaults to None.

545 tag (int): A tag to identify the receive operation. Defaults to 0.

546

547 Returns:

548 A work handle that can be waited on.

549 """

550 raise NotImplementedError("Platform subclasses must implement irecv")

551

552 @staticmethod

553 def p2p_op(op_type, tensor, peer, group=None):

554 """Build a batched-P2P descriptor (no launch).

555

556 Returns an opaque object understood by :meth:`batch_isend_irecv`.

557 Lets callers assemble a mixed send/recv batch that the backend can

558 run concurrently (e.g. TX/RX duplex on one link) in a single op.

559

560 Args:

561 op_type (str): ``"isend"`` or ``"irecv"``.

562 tensor: Tensor to send, or the buffer to receive into.

563 peer (int): Global rank of the peer.

564 group: Process group. ``None`` uses the default group.

565

566 Returns:

567 A backend P2P-op descriptor.

568 """

569 raise NotImplementedError("Platform subclasses must implement p2p_op")

570

571 @staticmethod

572 def batch_isend_irecv(p2p_ops):

573 """Launch a batch of :meth:`p2p_op` descriptors as one async op.

574

575 The whole batch shares a single completion handle (the backend runs

576 the items concurrently on one comm stream), so a send and a recv to

577 the same peer overlap on the duplex link.

578

579 Args:

580 p2p_ops (list): Descriptors from :meth:`p2p_op`.

581

582 Returns:

583 A single work handle covering the whole batch, or ``None`` when

584 ``p2p_ops`` is empty.

585 """

586 raise NotImplementedError("Platform subclasses must implement batch_isend_irecv")

587

588 @staticmethod

589 def p2p_exchange(tensor, peer_rank: int, group=None):

590 """Differentiable symmetric P2P exchange (send local tensor, receive peer's tensor).

591

592 Sends ``tensor`` to ``peer_rank`` and simultaneously receives the peer's

593 tensor. The operation is differentiable: the backward pass performs the

594 same symmetric exchange on the upstream gradient.

595

596 Args:

597 tensor: Local tensor to send.

598 peer_rank (int): Global rank of the communication peer.

599 group: Process group. ``None`` uses the default group.

600

601 Returns:

602 Tensor received from ``peer_rank``, with the same shape and dtype as

603 the input ``tensor``.

604 """

605 raise NotImplementedError("Platform subclasses must implement p2p_exchange")

606

607 @staticmethod

608 def send_object_list(obj_list, dst=None, group=None):

609 """Send a list of Python objects to destination rank.

610

611 Args:

612 obj_list (list): The list of Python objects to send.

613 dst (int, optional): The destination rank. Defaults to None.

614 group: The process group for communication. Defaults to None.

615 """

616 raise NotImplementedError("Platform subclasses must implement send_object_list")

617

618 @staticmethod

619 def recv_object_list(obj_list, src=None, group=None):

620 """Receive a list of Python objects from source rank.

621

622 Args:

623 obj_list (list): The list buffer to receive objects into.

624 src (int, optional): The source rank. Defaults to None.

625 group: The process group for communication. Defaults to None.

626 """

627 raise NotImplementedError("Platform subclasses must implement recv_object_list")

628

629 @staticmethod

630 def reduce_scatter_tensor(data, group_info, async_op=False):

631 """Reduce and scatter tensor across all ranks in group.

632

633 Args:

634 data: The input tensor to reduce and scatter.

635 group_info: The process group for collective communication.

636 async_op (bool): If True, returns a work handle for async operation.

637

638 Returns:

639 The scattered tensor chunk, or a tuple of (tensor, handle) if async_op is True.

640 """

641 raise NotImplementedError("Platform subclasses must implement reduce_scatter_tensor")

642

643 @staticmethod

644 def all_gather_single(input_tensor, output_shape, group, async_op=False):

645 """All-gather tensor shards with optional async execution.

646

647 Args:

648 input_tensor: Input tensor whose leading dimension is gathered.

649 output_shape: Shape of the gathered output tensor.

650 group: Process group (ProcessGroup for torch, group name string for mindspore).

651 async_op: If True, returns an async work handle.

652

653 Returns:

654 Tuple ``(output, work)`` where *output* is the gathered tensor and

655 *work* is the async handle (``None`` when ``async_op=False``).

656 """

657 raise NotImplementedError("Platform subclasses must implement all_gather_single")

658

659 @staticmethod

660 def reduce_scatter_single(input_tensor, output_shape, group, async_op=False):

661 """Reduce-scatter a tensor with optional async execution.

662

663 Args:

664 input_tensor: Input tensor whose leading dimension is split across ranks.

665 output_shape: Shape of the local reduced output tensor.

666 group: Process group (ProcessGroup for torch, group name string for mindspore).

667 async_op: If True, returns an async work handle.

668

669 Returns:

670 Tuple ``(output, work)`` where *output* is the local shard and

671 *work* is the async handle (``None`` when ``async_op=False``).

672 """

673 raise NotImplementedError("Platform subclasses must implement reduce_scatter_single")

674

675 @staticmethod

676 def all_to_all_single(input_tensor, output_shape, group, async_op=False):

677 """All-to-all single collective with optional async execution.

678

679 Args:

680 input_tensor: Input tensor to scatter.

681 output_shape: Shape of the pre-allocated output tensor.

682 group: Process group (ProcessGroup for torch, group name string for mindspore).

683 async_op: If True, returns a work handle; the output tensor is

684 filled only after ``work.wait()`` is called.

685

686 Returns:

687 Tuple ``(output, work)`` where *output* is the result tensor and

688 *work* is the async handle (``None`` when ``async_op=False``).

689

690 Raises:

691 NotImplementedError: Must be implemented by platform subclasses.

692 """

693 raise NotImplementedError("Platform subclasses must implement all_to_all_single")

694

695 @staticmethod

696 def differentiable_async_allgather_wait(x, work, out_perm, group, world_size, gather_dim,

697 handle_box=None):

698 """Differentiable wrapper that waits for a pre-launched async all-gather.

699

700 Forward waits for the all-gather handle and reconstructs the tensor by

701 moving the gathered leading dimension back to ``gather_dim``.

702

703 Backward launches the reverse reduce-scatter. If ``handle_box`` is a

704 mutable list, the reduce-scatter handle is appended there and a zero

705 gradient is returned to be replaced by the caller's backward pre-hook.

706 If ``handle_box`` is ``None``, the reduce-scatter is waited immediately

707 and its local result is returned, preserving composability with an

708 upstream autograd communication op.

709

710 Args:

711 x: Original input tensor; anchors the op in the autograd graph.

712 work: Async work handle from all-gather.

713 out_perm: Output buffer filled by all-gather.

714 group: Communication group for backward reduce-scatter.

715 world_size: Group size.

716 gather_dim: Dimension gathered in forward.

717 handle_box: Optional mutable list for deferred backward wait.

718

719 Returns:

720 Gathered tensor connected to the autograd graph through *x*.

721 """

722 raise NotImplementedError("Platform subclasses must implement differentiable_async_allgather_wait")

723

724 @staticmethod

725 def differentiable_async_a2a_wait(x, work, out_perm, group, world_size, concat_dim, split_dim,

726 handle_box=None):

727 """Differentiable wrapper that waits for a pre-launched async A2A.

728

729 Wraps the wait-and-reconstruct step in the platform autograd mechanism

730 so gradients flow correctly through the all-to-all communication.

731

732 The A2A direction is seq→head (forward): the output gathers along

733 ``concat_dim`` (sequence grows from S/cp to S) and scatters along

734 ``split_dim`` (heads shrink from H to H/ws).

735

736 In backward, launches an async head→seq A2A on the incoming gradient

737 and appends ``(work, out_perm)`` to ``handle_box`` so the caller can

738 wait just before the projection GEMM, achieving GEMM–A2A overlap.

739

740 Args:

741 x: Original projection output tensor; anchors the op

742 in the autograd graph.

743 work: Async work handle from ``all_to_all_single(async_op=True)``.

744 out_perm: Output buffer filled once ``work.wait()`` completes

745 (shape ``[ws, ...]``).

746 group: Process group for the reverse A2A in backward.

747 world_size: CP/Ulysses degree.

748 concat_dim: Dimension that is gathered (concatenated) in forward;

749 typically the sequence dimension.

750 split_dim: Dimension that is scattered (split) in forward;

751 typically the head dimension.

752 handle_box: Optional mutable list ``[]``. In backward, ``(work, out_perm)``

753 for the reverse A2A is appended here so the pre-hook can wait.

754

755 Returns:

756 Result tensor with ``concat_dim`` gathered and ``split_dim`` split,

757 connected to the autograd graph through *x*.

758

759 Raises:

760 NotImplementedError: Must be implemented by platform subclasses.

761 """

762 raise NotImplementedError("Platform subclasses must implement differentiable_async_a2a_wait")

763

764 @staticmethod

765 def differentiable_sync_hook(x, hook_name: str, coordinator):

766 """Identity operation that intercepts both forward and backward to call

767 coordinator rendezvous, enabling deterministic comm/compute overlap.

768

769 This is the differentiable building block for dual-pipe schedules.

770 In the forward pass the coordinator is invoked with the forward-side

771 roles for ``hook_name``; in the backward pass it is invoked with the

772 backward-side roles. The tensor value and gradient flow through

773 unchanged.

774

775 Args:

776 x: Input tensor. Returned as-is; gradients flow through.

777 hook_name: One of ``"A"``, ``"B"``, ``"C"``, ``"D"`` identifying

778 the position relative to MoE dispatch/combine.

779 coordinator: A :class:`HookCoordinator` instance shared between the

780 forward and backward threads.

781

782 Returns:

783 The same tensor *x*, attached to the autograd graph so that the

784 backward hook will fire.

785 """

786 raise NotImplementedError("Platform subclasses must implement differentiable_sync_hook")

787

788 @staticmethod

789 def differentiable_all_to_all_single(input_tensor, input_splits, output_splits, group):

790 """Variable-split all-to-all single that supports gradient flow.

791

792 Unlike ``all_to_all_single`` (which is not differentiable), this method

793 wraps the collective in an autograd function so gradients are correctly

794 routed back through the reverse all-to-all in the backward pass.

795 Intended for Expert Parallelism token dispatch / combine.

796

797 Args:

798 input_tensor: Input tensor to scatter. Shape ``[sum(input_splits), *feature_dims]``.

799 input_splits: Per-rank sizes of data sent from this rank (list of ints,

800 length equal to ep_degree).

801 output_splits: Per-rank sizes of data received by this rank (list of ints,

802 length equal to ep_degree).

803 group: Process group (ProcessGroup for torch, group name str for mindspore).

804

805 Returns:

806 Output tensor of shape ``[sum(output_splits), *feature_dims]``.

807

808 Raises:

809 NotImplementedError: Must be implemented by platform subclasses.

810 """

811 raise NotImplementedError("Platform subclasses must implement differentiable_all_to_all_single")

812

813 @staticmethod

814 def differentiable_all_to_all_single_async(input_tensor, input_splits, output_splits, group):

815 """Async variant of :meth:`differentiable_all_to_all_single`.

816

817 Same semantics but launches the collective with ``async_op=True`` and

818 only performs a stream-level ``wait`` — the host returns immediately

819 after dispatching the kernel. Intended for dual-pipe comm/compute

820 overlap paths where the paired COMPUTE side's rendezvous notify must

821 fire right after kernel launch (not after the collective actually

822 completes on device).

823

824 Args:

825 input_tensor: Input tensor to scatter. Shape ``[sum(input_splits), *feature_dims]``.

826 input_splits: Per-rank sizes of data sent from this rank.

827 output_splits: Per-rank sizes of data received by this rank.

828 group: Process group.

829

830 Returns:

831 Output tensor of shape ``[sum(output_splits), *feature_dims]``.

832

833 Raises:

834 NotImplementedError: Must be implemented by platform subclasses.

835 """

836 raise NotImplementedError(

837 "Platform subclasses must implement differentiable_all_to_all_single_async"

838 )

839

840 @staticmethod

841 def wait_async_tensor(tensor):

842 """Wait for an async collective tensor to become materialised.

843

844 Intended for use with :class:`AsyncHandle` so that callers can

845 wait on an async all-to-all result without importing framework-specific

846 modules directly. The call is **idempotent** — waiting on an already-

847 completed tensor is a no-op.

848

849 Args:

850 tensor: An async collective tensor (e.g. PyTorch

851 ``AsyncCollectiveTensor``) whose values have not yet been

852 fully written by the remote ranks.

853

854 Returns:

855 The same *tensor*, now guaranteed to be fully materialised.

856

857 Raises:

858 NotImplementedError: Must be implemented by platform subclasses.

859 """

860 raise NotImplementedError(

861 "Platform subclasses must implement wait_async_tensor"

862 )

863

864 @staticmethod

865 def arange(start, end=None, step=1, dtype=None, device=None):

866 """Create a 1-D tensor with evenly spaced values.

867

868 Args:

869 start: Start of interval (inclusive). If *end* is ``None``,

870 treated as the stop value and *start* defaults to 0.

871 end: End of interval (exclusive). Defaults to ``None``.

872 step: Step size. Defaults to ``1``.

873 dtype: Data type. ``None`` uses the framework default (int64).

874 device: Target device.

875

876 Returns:

877 1-D tensor ``[start, start+step, ..., end)``.

878

879 Raises:

880 NotImplementedError: Must be implemented by platform subclasses.

881 """

882 raise NotImplementedError("Platform subclasses must implement arange")

883

884 @staticmethod

885 def zeros(size, dtype=None, device=None):

886 """Create a zero-filled tensor of the given shape.

887

888 Args:

889 size: Shape of the tensor (a single tuple/list).

890 dtype: Desired data type. ``None`` uses the framework default (float32).

891 device: Target device. ``None`` uses the framework default.

892

893 Returns:

894 Zero-filled tensor of the specified shape.

895

896 Raises:

897 NotImplementedError: Must be implemented by platform subclasses.

898 """

899 raise NotImplementedError("Platform subclasses must implement zeros")

900

901 @staticmethod

902 def parameters_dict(cell):

903 """Get the parameters dictionary of a cell/module.

904

905 Args:

906 cell: The cell or module to get parameters from.

907

908 Returns:

909 dict: A dictionary mapping parameter names to parameters.

910 """

911 raise NotImplementedError("Platform subclasses must implement parameters_dict")

912

913 @staticmethod

914 def get_model_state_dict(model, *, options=None):

915 """Get the state dictionary of a model.

916

917 Args:

918 model: The model to extract state from.

919 options: Optional configuration for state dict extraction.

920

921 Returns:

922 dict: The state dictionary containing model parameters and buffers.

923 """

924 raise NotImplementedError(

925 "Platform subclasses must implement get_model_state_dict"

926 )

927

928 @staticmethod

929 def save_checkpoint(cell, file_path: str, ckpt_format: str = "safetensors") -> None:

930 """Save a cell/module checkpoint to file.

931

932 Args:

933 cell: The cell or module to save.

934 file_path (str): The path to save the checkpoint to.

935 ckpt_format (str): The file format.

936 """

937 raise NotImplementedError("Platform subclasses must implement save_checkpoint")

938

939 @staticmethod

940 def load_checkpoint(file_path: str, ckpt_format: str = "safetensors") -> dict:

941 """Load a checkpoint from file.

942

943 Args:

944 file_path (str): The path to load the checkpoint from.

945 ckpt_format (str): The file format.

946

947 Returns:

948 dict: The loaded checkpoint state dictionary.

949 """

950 raise NotImplementedError("Platform subclasses must implement load_checkpoint")

951

952 def _create_group(self, rank_list):

953 """Create a new process group with the specified ranks.

954

955 Internal method to be implemented by subclasses.

956

957 Args:

958 rank_list (list): List of ranks to include in the group.

959

960 Returns:

961 The newly created process group.

962 """

963 raise NotImplementedError("Platform subclasses must implement _create_group")

964

965 def new_stream(self):

966 """Create a new compute stream for asynchronous operations.

967

968 Returns:

969 A new stream object for the current device.

970 """

971 raise NotImplementedError("Platform subclasses must implement new_stream")

972

973 def get_stream_context(self):

974 """Get a context manager for executing operations on a specific stream.

975

976 Returns:

977 A context manager that can be used with 'with' statement to set stream.

978 """

979 raise NotImplementedError("Platform subclasses must implement get_stream_context")

980

981 @staticmethod

982 def get_tensor_transform():

983 """Get the tensor transformation utilities for the current framework.

984

985 Returns:

986 A module or object containing tensor transformation functions.

987 """

988 raise NotImplementedError("Platform subclasses must implement get_tensor_transform")

989

990 @staticmethod

991 def construct_strided_slice(x, begin, end, stride):

992 """Construct a strided slice operation on a tensor.

993

994 Args:

995 x: The input tensor to slice.

996 begin: The starting indices for each dimension.

997 end: The ending indices for each dimension.

998 stride: The stride for each dimension.

999

1000 Returns:

1001 The sliced tensor.

1002 """

1003 raise NotImplementedError("Platform subclasses must implement construct_strided_slice")

1004

1005 @staticmethod

1006 def micro_batch(micro_batch_num, args_batch_dim=None, kwargs_batch_dim=None):

1007 """Split inputs into micro-batches for pipeline parallelism.

1008

1009 Args:

1010 micro_batch_num (int): The number of micro-batches to create.

1011 args_batch_dim (list, optional): Batch dimension for each positional arg.

1012 kwargs_batch_dim (dict, optional): Batch dimension for each keyword arg.

1013

1014 Returns:

1015 A decorator that splits function inputs into micro-batches.

1016 """

1017 raise NotImplementedError("Platform subclasses must implement micro_batch")

1018

1019 @staticmethod

1020 def get_symmetric_memory_handler():

1021 """Return a platform-specific symmetric memory handler instance."""

1022 raise NotImplementedError("Platform subclasses must implement get_symmetric_memory_handler")

1023

1024 @staticmethod

1025 def load_into_param(param, data):

1026 """Load data into a parameter, handling framework-specific semantics."""

1027 raise NotImplementedError("Platform subclasses must implement load_into_param")

1028

1029 def create_group(self, rank_list):

1030 """Create or retrieve a communication group with the specified ranks.

1031

1032 If a group with the same rank list already exists, returns the existing

1033 group instead of creating a new one.

1034

1035 Args:

1036 rank_list (list): List of ranks to include in the group.

1037

1038 Returns:

1039 The process group for the specified ranks.

1040 """

1041 group_key = str(tuple(sorted(rank_list)))

1042 if group_key in EXISTING_COMM_GROUPS:

1043 return EXISTING_COMM_GROUPS[group_key]

1044

1045 group = self._create_group(rank_list)

1046 EXISTING_COMM_GROUPS[group_key] = group

1047 return group

1048

1049 @staticmethod

1050 def _process_current_handle():

1051 """Wait for the current gradient handle and execute post-process callback.

1052

1053 Internal method to synchronize pending gradient operations.

1054 """

1055 if Platform.current_grad_handle is None:

1056 return

1057

1058 Platform.current_grad_handle.wait()

1059 if Platform.post_grad_handle_process is None:

1060 return

1061 # pylint: disable=E1102

1062 Platform.post_grad_handle_process()

1063

1064 def set_grad_reduce_handle(self, handle, post_process=None):

1065 """Set a new gradient reduction handle after waiting for the current one.

1066

1067 Waits for any pending gradient handle on the grad sync stream, then

1068 sets the new handle and optional post-process callback.

1069

1070 Args:

1071 handle: The async work handle for gradient reduction.

1072 post_process (callable, optional): Callback to run after handle completes.

1073 """

1074 if Platform.grad_sync_stream is None:

1075 Platform.grad_sync_stream = self.new_stream()

1076 stream_context = self.get_stream_context()

1077 with stream_context(Platform.grad_sync_stream):

1078 Platform._process_current_handle()

1079 Platform.current_grad_handle = handle

1080 Platform.post_grad_handle_process = post_process

1081

1082 def wait_grad_handle(self):

1083 """Wait for the current gradient handle to complete.

1084

1085 Blocks until the current gradient reduction handle completes and

1086 clears the handle state.

1087 """

1088 if Platform.current_grad_handle is None:

1089 return

1090 if Platform.grad_sync_stream is None:

1091 Platform.grad_sync_stream = self.new_stream()

1092 stream_context = self.get_stream_context()

1093 with stream_context(Platform.grad_sync_stream):

1094 Platform._process_current_handle()

1095 sync_event = Platform.grad_sync_stream.record_event()

1096 sync_event.wait()

1097 Platform.current_grad_handle = None

1098 Platform.post_grad_handle_process = None

1099

1100 @staticmethod

1101 def all_gather_object(object_list, obj, group=None) -> None:

1102 """Gather Python objects from all ranks into a list.

1103

1104 Each rank contributes its object, and all ranks receive the complete list.

1105

1106 Args:

1107 object_list (list): List to store gathered objects (output parameter).

1108 obj: The Python object from this rank to contribute.

1109 group: The process group for communication. Defaults to None (default group).

1110 """

1111 raise NotImplementedError("Platform subclasses must implement all_gather_object")

1112

1113 @staticmethod

1114 def barrier(group=None, async_op: bool = False, device_ids=None) -> Any:

1115 """Synchronize all processes in the given process group.

1116

1117 Each rank blocks until every rank in the group enters this collective (when ``async_op``

1118 is False), or returns an async handle that must be completed before proceeding.

1119

1120 Args:

1121 group: The process group or communication group. ``None`` uses the default group.

1122 async_op (bool): If True, returns a backend-specific async work handle. Default: False.

1123 device_ids: Optional device id list; semantics depend on the backend.

1124

1125 Returns:

1126 Async work handle when ``async_op`` is True; otherwise ``None`` (unless the rank

1127 is not in the group, in which case the backend may return ``None``).

1128 """

1129 raise NotImplementedError("Platform subclasses must implement barrier")

1130

1131 @staticmethod

1132 def init_process_group(

1133 backend: Optional[str] = None,

1134 *,

1135 init_method: Optional[str] = None,

1136 timeout: Optional[timedelta] = None,

1137 world_size: int = -1,

1138 rank: int = -1,

1139 store: Any = None,

1140 pg_options: Any = None,

1141 device_id: Any = None

1142 ) -> None:

1143 """

1144 Initialize the default distributed process group.

1145

1146 Args:

1147 backend: The backend to use for distributed communication

1148 init_method: URL specifying how to initialize the process group

1149 timeout: Timeout for operations executed against the process group

1150 world_size: Number of processes participating in the job

1151 rank: Rank of the current process

1152 store: Key/value store for exchanging connection information

1153 pg_options: Process group options for backend-specific configurations

1154 device_id: Specific device this process will work on

1155

1156 Raises:

1157 NotImplementedError: This method must be implemented by subclasses

1158 """

1159 raise NotImplementedError("Platform subclasses must implement init_process_group")

1160

1161 @staticmethod

1162 def destroy_process_group(group=None) -> None:

1163 """

1164 Destroy a given process group.

1165

1166 Args:

1167 group: The process group to be destroyed. If None, destroys the default group.

1168

1169 Raises:

1170 NotImplementedError: This method must be implemented by subclasses

1171 """

1172 raise NotImplementedError("Platform subclasses must implement destroy_process_group")

1173

1174 @staticmethod

1175 def get_process_group_ranks(group=None) -> list[int]:

1176 """

1177 Get rank list of the given process group.

1178

1179 Args:

1180 group: The process group to get ranks from. If None, uses the default group.

1181

1182 Returns:

1183 List of ranks in the specified process group.

1184

1185 Raises:

1186 NotImplementedError: This method must be implemented by subclasses

1187 """

1188 raise NotImplementedError("Platform subclasses must implement get_process_group_ranks")

1189

1190 @staticmethod

1191 def get_backend(group=None):

1192 """

1193 Get the backend of the given process group.

1194 Args:

1195 group: The process group to get backend from. If None, uses the default group.

1196

1197 Returns:

1198 The backend name of the specified process group.

1199

1200 Raises:

1201 NotImplementedError: This method must be implemented by subclasses

1202 """

1203 raise NotImplementedError("Platform subclasses must implement get_backend")

1204

1205 @staticmethod

1206 def split_group(parent_pg: Any = None,

1207 split_ranks: Optional[list] = None,

1208 timeout: Optional[timedelta] = None,

1209 pg_options: Optional[Any] = None,

1210 group_desc: Optional[str] = None,

1211 ) -> Any:

1212 """Create a split group relative to the parent process group.

1213

1214 Args:

1215 parent_pg: The parent process group to split from.

1216 split_ranks (list, optional): Ranks to include in the split group.

1217 timeout (timedelta, optional): Timeout for operations.

1218 pg_options: Process group options for backend-specific configurations.

1219 group_desc (str, optional): Description of the group.

1220

1221 Returns:

1222 The new split process group.

1223 """

1224 raise NotImplementedError("Platform subclasses must implement split_group")

1225

1226 @staticmethod

1227 def get_group_local_rank(group=None) -> int:

1228 """Get the local rank within the given process group.

1229

1230 Args:

1231 group: The process group to query. If None, uses the default group.

1232

1233 Returns:

1234 int: The local rank within the group.

1235 """

1236 raise NotImplementedError("Platform subclasses must implement get_group_local_rank")

1237

1238 @staticmethod

1239 def no_grad():

1240 """Get a context manager to disable gradient computation.

1241

1242 Returns:

1243 A context manager that disables gradient tracking.

1244 """

1245 raise NotImplementedError("Platform subclasses must implement no_grad")

1246

1247 @staticmethod

1248 def preserve_version_counter(tensor):

1249 """Get a context manager that preserves version for an internal tensor update."""

1250 raise NotImplementedError("Platform subclasses must implement preserve_version_counter")

1251

1252 @staticmethod

1253 def relu(tensor):

1254 """Apply ReLU activation element-wise.

1255

1256 Args:

1257 tensor: Input tensor.

1258

1259 Returns:

1260 Tensor with ReLU applied (max(0, x)).

1261 """

1262 raise NotImplementedError("Platform subclasses must implement relu")

1263

1264 @staticmethod

1265 def cat(tensors, dim=0):

1266 """Concatenate tensors along a dimension."""

1267 raise NotImplementedError("Platform subclasses must implement cat")

1268

1269 @staticmethod

1270 def empty_like(tensor, *, dtype=None, device=None, pin_memory=False):

1271 """Create an uninitialized tensor with the same shape as input.

1272

1273 Args:

1274 tensor: The input tensor to copy shape from.

1275 dtype: Optional dtype for the new tensor. If None, uses input tensor's dtype.

1276 device: Optional device for the new tensor. If None, uses input tensor's device.

1277 pin_memory (bool): If True, allocate pinned memory for faster CPU-GPU transfer.

1278

1279 Returns:

1280 An uninitialized tensor with the same shape as input.

1281 """

1282 raise NotImplementedError("Platform subclasses must implement empty_like")

1283

1284 def get_current_stream(self):

1285 """Get the current compute stream for the device.

1286

1287 Returns:

1288 The current stream object.

1289 """

1290 raise NotImplementedError("Platform subclasses must implement get_current_stream")

1291

1292 def new_event(self):

1293 """Create a new event for stream synchronization.

1294

1295 Returns:

1296 A new event object.

1297 """

1298 raise NotImplementedError("Platform subclasses must implement new_event")

1299

1300 def tree_map(self, fn, tree):

1301 """Apply a function to all tensors in a nested structure.

1302

1303 Args:

1304 fn (callable): Function to apply to each tensor.

1305 tree: Nested structure (list, tuple, dict) containing tensors.

1306

1307 Returns:

1308 The same nested structure with fn applied to all tensors.

1309 """

1310 raise NotImplementedError("Platform subclasses must implement tree_map")

1311

1312 @staticmethod

1313 def is_linear_module(module) -> bool:

1314 """Check whether *module* is a linear/dense layer for the current framework.

1315

1316 Args:

1317 module: The module instance to check.

1318

1319 Returns:

1320 True if *module* is the framework's linear layer type.

1321 """

1322 raise NotImplementedError("Platform subclasses must implement is_linear_module")

1323

1324 @staticmethod

1325 def is_embedding_module(module) -> bool:

1326 """Check whether *module* is an embedding layer for the current framework.

1327

1328 Args:

1329 module: The module instance to check.

1330

1331 Returns:

1332 True if *module* is the framework's embedding layer type.

1333 """

1334 raise NotImplementedError("Platform subclasses must implement is_embedding_module")

1335

1336 @staticmethod

1337 def register_forward_pre_hook(module, hook, prepend=False, with_kwargs=False):

1338 """Register a forward pre-hook on a module.

1339

1340 Args:

1341 module: The module to register the hook on.

1342 hook (callable): The hook function to register.

1343 prepend (bool): If True, prepend the hook to existing hooks.

1344 with_kwargs (bool): If True, hook receives both args and kwargs.

1345

1346 Returns:

1347 A handle that can be used to remove the hook.

1348 """

1349 return module.register_forward_pre_hook(hook, prepend=prepend, with_kwargs=with_kwargs)

1350

1351 @staticmethod

1352 def register_full_backward_hook(module, hook, prepend=False):

1353 """Register a full backward hook on a module.

1354

1355 Args:

1356 module: The module to register the hook on.

1357 hook (callable): The hook function to register.

1358 prepend (bool): If True, prepend the hook to existing hooks.

1359

1360 Returns:

1361 A handle that can be used to remove the hook.

1362 """

1363 return module.register_full_backward_hook(hook, prepend)

1364

1365 @staticmethod

1366 def register_full_backward_pre_hook(module, hook, prepend=False):

1367 """Register a full backward pre-hook on a module.

1368

1369 Args:

1370 module: The module to register the hook on.

1371 hook (callable): The hook function to register.

1372 prepend (bool): If True, prepend the hook to existing hooks.

1373

1374 Returns:

1375 A handle that can be used to remove the hook.

1376 """

1377 return module.register_full_backward_pre_hook(hook, prepend)

1378

1379 @property

1380 def checkpoint(self):

1381 """Get the checkpoint function for activation checkpointing.

1382

1383 Returns:

1384 The checkpoint function for the current framework.

1385 """

1386 raise NotImplementedError("Platform subclasses must implement checkpoint")

1387

1388 @staticmethod

1389 def checkpoint_wrapper(module, **checkpoint_kwargs):

1390 """Wrap a module with activation checkpointing functionality.

1391

1392 Args:

1393 module: The module or callable to wrap with activation checkpointing.

1394 **checkpoint_kwargs: Keyword arguments forwarded to the framework

1395 checkpoint wrapper implementation.

1396

1397 Returns:

1398 The wrapped module with activation checkpointing enabled.

1399 """

1400 raise NotImplementedError("Platform subclasses must implement checkpoint_wrapper")

1401

1402 @staticmethod

1403 def swap_wrapper(module, policy_fn=None, group_swap=False):

1404 """Wrap a module with activation swap functionality.

1405

1406 Args:

1407 module: The module to wrap with activation swap.

1408 policy_fn: Optional per-tensor swap policy function.

1409 group_swap (bool, optional): Whether tensors participate in group copy fusion. Default: ``False``.

1410

1411 Returns:

1412 The wrapped module with activation swap enabled.

1413 """

1414 raise NotImplementedError("Platform subclasses must implement swap_wrapper")

1415

1416 @staticmethod

1417 def swap_tensor_wrapper(target, tag=None, group_swap=False):

1418 """Register target tensors into the current swap group.

1419

1420 Args:

1421 target: A tensor or nested container of tensors to register.

1422 tag: Optional debug tag associated with the wrapped tensors.

1423 group_swap (bool, optional): Whether tensors participate in group copy fusion. Default: ``False``.

1424

1425 Returns:

1426 The original target structure, unchanged semantically.

1427 """

1428 raise NotImplementedError("Platform subclasses must implement swap_tensor_wrapper")

1429

1430 @staticmethod

1431 def get_class_activation_wrapper():

1432 """Return the platform-specific activation wrapper class."""

1433 raise NotImplementedError("Platform subclasses must implement get_class_activation_wrapper")

1434

1435 @property

1436 def noop_context_fn(self):

1437 """Get a no-op context function for checkpointing.

1438

1439 Returns:

1440 A context function that performs no operation.

1441 """

1442 raise NotImplementedError("Platform subclasses must implement noop_context_fn")

1443

1444 @staticmethod

1445 def create_selective_checkpoint_contexts(policy_fn_or_list, allow_cache_entry_mutation=False, group_swap=False):

1446 """Create contexts for selective activation checkpointing.

1447

1448 Args:

1449 policy_fn_or_list: A policy function or list of layer names to checkpoint.

1450 allow_cache_entry_mutation (bool): Whether to allow cache entry mutation.

1451 group_swap (bool, optional): Whether MUST_SWAP tensors participate in group copy fusion. Default: ``False``.

1452

1453 Returns:

1454 Context functions for selective checkpointing.

1455 """

1456 raise NotImplementedError("Platform subclasses must implement create_selective_checkpoint_contexts")

1457

1458 @staticmethod

1459 def async_save_on_cpu(policy_fn=None, group_swap: bool = False):

1460 """Create an async CPU offload context for activation checkpointing.

1461

1462 Args:

1463 policy_fn: Optional policy function to determine which activations to offload.

1464 group_swap (bool): Whether swapped tensors participate in group copy fusion.

1465 Default: ``False``.

1466

1467 Returns:

1468 Context manager for async CPU offloading during checkpointing.

1469 """

1470 raise NotImplementedError("Platform subclasses must implement async_save_on_cpu")

1471

1472 @staticmethod

1473 def recompute_handle_collector_ctx():

1474 """Context manager that collects recompute handles created in its scope.

1475

1476 Yields:

1477 A list populated with one opaque recompute handle per checkpointed

1478 block executed during the forward pass within the context. Each

1479 handle can later be fired via :meth:`recompute_handle`.

1480 """

1481 raise NotImplementedError("Platform subclasses must implement recompute_handle_collector_ctx")

1482

1483 @staticmethod

1484 def recompute_handle(handle, session_id):

1485 """Eagerly fire one checkpointed block's forward re-run.

1486

1487 Materializes and caches the block's activations under ``session_id`` so

1488 a later backward in the same session reuses them instead of re-running.

1489

1490 Args:

1491 handle: An opaque recompute handle from

1492 :meth:`recompute_handle_collector_ctx`.

1493 session_id: Stable key shared by the producing re-run and the

1494 consuming backward.

1495 """

1496 raise NotImplementedError("Platform subclasses must implement recompute_handle")

1497

1498 @staticmethod

1499 def recompute_session_ctx(session_id, retain_on_unpack=False):

1500 """Context manager binding recompute unpack to a caller-provided session.

1501

1502 Args:

1503 session_id: Stable session key. Recompute caches are keyed by this

1504 instead of the transient autodiff engine id, so a re-run fired

1505 under one engine can be reused by another.

1506 retain_on_unpack (bool): When ``True``, unpack returns recomputed

1507 tensors without popping them, so a later backward can consume

1508 them. Default: ``False``.

1509

1510 Returns:

1511 A context manager activating the session for its scope.

1512 """

1513 raise NotImplementedError("Platform subclasses must implement recompute_session_ctx")

1514

1515 @staticmethod

1516 def clear_recompute_session(session_id):

1517 """Release retained recompute data for a session.

1518

1519 Args:

1520 session_id: The session key whose cached recompute data is cleared.

1521 """

1522 raise NotImplementedError("Platform subclasses must implement clear_recompute_session")

1523

1524 @staticmethod

1525 def get_element_size(tensor):

1526 """Get Tensor Element Size"""

1527 raise NotImplementedError("Platform subclasses must implement get_element_size")

1528

1529 @staticmethod

1530 def alloc_tensor_buffer(numel: int, dtype, device, pin_memory: bool = False):

1531 """Allocate an uninitialized 1-D tensor buffer."""

1532 raise NotImplementedError("Platform subclasses must implement alloc_tensor_buffer")

1533

1534 @staticmethod

1535 def tensor_to_numpy(tensor) -> np.ndarray:

1536 """Convert a framework tensor to a NumPy array.

1537

1538 Args:

1539 tensor: The tensor to convert.

1540

1541 Returns:

1542 np.ndarray: The tensor data as a NumPy array.

1543 """

1544 raise NotImplementedError("Platform subclasses must implement tensor_to_numpy")

1545

1546 @staticmethod

1547 def from_numpy(np_array):

1548 """Create a host-resident tensor from a NumPy array (inverse of tensor_to_numpy).

1549

1550 The result stays on the host regardless of the active device context, so it

1551 remains asnumpy-able even when built under ``ms.DeviceCtx("meta")`` (e.g. while

1552 ``fully_shard`` lazily constructs a default device mesh). Use it for rank/mesh

1553 bookkeeping tensors, which are only ever read back via ``tensor_to_numpy``.

1554 """

1555 raise NotImplementedError("Platform subclasses must implement from_numpy")

1556

1557 @staticmethod

1558 def profiler_record(name):

1559 """Record a profiler event with the given name.

1560

1561 Args:

1562 name (str): The name of the profiler event.

1563

1564 Returns:

1565 A context manager or decorator for profiling a code region.

1566 """

1567 raise NotImplementedError("Platform subclasses must implement profiler_record")

1568

1569 def cast_fp_tensor(self, dtype, x):

1570 """Cast floating-point tensor to target dtype if applicable.

1571

1572 Args:

1573 dtype: The target dtype to cast to.

1574 x: The input tensor.

1575

1576 Returns:

1577 The tensor cast to target dtype, or unchanged if not floating-point.

1578 """

1579 raise NotImplementedError("Platform subclasses must implement cast_fp_tensor")

1580

1581 def apply_to_tensors(self, fn, container):

1582 """Recursively apply a function to all tensors in a container.

1583

1584 Supports nested structures including lists, tuples, and dicts.

1585

1586 Args:

1587 fn (callable): Function to apply to each tensor.

1588 container: Nested structure containing tensors.

1589

1590 Returns:

1591 The same structure with fn applied to all tensors.

1592 """

1593 raise NotImplementedError("Platform subclasses must implement apply_to_tensors")

1594

1595 @staticmethod

1596 def clip_grad_norm_(

1597 parameters, max_norm: float, norm_type: float = 2.0,

1598 error_if_nonfinite: bool = False, foreach=None,

1599 ):

1600 """Compute and clip gradient norms for distributed models.

1601

1602 Communication is derived from each parameter's DTensor spec.

1603 Subclasses must implement this method.

1604

1605 Args:

1606 parameters: An ``nn.Module``, a single ``Tensor``, or an

1607 iterable of ``Tensor`` s whose gradients to clip.

1608 max_norm: Maximum allowed gradient norm.

1609 norm_type: Type of the norm (default ``2.0``).

1610 error_if_nonfinite: If ``True``, raise when total norm is

1611 non-finite. Default ``False``.

1612 foreach: Unused, accepted for API compatibility.

1613

1614 Returns:

1615 The total (unclipped) gradient norm.

1616 """

1617 raise NotImplementedError(

1618 "Platform subclasses must implement clip_grad_norm_"

1619 )

1620

1621 @staticmethod

1622 def get_created_group(rank_list: Union[list[int], tuple[int]]):

1623 """Get an existing process group by rank list.

1624

1625 Args:

1626 rank_list (Union[list[int], tuple[int]]): Tuple or list of ranks.

1627

1628 Returns:

1629 The process group corresponding to the rank list if it exists, else None.

1630 """

1631 group_key = str(tuple(sorted(rank_list)))

1632 if group_key in EXISTING_COMM_GROUPS:

1633 return EXISTING_COMM_GROUPS[group_key]

1634 return None

1635

1636 @classmethod

1637 def mark_created_groups(cls, process_group: Union[Any, list[Any]]) -> None:

1638 """Register process groups in the global cache for reuse.

1639

1640 Args:

1641 process_group (Union[Any, list[Any]]): A process group or a list of process groups.

1642 """

1643 if not isinstance(process_group, list):

1644 process_group = [process_group]

1645 for group in process_group:

1646 rank_list = cls.get_process_group_ranks(group)

1647 group_key = str(tuple(sorted(rank_list)))

1648 EXISTING_COMM_GROUPS[group_key] = group

1649

1650 @property

1651 def meta_device(self):

1652 """Get the framework-specific meta device for tensor shape inference.

1653

1654 The meta device allows creating tensors without allocating actual storage,

1655 useful for shape inference and model initialization.

1656

1657 Returns:

1658 The meta device object for the current framework.

1659 """

1660 raise NotImplementedError("Platform subclasses must implement meta_device")

1661

1662 def init_on_device(self, device, include_buffers=False):

1663 """Get a context manager for initializing module parameters on a device.

1664

1665 Args:

1666 device: The target device for parameter initialization.

1667 include_buffers (bool): If True, also initialize buffers on the device.

1668

1669 Returns:

1670 A context manager for device-specific initialization.

1671 """

1672 raise NotImplementedError("Platform subclasses must implement init_on_device")

1673

1674 def str_to_dtype(self, dtype_str: str) -> Any:

1675 """

1676 Map a framework-style dtype string (e.g. ``torch.float32``) to the backend dtype object.

1677

1678 Args:

1679 dtype_str (str): Serialized dtype identifier produced by checkpoint metadata.

1680

1681 Returns:

1682 Framework dtype object (e.g. ``torch.dtype`` or MindSpore dtype).

1683 """

1684 raise NotImplementedError("Platform subclasses must implement str_to_dtype")

1685

1686 def list_to_size(self, size_list: list[int]) -> Any:

1687 """

1688 Convert a shape list from checkpoint metadata to the framework's size type (e.g. ``torch.Size``).

1689

1690 Args:

1691 size_list (list[int]): Tensor global shape as a list of ints.

1692

1693 Returns:

1694 Framework-specific size object.

1695 """

1696 raise NotImplementedError("Platform subclasses must implement list_to_size")

Coverage for / home / jenkins / .local / lib / python3.10 / site-packages / hyper_parallel / platform / platform.py: 66%

410 statements