Coverage for /home/jenkins/.local/lib/python3.10/site-packages/hyper_parallel/core/activation

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ============================================================================

15"""Swap tensor and swap manager implementation for activation checkpointing"""

16# pylint: disable=W0212

18import functools

19import threading

20import warnings

22from collections import defaultdict

23from typing import Any, Dict, List, Optional, Set

25from hyper_parallel.platform import get_platform

27platform = get_platform()

29# ---------------------------------------------------------------------------

30# Module-level buffer pools — process-local, no locking needed for single-

31# stream training. Each GPU process owns its own Python interpreter, so

32# these dicts are never shared across processes.

33#

34# _CPU_PINNED_POOL: a list of available pinned CPU tensors per dtype_key.

35# Created via alloc_tensor_buffer(pin_memory=True) on the first miss; the

36# base tensor is returned here after wait_load and reused in the next

37# launch_offload, avoiding repeated cudaHostAlloc / cudaFreeHost calls.

38# ---------------------------------------------------------------------------

39_CPU_PINNED_POOL: Dict[str, List[Any]] = defaultdict(list)

40# Cap each group-swap staging allocation. 32 MiB keeps DMA chunks large

41# while avoiding one huge per-dtype staging tensor in large models.

42_GROUP_SWAP_MAX_BULK_COPY_BYTES = 32 * 1024 * 1024

45def _get_cpu_pinned_buf(dtype_key: str, total_numel: int, dtype):

46 """Pop the smallest sufficient pinned buffer from the pool, or allocate.

48 Best-fit selection minimises wasted pinned memory. When no buffer in the

49 pool is large enough, an undersized entry is discarded before allocating a

50 fresh buffer via alloc_tensor_buffer.

52 Returns the *full* buffer (capacity >= total_numel). Callers must slice

53 ``buf[:total_numel]`` for the actual copy so the returned reference can be

54 passed back to :func:`_return_cpu_pinned_buf` without any platform-specific

55 introspection.

56 """

57 pool = _CPU_PINNED_POOL[dtype_key]

58 best_i = -1

59 for i, buf in enumerate(pool):

60 if buf.numel() >= total_numel:

61 if best_i == -1 or buf.numel() < pool[best_i].numel():

62 best_i = i

63 if best_i != -1:

64 return pool.pop(best_i)

65 # No suitable buffer — discard one stale undersized entry.

66 if pool:

67 pool.pop()

68 return platform.alloc_tensor_buffer(total_numel, dtype, device='cpu', pin_memory=True)

71def _return_cpu_pinned_buf(buf):

72 """Return a full pinned CPU buffer to the pool for reuse."""

73 if buf is None:

74 return

75 _CPU_PINNED_POOL[str(buf.dtype)].append(buf)

78def _collect_device_storage_ptrs(tensors: Any) -> Set[int]:

79 """Collect device storage pointers from a nested tensor structure."""

80 storage_ptrs = set()

82 def _collect(x):

83 if isinstance(x, platform.Tensor) and str(x.device).lower() != "cpu":

84 storage_ptrs.add(x.untyped_storage().data_ptr())

85 return x

87 platform.tree_map(_collect, tensors)

88 return storage_ptrs

91class SwapTensor:

92 """A tensor that can be swapped between device and host memory asynchronously."""

93 STATE_DEVICE = "device"

94 STATE_HOST = "host"

95 STATE_D2H = "d2h"

96 STATE_H2D = "h2d"

97 STATE_NON_TENSOR = "non_tensor"

99 def __init__(self, val: Any, funcname: str, group_swap: bool = False) -> None:

100 self.val = val

101 self.funcname = funcname

102 self._keep_on_device = False

103 self._duplicate_swap = False

104 self._group_managed = False # True when this tensor is handled by SwapGroup bulk copy

105 self.group_swap = group_swap # opt-in for group copy fusion (MUST_SWAP tensors only)

106 if isinstance(val, platform.Tensor) and str(val.device).lower() != 'cpu':

107 self.ver = val._version

108 self._state = self.STATE_DEVICE

109 val_storage = val.untyped_storage()

110 self.storage_size = val_storage.size()

111 self.is_slice_tensor = self.storage_size != val.numel() * platform.get_element_size(val)

112 self.val_cpu = None

113 else:

114 self.ver = None

115 self._state = self.STATE_NON_TENSOR

116 self.val_cpu = None

117 self.is_slice_tensor = False

118 self.storage_size = 0

119

120 def dedup_key(self):

121 """Return a stable identity key for duplicate-swap detection."""

122 if self._state == self.STATE_NON_TENSOR:

123 return None

124 val_storage = self.val.untyped_storage()

125 return (

126 str(self.val.device),

127 val_storage.data_ptr(),

128 self.val.storage_offset(),

129 val_storage.size(),

130 tuple(self.val.stride()),

131 )

132

133 def mark_duplicate_swap(self) -> None:

134 """Mark this wrapper as a duplicate registration in the same swap group."""

135 self._duplicate_swap = True

136

137 def protect_if_aliases(self, alias_storage_ptrs: Set[int]) -> None:

138 """Keep tensors that alias externally-owned tensors on device."""

139 if self._state == self.STATE_NON_TENSOR:

140 return

141 if self.val.untyped_storage().data_ptr() in alias_storage_ptrs:

142 self._keep_on_device = True

143

144 def get_val(self) -> Any:

145 """Return the underlying tensor value.

146

147 Raises RuntimeError if the tensor is not currently in the 'device' state.

148 Non-tensor values are returned directly regardless of state.

149 """

150 if self._state == self.STATE_NON_TENSOR:

151 return self.val

152 if self._state != self.STATE_DEVICE:

153 raise RuntimeError(

154 f"Cannot call get_val(): tensor is in '{self._state}' state. "

155 f"Must be in 'device' state."

156 )

157 return self.val

158

159 def resize_device_storage(self):

160 """Reallocate device memory on compute stream."""

161 if self._state == self.STATE_NON_TENSOR or self._duplicate_swap:

162 return

163 if self._group_managed:

164 return

165

166 if self._state != self.STATE_HOST:

167 return

168 storage = self.val.untyped_storage()

169 if storage.size() == self.storage_size:

170 return

171 storage.resize_(self.storage_size)

172

173 def async_load(self):

174 """async load tensor from host to device"""

175 if self._state == self.STATE_NON_TENSOR or self._keep_on_device or self._duplicate_swap:

176 return

177 if self._group_managed:

178 return

179

180 if self._state != self.STATE_HOST:

181 warnings.warn(

182 f"[SwapTensor.async_load] Invalid state: current={self._state}, "

183 f"expected 'host'. Operation skipped."

184 )

185 return

186

187 if self.val_cpu is None:

188 raise ValueError("val_cpu must not be None during async_load")

189 with platform.preserve_version_counter(self.val):

190 if self.is_slice_tensor:

191 self.val.data.copy_(self.val_cpu, non_blocking=True)

192 else:

193 self.val.untyped_storage().copy_(self.val_cpu.untyped_storage(), non_blocking=True)

194 self._state = self.STATE_H2D

195

196 def wait_load(self):

197 """change state to device after async load is done"""

198 if self._state == self.STATE_NON_TENSOR or self._keep_on_device or self._duplicate_swap:

199 return

200

201 if self._state == self.STATE_DEVICE:

202 return # already loaded

203 if self._state != self.STATE_H2D:

204 warnings.warn(

205 f"[SwapTensor.wait_load] Called in invalid state: {self._state}. "

206 f"Expected 'h2d'. Skipped."

207 )

208 return

209 self._state = self.STATE_DEVICE

210

211 def async_offload(self):

212 """async offload tensor from device to host"""

213 if self._state == self.STATE_NON_TENSOR or self._keep_on_device or self._duplicate_swap:

214 return

215 if self._group_managed:

216 return

217

218 if self._state != self.STATE_DEVICE:

219 warnings.warn(

220 f"[SwapTensor.async_offload] Invalid state: current={self._state}, "

221 f"expected 'device'. Operation skipped."

222 )

223 return

224

225 if self.storage_size != self.val.untyped_storage().size():

226 raise RuntimeError(

227 f"There is a tensor from {self.funcname} cannot be SWAPPED! Its storage has been resized "

228 f"presize:{self.storage_size}, current size:{self.val.untyped_storage().size()}"

229 )

230 if self.ver != self.val._version:

231 raise RuntimeError(

232 f"There is a tensor from {self.funcname} cannot be SWAPPED! In-place modification happened "

233 f"preversion:{self.ver}, current version:{self.val._version}"

234 )

235

236 if self.val_cpu is None:

237 self.val_cpu = platform.empty_like(

238 self.val, device="cpu", pin_memory=True

239 )

240 if self.is_slice_tensor:

241 self.val_cpu.copy_(self.val, non_blocking=True)

242 else:

243 self.val_cpu.untyped_storage().copy_(self.val.untyped_storage(), non_blocking=True)

244 self._state = self.STATE_D2H

245

246 def wait_offload(self):

247 """wait offload to host and free device memory"""

248 if self._state == self.STATE_NON_TENSOR or self._keep_on_device or self._duplicate_swap:

249 return

250

251 if self._state == self.STATE_HOST:

252 return

253 if self._state != self.STATE_D2H:

254 warnings.warn(

255 f"[SwapTensor.wait_offload] Called in invalid state: {self._state}. "

256 f"Expected 'd2h'. Skipped."

257 )

258 return

259 storage = self.val.untyped_storage()

260 if storage.size() != 0:

261 storage.resize_(0)

262 self._state = self.STATE_HOST

263

264 @property

265 def state(self) -> str:

266 """Return the current swap state of this tensor (device, host, d2h, h2d, or non_tensor)."""

267 return self._state

268

269 def __repr__(self):

270 if self._state == self.STATE_NON_TENSOR:

271 return f"<SwapTensor state=non_tensor, val_type={type(self.val).__name__}>"

272 return (

273 f"<SwapTensor state={self._state}, duplicate={self._duplicate_swap}, "

274 f"device_val={'exists' if self.val is not None else 'None'}>"

275 )

276

277

278class Storage:

279 """Manage a collection of tensors for swapping operations.

280

281 Supports dict-like access: ``storage[key].append(item)``, ``storage.clear()``,

282 ``for batch in storage.values(): ...``.

283 """

284

285 def __init__(self):

286 self._data: Dict[Any, List[Any]] = defaultdict(list)

287

288 def __getitem__(self, key: Any) -> List[Any]:

289 return self._data[key]

290

291 def values(self):

292 """Return an iterable view of all stored lists."""

293 return self._data.values()

294

295 def clear(self):

296 """Remove all entries from the storage."""

297 self._data.clear()

298

299 def iter_swap_tensors(self):

300 """Iterate all SwapTensor objects stored in this storage."""

301 collected = []

302

303 def _collect(x):

304 if isinstance(x, SwapTensor):

305 collected.append(x)

306 return x

307

308 for storage_list in self.values():

309 for item in storage_list:

310 platform.tree_map(_collect, item)

311 return collected

312

313 def mark_duplicate_swaps(self, seen_keys) -> int:

314 """Mark tensors already registered in the same swap group as duplicates."""

315 duplicate_count = 0

316 for swap_tensor in self.iter_swap_tensors():

317 dedup_key = swap_tensor.dedup_key()

318 if dedup_key is None:

319 continue

320 if dedup_key in seen_keys:

321 swap_tensor.mark_duplicate_swap()

322 duplicate_count += 1

323 continue

324 seen_keys.add(dedup_key)

325 return duplicate_count

326

327 def protect_alias_storage_ptrs(self, alias_storage_ptrs: Set[int]):

328 """Avoid offloading swap entries that alias externally-owned storage."""

329 if not alias_storage_ptrs:

330 return

331

332 def _protect_tensor(x):

333 if isinstance(x, SwapTensor):

334 x.protect_if_aliases(alias_storage_ptrs)

335 return x

336

337 for storage_list in self.values():

338 for item in storage_list:

339 platform.tree_map(_protect_tensor, item)

340

341 def launch_load(self):

342 """launch async load for all tensors in swap storage"""

343 def _async_load(x):

344 if isinstance(x, SwapTensor):

345 x.async_load()

346 return x

347

348 for storage_list in self.values():

349 for item in storage_list:

350 platform.tree_map(_async_load, item)

351

352 def resize_device_storage(self):

353 """Resize device storage for all swap tensors (runs on compute stream)."""

354 def _resize(x):

355 if isinstance(x, SwapTensor):

356 x.resize_device_storage()

357 return x

358 for storage_list in self.values():

359 for item in storage_list:

360 platform.tree_map(_resize, item)

361

362 def wait_load(self):

363 """wait load for all tensors in swap storage"""

364 def _wait_load(x):

365 if isinstance(x, SwapTensor):

366 x.wait_load()

367 return x

368

369 for storage_list in self.values():

370 for item in storage_list:

371 platform.tree_map(_wait_load, item)

372 self.clear()

373

374 def wait_offload(self):

375 """wait offload for all tensors in swap storage"""

376 def _wait_offload(x):

377 if isinstance(x, SwapTensor):

378 x.wait_offload()

379 return x

380

381 for storage_list in self.values():

382 for item in storage_list:

383 platform.tree_map(_wait_offload, item)

384

385 def launch_offload(self):

386 """launch async offload for all tensors in swap storage"""

387 def _async_offload(x):

388

389 if isinstance(x, SwapTensor):

390 x.async_offload()

391 return x

392

393 for storage_list in self.values():

394 for item in storage_list:

395 platform.tree_map(_async_offload, item)

396

397

398class SwapGroup:

399 """Manager for a group of storages to coordinate swap operations.

400

401 Non-slice tensors within the group are packed into bounded contiguous device

402 buffers before D2H transfer, and loaded back from bounded H2D buffers.

403 Each tensor then aliases its slice of the relevant buffer via

404 ``Tensor.set_()``, avoiding per-tensor memory fragmentation.

405

406 Slice tensors (storage larger than logical data) fall back to the original

407 per-tensor copy path.

408 """

409

410 def __init__(self, group_name: str):

411 self.group_name = group_name

412 self.is_last_group: bool = False

413 self._storages: List[Storage] = []

414 self._load_event: Optional[Any] = None

415 self._offload_event: Optional[Any] = None

416 # Group-level contiguous buffers for non-slice tensors.

417 self._packed_tensor_info: List = [] # [(SwapTensor, bucket_key, element_offset), ...]

418 self._packed_buckets: Dict[str, Dict[str, Any]] = {}

419 self._group_cpu_buf = None # pinned CPU bufs; live offload→load

420 self._group_device_buf = None # temp device bufs; cleared after each phase

421 # Persistent dedup set accumulated across add() calls; avoids O(N²) rebuild.

422 # mark_duplicate_swaps mutates it in-place, so new keys are added automatically.

423 # Reset at wait_load() so stale data_ptrs don't leak into the next iteration.

424 self._seen_dedup_keys: set = set()

425 # Per-bucket SwapTensor lists built in _collect_packable_tensors and consumed

426 # in launch_offload, eliminating a redundant pass over _packed_tensor_info.

427 self._packed_by_bucket: Dict[str, List] = {}

428

429 def add(self, storage):

430 """Add a storage to the swap group."""

431 duplicate_count = storage.mark_duplicate_swaps(self._seen_dedup_keys)

432 if duplicate_count > 0:

433 warnings.warn(

434 f"SwapGroup '{self.group_name}' skipped {duplicate_count} duplicate tensor swap registration(s)."

435 )

436 self._storages.append(storage)

437

438 def protect_alias_tensors(self, tensors: Any):

439 """Protect externally-owned tensors from premature offload."""

440 alias_storage_ptrs = _collect_device_storage_ptrs(tensors)

441 if not alias_storage_ptrs:

442 return

443 for storage in self._storages:

444 storage.protect_alias_storage_ptrs(alias_storage_ptrs)

445

446 def _collect_packable_tensors(self) -> int:

447 """Identify tensors eligible for group packing and mark them for bulk copy.

448

449 A tensor is eligible only when it is contiguous, not a slice tensor,

450 not a duplicate, not sharing storage with another live swap tensor, and

451 has ``group_swap=True``. Dtype buckets are split before their staging

452 allocation would exceed ``_GROUP_SWAP_MAX_BULK_COPY_BYTES``. A packed

453 bucket with fewer than two tensors is left on the original per-tensor

454 path because it has no batch-copy benefit. Non-contiguous

455 tensors are excluded because the packing step copies storage-order

456 bytes while restore uses the original stride; those tensors fall back to

457 the per-tensor copy path.

458 Shared-storage tensors also fall back together because group packing

459 frees the original storage after packing, which would invalidate any

460 non-packed aliases such as transpose views before their own offload.

461

462 Side effects: marks each eligible tensor with ``_group_managed=True``

463 and ``_state=STATE_D2H``, and populates ``_packed_tensor_info`` /

464 ``_packed_buckets``.

465

466 Returns:

467 Total byte count of all packable tensors.

468 """

469 candidate_buckets: Dict[str, List[Dict[str, Any]]] = {}

470 packed_info: List = []

471 packed_buckets: Dict[str, Dict[str, Any]] = {}

472 packed_by_bucket: Dict[str, List] = {}

473 total_bytes = 0

474

475 def _try_pack(x):

476 if not isinstance(x, SwapTensor):

477 return x

478 no_pack = (not x.group_swap or x._state != SwapTensor.STATE_DEVICE or x._keep_on_device

479 or x.is_slice_tensor or x._duplicate_swap or x.storage_size >= _GROUP_SWAP_MAX_BULK_COPY_BYTES

480 or not x.val.is_contiguous())

481 if no_pack:

482 return x

483 if x.storage_size != x.val.untyped_storage().size():

484 raise RuntimeError(

485 f"There is a tensor from {x.funcname} cannot be SWAPPED! Its storage has been resized "

486 f"presize:{x.storage_size}, current size:{x.val.untyped_storage().size()}"

487 )

488 if x.ver != x.val._version:

489 raise RuntimeError(

490 f"There is a tensor from {x.funcname} cannot be SWAPPED! In-place modification happened "

491 f"preversion:{x.ver}, current version:{x.val._version}"

492 )

493 dtype_key = str(x.val.dtype)

494 dtype_buckets = candidate_buckets.setdefault(dtype_key, [])

495 if (not dtype_buckets or

496 dtype_buckets[-1]["total_bytes"] + x.storage_size > _GROUP_SWAP_MAX_BULK_COPY_BYTES):

497 dtype_buckets.append({

498 "bucket_key": f"{dtype_key}#{len(dtype_buckets)}",

499 "dtype": x.val.dtype,

500 "dtype_key": dtype_key,

501 "device": x.val.device,

502 "tensors": [],

503 "total_bytes": 0,

504 "total_numel": 0,

505 })

506 bucket = dtype_buckets[-1]

507 bucket["tensors"].append(x)

508 bucket["total_bytes"] += x.storage_size

509 bucket["total_numel"] += x.val.numel()

510 return x

511

512 for storage in self._storages:

513 for storage_list in storage.values():

514 for item in storage_list:

515 platform.tree_map(_try_pack, item)

516

517 for dtype_bucket_list in candidate_buckets.values():

518 for candidate_bucket in dtype_bucket_list:

519 tensors = candidate_bucket["tensors"]

520 if len(tensors) < 2:

521 continue

522 bucket_key = candidate_bucket["bucket_key"]

523 packed_buckets[bucket_key] = {

524 "dtype": candidate_bucket["dtype"],

525 "dtype_key": candidate_bucket["dtype_key"],

526 "device": candidate_bucket["device"],

527 "total_numel": candidate_bucket["total_numel"],

528 }

529 element_offset = 0

530 for tensor in tensors:

531 tensor._group_managed = True

532 tensor._state = SwapTensor.STATE_D2H

533 packed_info.append((tensor, bucket_key, element_offset))

534 element_offset += tensor.val.numel()

535 packed_by_bucket[bucket_key] = tensors

536 total_bytes += candidate_bucket["total_bytes"]

537

538 self._packed_tensor_info = packed_info

539 self._packed_buckets = packed_buckets

540 self._packed_by_bucket = packed_by_bucket

541 return total_bytes

542

543 def launch_offload(self, copy_stream):

544 """Launch async offload for all storages in the group.

545

546 Non-slice tensors are first packed into bounded contiguous device

547 buffers, then transferred to pinned CPU memory. Slice tensors are

548 offloaded individually via the existing per-tensor path.

549 """

550 total_bytes = self._collect_packable_tensors()

551 with platform.no_grad():

552 if total_bytes > 0:

553 group_device_bufs = {}

554 group_cpu_bufs = {}

555 for bucket_key, swap_tensors in self._packed_by_bucket.items():

556 group_device_bufs[bucket_key] = platform.cat(

557 [st.val.reshape(-1) for st in swap_tensors], dim=0

558 )

559

560 compute_event = platform.new_event()

561 compute_event.record(platform.get_current_stream())

562 self._offload_event = platform.new_event()

563 stream_context = platform.get_stream_context()

564 with platform.no_grad(), stream_context(copy_stream):

565 compute_event.wait(copy_stream)

566

567 if total_bytes > 0:

568 # One-shot D2H per packed bucket. MindSpore requires tensor/storage dtype consistency.

569 for bucket_key, bucket in self._packed_buckets.items():

570 dtype_key = bucket["dtype_key"]

571 numel = bucket["total_numel"]

572 cpu_buf = _get_cpu_pinned_buf(dtype_key, numel, bucket["dtype"])

573 group_cpu_bufs[bucket_key] = cpu_buf

574 cpu_buf[:numel].copy_(group_device_bufs[bucket_key], non_blocking=True)

575 self._group_device_buf = group_device_bufs

576 self._group_cpu_buf = group_cpu_bufs

577

578 # Slice tensors use the existing per-tensor path.

579 # Group-managed tensors are already STATE_D2H so async_offload is a no-op.

580 for storage in self._storages:

581 storage.launch_offload()

582 self._offload_event.record(copy_stream)

583

584 def wait_offload(self):

585 """Wait for offload to complete for all storages in the group."""

586 if self._offload_event is None:

587 raise RuntimeError(

588 f"SwapGroup '{self.group_name}' wait_offload() called before launch_offload()."

589 )

590 compute_stream = platform.get_current_stream()

591 stream_context = platform.get_stream_context()

592 with platform.no_grad(), stream_context(compute_stream):

593 self._offload_event.wait(compute_stream)

594 self._offload_event = None

595 for storage in self._storages:

596 storage.wait_offload()

597 # Release the temporary device packing buffer; _group_cpu_buf persists until launch_load.

598 self._group_device_buf = None

599

600 def launch_load(self, copy_stream):

601 """Prepare storage and launch async load for all storages in the group.

602

603 Non-slice tensors are loaded from pinned CPU memory into bounded

604 contiguous device buffers. Tensors will alias their slice of the

605 relevant buffer after ``wait_load``. Slice tensors use the existing

606 per-tensor path.

607 """

608 # Resize device storage for slice tensors only.

609 # Group-managed tensors skip resize_device_storage via _group_managed flag.

610 with platform.no_grad():

611 for storage in self._storages:

612 storage.resize_device_storage()

613

614 compute_event = platform.new_event()

615 compute_event.record(platform.get_current_stream())

616 self._load_event = platform.new_event()

617 stream_context = platform.get_stream_context()

618 with platform.no_grad(), stream_context(copy_stream):

619 compute_event.wait(copy_stream)

620

621 if self._packed_tensor_info and self._group_cpu_buf is not None:

622 group_device_bufs = {}

623 for bucket_key, bucket in self._packed_buckets.items():

624 cpu_buf = self._group_cpu_buf.get(bucket_key)

625 if cpu_buf is None:

626 continue

627 numel = bucket["total_numel"]

628 group_device_bufs[bucket_key] = platform.alloc_tensor_buffer(

629 numel, bucket["dtype"], bucket["device"]

630 )

631 # One-shot H2D per packed bucket.

632 group_device_bufs[bucket_key].copy_(cpu_buf[:numel], non_blocking=True)

633 self._group_device_buf = group_device_bufs

634 # Mirror async_load's STATE_H2D transition: H2D is in flight.

635 for st, _, _ in self._packed_tensor_info:

636 st._state = SwapTensor.STATE_H2D

637

638 # Slice tensors use the existing per-tensor path.

639 # Group-managed tensors skip async_load via _group_managed flag.

640 for storage in self._storages:

641 storage.launch_load() # Only copy, no resize

642 self._load_event.record(copy_stream)

643

644 def wait_load(self):

645 """Wait for load to complete for all storages in the group.

646

647 After the H2D transfer completes, each group-managed tensor is made to

648 alias its slice of the contiguous device buffer via ``Tensor.set_()``.

649 The buffer stays alive through the tensors' own storage references after

650 ``_group_device_buf`` is cleared here.

651 """

652 if self._load_event is None:

653 raise RuntimeError(

654 f"SwapGroup '{self.group_name}' wait_load() called before launch_load()."

655 )

656 compute_stream = platform.get_current_stream()

657 stream_context = platform.get_stream_context()

658 with platform.no_grad(), stream_context(compute_stream):

659 self._load_event.wait(compute_stream)

660 self._load_event = None

661 # Restore group-managed tensors: alias into the contiguous device buffer.

662 if self._group_device_buf is not None:

663 prev_key = None

664 group_storage = None

665 for st, bucket_key, element_offset in self._packed_tensor_info:

666 if bucket_key != prev_key:

667 group_device_buf = self._group_device_buf.get(bucket_key)

668 group_storage = group_device_buf.untyped_storage() if group_device_buf is not None else None

669 prev_key = bucket_key

670 if group_storage is None:

671 continue

672 with platform.preserve_version_counter(st.val):

673 st.val.set_(group_storage, element_offset, st.val.shape, st.val.stride())

674 st._state = SwapTensor.STATE_DEVICE

675 for storage in self._storages:

676 storage.wait_load()

677 self._storages.clear()

678 # Return CPU pinned buffers to the pool. By the time wait_load

679 # returns, _load_event has fired on the compute stream, which

680 # means the copy stream's H2D transfer has completed and the CPU

681 # buffer is no longer being read by the DMA engine. The next

682 # launch_offload (start of the following iteration) will pop these

683 # buffers from the pool, well after the current H2D is done.

684 if self._group_cpu_buf is not None:

685 for buf in self._group_cpu_buf.values():

686 _return_cpu_pinned_buf(buf)

687 self._group_cpu_buf = None

688 # Device buffer: the pool holds the staging reference; just drop

689 # the local reference. Tensors aliasing _group_device_buf's

690 # storage keep it alive via their own storage references until

691 # they are consumed in backward.

692 self._group_device_buf = None

693 self._packed_tensor_info = []

694 self._packed_buckets = {}

695 self._packed_by_bucket = {}

696 self._seen_dedup_keys = set()

697

698

699class SwapManager:

700 """Singleton manager for swap groups and their operations."""

701 _instance: Optional["SwapManager"] = None

702 _lock = threading.Lock()

703

704 def __init__(self):

705 if hasattr(self, '_groups'):

706 return

707 self._groups: Dict[str, SwapGroup] = {}

708 self._current_group_name: str = ""

709 self._layer_count: int = 0

710 self._copy_stream: Optional[Any] = None

711

712 def __new__(cls):

713 if cls._instance is None:

714 with cls._lock:

715 if cls._instance is None:

716 cls._instance = super().__new__(cls)

717 return cls._instance

718

719 def add_storage(self, group_name: str, storage: Storage) -> None:

720 """Add a storage to a specified swap group."""

721 self.ensure_group(group_name)

722 self._groups[group_name].add(storage)

723

724 def ensure_group(self, group_name: str) -> None:

725 """Create the swap group if it does not exist yet."""

726 if group_name not in self._groups:

727 self._groups[group_name] = SwapGroup(group_name)

728

729 def launch_offload(self, group_name: str, copy_stream=None):

730 """Launch async offload for a specified swap group."""

731 group = self._groups.get(group_name)

732 if group is None:

733 raise RuntimeError(f"Group {group_name} does not exist.")

734 if copy_stream is None:

735 copy_stream = self._get_copy_stream()

736 group.launch_offload(copy_stream)

737

738 def protect_alias_tensors(self, group_name: str, tensors: Any):

739 """Keep tensors that alias externally-owned tensors on device."""

740 group = self._groups.get(group_name)

741 if group is None:

742 raise RuntimeError(f"Group {group_name} does not exist.")

743 group.protect_alias_tensors(tensors)

744

745 def wait_offload(self, group_name: str):

746 """Wait for offload to complete for a specified swap group."""

747 group = self._groups.get(group_name)

748 if group is None:

749 raise RuntimeError(f"Group {group_name} does not exist.")

750 group.wait_offload()

751

752 def launch_load(self, group_name: str, copy_stream=None):

753 """Launch async load for a specified swap group."""

754 group = self._groups.get(group_name)

755 if group is None:

756 raise RuntimeError(f"Group {group_name} does not exist.")

757 if copy_stream is None:

758 copy_stream = self._get_copy_stream()

759 group.launch_load(copy_stream)

760

761 def wait_load(self, group_name: str):

762 """Wait for load to complete for a specified swap group."""

763 group = self._groups.get(group_name)

764 if group is None:

765 raise RuntimeError(f"Group {group_name} does not exist.")

766 group.wait_load()

767

768 def release_group_storage(self, group_name: str) -> None:

769 """Release live storage references held by the swap group.

770

771 Called at the end of backward to free Storage objects that were never

772 released via wait_load (e.g. the last layer, which has no next layer

773 and therefore never goes through the offload-load cycle).

774 """

775 group = self._groups.get(group_name)

776 if group is not None:

777 group._storages.clear()

778

779 def get_current_group_name(self) -> str:

780 """Return the name of the currently active swap group."""

781 return self._current_group_name

782

783 def set_current_group_name(self, group_name: str) -> None:

784 """Set the name of the currently active swap group."""

785 self._current_group_name = group_name

786

787 def is_last_group(self, group_name: Optional[str] = None) -> bool:

788 """Return whether the specified swap group is the terminal group in the chain."""

789 group_name = self._current_group_name if group_name is None else group_name

790 group = self._groups.get(group_name)

791 if group is None:

792 return False

793 return group.is_last_group

794

795 def set_forward_prefetch_layer(self, first_layer, second_layer):

796 """

797 Configure prefetching and offloading order between two consecutive layers.

798

799 Usage:

800 for i in range(len(model.layers) - 1):

801 set_forward_prefetch_layer(model.layers[i], model.layers[i + 1])

802

803 Ensures idempotency: safe to call multiple times on the same layer pair.

804 """

805 if first_layer is second_layer:

806 warnings.warn(

807 "set_forward_prefetch_layer: "

808 "Prefetching between identical layers has no effect.",

809 UserWarning,

810 stacklevel=2,

811 )

812

813 def _ensure_group_name(module):

814 """Assign a unique swap group name to the module if not already assigned."""

815 if not hasattr(module, "_swap_group_name"):

816 name = f"swap_group_{self._layer_count}"

817 self._layer_count += 1

818 module._swap_group_name = name

819 module._swap_group_order = {"prev": None, "next": None}

820 return module._swap_group_name

821 first_name = _ensure_group_name(first_layer)

822 second_name = _ensure_group_name(second_layer)

823

824 if first_name not in self._groups:

825 self._groups[first_name] = SwapGroup(first_name)

826 if second_name not in self._groups:

827 self._groups[second_name] = SwapGroup(second_name)

828

829 if first_layer._swap_group_order["next"] is None:

830 first_layer._swap_group_order["next"] = second_name

831 if second_layer._swap_group_order["prev"] is None:

832 second_layer._swap_group_order["prev"] = first_name

833

834 self._groups[first_name].is_last_group = first_layer._swap_group_order["next"] is None

835 self._groups[second_name].is_last_group = second_layer._swap_group_order["next"] is None

836

837 def _forward_pre_hook(group_name, module, _): # pylint: disable=W0613

838 if getattr(module, "_swap_state", None) == "pre_backward":

839 return

840 SwapManager().set_current_group_name(group_name)

841

842 def _forward_hook(group_name, module, args, output): # pylint: disable=W0613

843 """

844 Forward post-hook executed immediately after forward computation

845 of the current layer finishes.

846

847 Execution timeline (example with 3 layers, forward order: L0 → L1 → L2):

848

849 Time →

850 Forward Compute Stream:

852

853 Copy Stream (offload):

854 | Offload L0 | - | Offload L1 |

855 ↑ ↑

856 offload at post(L0) offload at post(L1)

857

858 Swap rules:

859 1. After forward computation of the current layer completes:

860 - If a next layer exists, asynchronously offload the activations

861 of the current layer (launch_offload).

862

863 Example:

864 - At post-forward of L0, offload activations of L0.

865 - At post-forward of L1, offload activations of L1.

866

867 2. To limit device memory peak:

868 - If a previous layer exists, wait until its offload operation

869 has completed (wait_offload).

870

871 Notes:

872 - Offload operations are issued on the copy stream to overlap data transfer

873 with forward computation of subsequent layers.

874 - If the module is already in 'pre_backward' state, this hook is skipped

875 to avoid triggering offload during backward phase.

876 """

877 if getattr(module, "_swap_state", None) == "pre_backward":

878 return

879 next_name = module._swap_group_order.get('next', None)

880 if next_name:

881 SwapManager().protect_alias_tensors(group_name, output)

882 SwapManager().launch_offload(group_name)

883 prev_name = module._swap_group_order.get('prev', None)

884 if prev_name:

885 SwapManager().wait_offload(prev_name)

886

887 def _backward_pre_hook(group_name, module, grad_input): # pylint: disable=W0613

888 """

889 Pre-backward hook executed immediately before backward computation

890 of the current layer starts.

891

892 Execution timeline (example with 3 layers, backward order: L2 → L1 → L0):

893

894 Time →

895 Backward Compute Stream:

897

898 Copy Stream (load):

899 | Load L1 | - | Load L0 |

900 ↑ ↑

901 prefetch at pre(L2) prefetch at pre(L1)

902

903 Swap rules:

904 1. At the beginning of backward for the current layer:

905 - If a previous layer exists in backward order, asynchronously

906 prefetch its activations (launch_load).

907

908 Example:

909 - At pre-backward of L2, prefetch activations of L1.

910 - At pre-backward of L1, prefetch activations of L0.

911

912 2. Before starting backward computation of the current layer:

913 - Ensure that the activations of the current layer have already

914 been loaded back to device memory (wait_load).

915

916 Notes:

917 - Load operations are issued on the copy stream to overlap data transfer

918 with backward computation of the current layer.

919 - The swap state is marked as 'pre_backward' to prevent forward hooks

920 from issuing offload operations during backward phase.

921 """

922 module._swap_state = "pre_backward"

923 prev_name = module._swap_group_order.get('prev', None)

924 if prev_name:

925 SwapManager().launch_load(prev_name)

926

927 next_name = module._swap_group_order.get('next', None)

928 if next_name:

929 SwapManager().wait_load(group_name)

930 SwapManager().release_group_storage(group_name)

931

932 def _backward_hook(group_name, module, grad_input, grad_output): # pylint: disable=W0613

933 module._swap_state = "backward"

934

935 def _register_hooks_once(module, group_name):

936 hooks = [

937 ("_swap_forward_pre_hook_handle",

938 lambda h: platform.register_forward_pre_hook(module, h, prepend=True),

939 functools.partial(_forward_pre_hook, group_name)),

940

941 ("_swap_forward_hook_handle",

942 module.register_forward_hook,

943 functools.partial(_forward_hook, group_name)),

944

945 ("_swap_backward_pre_hook_handle",

946 lambda h: platform.register_full_backward_pre_hook(module, h, prepend=True),

947 functools.partial(_backward_pre_hook, group_name)),

948

949 ("_swap_backward_hook_handle",

950 lambda h: platform.register_full_backward_hook(module, h),

951 functools.partial(_backward_hook, group_name)),

952 ]

953

954 for attr_name, register_func, hook in hooks:

955 if not hasattr(module, attr_name):

956 handle = register_func(hook)

957 setattr(module, attr_name, handle)

958 # Register for both layers

959 _register_hooks_once(first_layer, first_name)

960 _register_hooks_once(second_layer, second_name)

961

962 def _get_copy_stream(self):

963 """Return a singleton copy stream, created on first access."""

964 if self._copy_stream is None:

965 self._copy_stream = platform.new_stream()

966 return self._copy_stream

Coverage for / home / jenkins / .local / lib / python3.10 / site-packages / hyper_parallel / core / activation_checkpoint / swap.py: 95%

521 statements