Coverage for /home/jenkins/.local/lib/python3.10/site-packages/hyper_parallel/platform/torch/fully

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

15# Adapted from https://github.com/pytorch/pytorch/blob/release/2.6/torch/distributed/fsdp/_fully_shard/_fsdp_param.py

16# enhanced with fully_shard parameter management

17# ============================================================================

18"""HSDP parameter"""

19# pylint: disable=W0212

20from typing import Callable, List, Optional, Tuple, Union, cast

22import torch

23import torch.distributed as dist

24from torch import nn

25from torch._prims_common import make_contiguous_strides_for

27from hyper_parallel.core.dtensor.device_mesh import DeviceMesh

28from hyper_parallel.core.dtensor.dtensor import DTensor

29from hyper_parallel.core.dtensor.layout import Layout

30from hyper_parallel.core.dtensor.placement_types import Replicate, Shard, StridedShard

31from hyper_parallel.core.fully_shard.hsdp_param import HSDPParamV2

32from hyper_parallel.core.fully_shard.hsdp_utils import (

33 FullyShardParamMode,

34 GroupInfo,

35 ParamModuleInfo,

36 ShardedState,

37 apply_gradient_scaling_factor,

38 get_rank_list_for_axes,

39 get_split_rank_lists_for_axes,

40)

41from hyper_parallel.core.fully_shard.utils import (

42 CPUOffloadPolicy,

43 DDPMeshInfo,

44 FSDPMeshInfo,

45 MixedPrecisionPolicy,

46 OffloadPolicy,

47)

48from hyper_parallel.platform import get_platform

49from hyper_parallel.platform.torch.fully_shard.pack_utils import (

50 build_rs_plan,

51 pack_for_reduce_scatter,

52 unpack_from_all_gather,

53)

55_GROUP_INFO_CACHE = {}

56platform = get_platform()

59def _copy_without_bumping_version(dst: torch.Tensor, src: torch.Tensor) -> None:

60 """Copy into ``dst`` while preserving its autograd version counter."""

61 # pylint: disable=W0212

62 with torch.autograd._unsafe_preserve_version_counter(dst):

63 dst.copy_(src)

66def _build_group_info_from_rank_list(

67 group_name: str,

68 rank_list,

69) -> GroupInfo:

70 """Create group metadata from an explicit rank list."""

71 normalized_rank_list = tuple(sorted(int(rank) for rank in rank_list))

72 if len(normalized_rank_list) <= 1:

73 return GroupInfo(f"{group_name}_invalid", None, 1)

74 if normalized_rank_list in _GROUP_INFO_CACHE:

75 cached_group = _GROUP_INFO_CACHE[normalized_rank_list]

76 return GroupInfo(str(normalized_rank_list), cached_group, len(normalized_rank_list))

77 try:

78 group = platform.create_group(list(normalized_rank_list))

79 except (RuntimeError, ValueError): # pragma: no cover - UT may run without dist init

80 group = None

81 _GROUP_INFO_CACHE[normalized_rank_list] = group

82 return GroupInfo(str(normalized_rank_list), group, len(normalized_rank_list))

85def _build_group_info_from_process_group(

86 group_name: str,

87 process_group,

88 rank_size: int,

89) -> GroupInfo:

90 """Create group metadata from an existing process group."""

91 if process_group is None or rank_size <= 1:

92 return GroupInfo(f"{group_name}_invalid", None, 1)

93 try:

94 rank_list = dist.get_process_group_ranks(process_group)

95 resolved_group_name = str(tuple(sorted(rank_list)))

96 except (AssertionError, AttributeError, KeyError, RuntimeError, TypeError, ValueError):

97 # pragma: no cover - best-effort naming / mocked process groups in UT

98 resolved_group_name = group_name

99 return GroupInfo(resolved_group_name, process_group, rank_size)

100

101

102class TorchHSDPParamV2(HSDPParamV2):

103 """

104 Torch HSDP parameter.

105 """

106

107 def __init__(

108 self,

109 param: nn.Parameter,

110 module_info: ParamModuleInfo,

111 mesh_info: FSDPMeshInfo,

112 shard_placement_fn: Optional[Callable[[nn.Parameter], Optional[Shard]]] = None,

113 mp_policy: Optional[MixedPrecisionPolicy] = None,

114 offload_policy: Optional[OffloadPolicy] = None,

115 device: Optional[torch.device] = None,

116 param_mode: Optional[FullyShardParamMode] = None,

117 enable_fsdp_shard: bool = True,

118 ):

119 """

120 Initialize TorchHSDPParamV2 and shard the parameter.

121

122 Args:

123 param (nn.Parameter): The original full parameter to shard.

124 module_info (ParamModuleInfo): Ownership and shared-weight metadata.

125 mesh_info (FSDPMeshInfo): Mesh topology for shard/replicate dimensions.

126 shard_placement_fn (Callable, optional): Returns a Shard placement for the parameter,

127 or None to use default (Shard(0)).

128 mp_policy (MixedPrecisionPolicy, optional): Mixed precision dtype policy.

129 offload_policy (OffloadPolicy, optional): CPU offload policy.

130 device (torch.device, optional): Target device for the sharded parameter.

131 """

132 self._module_info: ParamModuleInfo = module_info

133 self.mesh_info = mesh_info

134 self.mp_policy = mp_policy

135 self.device = device

136 if param_mode is None:

137 raise AssertionError("param_mode must be resolved before TorchHSDPParamV2 initialization.")

138 self.param_mode = param_mode

139 self.enable_fsdp_shard = enable_fsdp_shard

140 self.orig_dtype = None

141 self.param_dtype = None

142 self.reduce_dtype = None

143 self.offload_to_cpu: bool = isinstance(offload_policy, CPUOffloadPolicy)

144 self.pin_memory = (

145 self.offload_to_cpu and cast(CPUOffloadPolicy, offload_policy).pin_memory

146 )

147 self._orig_param_hooks: List[Callable] = []

148 self.grad_offload_event: Optional[torch.Event] = None

149 self._orig_param_is_dtensor = isinstance(param, DTensor)

150 self._orig_dtensor_mesh = param.device_mesh if self._orig_param_is_dtensor else None

151 self._orig_dtensor_placements = tuple(param.placements) if self._orig_param_is_dtensor else None

152 self._spmd_shard_mesh_dim = self.mesh_info.shard_mesh_dim

153 self._spmd_replicate_mesh_dim = self.mesh_info.replicate_mesh_dim

154 self._init_sharded_param(param, shard_placement_fn)

155 self._init_group_infos()

156 self.all_gather_outputs: List[torch.Tensor] = []

157 self.unsharded_accumulated_grad = None

158 self._param_fqn: Optional[str] = None

159 # Communication attributes for prefetch pattern

160 self.prefetch_handle: Optional[dist.Work] = None

161 self._post_load_hook_handle = (

162 module_info.module.register_load_state_dict_post_hook(

163 lambda *args, **kwargs: self.reset_sharded_param()

164 )

165 )

166 self._reduce_scatter_output = None

167 self.reduce_scatter_handle = None

168 self._all_reduce_output = None

169 self.all_reduce_handle = None

170 self._save_backward_hooks(param)

171 self._grad = None

172 self._accumulated_allreduced_grad = True

173 self.gradient_scaling_factor = None

174

175 @property

176 def uses_param_shard(self) -> bool:

177 """Whether fully_shard should physically shard parameter storage for this param."""

178 return self.enable_fsdp_shard

179

180 @property

181 def is_dtensor_compat_mode(self) -> bool:

182 """Whether the parameter is managed through the DTensor compatibility path only."""

183 return self.param_mode == FullyShardParamMode.DTENSOR_COMPAT

184

185 def _get_base_spmd_placements(self) -> tuple:

186 if self.param_mode == FullyShardParamMode.DTENSOR_UNIFIED and self._orig_param_is_dtensor:

187 # DTENSOR_UNIFIED keeps the original distributed layout and prefixes

188 # explicit DP/FSDP mesh dimensions ahead of it on the unified mesh.

189 self._spmd_mesh = DeviceMesh.concatenate([self.mesh_info.mesh, self._orig_dtensor_mesh])

190 dp_prefix_placements = tuple(Replicate() for _ in range(self.mesh_info.mesh.ndim))

191 return dp_prefix_placements + tuple(self._orig_dtensor_placements)

192

193 if self.is_dtensor_compat_mode and self._orig_param_is_dtensor:

194 self._spmd_mesh = self._orig_dtensor_mesh

195 return tuple(self._orig_dtensor_placements)

196

197 self._spmd_mesh = self.mesh_info.mesh

198 return tuple(Replicate() for _ in range(self._spmd_mesh.ndim))

199

200 def _apply_data_parallel_placements(self, placements: list, shard_placement: Shard) -> tuple:

201 if len(placements) != self._spmd_mesh.ndim:

202 raise AssertionError(

203 f"Expected {self._spmd_mesh.ndim} unified placements, got {len(placements)}: {placements}"

204 )

205 if (

206 isinstance(self.mesh_info, DDPMeshInfo)

207 and self._spmd_replicate_mesh_dim is not None

208 and not self._orig_param_is_dtensor

209 ):

210 placements[self._spmd_replicate_mesh_dim] = Replicate()

211 if (

212 self.uses_param_shard

213 and isinstance(self.mesh_info, FSDPMeshInfo)

214 and self._spmd_shard_mesh_dim is not None

215 ):

216 # If TP/EP already shards the same tensor dimension, fully_shard must

217 # use StridedShard so the unified placement preserves the intended

218 # shard order on the concatenated mesh.

219 split_factor = 1

220 for mesh_idx, placement in enumerate(placements):

221 if mesh_idx == self._spmd_shard_mesh_dim:

222 continue

223 if placement.is_shard(shard_placement.dim):

224 split_factor *= self._spmd_mesh.mesh_shape[mesh_idx]

225 placements[self._spmd_shard_mesh_dim] = (

226 StridedShard(shard_placement.dim, split_factor=split_factor)

227 if split_factor > 1

228 else shard_placement

229 )

230 return tuple(placements)

231

232 def _init_group_infos(self) -> None:

233 if self.uses_param_shard and self.is_sharded and isinstance(self.mesh_info, FSDPMeshInfo):

234 self.sharded_group_info = _build_group_info_from_process_group(

235 "fully_shard_sharded_group",

236 self.mesh_info.shard_process_group,

237 self.mesh_info.shard_mesh_size,

238 )

239 else:

240 self.sharded_group_info = GroupInfo("fully_shard_sharded_group_invalid", None, 1)

241

242 # The all-reduce group is always derived from the final materialized layout.

243 # This keeps replicate_params, DTensor compat, and unified multi-dim layouts

244 # on a single source of truth.

245 self.unsharded_group_info = self._build_layout_driven_group_info()

246

247 self.shard_size = self.sharded_group_info.rank_size

248 self.dp_size = self.unsharded_group_info.rank_size

249 self.rank_size = max(1, self.shard_size * self.dp_size)

250

251 def _build_layout_driven_group_info(self):

252 group_axes = [

253 axis

254 for axis, placement in enumerate(self._spmd_placements)

255 if placement.is_replicate()

256 ]

257 if self.uses_param_shard and self._spmd_shard_mesh_dim is not None:

258 group_axes = [axis for axis in group_axes if axis != self._spmd_shard_mesh_dim]

259 if not group_axes:

260 return GroupInfo("fully_shard_unsharded_group_invalid", None, 1)

261 group_dim_names = getattr(self._spmd_mesh, "mesh_dim_names", None)

262 if group_dim_names:

263 try:

264 mesh_axis_names = tuple(group_dim_names[axis] for axis in group_axes)

265 if len(mesh_axis_names) == 1:

266 axis_name = mesh_axis_names[0]

267 process_group = self._spmd_mesh.get_group(axis_name)

268 if process_group is not None:

269 rank_size = self._spmd_mesh.mesh_shape[group_dim_names.index(axis_name)]

270 return _build_group_info_from_process_group(

271 "fully_shard_unsharded_group",

272 process_group,

273 rank_size,

274 )

275

276 split_rank_lists = get_split_rank_lists_for_axes(self._spmd_mesh, group_axes)

277 process_group = platform.split_group(split_ranks=split_rank_lists)

278 if process_group is not None:

279 rank_size = 1

280 for axis in group_axes:

281 rank_size *= self._spmd_mesh.mesh_shape[axis]

282 return _build_group_info_from_process_group(

283 "fully_shard_unsharded_group",

284 process_group,

285 rank_size,

286 )

287 except (

288 AssertionError,

289 AttributeError,

290 KeyError,

291 RuntimeError,

292 TypeError,

293 ValueError,

294 ):

295 # Fall back to the explicit rank-list path for mocked meshes in UT

296 # or when a mesh implementation cannot materialize a reusable group.

297 pass

298

299 rank_list = get_rank_list_for_axes(self._spmd_mesh, group_axes)

300 return _build_group_info_from_rank_list("fully_shard_unsharded_group", rank_list)

301

302 def _to_local_unsharded_grad(self, grad):

303 """Normalize a pending gradient to a local tensor expected by fully_shard collectives."""

304 if not isinstance(grad, DTensor):

305 return grad

306

307 if any(placement.is_partial() for placement in grad.placements):

308 grad = grad.reduce_partial()

309

310 mesh_mismatch = (

311 self._orig_dtensor_mesh is not None

312 and grad.device_mesh.to_hash() != self._orig_dtensor_mesh.to_hash()

313 )

314 placement_mismatch = (

315 self._orig_dtensor_placements is not None

316 and tuple(grad.placements) != tuple(self._orig_dtensor_placements)

317 )

318 if mesh_mismatch or placement_mismatch:

319 grad = grad.redistribute(self._orig_dtensor_mesh, self._orig_dtensor_placements)

320 return grad.to_local()

321

322 @property

323 def accumulated_allreduced_grad(self) -> bool:

324 """Whether the parameter has accumulated all-reduced gradient."""

325 return self._accumulated_allreduced_grad

326

327 @accumulated_allreduced_grad.setter

328 def accumulated_allreduced_grad(self, value: bool) -> None:

329 self._accumulated_allreduced_grad = value

330

331 def _save_backward_hooks(self, param: nn.Parameter) -> None:

332 """Save the backward hooks of the original parameter"""

333 if not hasattr(param, '_backward_hooks') or param._backward_hooks is None:

334 return

335

336 # Get the set of saved hook function IDs for deduplication

337 if not hasattr(self, '_saved_hook_ids'):

338 object.__setattr__(self, '_saved_hook_ids', set())

339

340 for _, hook_func in param._backward_hooks.items():

341 # Use the id of hook_func to avoid adding the same function object repeatedly

342 hook_func_id = id(hook_func)

343 if hook_func_id not in self._saved_hook_ids:

344 self._orig_param_hooks.append(hook_func)

345 self._saved_hook_ids.add(hook_func_id)

346

347 def _migrate_backward_hooks(self, new_param: nn.Parameter) -> None:

348 """Migrate backward hooks from the original parameter to the new parameter"""

349 if not self._orig_param_hooks or hasattr(new_param, "migrate_backward_hooks_run_once"):

350 return

351

352 # Properly register each hook using the register_hook method

353 for hook_func in self._orig_param_hooks:

354 try:

355 if new_param.requires_grad:

356 new_param.register_hook(hook_func)

357 except RuntimeError:

358 # Skip hook registration if the parameter does not require gradients

359 pass

360 new_param.migrate_backward_hooks_run_once = True

361

362 def reduce_scatter_output(self):

363 """

364 Get the reduce-scatter output tensor and wait for asynchronous operation to complete.

365

366 Returns:

367 torch.Tensor: The sharded gradient tensor after reduce-scatter operation.

368 """

369 if self.reduce_scatter_handle is not None:

370 self.reduce_scatter_handle.wait()

371 self._grad.untyped_storage().resize_(0)

372 self._grad = None

373 self.reduce_scatter_handle = None

374 return self._reduce_scatter_output

375

376 def clear_reduce_scatter_output(self):

377 """Clear the reduce-scatter output tensor to free memory."""

378 self._reduce_scatter_output = None

379

380 def all_reduce_output(self):

381 """

382 Get the all-reduce output tensor and wait for asynchronous operation to complete.

383

384 Returns:

385 torch.Tensor: The reduced gradient tensor after all-reduce operation.

386 """

387 if self.all_reduce_handle is not None:

388 self.all_reduce_handle.wait()

389 self.all_reduce_handle = None

390 return self._all_reduce_output

391

392 def clear_all_reduce_output(self):

393 """Clear the all-reduce output tensor to free memory."""

394 self._all_reduce_output = None

395

396 def apply_reduced_grad(self, reduced_grad, param_type):

397 """

398 Apply reduced gradient to the sharded parameter.

399

400 Reshapes ``reduced_grad`` to match the local shard, optionally

401 offloads to CPU, then accumulates or assigns onto

402 ``hsdp_param.sharded_param.grad``.

403

404 Note:

405 Gradient scaling (``gradient_scaling_factor``) is applied earlier on

406 the reduce input (see ``reduce_scatter_grad`` / ``foreach_reduce``),

407 never here, so accumulation stays ``sum_i(g_i * factor)`` rather than

408 scaling the already-accumulated grad again.

409

410 Args:

411 reduced_grad (torch.Tensor): Gradient after reduce-scatter

412 and/or all-reduce.

413 param_type (Optional[torch.dtype]): Target dtype for the gradient (if conversion is needed).

414 """

415 sharded_grad = None

416 if not self.mp_policy.apply_grad_on_fp32_main_grad:

417 sharded_grad = self.sharded_param.grad

418 else:

419 if not hasattr(self.sharded_param, "main_grad"):

420 self.sharded_param.main_grad = None

421 sharded_grad = self.sharded_param.main_grad

422 sharded_param_local_shape = (

423 self.sharded_param.local_shape

424 if isinstance(self.sharded_param, DTensor)

425 else self.sharded_param.shape

426 )

427 reduced_grad = reduced_grad.view(sharded_param_local_shape)

428 if (not self.mp_policy.apply_grad_on_fp32_main_grad and param_type is not None

429 and reduced_grad.dtype != param_type):

430 reduced_grad = reduced_grad.to(param_type)

431 to_accumulate_grad = sharded_grad is not None

432 need_synchronize = False

433 if self.offload_to_cpu:

434 non_blocking = self.pin_memory and not to_accumulate_grad

435 reduced_grad = reduced_grad.to(

436 torch.device("cpu"), non_blocking=non_blocking

437 )

438 need_synchronize = True

439 if sharded_grad is None:

440 if not self.mp_policy.apply_grad_on_fp32_main_grad:

441 self.sharded_param.grad = self.to_sharded_dtensor(reduced_grad)

442 else:

443 self.sharded_param.main_grad = self.to_sharded_dtensor(reduced_grad)

444 self.sharded_param.grad = None

445 else:

446 if not self.mp_policy.apply_grad_on_fp32_main_grad:

447 self.sharded_param.grad._local_tensor += reduced_grad

448 else:

449 self.sharded_param.main_grad._local_tensor += reduced_grad

450 self.sharded_param.grad = None

451 if self.unsharded_accumulated_grad_data is not None:

452 self.unsharded_accumulated_grad = None

453 elif self.unsharded_param.grad is not None:

454 self.unsharded_param.grad = None

455 return need_synchronize

456

457 @torch.no_grad()

458 def _init_sharded_param(

459 self,

460 param: nn.Parameter,

461 shard_placement_fn: Optional[Callable],

462 ) -> None:

463 if param.device != self.device and param.device.type != "meta":

464 raise AssertionError(

465 f"Expects the parameter to already be moved to device {self.device} but got {param.device}"

466 )

467

468 hsdp_placement = shard_placement_fn(param) if shard_placement_fn else None

469 if hsdp_placement is None:

470 hsdp_placement = Shard(0)

471 elif hsdp_placement.dim < 0:

472 # if dim is negative, add the number of dimensions of the parameter

473 hsdp_placement = Shard(hsdp_placement.dim + param.ndim)

474

475 if not isinstance(hsdp_placement, Shard):

476 raise AssertionError(

477 f"Expected Shard, got {type(hsdp_placement)}: {hsdp_placement}"

478 )

479

480 self.hsdp_placement = hsdp_placement

481 base_placements = list(self._get_base_spmd_placements())

482 self._spmd_placements = self._apply_data_parallel_placements(base_placements, hsdp_placement)

483 param_data = param.to_local() if self._orig_param_is_dtensor else param

484

485 shard_dim = hsdp_placement.dim

486 self._orig_size = param_data.size()

487 self._contiguous_orig_stride = make_contiguous_strides_for(self._orig_size)

488

489 if self.uses_param_shard and isinstance(self.mesh_info, FSDPMeshInfo):

490 shard_rank = self.mesh_info.shard_mesh_rank

491 shard_world_size = self.mesh_info.shard_mesh_size

492 else:

493 shard_rank = 0

494 shard_world_size = 1

495

496 if isinstance(param_data, DTensor) and isinstance(self.mesh_info, DDPMeshInfo):

497 param_data.data = param_data.full_tensor()

498

499 self.is_sharded = bool(self.uses_param_shard and shard_world_size > 1)

500

501 if param_data.size(shard_dim) % shard_world_size != 0:

502 raise NotImplementedError(

503 f"Uneven sharding on dim {shard_dim} not supported: "

504 f"shape={param_data.shape}, world_size={shard_world_size}"

505 )

506 chunks = torch.chunk(param_data, shard_world_size, dim=shard_dim)

507 sharded_param = chunks[shard_rank].clone().contiguous()

508 self.sharded_size = sharded_param.size()

509 self.contiguous_sharded_stride = make_contiguous_strides_for(self.sharded_size)

510 if self.offload_to_cpu and not sharded_param.is_meta:

511 sharded_param = sharded_param.cpu()

512 if self.pin_memory:

513 sharded_param = sharded_param.pin_memory()

514 self._sharded_param_data = sharded_param.view(-1)

515

516 self._sharding_spec = Layout.from_device_mesh(self._spmd_mesh)

517 self._sharding_spec.set_placements(self._spmd_placements)

518 self._sharding_spec.placement_to_tensor_map(param.ndim)

519

520 self.sharded_param = nn.Parameter(DTensor.from_local(sharded_param, self._spmd_mesh, self._spmd_placements))

521 self.sharded_param.requires_grad_(param.requires_grad)

522 self._setattr_on_modules(self.sharded_param)

523 # after init, self.sharded_param replaces original param, gradients must accumulate to this Parameter's grad

524 self.sharded_param._hsdp_param_initialized = True

525 self.sharded_state = ShardedState.SHARDED

526 self.param_dtype = None

527

528 def init_dtype_attrs(self, mp_policy: MixedPrecisionPolicy):

529 """Initialize param_dtype and reduce_dtype from the mixed precision policy."""

530 param_dtype, reduce_dtype = (mp_policy.param_dtype, mp_policy.reduce_dtype)

531 self.orig_dtype = self.sharded_param.dtype

532 if reduce_dtype == param_dtype:

533 reduce_dtype = None

534 if param_dtype == self.orig_dtype:

535 param_dtype = None

536 self.param_dtype = param_dtype

537 self.reduce_dtype = reduce_dtype

538

539 def init_all_gather_outputs(

540 self,

541 all_gather_input_numels: list[int],

542 all_gather_input_dtypes: list[torch.dtype],

543 world_size: int,

544 device: torch.device,

545 force_recreate: bool = False,

546 ):

547 """

548 Allocate output buffers for all-gather communication.

549

550 Args:

551 all_gather_input_numels: Number of elements per input shard.

552 all_gather_input_dtypes: Dtype of each input shard.

553 world_size: Number of ranks in the shard process group.

554 device: Device on which to allocate the output buffers.

555 force_recreate: If True, always recreate buffers even if already initialized.

556 """

557 if not force_recreate and len(self.all_gather_outputs) > 0:

558 return # already initialized

559 self.all_gather_outputs = [

560 torch.empty(torch.Size([numel * world_size]), dtype=dtype, device=device)

561 for numel, dtype in zip(all_gather_input_numels, all_gather_input_dtypes)

562 ]

563

564 def init_unsharded_param(self):

565 """

566 Initialize unsharded parameter from all-gather outputs.

567

568 This reconstructs the full parameter after all-gather by unpacking the

569 gathered flat buffer back to the original tensor layout.

570 """

571 unsharded_param = self._get_unsharded_param_from_all_gather_output()

572 # Always refresh the unsharded Parameter from the latest all-gather output.

573 # Non-dim0 unpack currently materializes a contiguous tensor copy, so

574 # keeping stale .data would otherwise reuse old weights after optimizer.step()

575 # mutates only the sharded local shard. Preserve the Parameter object identity

576 # so autograd-facing module state stays stable across unshard cycles.

577 if hasattr(self, "_unsharded_param"):

578 # pylint: disable=access-member-before-definition

579 self._unsharded_param.data = unsharded_param

580 self._unsharded_param.requires_grad_(self.sharded_param.requires_grad)

581 self._unsharded_param.grad = None

582 return

583 self._unsharded_param = nn.Parameter(

584 unsharded_param,

585 requires_grad=self.sharded_param.requires_grad,

586 )

587

588 def _get_unsharded_param_from_all_gather_output(self) -> torch.Tensor:

589 """Reconstruct the full local parameter view from the packed all-gather output."""

590 if len(self.all_gather_outputs) != 1:

591 raise AssertionError(

592 f"Expected 1 all_gather_output, got {len(self.all_gather_outputs)}"

593 )

594 unsharded_tensor = self.all_gather_outputs[0]

595 plan = build_rs_plan(

596 self,

597 self._sharded_local_tensor,

598 self.shard_world_size if self.is_sharded else 1,

599 )

600 unsharded_param = unpack_from_all_gather(unsharded_tensor, plan)

601 if self._orig_param_is_dtensor:

602 # Rebuild the original DTensor view after all-gather so gradient

603 # consumers keep seeing the source DTensor layout.

604 unsharded_param = DTensor.from_local(

605 unsharded_param,

606 self._orig_dtensor_mesh,

607 self._orig_dtensor_placements,

608 )

609 return unsharded_param

610

611 def to_sharded(self) -> None:

612 if not self.uses_param_shard and self._unsharded_param is not None:

613 # Replicate params keep the same local shape across shard/unshard,

614 # so persist forward-time state updates before switching objects.

615 src = self._unsharded_param.to_local() if isinstance(self._unsharded_param, DTensor) \

616 else self._unsharded_param

617 dst = self.sharded_param.to_local() if isinstance(self.sharded_param, DTensor) else self.sharded_param

618 _copy_without_bumping_version(dst, src)

619 self._setattr_on_modules(self.sharded_param)

620 self.free_unsharded_param()

621 self.sharded_state = ShardedState.SHARDED

622

623 def to_unsharded(self) -> None:

624 set_requires_grad_if_needed(self.sharded_param, self._unsharded_param)

625 self._setattr_on_modules(self._unsharded_param)

626 self.sharded_state = ShardedState.UNSHARDED

627

628 def _setattr_on_modules(self, param: nn.Parameter) -> None:

629 """Set parameter on module and shared modules, preserving pointer consistency."""

630 if getattr(self._module_info.module.__setattr__, "__func__", None) is nn.Module.__setattr__:

631 # fast path

632 self._module_info.module._parameters[self._module_info.param_name] = param

633 else:

634 # slow path

635 setattr(self._module_info.module, self._module_info.param_name, param)

636 self._save_backward_hooks(self.sharded_param)

637 self._migrate_backward_hooks(param)

638 # Iterate through all modules that share this parameter to prevent pointer desync.

639 for shared_module, shared_param_name in zip(

640 self._module_info.shared_modules, self._module_info.shared_param_names

641 ):

642 if getattr(shared_module.__setattr__, "__func__", None) is nn.Module.__setattr__:

643 shared_module._parameters[shared_param_name] = param

644 else:

645 setattr(shared_module, shared_param_name, param)

646

647 def to_sharded_dtensor(self, tensor: torch.Tensor) -> DTensor:

648 """

649 Converts a local tensor representing either the sharded parameter or

650 sharded gradient to DTensor.

651 """

652 return DTensor.from_local(

653 tensor,

654 self._sharding_spec.mesh,

655 self._sharding_spec.placements

656 )

657

658 def to_accumulated_grad_if_needed(self) -> None:

659 if self._unsharded_param.grad is None:

660 return

661 # Keep local gradients alive across no-sync / delayed-sync steps even

662 # after the parameter transitions back to the sharded view.

663 unsharded_grad = self._unsharded_param.grad

664 self._unsharded_param.grad = None

665 if self.reduce_dtype is not None and unsharded_grad.dtype != self.reduce_dtype:

666 unsharded_grad = unsharded_grad.to(self.reduce_dtype)

667 if self.unsharded_accumulated_grad is None:

668 self.unsharded_accumulated_grad = unsharded_grad

669 else:

670 self.unsharded_accumulated_grad += unsharded_grad

671

672 def accumulate_unsharded_grad_if_needed(self) -> None:

673 if (

674 self.unsharded_accumulated_grad is not None

675 and self.unsharded_param.grad is not None

676 ):

677 grad = self.unsharded_param.grad

678 if self.reduce_dtype is not None and grad.dtype != self.reduce_dtype:

679 grad = grad.to(self.reduce_dtype)

680 self.unsharded_accumulated_grad += grad

681 self.unsharded_param.grad = None

682

683 def alloc_all_gather_outputs(self) -> None:

684 """Resize all-gather output buffers to their full capacity for communication."""

685 for tensor in self.all_gather_outputs:

686 expected_size = tensor.numel() * tensor.itemsize

687 storage = tensor.untyped_storage()

688 if storage.size() != expected_size:

689 storage.resize_(expected_size)

690

691 def free_unsharded_param(self) -> None:

692 """Release storage of all-gather outputs to free device memory."""

693 for tensor in self.all_gather_outputs:

694 storage = tensor.untyped_storage()

695 if storage.size() != 0:

696 storage.resize_(0)

697

698 @property

699 def all_gather_inputs(self) -> list[torch.Tensor]:

700 """Return the local sharded tensor to use as input for all-gather, applying dtype cast if needed."""

701 self._assert_in_states(ShardedState.SHARDED)

702 sharded_param_data = self._sharded_param_data

703 if self.offload_to_cpu:

704 sharded_param_data = sharded_param_data.to(

705 self.device, non_blocking=True

706 )

707 if self.param_dtype is not None and self.param_dtype != sharded_param_data.dtype:

708 return [sharded_param_data.to(self.param_dtype)]

709 return [sharded_param_data]

710

711 @property

712 def unsharded_param(self) -> nn.Parameter:

713 """Return the full unsharded parameter after all-gather."""

714 return self._unsharded_param

715

716 @property

717 def unsharded_grad_data(self) -> torch.Tensor:

718 """

719 Get the unsharded gradient data as a local tensor.

720 """

721 grad = self.unsharded_param.grad

722 if grad is None:

723 raise AssertionError("Expects unsharded_param.grad to not be None")

724 return self._to_local_unsharded_grad(grad)

725

726 @property

727 def unsharded_accumulated_grad_data(self) -> torch.Tensor:

728 """

729 Get the unsharded accumulated gradient data as a local tensor.

730 """

731 grad = self.unsharded_accumulated_grad

732 return self._to_local_unsharded_grad(grad)

733

734 @property

735 def _sharded_local_tensor(self) -> torch.Tensor:

736 """Return the underlying local tensor of the sharded DTensor parameter."""

737 return cast(DTensor, self.sharded_param)._local_tensor

738

739 @property

740 def shard_world_size(self) -> int:

741 """Get the world size for shard dimension."""

742 return self.shard_size

743

744 @property

745 def replicate_world_size(self) -> int:

746 """Get the world size for replicate dimension (HSDP only)."""

747 return self.dp_size

748

749 def _assert_in_states(self, *states: ShardedState) -> None:

750 """Assert current state is one of expected states."""

751 if self.sharded_state not in states:

752 raise AssertionError(

753 f"Expected sharded_state in {states}, got {self.sharded_state}"

754 )

755

756 def _resolve_reset_param(self):

757 """Resolve the (possibly swapped) module param for ``reset_sharded_param``.

758

759 Refreshes ``self.sharded_param`` for the DTensor case and returns the

760 current module parameter for the caller to re-shard.

761 """

762 module_info = self._module_info

763 new_param = getattr(module_info.module, module_info.param_name)

764 if new_param is self.sharded_param:

765 return new_param

766 # Ensure object identity is preserved after parameter conversion.

767 if torch.__future__.get_swap_module_params_on_conversion():

768 raise AssertionError(

769 f"Expects swap_tensors to preserve object but got {new_param} "

770 f"instead of {self.sharded_param}"

771 )

772 if isinstance(new_param, DTensor):

773 self.sharded_param = new_param

774 if not getattr(self.sharded_param, "_hsdp_param_initialized", None):

775 # reset _hsdp_param_initialized flag.

776 self.sharded_param._hsdp_param_initialized = True

777 # If new_param is a plain Tensor, keep the existing 'self.sharded_param' ref;

778 # only its _local_tensor / _sharded_param_data are refreshed below.

779 return new_param

780

781 def reset_sharded_param(self) -> None:

782 """Reset sharded param after load_state_dict."""

783 new_param = self._resolve_reset_param()

784 local_tensor = new_param._local_tensor if isinstance(new_param, DTensor) else new_param

785 if local_tensor.is_meta:

786 return

787 updated_local_tensor = False

788 # local_tensor can be padded twice

789 # 1st time in fully_shard(model)

790 # 2nd time in model(input) lazy_init

791 # 2nd time should be no-op if parameters remain unchanged

792 # 2nd time shouldn't be no-op if people call model.load_state_dict(...) before lazy_init

793 # this makes it possible for trainer to call `sd = model.state_dict()` before the training loop

794 # and use `sd` without calling .state_dict() per iteration

795 same_local_tensor = False

796 if isinstance(self._sharded_param_data, torch.Tensor):

797 same_local_tensor = (

798 # when sharding param with shape (1, ...) over 2 ranks

799 # local_tensor on rank 1 can be size 0, data_ptr() can be 0

800 self._sharded_param_data.untyped_storage().data_ptr() > 0

801 and self._sharded_param_data.untyped_storage().data_ptr()

802 == local_tensor.untyped_storage().data_ptr()

803 )

804 sharded_size = self.sharded_size

805 shard_dim = self.hsdp_placement.dim

806 length = local_tensor.size(shard_dim) if local_tensor.numel() > 0 else 0

807 if not same_local_tensor:

808 if local_tensor.size() != sharded_size:

809 raise AssertionError(

810 f"Expected sharded_size to be {sharded_size}, got {local_tensor.size()}"

811 )

812 updated_local_tensor = True

813 if self.pin_memory and not local_tensor.is_pinned():

814 local_tensor = local_tensor.cpu().pin_memory()

815 updated_local_tensor = True

816 if not same_local_tensor:

817 self._sharded_param_data = local_tensor.view(-1)

818 if not isinstance(self.sharded_param, DTensor):

819 raise AssertionError(f"Expected DTensor, got {type(self.sharded_param)}")

820 if updated_local_tensor:

821 # Only change the local tensor object if needed.

822 # detach to an off-graph leaf sharing storage, so later grad-mode in-place

823 # edits (weight load / upcast) don't trip the is_leaf check.

824 local_view = local_tensor.narrow(dim=shard_dim, start=0, length=length).detach()

825 set_requires_grad_if_needed(self.sharded_param, local_view)

826 self.sharded_param._local_tensor = local_view

827 if not self.sharded_param._local_tensor.is_contiguous():

828 raise AssertionError(

829 "Expected sharded_param._local_tensor to be contiguous"

830 )

831 self._sharding_spec = cast(DTensor, self.sharded_param).layout

832 # After ``to_empty`` replaces the module parameter with a plain tensor,

833 # re-install the DTensor ``nn.Parameter`` so the optimizer and forward

834 # hooks see the correct object. Idempotent when the module already

835 # holds ``self.sharded_param`` (same data_ptr → no-op in practice).

836 self._setattr_on_modules(self.sharded_param)

837

838 def _get_unsharded_param_data(self, async_op: bool = False) -> Tuple[torch.Tensor, Optional[dist.Work]]:

839 """

840 Perform all-gather to get unsharded parameter data.

841

842 Args:

843 async_op: Whether to execute asynchronously.

844

845 Returns:

846 (unsharded_param, handle): Unsharded parameter data and communication handle.

847 """

848 # If parameter is not sharded (below threshold), no communication needed

849 if not self.is_sharded:

850 all_gather_input = self.all_gather_inputs[0]

851 self.init_all_gather_outputs(

852 all_gather_input_numels=[all_gather_input.numel()],

853 all_gather_input_dtypes=[all_gather_input.dtype],

854 world_size=1,

855 device=self.device,

856 )

857 self.alloc_all_gather_outputs()

858 _copy_without_bumping_version(self.all_gather_outputs[0], all_gather_input)

859 return self.all_gather_outputs[0], None

860

861 # Get input data

862 all_gather_input = self.all_gather_inputs[0]

863

864 # Initialize output buffer

865 self.init_all_gather_outputs(

866 all_gather_input_numels=[all_gather_input.numel()],

867 all_gather_input_dtypes=[all_gather_input.dtype],

868 world_size=self.shard_world_size,

869 device=self.device,

870 )

871 self.alloc_all_gather_outputs()

872

873 if self.sharded_group_info.group is None or self.shard_world_size <= 1:

874 # No communication needed, just copy

875 _copy_without_bumping_version(self.all_gather_outputs[0], all_gather_input)

876 return self.all_gather_outputs[0], None

877

878 # Execute all_gather_into_tensor

879 handle = dist.all_gather_into_tensor(

880 self.all_gather_outputs[0],

881 all_gather_input,

882 group=self.sharded_group_info.group,

883 async_op=async_op,

884 )

885

886 return self.all_gather_outputs[0], handle

887

888 def unshard(self, async_op: bool = False) -> None:

889 if self.prefetch_handle is not None:

890 # Already triggered by HSDPState.prefetch(), so return directly.

891 return # no-op

892

893 _, handle = self._get_unsharded_param_data(async_op=async_op)

894 self.prefetch_handle = handle

895

896 def wait_for_unshard(self) -> None:

897 self._assert_in_states(ShardedState.SHARDED)

898

899 if self.prefetch_handle is not None:

900 self.prefetch_handle.wait()

901 self.prefetch_handle = None

902

903 self.init_unsharded_param()

904 self.to_unsharded()

905

906 def shard(self) -> None:

907 """

908 Transition parameter from unsharded back to sharded state.

909 """

910 self._assert_in_states(ShardedState.UNSHARDED)

911 self.to_sharded()

912

913 def reduce_scatter_grad(

914 self,

915 async_op: bool = True,

916 dtype: Optional[torch.dtype] = None,

917 reduce_op: Optional[dist.ReduceOp] = dist.ReduceOp.AVG,

918 output_buffer: Optional[torch.Tensor] = None,

919 ) -> Union[None, Tuple[torch.Tensor, Optional[dist.Work]]]:

920 """

921 Perform reduce-scatter on gradient to reduce and shard the full gradient.

922

923 Args:

924 async_op: Whether to execute asynchronously.

925 dtype: reduce dtype.

926 reduce_op: do reduce-scatter avg or sum.

927 output_buffer: Optional pre-allocated output buffer for fused all-reduce.

928 When provided, reduce_scatter writes directly into this buffer,

929 enabling zero-copy fusion with subsequent all_reduce operations.

930 The buffer must have the correct size (sharded_size.numel()) and dtype.

931

932 Returns:

933 (sharded_grad, handle): Sharded gradient and communication handle.

934 """

935 self._assert_in_states(ShardedState.UNSHARDED)

936

937 # Choose gradient source based on use_accumulated_grad flag

938 if self.unsharded_accumulated_grad is not None:

939 grad = self.unsharded_accumulated_grad_data

940 else:

941 grad = self.unsharded_grad_data

942 reduce_dtype = dtype or grad.dtype

943 self._grad = grad.to(reduce_dtype)

944 plan_world_size = (

945 self.shard_world_size

946 if self.is_sharded

947 and self.sharded_group_info.group is not None

948 and self.shard_world_size > 1

949 else 1

950 )

951 plan = build_rs_plan(self, self._grad, plan_world_size)

952 grad_flat = pack_for_reduce_scatter(self._grad, plan).reshape(-1)

953 # apply gradient_scaling_factor (reduce-scatter leg)

954 apply_gradient_scaling_factor(grad_flat, self.gradient_scaling_factor)

955 # If parameter is not sharded (below threshold), no reduce-scatter needed

956 if not self.is_sharded:

957 if output_buffer is not None:

958 output_buffer.copy_(grad_flat)

959 self._reduce_scatter_output = output_buffer

960 else:

961 self._reduce_scatter_output = grad_flat

962 self.reduce_scatter_handle = None

963 return grad_flat, None

964

965 if self.sharded_group_info.group is None or self.shard_world_size <= 1:

966 if output_buffer is not None:

967 output_buffer.copy_(grad_flat)

968 self._reduce_scatter_output = output_buffer

969 else:

970 self._reduce_scatter_output = grad_flat

971 self.reduce_scatter_handle = None

972 # No communication needed

973 return grad_flat, None

974

975 # Calculate output size

976 output_numel = grad_flat.numel() // self.shard_world_size

977 # Use provided output buffer or allocate a new one

978 if output_buffer is not None:

979 if output_buffer.numel() != output_numel:

980 raise ValueError(

981 f"output_buffer size mismatch: expected {output_numel}, got {output_buffer.numel()}"

982 )

983 if output_buffer.dtype != reduce_dtype:

984 raise ValueError(

985 f"output_buffer dtype mismatch: expected {reduce_dtype}, got {output_buffer.dtype}"

986 )

987 self._reduce_scatter_output = output_buffer

988 else:

989 self._reduce_scatter_output = torch.empty(output_numel, dtype=reduce_dtype, device=self._grad.device)

990 # Execute reduce_scatter_tensor

991 self.reduce_scatter_handle = dist.reduce_scatter_tensor(

992 self._reduce_scatter_output,

993 grad_flat,

994 op=reduce_op,

995 group=self.sharded_group_info.group,

996 async_op=async_op,

997 )

998 return self._reduce_scatter_output, self.reduce_scatter_handle

999

1000 def all_reduce_grad(

1001 self,

1002 grad: Optional[torch.Tensor] = None,

1003 dtype: Optional[torch.dtype] = None,

1004 async_op: bool = True,

1005 reduce_op: Optional[dist.ReduceOp] = dist.ReduceOp.AVG,

1006 ) -> Union[None, Tuple[torch.Tensor, Optional[dist.Work]]]:

1007 """

1008 Perform all-reduce on gradient (across replicate dimension in HSDP mode).

1009

1010 Args:

1011 grad: Gradient tensor to reduce. If None, this is a pure all-reduce

1012 path (no preceding reduce-scatter): the unsharded grad is fetched

1013 here and ``gradient_scaling_factor`` is applied in this leg. If a

1014 grad is passed in, it is the already-scaled output of

1015 ``reduce_scatter_grad`` (chained HSDP all-reduce) and is not

1016 scaled again. Whether the grad is fetched here is therefore the

1017 signal for which leg owns the scaling -- no extra flag needed.

1018 async_op: Whether to execute asynchronously.

1019 reduce_op: Optional[dist.ReduceOp] = dist.ReduceOp.AVG.

1020

1021 Returns:

1022 (reduced_grad, handle): Reduced gradient and communication handle.

1023 """

1024 # grad is None => pure all-reduce path: fetch the unsharded grad and own

1025 # the scaling here, since it never went through reduce_scatter_grad.

1026 scale_here = grad is None

1027 if grad is None:

1028 if self.unsharded_accumulated_grad is not None:

1029 grad = self.unsharded_accumulated_grad_data

1030 else:

1031 grad = self.unsharded_grad_data

1032

1033 if dtype is not None and dtype != grad.dtype:

1034 grad = grad.to(dtype)

1035

1036 if scale_here:

1037 # all-reduce below is in-place on grad, so scaling in-place here keeps

1038 # the same semantics: reduce(g_i * factor) == factor * reduce(g_i).

1039 apply_gradient_scaling_factor(grad, self.gradient_scaling_factor)

1040

1041 if self.unsharded_group_info.group is None or self.replicate_world_size <= 1:

1042 return grad, None

1043

1044 self.all_reduce_handle = dist.all_reduce(grad, op=reduce_op,

1045 group=self.unsharded_group_info.group, async_op=async_op)

1046 self._all_reduce_output = grad

1047 return grad, self.all_reduce_handle

1048

1049

1050def set_requires_grad_if_needed(

1051 src_tensor: torch.Tensor, dst_tensor: torch.Tensor

1052) -> None:

1053 """set dst_tensor requires_grads from src_tensor if needed."""

1054 if src_tensor.requires_grad != dst_tensor.requires_grad:

1055 dst_tensor.requires_grad_(src_tensor.requires_grad)

Coverage for / home / jenkins / .local / lib / python3.10 / site-packages / hyper_parallel / platform / torch / fully_shard / param.py: 76%

512 statements