Coverage for /home/jenkins/.local/lib/python3.10/site-packages/hyper_parallel/platform/torch/fully

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ============================================================================

15"""Torch HSDP cell state"""

16# pylint: disable=protected-access

18from typing import Optional, List

19from collections import defaultdict

20import torch

22from hyper_parallel.tools.logging import get_logger

23from hyper_parallel.core.fully_shard.hsdp_state import HSDPState

24from hyper_parallel.core.fully_shard.hsdp_utils import (

25 FullyShardParamMode,

26 _get_param_module_infos,

27 infer_fully_shard_param_mode,

28 apply_gradient_scaling_factor,

29)

30from hyper_parallel.core.fully_shard.utils import HSDPMeshInfo

31from hyper_parallel.core.fully_shard.utils import CPUOffloadPolicy

32from hyper_parallel.platform.torch.fully_shard.param import TorchHSDPParamV2

33from hyper_parallel.platform.torch.fully_shard.pack_utils import build_rs_plan

34from hyper_parallel.platform.torch.fully_shard.param_group import get_comm_ctx, HSDPParamGroup, AllReduceParamGroup

36logger = get_logger("FSDP")

39def _to_dtype_if_needed(

40 tensor: torch.Tensor, dtype: Optional[torch.dtype]

41) -> torch.Tensor:

42 """Cast tensor to the given dtype if it differs from current dtype.

44 Args:

45 tensor: The input tensor to potentially cast.

46 dtype: Target dtype. If None or same as tensor dtype, no-op.

47 """

48 if dtype is not None and tensor.dtype != dtype:

49 return tensor.to(dtype)

50 return tensor

53class TorchHSDPStateV2(HSDPState):

54 """Torch HSDP cell state"""

55 # DTensor compat parameters in pure-TP mode can accumulate gradients

56 # directly on ``sharded_param.grad`` without ever materializing an

57 # ``_unsharded_param``. Track their async all-reduce work separately from

58 # the standard unsharded-grad queues.

59 pre_direct_all_reduce_grads = []

60 # Record AllReduceParamGroup that has reduce_scatter issued, waiting for next post_backward to process

61 pre_all_reduce_groups: List[AllReduceParamGroup] = []

63 # Record AllReduceParamGroup that has all_reduce issued, waiting for root_backward_hook to apply

64 pending_all_reduce_groups: List[AllReduceParamGroup] = []

66 @staticmethod

67 def _get_pending_unsharded_grad(hsdp_param):

68 """Return the pending unsharded gradient tensor for all-reduce-based paths."""

69 if hsdp_param.unsharded_accumulated_grad is not None:

70 return hsdp_param.unsharded_accumulated_grad_data

71 return hsdp_param.unsharded_grad_data

73 @staticmethod

74 def _has_pending_unsharded_grad(hsdp_param):

75 """Whether the parameter currently has a gradient waiting for reduction."""

76 if hsdp_param.unsharded_accumulated_grad is not None:

77 return True

78 if not hasattr(hsdp_param, "_unsharded_param") or hsdp_param.unsharded_param is None:

79 return False

80 return hsdp_param.unsharded_param.grad is not None

82 @staticmethod

83 def _get_local_sharded_grad(hsdp_param):

84 """Return the local gradient tensor currently stored on ``sharded_param``."""

85 grad = hsdp_param.sharded_param.grad

86 if grad is None:

87 return None

88 to_local = getattr(grad, "to_local", None)

89 if callable(to_local):

90 return to_local()

91 return grad

93 def __init__(self, cell, mesh_info, config, platform, device):

94 """

95 Initialize TorchHSDPStateV2.

97 Args:

98 cell (nn.Module): The module whose parameters are managed by this state.

99 mesh_info: Mesh topology for shard/replicate dimensions.

100 config (HSDPConfigV2): HSDP configuration.

101 platform (TorchPlatform): Torch platform abstraction.

102 device (torch.device): Target device.

103 """

104 super().__init__(cell, mesh_info, config, platform, device)

105 self.comm_fusion = config.comm_fusion

106 # Do ReduceScatter/AllReduce for grad

107 self.device = device

108 self.mp_policy = config.mp_policy

109 self.offload_policy = config.offload_policy

110 self.reduce_grads = True

111 # Reshard parameter after backward

112 self.reshard_after_backward = True

113 # Requires AllReduce for grad When HSDP

114 self.requires_all_reduce = True

115 # Default reduce op is decided at the fully_shard-state level:

116 # if any managed parameter is DTensor-backed, use SUM; otherwise AVG.

117 self._user_reduce_op_type = None

118 self.reduce_op_type = self._resolve_default_reduce_op()

119 self._reset_sharded_params = False

120 self._init_param_group()

121

122 @staticmethod

123 def _comm_fusion_unsupported_reason(hsdp_param) -> Optional[str]:

124 """Return the reason why ``hsdp_param`` cannot participate in comm_fusion."""

125 if not hsdp_param.enable_fsdp_shard:

126 return "non-sharded parameters such as replicate_params are not supported"

127 if hsdp_param.param_mode not in (

128 FullyShardParamMode.LOCAL_PARAM,

129 FullyShardParamMode.DTENSOR_UNIFIED,

130 ):

131 return (

132 "param_mode "

133 f"{hsdp_param.param_mode} is not supported"

134 )

135 local_shard = getattr(hsdp_param, "_sharded_local_tensor", None)

136 if local_shard is None:

137 return "missing local shard tensor for comm_fusion plan validation"

138 plan_world_size = getattr(hsdp_param, "shard_world_size", None)

139 if plan_world_size is None:

140 plan_world_size = getattr(hsdp_param, "shard_size", 1)

141 try:

142 build_rs_plan(hsdp_param, local_shard, plan_world_size)

143 except NotImplementedError as exc:

144 return str(exc)

145 except (AssertionError, ValueError) as exc:

146 return f"cannot build comm_fusion pack plan: {exc}"

147 return None

148

149 def _init_param_group(self):

150 """Initialize fused parameter group for communication fusion.

151

152 When ``comm_fusion`` is enabled, creates an ``HSDPParamGroup`` that packs all

153 parameters into a single buffer for fused all-gather and reduce-scatter,

154 replacing the per-parameter communication pattern.

155 """

156 if self.config.comm_fusion:

157 unsupported_param = next(

158 (

159 hsdp_param

160 for hsdp_param in self.hsdp_params

161 if self._comm_fusion_unsupported_reason(hsdp_param) is not None

162 ),

163 None,

164 )

165 if unsupported_param is not None:

166 param_fqn = getattr(unsupported_param, "_param_fqn", "<unknown>")

167 reason = self._comm_fusion_unsupported_reason(unsupported_param)

168 raise NotImplementedError(

169 f"comm_fusion does not support parameter {param_fqn}: {reason}."

170 )

171 self.param_group = None

172 if self.hsdp_params:

173 # pylint: disable=E1128

174 self.param_group = HSDPParamGroup(

175 self.hsdp_params,

176 self.mesh_info,

177 self.device,

178 self.mp_policy,

179 self.config.comm_fusion_zero_copy,

180 )

181

182 def _move_states_to_device(self):

183 """move states to device"""

184 for mod in self.modules:

185 for param in mod.parameters():

186 if hasattr(param, "_hsdp_param_initialized") and param._hsdp_param_initialized:

187 continue

188 if param.device == self.device or param.device.type == "meta":

189 continue

190 param.data = param.to(self.device)

191 for buffer in mod.buffers():

192 if buffer.device == self.device or buffer.device.type == "meta":

193 continue

194 buffer.data = buffer.to(self.device)

195

196 def _init_hsdp_params(self):

197 """init hsdp parameters and replicate parameters for cell."""

198 replicate_params = set(self.config.replicate_params or ())

199 # all parameters in the module tree(s), deduplicated

200 ignored_params = set(self.config.ignored_params or ())

201 visited_params = set()

202 filtered_params = []

203 for mod in self.modules:

204 for _, param in mod.named_parameters():

205 if param in ignored_params:

206 continue

207 if hasattr(param, "_hsdp_param_initialized") and param._hsdp_param_initialized:

208 continue

209 if param in visited_params:

210 continue

211 visited_params.add(param)

212 filtered_params.append(param)

213

214 module_infos = _get_param_module_infos(filtered_params, tuple(self.modules))

215 for param, module_info in zip(filtered_params, module_infos):

216 param_mode = infer_fully_shard_param_mode(self.config.mesh, [param])

217 enable_fsdp_shard = param not in replicate_params

218 hsdp_param = TorchHSDPParamV2(param,

219 module_info,

220 self.mesh_info,

221 shard_placement_fn=self.config.shard_placement_fn,

222 mp_policy=self.mp_policy,

223 offload_policy=self.offload_policy,

224 device=self.device,

225 param_mode=param_mode,

226 enable_fsdp_shard=enable_fsdp_shard,

227 )

228 if param in replicate_params:

229 self.replicate_params.append(hsdp_param)

230 else:

231 self.hsdp_params.append(hsdp_param)

232 self.sharded_hsdp_params.append(hsdp_param)

233

234 def _init_mp_dtypes(self):

235 """init mp dtypes for hsdp parameters and replicate parameters"""

236 for hsdp_param in self.hsdp_params:

237 hsdp_param.init_dtype_attrs(self.mp_policy)

238 for replicate_param in self.replicate_params:

239 replicate_param.init_dtype_attrs(self.mp_policy)

240 trainable_params: list[TorchHSDPParamV2] = [

241 p for p in self._iter_managed_params() if p.sharded_param.requires_grad

242 ]

243 orig_dtypes = {p.orig_dtype for p in trainable_params}

244 reduce_dtypes = {p.reduce_dtype for p in trainable_params}

245 if len(trainable_params) > 0 and len(orig_dtypes) != 1:

246 raise AssertionError(

247 f"hsdp expects uniform original parameter dtype but got {orig_dtypes}"

248 )

249 self._orig_dtype = next(iter(orig_dtypes)) if trainable_params else None

250 if len(trainable_params) > 0 and len(reduce_dtypes) != 1:

251 raise AssertionError(

252 f"hsdp expects uniform reduce dtype but got {reduce_dtypes}"

253 )

254 self._reduce_dtype = next(iter(reduce_dtypes)) if trainable_params else None

255

256 def _validate_cpu_offload_params(self):

257 """Validate that all parameters are on CPU when CPU offload policy is enabled."""

258 if not isinstance(self.offload_policy, CPUOffloadPolicy):

259 return

260 hsdp_params_not_on_cpu = [

261 hsdp_param

262 for hsdp_param in self._iter_managed_params()

263 if hsdp_param.sharded_param.device.type != "cpu"

264 ]

265 if hsdp_params_not_on_cpu:

266 raise RuntimeError(

267 "HSDP parameters should be materialized on CPU when enabling CPU offloading. "

268 'For example, load a CPU state dict or call module.to_empty(device="cpu"). '

269 "Found following parameters on non-CPU device: "

270 f"{[(p._param_fqn, p.sharded_param.device) for p in hsdp_params_not_on_cpu]}\n"

271 )

272

273 def lazy_init(self):

274 """Deferred initialization: reset sharded params, validate devices, and set mixed-precision dtypes."""

275 if self.is_shard and not self._reset_sharded_params:

276 for hsdp_param in self.hsdp_params:

277 hsdp_param.reset_sharded_param()

278 self._reset_sharded_params = True

279 self._validate_no_meta_params()

280 self._validate_cpu_offload_params()

281 self._init_mp_dtypes()

282

283 def _validate_no_meta_params(self):

284 param_names_on_meta = [

285 hsdp_param._param_fqn

286 for hsdp_param in self._iter_managed_params()

287 if hsdp_param.sharded_param.device.type == "meta"

288 ]

289 if param_names_on_meta:

290 raise RuntimeError(

291 "HSDP parameters should be materialized from meta device before training, "

292 f"but the following were still on meta device: {param_names_on_meta}\n"

293 "For example, call module.to_empty(device) to materialize to device and "

294 "call module.reset_parameters() on each module to initialize values."

295 )

296

297 def post_backward_for_comm_fusion(self):

298 """post_backward_for_comm_fusion."""

299 logger.debug("post_backward module=%s mode=comm_fusion enter", self)

300 # Replicate-only params still use the non-fused compat all-reduce path.

301 # Drain any pending side-path reductions before advancing the fused

302 # param-group pipeline for sharded params.

303 self.reduce_params()

304 # Fused gradient reduction path: first apply any pending async reduction

305 # from the previous module's backward (pipelined overlap), then issue

306 # this module's fused reduce-scatter (+ all-reduce for HSDP).

307 comm_ctx = get_comm_ctx()

308 # Phase 2: apply grads for the param group whose all_reduce is done

309 if comm_ctx.all_reduce_param_group is not None:

310 logger.debug("post_backward module=%s wait=comm_fusion_all_reduce", self)

311 comm_ctx.all_reduce_param_group.wait_all_reduce_and_apply_grad()

312 comm_ctx.all_reduce_param_group = None

313 # Phase 1: wait reduce_scatter, issue async all_reduce for previous layer

314 if comm_ctx.pre_param_group is not None:

315 logger.debug("post_backward module=%s wait=comm_fusion_reduce_scatter", self)

316 comm_ctx.pre_param_group.wait_reduce_scatter_and_issue_all_reduce()

317 comm_ctx.pre_param_group = None

318 if self.param_group is not None:

319 logger.debug("post_backward module=%s launch=comm_fusion_reduce_scatter", self)

320 self.param_group.foreach_reduce(

321 reduce_scatter_reduce_op=self.reduce_op_type,

322 )

323 for hsdp_param in self.replicate_params:

324 if not hasattr(hsdp_param, "_unsharded_param") or hsdp_param.unsharded_param is None:

325 continue

326 if not hsdp_param.sharded_param.requires_grad:

327 continue

328 if not self._has_pending_unsharded_grad(hsdp_param):

329 continue

330 reduce_op = self._resolve_reduce_op(hsdp_param)

331 logger.debug(

332 "post_backward module=%s launch=replicate_all_reduce param=%s",

333 self,

334 hsdp_param,

335 )

336 self._queue_compat_all_reduce(hsdp_param, reduce_op)

337

338 def _resolve_default_reduce_op(self):

339 """Resolve the default reduce op for the whole fully_shard state."""

340 for hsdp_param in self._iter_managed_params():

341 if hsdp_param.param_mode in (

342 FullyShardParamMode.DTENSOR_COMPAT,

343 FullyShardParamMode.DTENSOR_UNIFIED,

344 ):

345 return torch.distributed.ReduceOp.SUM

346 return torch.distributed.ReduceOp.AVG

347

348 def _resolve_reduce_op(self, hsdp_param=None):

349 """Resolve the gradient reduction op for the current fully_shard state."""

350 if self._user_reduce_op_type is not None:

351 return self._user_reduce_op_type

352 return self.reduce_op_type

353

354 def _should_run_all_reduce(self, hsdp_param) -> bool:

355 """Whether the current parameter should issue an all-reduce in this backward pass."""

356 return self.requires_all_reduce and hsdp_param.dp_size > 1

357

358 def _queue_reduce_scatter_then_all_reduce(self, hsdp_param, reduce_op):

359 """Queue the standard FSDP/HSDP reduction path."""

360 logger.debug(

361 "post_backward module=%s launch=reduce_scatter param=%s",

362 self,

363 hsdp_param,

364 )

365 hsdp_param.reduce_scatter_grad(

366 dtype=self._reduce_dtype,

367 reduce_op=reduce_op,

368 )

369 HSDPState.pre_reduce_scatter_params.append((hsdp_param, self._orig_dtype))

370 if not self._should_run_all_reduce(hsdp_param):

371 return

372 reduced_grad = hsdp_param.reduce_scatter_output()

373 if (

374 HSDPState.pre_reduce_scatter_params

375 and HSDPState.pre_reduce_scatter_params[-1][0] == hsdp_param

376 ):

377 HSDPState.pre_reduce_scatter_params.pop()

378 hsdp_param.all_reduce_grad(

379 grad=reduced_grad,

380 dtype=self._reduce_dtype,

381 reduce_op=reduce_op,

382 )

383 logger.debug(

384 "post_backward module=%s launch=all_reduce param=%s",

385 self,

386 hsdp_param,

387 )

388 HSDPState.pre_all_reduce_params.append((hsdp_param, self._orig_dtype))

389

390 def _queue_compat_all_reduce(self, hsdp_param, reduce_op):

391 """Queue the compatibility all-reduce path without FSDP sharding."""

392 if not self._should_run_all_reduce(hsdp_param):

393 return

394 # Pure all-reduce path: pass grad=None so all_reduce_grad fetches the

395 # unsharded grad itself and owns the scaling (no reduce-scatter here).

396 hsdp_param.all_reduce_grad(

397 dtype=self._reduce_dtype,

398 reduce_op=reduce_op,

399 )

400 logger.debug(

401 "post_backward module=%s launch=compat_all_reduce param=%s",

402 self,

403 hsdp_param,

404 )

405 HSDPState.pre_all_reduce_params.append((hsdp_param, self._orig_dtype))

406

407 def _can_direct_all_reduce_compat_grad(self, hsdp_param) -> bool:

408 """Whether ``hsdp_param`` should reduce its existing ``sharded_param.grad`` directly."""

409 return (

410 hsdp_param.param_mode == FullyShardParamMode.DTENSOR_COMPAT

411 and hsdp_param.enable_fsdp_shard

412 and not hsdp_param.is_sharded

413 and hsdp_param.shard_size == 1

414 and hsdp_param.sharded_param.requires_grad

415 and self._should_run_all_reduce(hsdp_param)

416 and self._get_local_sharded_grad(hsdp_param) is not None

417 )

418

419 def _queue_direct_compat_all_reduce(self, hsdp_param, reduce_op):

420 """Queue all-reduce for DTENSOR_COMPAT params whose grad stays on ``sharded_param``."""

421 grad = self._get_local_sharded_grad(hsdp_param)

422 if grad is None:

423 return

424 reduced_grad = grad

425 if self._reduce_dtype is not None and reduced_grad.dtype != self._reduce_dtype:

426 reduced_grad = reduced_grad.to(self._reduce_dtype)

427 # Pure all-reduce path (no reduce-scatter): this leg owns the scaling.

428 # all-reduce below is in-place, so scale in-place before it.

429 apply_gradient_scaling_factor(reduced_grad, hsdp_param.gradient_scaling_factor)

430 handle = None

431 if hsdp_param.unsharded_group_info.group is not None and hsdp_param.dp_size > 1:

432 logger.debug(

433 "post_backward module=%s launch=direct_compat_all_reduce param=%s",

434 self,

435 hsdp_param,

436 )

437 handle = torch.distributed.all_reduce(

438 reduced_grad,

439 op=reduce_op,

440 group=hsdp_param.unsharded_group_info.group,

441 async_op=True,

442 )

443 TorchHSDPStateV2.pre_direct_all_reduce_grads.append((handle, reduced_grad, grad))

444

445 def post_backward(self, *unused): # pylint: disable=unused-argument

446 """Reduce gradients and reshard parameters after backward."""

447 logger.debug(

448 "post_backward module=%s enter reduce_grads=%s comm_fusion=%s reshard_after_backward=%s",

449 self,

450 self.reduce_grads,

451 self.comm_fusion,

452 self.reshard_after_backward,

453 )

454 for hsdp_param in self._iter_managed_params():

455 hsdp_param.accumulate_unsharded_grad_if_needed()

456 if not self.reduce_grads:

457 if self.reshard_after_backward:

458 self.shard()

459 for hsdp_param in self._iter_managed_params():

460 hsdp_param.to_accumulated_grad_if_needed()

461 return

462 if not self.comm_fusion:

463 # Handle user config replicate params and mirror params.

464 self.reduce_params()

465 for hsdp_param in self._iter_managed_params():

466 if not hasattr(hsdp_param, "_unsharded_param") or hsdp_param.unsharded_param is None:

467 if self._can_direct_all_reduce_compat_grad(hsdp_param):

468 reduce_op = self._resolve_reduce_op(hsdp_param)

469 self._queue_direct_compat_all_reduce(hsdp_param, reduce_op)

470

471 # Step 1: wait prev reduce_scatter (for params needing allreduce)

472 prev_group = self._wait_prev_reduce_scatter()

473

474 # Step 2: wait and apply prev reduce_scatter (for params NOT needing allreduce)

475 self._wait_and_apply_prev_no_allreduce_params()

476

477 # Step 3: issue current reduce_scatter

478 self._issue_reduce_scatter_for_current_module()

479

480 # Step 4: issue prev fused allreduce (async) - using saved prev_group

481 self._issue_prev_fused_allreduce(prev_group)

482 else:

483 self.post_backward_for_comm_fusion()

484 if self.reshard_after_backward:

485 self.shard()

486

487 def _issue_reduce_scatter_for_current_module(self):

488 """Issue reduce_scatter for current module's parameters with fused all-reduce support.

489

490 This method groups parameters by their replicate_process_group and:

491 1. For params without all_reduce needs: issue reduce_scatter directly

492 2. For params with all_reduce needs: allocate fused buffer and issue reduce_scatter

493 into aligned views, enabling zero-copy fused all_reduce later.

494 """

495 # Collect parameters that need gradient reduction

496 params_to_reduce = []

497 for hsdp_param in self._iter_managed_params():

498 skip_param = (not hasattr(hsdp_param, "_unsharded_param")

499 or hsdp_param.unsharded_param is None

500 or not hsdp_param.sharded_param.requires_grad

501 or self._can_direct_all_reduce_compat_grad(hsdp_param)

502 or (hsdp_param.unsharded_param.grad is None

503 and hsdp_param.unsharded_accumulated_grad_data is None))

504 if skip_param:

505 continue

506 params_to_reduce.append(hsdp_param)

507

508 if not params_to_reduce:

509 return

510

511 # Group by replicate_process_group for fused all-reduce

512 # Key: id of process group, or None for params that don't need all_reduce

513 groups_by_comm = defaultdict(list)

514 for hsdp_param in params_to_reduce:

515 if self._should_run_all_reduce(hsdp_param):

516 key = id(hsdp_param.unsharded_group_info.group)

517 groups_by_comm[key].append(hsdp_param)

518 else:

519 groups_by_comm[None].append(hsdp_param)

520

521 # Handle params that don't need all_reduce (FSDP or single replica)

522 if None in groups_by_comm:

523 for hsdp_param in groups_by_comm[None]:

524 logger.debug(

525 "post_backward module=%s launch=reduce_scatter param=%s all_reduce=False",

526 self,

527 hsdp_param,

528 )

529 hsdp_param.reduce_scatter_grad(

530 dtype=self._reduce_dtype,

531 reduce_op=self._resolve_reduce_op()

532 )

533 HSDPState.pre_reduce_scatter_params.append(

534 (hsdp_param, self._orig_dtype))

535

536 # Handle params that need all_reduce (HSDP with multiple replicas)

537 for key, hsdp_params in groups_by_comm.items():

538 if key is None:

539 continue

540

541 # Create AllReduceParamGroup for fused all-reduce

542 group = AllReduceParamGroup(

543 replicate_group=hsdp_params[0].unsharded_group_info.group,

544 hsdp_params=hsdp_params,

545 orig_dtypes=[self._orig_dtype] * len(hsdp_params),

546 reduce_dtype=self._reduce_dtype,

547 reduce_op=self._resolve_reduce_op(),

548 mp_policy=self.mp_policy,

549 )

550

551 # Allocate fused buffer with 512-byte alignment

552 group.allocate_fused_buffer(self.device)

553

554 # Issue reduce_scatter with output directly into fused buffer views

555 logger.debug(

556 "post_backward module=%s launch=fused_reduce_scatter group_params=%s",

557 self,

558 hsdp_params,

559 )

560 for idx, hsdp_param in enumerate(hsdp_params):

561 buffer_view = group.get_param_buffer_view(idx)

562 hsdp_param.reduce_scatter_grad(

563 dtype=self._reduce_dtype,

564 reduce_op=self._resolve_reduce_op(),

565 output_buffer=buffer_view,

566 )

567

568 # Save group for later all_reduce in reduce_params()

569 TorchHSDPStateV2.pre_all_reduce_groups.append(group)

570

571 def _wait_prev_reduce_scatter(self) -> List[AllReduceParamGroup]:

572 """Step 1: wait prev reduce_scatter.

573

574 This enables overlapping:

575 - Layer N-1's reduce_scatter wait with Layer N's backward compute

576

577 Returns:

578 List of previous AllReduceParamGroups (one per communication group).

579 """

580 if TorchHSDPStateV2.pre_all_reduce_groups:

581 prev_groups = list(TorchHSDPStateV2.pre_all_reduce_groups)

582 TorchHSDPStateV2.pre_all_reduce_groups.clear()

583 for prev_group in prev_groups:

584 logger.debug(

585 "post_backward module=%s wait=fused_reduce_scatter group_params=%s",

586 self,

587 prev_group.hsdp_params,

588 )

589 for hsdp_param in prev_group.hsdp_params:

590 hsdp_param.reduce_scatter_output()

591 hsdp_param.clear_reduce_scatter_output()

592 if hsdp_param.unsharded_accumulated_grad_data is not None:

593 hsdp_param.unsharded_accumulated_grad = None

594 elif hsdp_param.unsharded_param.grad is not None:

595 hsdp_param.unsharded_param.grad = None

596 return prev_groups

597 return []

598

599 def _issue_prev_fused_allreduce(self, prev_groups: List[AllReduceParamGroup]):

600 """Step 4: issue previous module's fused allreduce (async).

601

602 The allreduce handle is collected in pending_all_reduce_groups,

603 and will be processed in root_backward_hook's delay_apply_reduce_grads().

604

605 Args:

606 prev_groups: List of previous AllReduceParamGroups to issue allreduce for.

607 """

608 for prev_group in prev_groups:

609 prev_group.accumulate_existing_grads_to_buffer()

610 logger.debug(

611 "post_backward module=%s launch=fused_all_reduce group_params=%s",

612 self,

613 prev_group.hsdp_params,

614 )

615 prev_group.issue_async_allreduce()

616 # Move to pending queue for root_backward_hook to process

617 TorchHSDPStateV2.pending_all_reduce_groups.append(prev_group)

618

619 def _wait_and_apply_prev_no_allreduce_params(self):

620 """Step 2: wait and apply previous reduce_scatter for params NOT needing allreduce.

621

622 These are FSDP params or single-replica HSDP params that don't need

623 cross-replica allreduce. Their reduce_scatter was issued by the previous

624 module's _issue_reduce_scatter_for_current_module(), and we wait and apply here.

625 """

626 need_synchronize = False

627 while HSDPState.pre_reduce_scatter_params:

628 pre_hsdp_param, pre_orig_dtype = HSDPState.pre_reduce_scatter_params.pop(0)

629 logger.debug(

630 "post_backward module=%s wait=reduce_scatter param=%s",

631 self,

632 pre_hsdp_param,

633 )

634 reduced_grad = pre_hsdp_param.reduce_scatter_output()

635 pre_hsdp_param.clear_reduce_scatter_output()

636 need_synchronize = pre_hsdp_param.apply_reduced_grad(reduced_grad, pre_orig_dtype) or need_synchronize

637 pre_hsdp_param.accumulated_allreduced_grad = False

638

639 if need_synchronize:

640 if self.device.type == "npu":

641 torch.npu.current_stream().synchronize()

642 elif self.device.type == "cuda":

643 torch.cuda.current_stream().synchronize()

644 else:

645 raise NotImplementedError(

646 f"Unsupported device type {self.device.type} for synchronization after CPU offload."

647 )

648

649 @classmethod

650 def delay_apply_reduce_grads(cls, device: torch.device):

651 """Apply all pending allreduce gradients in root_backward_hook.

652

653 This is called at the end of root_backward_hook to wait for all

654 async allreduce operations and apply gradients to sharded parameters.

655

656 Args:

657 device: Device for CPU offload synchronization.

658 """

659 need_synchronize = False

660

661 for group in cls.pending_all_reduce_groups:

662 logger.debug(

663 "post_backward wait=pending_fused_all_reduce group_params=%s",

664 group.hsdp_params,

665 )

666 need_synchronize = group.wait_and_apply_grads() or need_synchronize

667

668 cls.pending_all_reduce_groups.clear()

669

670 if need_synchronize:

671 if device.type == "npu":

672 torch.npu.current_stream().synchronize()

673 elif device.type == "cuda":

674 torch.cuda.current_stream().synchronize()

675 else:

676 raise NotImplementedError(

677 f"Unsupported device type {device.type} for synchronization after CPU offload."

678 )

679

680

681 def reduce_scattered_params(self):

682 """

683 reduce_scattered_params

684 """

685 need_synchronize = False

686 while HSDPState.pre_reduce_scatter_params:

687 pre_hsdp_param, pre_orig_dtype = HSDPState.pre_reduce_scatter_params.pop(0)

688 logger.debug(

689 "post_backward module=%s wait=reduce_scatter param=%s",

690 self,

691 pre_hsdp_param,

692 )

693 reduced_grad = pre_hsdp_param.reduce_scatter_output()

694 pre_hsdp_param.clear_reduce_scatter_output()

695 need_synchronize = pre_hsdp_param.apply_reduced_grad(reduced_grad, pre_orig_dtype) or need_synchronize

696 pre_hsdp_param.accumulated_allreduced_grad = False

697 if need_synchronize:

698 if self.device.type == "npu":

699 torch.npu.current_stream().synchronize()

700 elif self.device.type == "cuda":

701 torch.cuda.current_stream().synchronize()

702 else:

703 raise NotImplementedError(

704 f"Unsupported device type {self.device.type} for synchronization after CPU offload."

705 )

706

707 def reduce_params(self):

708 """Apply reduced gradients from pre-staged HSDP parameters to sharded parameters.

709

710 This function processes two lists of pre-queued HSDP parameters (`pre_reduce_scatter_params`

711 and `pre_all_reduce_params`), retrieves the reduced gradients from asynchronous

712 reduce-scatter/all-reduce operations, clears cached communication outputs, and applies

713 the reduced gradients to the corresponding sharded parameters (including reshaping,

714 dtype conversion, optional CPU offloading, and gradient accumulation/assignment).

715

716 Note:

717 - Parameters are processed in **FIFO (First-In-First-Out)** order (via `pop(0)`), ensuring

718 gradient application order matches the order of gradient reduction operations.

719 - After retrieving the reduced gradient, the cached communication output (reduce_scatter_output

720 or all_reduce_output) is cleared to free memory and avoid stale data.

721 - Gradient application logic (in `apply_reduced_grad`) includes:

722 1. Reshaping the flat reduced gradient to match the local shard shape

723 2. Optional dtype conversion to `param_type`

724 3. Optional CPU offloading (per the HSDP parameter's offload policy)

725 4. Assigning or accumulating the gradient to `sharded_param.grad`

726 """

727 need_synchronize = False

728 while HSDPState.pre_all_reduce_params:

729 pre_hsdp_param, pre_orig_dtype = HSDPState.pre_all_reduce_params.pop(0)

730 logger.debug(

731 "post_backward module=%s wait=all_reduce param=%s",

732 self,

733 pre_hsdp_param,

734 )

735 reduced_grad = pre_hsdp_param.all_reduce_output()

736 pre_hsdp_param.clear_all_reduce_output()

737 need_synchronize = pre_hsdp_param.apply_reduced_grad(reduced_grad, pre_orig_dtype) or need_synchronize

738

739 while TorchHSDPStateV2.pre_direct_all_reduce_grads:

740 handle, reduced_grad, target_grad = TorchHSDPStateV2.pre_direct_all_reduce_grads.pop(0)

741 if handle is not None:

742 logger.debug("post_backward module=%s wait=direct_compat_all_reduce", self)

743 handle.wait()

744 if reduced_grad is not target_grad:

745 if reduced_grad.dtype != target_grad.dtype:

746 reduced_grad = reduced_grad.to(target_grad.dtype)

747 target_grad.copy_(reduced_grad)

748 if need_synchronize:

749 if self.device.type == "npu":

750 torch.npu.current_stream().synchronize()

751 elif self.device.type == "cuda":

752 torch.cuda.current_stream().synchronize()

753 else:

754 raise NotImplementedError(

755 f"Unsupported device type {self.device.type} for synchronization after CPU offload."

756 )

757

758 def set_requires_grad_sync(self, requires_grad_sync):

759 """set requires grad sync flag to control gradient sync."""

760 self.reduce_grads = requires_grad_sync

761

762 @property

763 def _is_hsdp(self) -> bool:

764 return isinstance(self.mesh_info, HSDPMeshInfo)

765

766 def set_reduce_op_type(self, reduce_op_type: str):

767 """set reduce op type for gradient reduction."""

768 fsdp_support_reduce_op = {

769 "sum": torch.distributed.ReduceOp.SUM,

770 "avg": torch.distributed.ReduceOp.AVG,

771 }

772 reduce_op = reduce_op_type.lower().strip() if isinstance(reduce_op_type, str) else reduce_op_type

773 reduce_op_value = fsdp_support_reduce_op.get(reduce_op)

774 if reduce_op_value is None:

775 raise ValueError(

776 f"Unsupported reduce op type {reduce_op_type}, "

777 f"supported types are {list(fsdp_support_reduce_op.keys())}"

778 )

779 self._user_reduce_op_type = reduce_op_value

780 self.reduce_op_type = self._user_reduce_op_type

Coverage for / home / jenkins / .local / lib / python3.10 / site-packages / hyper_parallel / platform / torch / fully_shard / state.py: 81%

374 statements