Coverage for /home/jenkins/.local/lib/python3.10/site-packages/hyper_parallel/platform/mindspore/fully

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ============================================================================

15"""MindSpore HSDP cell state"""

16from collections import defaultdict

17from typing import List, Optional

18import mindspore as ms

19from mindspore import ops

20import mindspore.mint.distributed as dist

21from hyper_parallel.tools.logging import get_logger

22from hyper_parallel.core.fully_shard.hsdp_state import HSDPState

23from hyper_parallel.core.fully_shard.hsdp_utils import (

24 _get_param_module_infos,

25 FullyShardParamMode,

26 infer_fully_shard_param_mode,

27 apply_gradient_scaling_factor,

28)

29from hyper_parallel.platform.mindspore.fully_shard.pack_utils import build_rs_plan

30from hyper_parallel.platform.mindspore.fully_shard.param import MindSporeHSDPParamV2

31from hyper_parallel.platform.mindspore.fully_shard._version_utils import copy_without_bumping_version

32from hyper_parallel.platform.mindspore.fully_shard.param_group import (

33 AllReduceParamGroup,

34 HSDPParamGroup,

35 get_comm_ctx,

36)

37from hyper_parallel.platform.mindspore.utils import normalize_runtime_device

38from hyper_parallel.core.fully_shard.utils import CPUOffloadPolicy

40logger = get_logger("FSDP")

43def _to_dtype_if_needed(

44 tensor: ms.Tensor, dtype: Optional[ms.Type]

45) -> ms.Tensor:

46 """Cast tensor to the given dtype if it differs from current dtype.

48 Args:

49 tensor: The input tensor to potentially cast.

50 dtype: Target dtype. If None or same as tensor dtype, no-op.

51 """

52 if isinstance(dtype, ms.Type) and tensor.dtype != dtype:

53 return tensor.to(dtype)

54 return tensor

57class MindSporeHSDPStateV2(HSDPState):

58 """MindSpore HSDP cell state"""

59 # DTensor compat parameters in pure-TP mode can accumulate gradients

60 # directly on ``sharded_param.grad`` without materializing an

61 # ``_unsharded_param``. Track those async all-reduces separately from the

62 # standard unsharded-gradient queues.

63 pre_direct_all_reduce_grads = []

64 # Reserved for HSDP fused all-reduce pipeline (phase-2); kept for API parity with Torch.

65 pre_all_reduce_groups: List = []

66 pending_all_reduce_groups: List = []

68 @staticmethod

69 def _get_pending_unsharded_grad(hsdp_param):

70 """Return the pending unsharded gradient tensor for reduction paths."""

71 if hsdp_param.unsharded_accumulated_grad is not None:

72 return hsdp_param.unsharded_accumulated_grad_data

73 return hsdp_param.unsharded_grad_data

75 @staticmethod

76 def _has_pending_unsharded_grad(hsdp_param):

77 """Whether the parameter currently has a gradient waiting for reduction."""

78 if hsdp_param.unsharded_accumulated_grad is not None:

79 return True

80 if not hasattr(hsdp_param, "_unsharded_param") or hsdp_param.unsharded_param is None:

81 return False

82 return hsdp_param.unsharded_param.grad is not None

84 @staticmethod

85 def _get_local_sharded_grad(hsdp_param):

86 """Return the local gradient tensor currently stored on ``sharded_param``."""

87 grad = hsdp_param.sharded_param.grad

88 if grad is None:

89 return None

90 to_local = getattr(grad, "to_local", None)

91 if callable(to_local):

92 return to_local()

93 return grad

95 @staticmethod

96 def _synchronize_current_stream_if_needed(need_synchronize: bool) -> None:

97 """Synchronize the current device stream after non-blocking CPU offload."""

98 if not need_synchronize:

99 return

100 ms.runtime.current_stream().synchronize()

101

102 def _apply_pending_unsharded_grad_locally(self, hsdp_param) -> bool:

103 """Materialize pending unsharded grad onto ``sharded_param.grad`` without communication."""

104 pending_grad = self._get_pending_unsharded_grad(hsdp_param)

105 apply_gradient_scaling_factor(

106 pending_grad, hsdp_param.gradient_scaling_factor

107 )

108 return hsdp_param.apply_reduced_grad(pending_grad, self._orig_dtype)

109

110 def __init__(self, cell, mesh_info, config, platform, device=None):

111 super().__init__(cell, mesh_info, config, platform, device)

112 self.comm_fusion = config.comm_fusion

113 # Do ReduceScatter/AllReduce for grad

114 self.mp_policy = config.mp_policy

115 self.offload_policy = config.offload_policy

116 self.reduce_grads = True

117 # Reshard parameter after backward

118 self.reshard_after_backward = True

119 # Requires AllReduce for grad When HSDP

120 self.requires_all_reduce = True

121 # Default reduce op is decided at the fully_shard-state level:

122 # if any managed parameter is DTensor-backed, use SUM; otherwise AVG.

123 self.reduce_op_type = self._resolve_default_reduce_op()

124 self._reset_sharded_params = False

125 self._init_param_group()

126

127 def _iter_managed_params(self):

128 """Return all fully_shard-managed parameters, including replicate_params."""

129 return [*self.hsdp_params, *self.replicate_params]

130

131 def _resolve_default_reduce_op(self):

132 """Resolve the default reduce op for the whole fully_shard state."""

133 for hsdp_param in self._iter_managed_params():

134 if hsdp_param.param_mode in (

135 FullyShardParamMode.DTENSOR_COMPAT,

136 FullyShardParamMode.DTENSOR_UNIFIED,

137 ):

138 return ops.ReduceOp.SUM

139 return ops.ReduceOp.AVG

140

141 def _resolve_reduce_op(self):

142 """Resolve the gradient reduction op for the current fully_shard state."""

143 return self.reduce_op_type

144

145 @staticmethod

146 def _comm_fusion_unsupported_reason(hsdp_param) -> Optional[str]:

147 """Return the reason why ``hsdp_param`` cannot participate in comm_fusion."""

148 if not hsdp_param.enable_fsdp_shard:

149 return "non-sharded parameters such as replicate_params are not supported"

150 if hsdp_param.param_mode not in (

151 FullyShardParamMode.LOCAL_PARAM,

152 FullyShardParamMode.DTENSOR_UNIFIED,

153 ):

154 return f"param_mode {hsdp_param.param_mode} is not supported"

155 local_shard = getattr(hsdp_param, "_sharded_local_tensor", None)

156 if local_shard is None:

157 return "missing local shard tensor for comm_fusion plan validation"

158 plan_world_size = getattr(hsdp_param, "shard_world_size", None)

159 if plan_world_size is None:

160 plan_world_size = getattr(hsdp_param, "shard_size", 1)

161 try:

162 build_rs_plan(hsdp_param, local_shard, plan_world_size)

163 except NotImplementedError as exc:

164 return str(exc)

165 except (AssertionError, ValueError) as exc:

166 return f"cannot build comm_fusion pack plan: {exc}"

167 return None

168

169 def _init_param_group(self):

170 """Initialize fused parameter group when comm_fusion is enabled."""

171 if self.config.comm_fusion:

172 unsupported_param = next(

173 (

174 hsdp_param

175 for hsdp_param in self.hsdp_params

176 if self._comm_fusion_unsupported_reason(hsdp_param) is not None

177 ),

178 None,

179 )

180 if unsupported_param is not None:

181 param_fqn = getattr(unsupported_param, "_param_fqn", "<unknown>")

182 reason = self._comm_fusion_unsupported_reason(unsupported_param)

183 raise NotImplementedError(

184 f"comm_fusion does not support parameter {param_fqn}: {reason}."

185 )

186 self.param_group = None

187 if self.hsdp_params:

188 self.param_group = HSDPParamGroup(

189 self.hsdp_params,

190 self.mesh_info,

191 self.device,

192 self.mp_policy,

193 self.config.comm_fusion_zero_copy,

194 )

195

196 def zero_grad(self):

197 """zero grad"""

198 for hsdp_param in self.hsdp_params:

199 hsdp_param.zero_grad()

200 for hsdp_param in self.replicate_params:

201 hsdp_param.zero_grad()

202

203 def _move_states_to_device(self):

204 """move states to device"""

205 for mod in self.modules:

206 for param in mod.get_parameters():

207 if hasattr(param, "_hsdp_param_initialized") and param._hsdp_param_initialized:

208 continue

209 param_device = normalize_runtime_device(param.device)

210 if param_device in (self.device, "meta"):

211 continue

212 param.data = param.to(self.device)

213 for buffer in mod.buffers():

214 if buffer.device in (self.device, "meta"):

215 continue

216 buffer.data = buffer.to(self.device)

217

218 def _init_hsdp_params(self):

219 """init hsdp parameters for cell and replicate parameters for cell."""

220 # all parameters in the module tree(s), deduplicated

221 visited_params = set()

222 replicate_params = set(self.config.replicate_params or ())

223 ignored_params = set(self.config.ignored_params or ())

224 filtered_params = []

225 for mod in self.modules:

226 for _, param in mod.parameters_and_names():

227 if hasattr(param, "_hsdp_param_initialized") and param._hsdp_param_initialized:

228 continue

229 if param in ignored_params:

230 continue

231 if param in visited_params:

232 continue

233 visited_params.add(param)

234 filtered_params.append(param)

235

236 module_infos = _get_param_module_infos(filtered_params, tuple(self.modules))

237 for param, module_info in zip(filtered_params, module_infos):

238 param_mode = infer_fully_shard_param_mode(self.config.mesh, [param])

239 enable_fsdp_shard = param not in replicate_params

240 hsdp_param = MindSporeHSDPParamV2(

241 param,

242 module_info,

243 self.mesh_info,

244 shard_placement_fn=self.config.shard_placement_fn,

245 mp_policy=self.mp_policy,

246 offload_policy=self.offload_policy,

247 device=self.device,

248 param_mode=param_mode,

249 enable_fsdp_shard=enable_fsdp_shard,

250 )

251 if param in replicate_params:

252 self.replicate_params.append(hsdp_param)

253 else:

254 self.hsdp_params.append(hsdp_param)

255 self.sharded_hsdp_params.append(hsdp_param)

256

257 def _init_mp_dtypes(self):

258 """init mp dtypes for hsdp parameters and replicate parameters"""

259 for hsdp_param in self.hsdp_params:

260 hsdp_param.init_dtype_attrs(self.mp_policy)

261 for replicate_param in self.replicate_params:

262 replicate_param.init_dtype_attrs(self.mp_policy)

263 trainable_params: list[MindSporeHSDPParamV2] = [

264 p for p in self._iter_managed_params() if p.sharded_param.requires_grad

265 ]

266 orig_dtypes = {p.orig_dtype for p in trainable_params}

267 reduce_dtypes = {p.reduce_dtype for p in trainable_params}

268 if len(trainable_params) > 0 and len(orig_dtypes) != 1:

269 raise AssertionError(

270 f"hsdp expects uniform original parameter dtype but got {orig_dtypes}"

271 )

272 self._orig_dtype = next(iter(orig_dtypes)) if trainable_params else None

273 if len(trainable_params) > 0 and len(reduce_dtypes) != 1:

274 raise AssertionError(

275 f"hsdp expects uniform reduce dtype but got {reduce_dtypes}"

276 )

277 self._reduce_dtype = next(iter(reduce_dtypes)) if trainable_params else None

278

279 def lazy_init(self):

280 """Refresh parameter views and validate runtime state before first execution."""

281 if self.is_shard and not self._reset_sharded_params:

282 for hsdp_param in self.hsdp_params:

283 if hsdp_param.is_sharded:

284 hsdp_param.reset_sharded_param()

285 self._reset_sharded_params = True

286 self._validate_no_meta_params()

287 self._validate_cpu_offload_params()

288 self._init_mp_dtypes()

289

290 def _validate_cpu_offload_params(self):

291 """Validate that all parameters are on CPU when CPU offload policy is enabled."""

292 if not isinstance(self.offload_policy, CPUOffloadPolicy):

293 return

294 hsdp_params_not_on_cpu = [

295 hsdp_param

296 for hsdp_param in self._iter_managed_params()

297 if not str(hsdp_param.sharded_param.device).lower().startswith("cpu")

298 ]

299 if hsdp_params_not_on_cpu:

300 raise RuntimeError(

301 "HSDP parameters should be materialized on CPU when enabling CPU offloading. "

302 "Found following parameters on non-CPU device: "

303 f"{[(p._param_fqn, p.sharded_param.device) for p in hsdp_params_not_on_cpu]}\n"

304 "MindSpore backend will support this feature in future version."

305 )

306

307 def _validate_no_meta_params(self):

308 """Validate that all parameters have been materialized from meta device."""

309 param_names_on_meta = [

310 hsdp_param._param_fqn

311 for hsdp_param in self._iter_managed_params()

312 if hsdp_param.sharded_param.device == "meta"

313 ]

314 if param_names_on_meta:

315 raise RuntimeError(

316 "HSDP parameters should be materialized from meta device before training, "

317 f"but the following were still on meta device: {param_names_on_meta}\n"

318 "For example, initialize the module weights on a real device before running training."

319 )

320

321 def _queue_replicate_params_allreduce(self) -> None:

322 """Queue async all-reduce for config.replicate_params (aligned with Torch)."""

323 for hsdp_param in self.replicate_params:

324 if not hasattr(hsdp_param, "_unsharded_param") or hsdp_param.unsharded_param is None:

325 continue

326 if not hsdp_param.sharded_param.requires_grad:

327 continue

328 if not self._has_pending_unsharded_grad(hsdp_param):

329 continue

330 if self._should_run_all_reduce(hsdp_param):

331 self._queue_compat_all_reduce(hsdp_param)

332 else:

333 need_synchronize = self._apply_pending_unsharded_grad_locally(hsdp_param)

334 self._synchronize_current_stream_if_needed(need_synchronize)

335

336 def _drain_reduce_scatter_params(self) -> bool:

337 """Wait pending reduce-scatter ops and apply sharded grads."""

338 need_synchronize = False

339 while HSDPState.pre_reduce_scatter_params:

340 hsdp_param, pre_orig_dtype = HSDPState.pre_reduce_scatter_params.pop(0)

341 logger.debug(

342 "post_backward module=%s wait=reduce_scatter param=%s",

343 self,

344 hsdp_param,

345 )

346 reduced_grad = hsdp_param.reduce_scatter_output()

347 hsdp_param.clear_reduce_scatter_output()

348 need_synchronize = (

349 hsdp_param.apply_reduced_grad(reduced_grad, pre_orig_dtype)

350 or need_synchronize

351 )

352 hsdp_param.accumulated_allreduced_grad = False

353 return need_synchronize

354

355 def reduce_scattered_params(self):

356 """Wait pending reduce-scatter ops and apply sharded grads (FSDP pipeline step 2)."""

357 need_synchronize = self._drain_reduce_scatter_params()

358 self._synchronize_current_stream_if_needed(need_synchronize)

359

360 def reduce_params(self):

361 """Apply reduced gradients from pre-staged all-reduce queues (aligned with Torch).

362

363 Drains ``pre_all_reduce_params`` and ``pre_direct_all_reduce_grads``. For

364 pending reduce-scatter work, call ``reduce_scattered_params()`` separately.

365 """

366 need_synchronize = False

367 while HSDPState.pre_all_reduce_params:

368 hsdp_param, pre_orig_dtype = HSDPState.pre_all_reduce_params.pop(0)

369 logger.debug(

370 "post_backward module=%s wait=all_reduce param=%s",

371 self,

372 hsdp_param,

373 )

374 reduced_grad = hsdp_param.all_reduce_output()

375 hsdp_param.clear_all_reduce_output()

376 need_synchronize = (

377 hsdp_param.apply_reduced_grad(reduced_grad, pre_orig_dtype)

378 or need_synchronize

379 )

380 while MindSporeHSDPStateV2.pre_direct_all_reduce_grads:

381 hsdp_param, handle, reduced_grad, target_grad, *_ = (

382 MindSporeHSDPStateV2.pre_direct_all_reduce_grads.pop(0)

383 )

384 if handle is not None:

385 logger.debug("post_backward module=%s wait=direct_compat_all_reduce", self)

386 handle.wait()

387 # all-reduce already applied SUM/AVG via _resolve_reduce_op(); skip legacy manual AVG div.

388 if hsdp_param.mp_policy.apply_grad_on_fp32_main_grad:

389 need_synchronize = (

390 hsdp_param.apply_reduced_grad(reduced_grad, self._orig_dtype)

391 or need_synchronize

392 )

393 elif reduced_grad is not target_grad:

394 if reduced_grad.dtype != target_grad.dtype:

395 reduced_grad = reduced_grad.to(target_grad.dtype)

396 copy_without_bumping_version(target_grad, reduced_grad)

397 self._synchronize_current_stream_if_needed(need_synchronize)

398

399 def _wait_prev_reduce_scatter(self) -> List:

400 """Step 1: wait previous module RS for HSDP fused all-reduce groups."""

401 if MindSporeHSDPStateV2.pre_all_reduce_groups:

402 prev_groups = list(MindSporeHSDPStateV2.pre_all_reduce_groups)

403 MindSporeHSDPStateV2.pre_all_reduce_groups.clear()

404 for prev_group in prev_groups:

405 for hsdp_param in prev_group.hsdp_params:

406 hsdp_param.reduce_scatter_output()

407 hsdp_param.clear_reduce_scatter_output()

408 if hsdp_param.unsharded_accumulated_grad_data is not None:

409 hsdp_param.unsharded_accumulated_grad = None

410 elif hsdp_param.unsharded_param.grad is not None:

411 hsdp_param.unsharded_param.grad = None

412 return prev_groups

413 return []

414

415 def _wait_and_apply_prev_no_allreduce_params(self):

416 """Step 2: wait/apply previous reduce-scatter for pure FSDP params."""

417 self.reduce_scattered_params()

418

419 def _should_skip_reduce_scatter_issue(self, hsdp_param) -> bool:

420 """Return True when a parameter should not enter the HSDP RS/fused-AR pipeline."""

421 return (

422 not hasattr(hsdp_param, "_unsharded_param")

423 or hsdp_param.unsharded_param is None

424 or not hasattr(hsdp_param, "sharded_param")

425 or not hsdp_param.sharded_param.requires_grad

426 or hsdp_param.shard_size <= 1

427 or self._can_direct_all_reduce_compat_grad(hsdp_param)

428 or not self._has_pending_unsharded_grad(hsdp_param)

429 )

430

431 def _collect_params_for_reduce_scatter(self):

432 """Collect parameters that need the HSDP RS/fused-AR overlap pipeline."""

433 return [

434 hsdp_param

435 for hsdp_param in self._iter_managed_params()

436 if not self._should_skip_reduce_scatter_issue(hsdp_param)

437 ]

438

439 def _needs_overlap_post_backward_steps(self) -> bool:

440 """Whether the 4-step RS/AR overlap pipeline has pending work this hook."""

441 if MindSporeHSDPStateV2.pre_all_reduce_groups:

442 return True

443 if HSDPState.pre_reduce_scatter_params:

444 return True

445 return bool(self._collect_params_for_reduce_scatter())

446

447 def _run_overlap_post_backward_steps(self) -> None:

448 """Run the 4-step HSDP RS/AR overlap pipeline for the current module."""

449 prev_group = self._wait_prev_reduce_scatter()

450 self._wait_and_apply_prev_no_allreduce_params()

451 self._issue_reduce_scatter_for_current_module()

452 self._issue_prev_fused_allreduce(prev_group)

453

454 def _issue_reduce_scatter_for_current_module(self):

455 """Issue reduce_scatter for current module with fused all-reduce when needed."""

456 params_to_reduce = self._collect_params_for_reduce_scatter()

457 if not params_to_reduce:

458 return

459

460 groups_by_comm = defaultdict(list)

461 for hsdp_param in params_to_reduce:

462 if self._should_run_all_reduce(hsdp_param):

463 replicate_group = hsdp_param.unsharded_group_info.group

464 key = id(replicate_group) if replicate_group is not None else None

465 groups_by_comm[key].append(hsdp_param)

466 else:

467 groups_by_comm[None].append(hsdp_param)

468

469 if None in groups_by_comm:

470 for hsdp_param in groups_by_comm[None]:

471 hsdp_param.reduce_scatter_grad(

472 async_op=True,

473 dtype=self._reduce_dtype,

474 reduce_op=self._resolve_reduce_op(),

475 )

476 HSDPState.pre_reduce_scatter_params.append(

477 (hsdp_param, self._orig_dtype)

478 )

479

480 for key, hsdp_params in groups_by_comm.items():

481 if key is None:

482 continue

483 group_info = hsdp_params[0].unsharded_group_info

484 group = AllReduceParamGroup(

485 replicate_group=group_info.group,

486 hsdp_params=hsdp_params,

487 orig_dtypes=[self._orig_dtype] * len(hsdp_params),

488 reduce_dtype=self._reduce_dtype,

489 reduce_op=self._resolve_reduce_op(),

490 mp_policy=self.mp_policy,

491 replicate_world_size=group_info.rank_size,

492 )

493 group.allocate_fused_buffer(self.device)

494 for idx, hsdp_param in enumerate(hsdp_params):

495 buffer_view = group.get_param_buffer_view(idx)

496 hsdp_param.reduce_scatter_grad(

497 async_op=True,

498 dtype=self._reduce_dtype,

499 reduce_op=self._resolve_reduce_op(),

500 output_buffer=buffer_view,

501 )

502 MindSporeHSDPStateV2.pre_all_reduce_groups.append(group)

503

504 def _issue_prev_fused_allreduce(self, prev_groups: List) -> None:

505 """Step 4: issue async all-reduce for previous HSDP groups (no-op without fusion groups)."""

506 for prev_group in prev_groups:

507 prev_group.accumulate_existing_grads_to_buffer()

508 prev_group.issue_async_allreduce()

509 MindSporeHSDPStateV2.pending_all_reduce_groups.append(prev_group)

510

511 @classmethod

512 def delay_apply_reduce_grads(cls) -> None:

513 """Wait pending fused all-reduce groups at root backward."""

514 need_synchronize = False

515 for group in cls.pending_all_reduce_groups:

516 need_synchronize = group.wait_and_apply_grads() or need_synchronize

517 cls.pending_all_reduce_groups.clear()

518 if need_synchronize:

519 ms.runtime.current_stream().synchronize()

520

521 def post_backward_for_comm_fusion(self):

522 """Drive the fused gradient-reduction pipeline for sharded params."""

523 logger.debug("post_backward module=%s mode=comm_fusion enter", self)

524 self.reduce_params()

525 comm_ctx = get_comm_ctx()

526 if comm_ctx.all_reduce_param_group is not None:

527 logger.debug("post_backward module=%s wait=comm_fusion_all_reduce", self)

528 comm_ctx.all_reduce_param_group.wait_all_reduce_and_apply_grad()

529 comm_ctx.all_reduce_param_group = None

530 if comm_ctx.pre_param_group is not None:

531 logger.debug("post_backward module=%s wait=comm_fusion_reduce_scatter", self)

532 comm_ctx.pre_param_group.wait_reduce_scatter_and_issue_all_reduce()

533 comm_ctx.pre_param_group = None

534 if self.param_group is not None:

535 logger.debug("post_backward module=%s launch=comm_fusion_reduce_scatter", self)

536 self.param_group.foreach_reduce(

537 reduce_scatter_reduce_op=self._resolve_reduce_op(),

538 )

539 self._queue_replicate_params_allreduce()

540

541 def _post_backward_without_reduce(self):

542 """Finish backward when gradient communication is disabled."""

543 if self.reshard_after_backward:

544 self.shard()

545 for hsdp_param in self._iter_managed_params():

546 hsdp_param.to_accumulated_grad_if_needed()

547

548 def _should_run_all_reduce(self, hsdp_param) -> bool:

549 """Whether the current parameter should issue an all-reduce in this backward pass."""

550 return self.requires_all_reduce and hsdp_param.dp_size > 1

551

552 def _queue_compat_all_reduce(self, hsdp_param):

553 """Queue the compatibility all-reduce path without FSDP sharding."""

554 if not self._should_run_all_reduce(hsdp_param):

555 return

556 # Pure all-reduce path: pass grad=None so all_reduce_grad fetches the

557 # unsharded grad itself and owns the scaling (no reduce-scatter here).

558 hsdp_param.all_reduce_grad(

559 dtype=self._reduce_dtype,

560 async_op=True,

561 reduce_op=self._resolve_reduce_op(),

562 )

563 logger.debug(

564 "post_backward module=%s launch=compat_all_reduce param=%s",

565 self,

566 hsdp_param,

567 )

568 HSDPState.pre_all_reduce_params.append((hsdp_param, self._orig_dtype))

569

570 def _can_direct_all_reduce_compat_grad(self, hsdp_param) -> bool:

571 """Whether ``hsdp_param`` should reduce its existing ``sharded_param.grad`` directly."""

572 if not hasattr(hsdp_param, "param_mode"):

573 return False

574 return (

575 hsdp_param.param_mode == FullyShardParamMode.DTENSOR_COMPAT

576 and hsdp_param.enable_fsdp_shard

577 and not hsdp_param.is_sharded

578 and hsdp_param.shard_size == 1

579 and hsdp_param.sharded_param.requires_grad

580 and self._should_run_all_reduce(hsdp_param)

581 and self._get_local_sharded_grad(hsdp_param) is not None

582 )

583

584 def _queue_direct_compat_all_reduce(self, hsdp_param):

585 """Queue all-reduce for DTENSOR_COMPAT params whose grad stays on ``sharded_param``."""

586 grad = self._get_local_sharded_grad(hsdp_param)

587 if grad is None:

588 return

589 reduced_grad = _to_dtype_if_needed(grad, self._reduce_dtype)

590 # All-reduce needs a contiguous buffer; the local sharded grad may be a

591 # non-contiguous view. No-op when already contiguous; the copy is written

592 # back to grad in reduce_params().

593 reduced_grad = reduced_grad.contiguous()

594 # Pure all-reduce path (no reduce-scatter): this leg owns the scaling.

595 # all-reduce below is in-place, so scale in-place before it.

596 apply_gradient_scaling_factor(reduced_grad, hsdp_param.gradient_scaling_factor)

597 reduce_group_info = getattr(hsdp_param, "unsharded_group_info", None)

598 reduce_group = reduce_group_info.group if reduce_group_info is not None else None

599 reduce_group_size = reduce_group_info.rank_size if reduce_group_info is not None else 1

600 handle = None

601 if reduce_group_size > 1:

602 if reduce_group is None:

603 raise RuntimeError("Expected a valid unsharded all-reduce group when rank_size > 1")

604 handle = dist.all_reduce(

605 reduced_grad,

606 group=reduce_group,

607 op=self._resolve_reduce_op(),

608 async_op=True,

609 )

610 MindSporeHSDPStateV2.pre_direct_all_reduce_grads.append(

611 (hsdp_param, handle, reduced_grad, grad, reduce_group_size, False)

612 )

613

614 def post_backward(self, *_):

615 """Post-backward hook that accumulates, reduces, and reshards gradients for all managed parameters."""

616 for hsdp_param in self._iter_managed_params():

617 hsdp_param.accumulate_unsharded_grad_if_needed()

618 if not self.reduce_grads:

619 self._post_backward_without_reduce()

620 return

621 if not self.comm_fusion:

622 self.reduce_params()

623 for hsdp_param in self._iter_managed_params():

624 # replicate_params are queued once by _queue_replicate_params_allreduce().

625 if not getattr(hsdp_param, "enable_fsdp_shard", True):

626 continue

627 if not hasattr(hsdp_param, "_unsharded_param") or hsdp_param.unsharded_param is None:

628 if self._can_direct_all_reduce_compat_grad(hsdp_param):

629 self._queue_direct_compat_all_reduce(hsdp_param)

630 continue

631 if not hasattr(hsdp_param, "sharded_param") or not hsdp_param.sharded_param.requires_grad:

632 continue

633 if not self._has_pending_unsharded_grad(hsdp_param):

634 continue

635 if hsdp_param.shard_size <= 1:

636 if self._should_run_all_reduce(hsdp_param):

637 self._queue_compat_all_reduce(hsdp_param)

638 else:

639 logger.debug(

640 "post_backward module=%s apply=no_comm_grad param=%s",

641 self,

642 hsdp_param,

643 )

644 # No-communication path (shard_size == 1, no all-reduce):

645 # this leg owns the scaling since the grad never goes through

646 # reduce_scatter_grad / all_reduce_grad.

647 need_synchronize = self._apply_pending_unsharded_grad_locally(

648 hsdp_param

649 )

650 self._synchronize_current_stream_if_needed(need_synchronize)

651

652 if self._needs_overlap_post_backward_steps():

653 self._run_overlap_post_backward_steps()

654 self._queue_replicate_params_allreduce()

655 else:

656 self.post_backward_for_comm_fusion()

657 if self.reshard_after_backward:

658 self.shard()

659

660 def set_requires_grad_sync(self, requires_grad_sync):

661 """set requires grad sync flag to control gradient sync."""

662 self.reduce_grads = requires_grad_sync

663

664 def set_reduce_op_type(self, reduce_op_type: str):

665 """set reduce op type for gradient reduction."""

666 fsdp_support_reduce_op = {

667 "sum": ops.ReduceOp.SUM,

668 "avg": ops.ReduceOp.AVG,

669 }

670 if reduce_op_type not in fsdp_support_reduce_op:

671 raise ValueError(

672 f"Unsupported reduce op type {reduce_op_type}, "

673 f"supported types are {list(fsdp_support_reduce_op.keys())}")

674 reduce_op: str = reduce_op_type.lower().strip()

675 self.reduce_op_type = fsdp_support_reduce_op.get(reduce_op)

Coverage for / home / jenkins / .local / lib / python3.10 / site-packages / hyper_parallel / platform / mindspore / fully_shard / state.py: 86%

379 statements