Coverage for /home/jenkins/.local/lib/python3.10/site-packages/hyper_parallel/platform/mindspore/pipeline

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ============================================================================

15"""parallel grad helper (dx/dw split)"""

16from __future__ import absolute_import

17from collections import deque, defaultdict

18import warnings

19import logging

20from mindspore.utils._pytree import tree_flatten, tree_leaves, tree_unflatten

21from mindspore.common.api import _pynative_executor, _GradientEdge

22from mindspore._c_expression import run_backward

23from mindspore.common.tensor import Tensor

24from mindspore import ops

27def _fill_grads(output_tensor):

28 """Fill gradients with ones for given tensor(s)."""

29 if isinstance(output_tensor, Tensor):

30 return ops.ones_like(output_tensor)

31 if isinstance(output_tensor, (list, tuple)):

32 return tuple(ops.ones_like(tensor) for tensor in output_tensor)

33 return None

36def _validate_grad_config(grad_position, weights):

37 """Validate gradient configuration."""

38 if grad_position is None and not weights:

39 raise ValueError("grad_position and weights cannot both be None!")

42def _set_requires_grad(inputs, kwargs, grad_position):

43 """Set requires_grad flag for specified inputs."""

44 if grad_position is None:

45 return

46 if grad_position == -1:

47 flatten_inputs, _ = tree_flatten(inputs, tensors_only_leaf=True)

48 for inp in flatten_inputs:

49 inp._requires_grad = True # pylint: disable=protected-access

50 flatten_kwargs, _ = tree_flatten(kwargs, tensors_only_leaf=True)

51 for kwarg in flatten_kwargs:

52 kwarg._requires_grad = True # pylint: disable=protected-access

53 return

54 if isinstance(grad_position, int) and isinstance(inputs[grad_position], Tensor):

55 inputs[grad_position]._requires_grad = True # pylint: disable=protected-access

56 return

57 if isinstance(grad_position, tuple):

58 for idx in grad_position:

59 if isinstance(inputs[idx], Tensor):

60 inputs[idx]._requires_grad = True # pylint: disable=protected-access

63def _get_grad_node(tensor):

64 """Get the grad_fn or grad accumulator for a tensor."""

65 return tensor._grad_node # pylint: disable=protected-access

68def _get_node_id(node):

69 """Return stable integer id for BackwardNode (C++ `_unique_id`)."""

70 return int(node._unique_id()) # pylint: disable=protected-access

73def _accumulate_grads(target_tensors, grads):

74 """Accumulate returned grads onto leaf tensors' internal grad storage."""

75 if grads is None:

76 return

77 for tensor, grad in zip(target_tensors, grads):

78 if tensor is None or grad is None:

79 continue

80 current_grad = getattr(tensor, "_grad", None)

81 if current_grad is None:

82 tensor._grad = grad # pylint: disable=protected-access

83 else:

84 current_grad += grad

87def _compute_nodes_out_degree(output_grad_fns):

88 """Build reverse edges: child_id -> [(parent_node, slot_index)]."""

90 queue = deque()

91 visited_roots = set()

92 # child_node_id -> list[(parent_node, slot_index_in_parent_next_functions)]

93 backward_edges = defaultdict(list)

95 # Seed from output grad_fns.

96 for node in output_grad_fns:

97 if node is not None:

98 node_id = _get_node_id(node)

99 if node_id not in visited_roots:

100 queue.append(node)

101 visited_roots.add(node_id)

102

103 # Follow next_functions (towards inputs) while recording reverse edges.

104 while queue:

105 node = queue.popleft()

106 next_fns = node.next_functions

107 for slot, (child_fn, _) in enumerate(next_fns):

108 if child_fn is not None:

109 child_id = _get_node_id(child_fn)

110 if len(backward_edges[child_id]) == 0:

111 queue.append(child_fn)

112 backward_edges[child_id].append((node, slot))

113

114 return backward_edges

115

116

117def _compute_reachable_nodes(roots, boundary_nodes, backward_graph):

118 """BFS on reverse edges; stop at boundary_nodes and return (reachable_ids, boundary_hits)."""

119 reachable = set()

120 boundary_hits = set()

121 queue = deque()

122

123 for node in roots:

124 if node is not None:

125 node_id = _get_node_id(node)

126 if node_id not in reachable:

127 reachable.add(node_id)

128 queue.append(node)

129

130 while queue:

131 node = queue.popleft()

132 node_id = _get_node_id(node)

133 parent_entries = backward_graph.get(node_id, [])

134

135 for parent_fn, _ in parent_entries:

136 if parent_fn is None:

137 continue

138 parent_id = _get_node_id(parent_fn)

139 if parent_id in reachable:

140 continue

141 if parent_id in boundary_nodes:

142 boundary_hits.add(parent_fn)

143 continue

144 reachable.add(parent_id)

145 queue.append(parent_fn)

146

147 return reachable, boundary_hits

148

149

150def _find_boundary_intermediates_with_edge_slots(weight_root, boundary_node_ids, backward_graph):

151 """For one weight root, collect boundary intermediates and their edge-slot(s) on the boundary node."""

152 inter_to_slots = defaultdict(set)

153 if weight_root is None:

154 return inter_to_slots

155

156 q = deque([weight_root])

157 seen = {_get_node_id(weight_root)}

158

159 while q:

160 node = q.popleft()

161 node_id = _get_node_id(node)

162 for parent, slot in backward_graph.get(node_id, ()):

163 if parent is None:

164 continue

165 parent_id = _get_node_id(parent)

166 if parent_id in boundary_node_ids:

167 # Slot is already carried by reverse graph, no need to rescan parent.next_functions.

168 inter_to_slots[parent].add(slot)

169 continue

170 if parent_id not in seen:

171 seen.add(parent_id)

172 q.append(parent)

173

174 return inter_to_slots

175

176

177def _group_weights_by_intermediate(input_grad_fns, weight_grad_fns, backward_graph):

178 """Group weight nodes by boundary intermediates; also merge edge-slot usage."""

179 # Step 1: Build input subgraph - all nodes reachable from inputs

180 input_subgraph_ids, _ = _compute_reachable_nodes(input_grad_fns, set(), backward_graph)

181

182 weight_groups = {}

183

184 # Step 2: For each weight, find boundary nodes with input subgraph

185 for weight_fn in weight_grad_fns:

186 inter_to_slots = _find_boundary_intermediates_with_edge_slots(weight_fn, input_subgraph_ids, backward_graph)

187 intermediate_nodes = set(inter_to_slots.keys())

188

189 weight_group = {

190 'params': {weight_fn},

191 'intermediates': intermediate_nodes,

192 'edge_slots': {_get_node_id(node): set(slots) for node, slots in inter_to_slots.items()},

193 }

194

195 # Merge weights with same intermediate nodes

196 for intermediate_node in intermediate_nodes:

197 intermediate_id = _get_node_id(intermediate_node)

198 existing = weight_groups.get(intermediate_id, None)

199 if existing is not None:

200 existing['params'] = existing['params'].union(weight_group['params'])

201 existing['intermediates'] = existing['intermediates'].union(weight_group['intermediates'])

202 if existing.get('edge_slots', None) is None:

203 existing['edge_slots'] = {}

204 for nid, slots in weight_group.get('edge_slots', {}).items():

205 existing['edge_slots'].setdefault(nid, set()).update(slots)

206 weight_group = existing

207 else:

208 weight_groups[intermediate_id] = weight_group

209

210 # Return unique weight groups (normalize to deterministic order for hooks/consumption)

211 seen_groups = set()

212 unique_groups = []

213 for weight_group in weight_groups.values():

214 group_id = id(weight_group)

215 if group_id not in seen_groups:

216 seen_groups.add(group_id)

217 weight_group['intermediates'] = tuple(sorted(weight_group.get('intermediates', ()), key=_get_node_id))

218 weight_group['params'] = tuple(sorted(weight_group.get('params', ()), key=_get_node_id))

219 edge_slots = {}

220 for nid, slots in weight_group.get('edge_slots', {}).items():

221 edge_slots[nid] = tuple(sorted(slots))

222 weight_group['edge_slots'] = edge_slots

223 unique_groups.append(weight_group)

224

225 return unique_groups

226

227

228class GradFunction:

229 """

230 A wrapper class used to build forward output and gradient functions.

231 This class supports separate computation of input gradients (dx) and weight gradients (dw),

232 which is essential for pipeline parallelism.

233

234 Args:

235 output (Any): Forward output value of the network.

236 inputs (tuple[Tensor, ...] | Tensor | Any): Original inputs used for forward computation.

237 kwargs (dict): Keyword arguments used in forward computation.

238 weights (tuple[Parameter, ...] | None): Parameters used for weight gradient calculation.

239 has_aux (bool): Whether the forward output contains auxiliary values.

240 grad_position (tuple[int, ...]): Positions of inputs to compute gradients for.

241

242 Raises:

243 TypeError: If `output` does not match the expected structure when `has_aux` is ``True``.

244

245 Supported Platforms:

246 ``Ascend`` ``GPU`` ``CPU``

247 """

248

249 def __init__(self, output, inputs, kwargs, weights, has_aux, grad_position):

250 self.output = output

251 self.inputs = inputs

252 self.flatten_input_size = 0

253 self.kwargs = kwargs

254 self.weights = weights

255 self.has_aux = has_aux

256 self.grad_position = grad_position

257 # Storage for intermediate gradients captured during dx computation

258 self._saved_intermediates = []

259 self.aux_inputs_data = None

260 self.aux_kwargs_data = None

261 self.aux_weights_data = None

262

263 def _clear_res(self):

264 self.output = None

265 self.inputs = None

266 self.weights = None

267 self._saved_intermediates = []

268

269 def _collect_weight_tensors(self):

270 """Collect weight tensors into a list."""

271 if self.weights is None:

272 return []

273 if isinstance(self.weights, tuple):

274 return list(self.weights)

275 return [self.weights]

276

277 def _setup_intermediate_hooks(self, output_tensors, input_tensors, weight_tensors):

278 """Setup prehooks on intermediate nodes to capture gradients for dw computation."""

279 hook_handles = []

280 output_grad_fns = [_get_grad_node(t) for t in output_tensors if isinstance(t, Tensor)]

281 input_grad_fns = [_get_grad_node(t) for t in input_tensors]

282 weight_grad_fns = [_get_grad_node(w) for w in weight_tensors]

283

284 # Filter out None values

285 output_grad_fns = [fn for fn in output_grad_fns if fn is not None]

286 input_grad_fns = [fn for fn in input_grad_fns if fn is not None]

287 weight_grad_fns = [fn for fn in weight_grad_fns if fn is not None]

288

289 if not output_grad_fns or not weight_grad_fns:

290 return hook_handles

291

292 backward_graph = _compute_nodes_out_degree(output_grad_fns)

293 weight_groups = _group_weights_by_intermediate(input_grad_fns, weight_grad_fns, backward_graph)

294

295 for weight_group in weight_groups:

296 for i, intermediate in enumerate(weight_group['intermediates']):

297 def make_hook(wg, idx):

298 def prehook_fn(grad_inputs):

299 if wg.get('grads', None) is None:

300 wg['grads'] = [None] * len(wg['intermediates'])

301 wg['grads'][idx] = grad_inputs

302 return grad_inputs

303 return prehook_fn

304 handle = intermediate.register_prehook(make_hook(weight_group, i))

305 hook_handles.append(handle)

306

307 self._saved_intermediates = weight_groups

308 return hook_handles

309

310 def _format_input_grads(self, input_grads, input_size):

311 """Format input gradients based on grad_position configuration."""

312 if isinstance(self.grad_position, int) and self.grad_position == -1:

313 if self.flatten_input_size == input_size:

314 return tree_unflatten(self.aux_inputs_data, input_grads[:input_size])

315 return (tree_unflatten(self.aux_inputs_data, input_grads[:self.flatten_input_size]),

316 tree_unflatten(self.aux_kwargs_data, input_grads[self.flatten_input_size:input_size]))

317 return input_grads[0] if len(input_grads) == 1 else input_grads

318

319 def _prune_intermediate_edges(self):

320 """Prune intermediate next_edges based on recorded edge slots."""

321 for weight_group in self._saved_intermediates:

322 edge_slots = weight_group.get('edge_slots', {})

323 for intermediate in weight_group.get('intermediates', ()):

324 if intermediate is None:

325 continue

326 keep_slots = set(edge_slots.get(_get_node_id(intermediate), ()))

327 for slot in range(len(intermediate.next_functions)):

328 if slot not in keep_slots:

329 intermediate._set_next_edge(slot, None) # pylint: disable=protected-access

330

331 @staticmethod

332 def _process_weight_group(weight_group, grad_node_to_weight, keep_graph):

333 """Process a single weight group and compute gradients."""

334 grad_edges = []

335 grad_outputs = []

336

337 for captured_grads, intermediate in zip(weight_group['grads'], weight_group['intermediates']):

338 if captured_grads is None:

339 continue

340 if isinstance(captured_grads, (tuple, list)):

341 for slot_idx, grad_item in enumerate(captured_grads):

342 if grad_item is not None:

343 grad_edges.append(_GradientEdge(

344 grad_node=intermediate, output_index=slot_idx, keep_alive_token=None))

345 grad_outputs.append(grad_item)

346 else:

347 grad_edges.append(_GradientEdge(

348 grad_node=intermediate, output_index=0, keep_alive_token=None))

349 grad_outputs.append(captured_grads)

350

351 if not keep_graph:

352 del weight_group['intermediates']

353

354 if not grad_edges:

355 return {}

356

357 group_weights = [grad_node_to_weight.get(_get_node_id(weight_fn))

358 for weight_fn in weight_group.get('params', set())]

359 group_weights = [w for w in group_weights if w is not None]

360

361 if not group_weights:

362 return {}

363

364 weight_grads = run_backward(

365 tuple(grad_edges), tuple(grad_outputs),

366 keep_graph, False,

367 tuple(group_weights), allow_unreachable=True, accumulate_grad=False

368 )

369 _accumulate_grads(group_weights, weight_grads)

370 if not keep_graph:

371 del weight_group['grads']

372

373 collected = {}

374 for weight, grad in zip(group_weights, weight_grads):

375 collected[weight] = grad if grad is not None else ops.zeros_like(weight)

376 return collected

377

378 def _prepare_output_and_sens(self, sens):

379 """

380 Prepare output tensor and sensitivity based on has_aux flag.

381

382 Args:

383 sens: Gradient of output tensor for gradient computation.

384

385 Returns:

386 Tuple of (output_tensor, processed_sens)

387 """

388 output_tensor = self.output

389 if self.has_aux:

390 if not isinstance(self.output, (tuple, list)):

391 raise TypeError(

392 f"The output of fn should be list or tuple when has_aux=True, "

393 f"but got {type(self.output)}"

394 )

395 output_tensor = output_tensor[0]

396 if isinstance(sens, (tuple, list)):

397 sens = sens[0]

398 return (output_tensor,), (sens,)

399 flatten_outputs = tree_leaves(output_tensor, tensors_only_leaf=True)

400 if sens is None:

401 sens = _fill_grads(flatten_outputs)

402 else:

403 sens = tree_leaves(sens, tensors_only_leaf=True)

404 return tuple(flatten_outputs), tuple(sens)

405

406 def _collect_input_tensors(self, collect_weights=True):

407 """Collect input tensors based on grad_position and weights."""

408 input_tensors = []

409 if isinstance(self.grad_position, int) and self.grad_position == -1:

410 flatten_inputs, self.aux_inputs_data = tree_flatten(self.inputs, tensors_only_leaf=True)

411 flatten_kwargs, self.aux_kwargs_data = tree_flatten(self.kwargs, tensors_only_leaf=True)

412 self.flatten_input_size = len(flatten_inputs)

413 input_tensors.extend(flatten_inputs)

414 input_tensors.extend(flatten_kwargs)

415 elif isinstance(self.grad_position, int) and isinstance(self.inputs[self.grad_position], Tensor):

416 input_tensors.append(self.inputs[self.grad_position])

417 elif isinstance(self.grad_position, (list, tuple)):

418 input_tensors.extend(self.inputs[idx] for idx in self.grad_position if isinstance(self.inputs[idx], Tensor))

419

420 input_size = len(input_tensors)

421 if self.weights is not None and collect_weights:

422 if isinstance(self.weights, (list, tuple)):

423 input_tensors.extend(self.weights)

424 else:

425 input_tensors.append(self.weights)

426

427 return tuple(input_tensors), input_size

428

429 def __call__(self, sens=None, keep_graph=False):

430 """

431 Compute gradients with respect to both inputs and weights.

432

433 Args:

434 sens: gradient of output tensor for gradient computation.

435 keep_graph: Whether to keep the computation graph.

436

437 Returns:

438 Gradients with respect to inputs and/or weights.

439 """

440 weights = self.weights

441 input_tensors, input_size = self._collect_input_tensors()

442 output_tensors, sens = self._prepare_output_and_sens(sens)

443

444 grads = run_backward(

445 output_tensors, sens, keep_graph, keep_graph,

446 input_tensors, allow_unreachable=True, accumulate_grad=False

447 )

448 if input_size > 0:

449 _accumulate_grads(input_tensors[:input_size], grads[:input_size])

450 weight_tensors = self._collect_weight_tensors()

451 if weight_tensors:

452 _accumulate_grads(weight_tensors, grads[input_size:])

453 if not keep_graph:

454 self._clear_res()

455 if input_size == 0:

456 return grads

457 if weights is None:

458 if isinstance(self.grad_position, int) and self.grad_position == -1:

459 if self.flatten_input_size == input_size:

460 return tree_unflatten(self.aux_inputs_data, grads)

461 return (tree_unflatten(self.aux_inputs_data, grads[:self.flatten_input_size]),

462 tree_unflatten(self.aux_kwargs_data, grads[self.flatten_input_size:input_size]))

463 return grads[0] if len(grads) == 1 else grads

464 if isinstance(self.grad_position, int) and self.grad_position == -1:

465 if self.flatten_input_size == input_size:

466 return tree_unflatten(self.aux_inputs_data, grads[:input_size]), grads[input_size:]

467 return (tree_unflatten(self.aux_inputs_data, grads[:self.flatten_input_size]),

468 tree_unflatten(self.aux_kwargs_data, grads[self.flatten_input_size:input_size]),

469 grads[input_size:])

470 return grads[:input_size], grads[input_size:]

471

472 def compute_input_grad(self, sens=None):

473 """

474 Compute gradients with respect to inputs only (dx).

475

476 This is the first stage of dx/dw split computation. It computes input gradients

477 while capturing intermediate gradients at the boundaries between input and weight subgraphs.

478

479 Implementation Strategy:

480 1. Compute graph out degree from output grad_fns

481 2. Find intermediate nodes (boundaries between input/weight subgraphs)

482 3. Register prehooks on these intermediate nodes to capture gradients

483 4. Save captured intermediate gradients for later dw computation

484

485 Args:

486 sens: gradient of output tensor for gradient computation. If None, will use ones_like.

487

488 Returns:

489 Gradients with respect to inputs. Returns single tensor if one input,

490 tuple of tensors if multiple inputs.

491

492 When grad_position=-1 (default), the return type matches the input structure,

493 supporting complex input types (tuple, dict, etc.). The gradients are automatically

494 unflattened to preserve the original input structure.

495

496 Raises:

497 ValueError: If grad_position is None (no inputs specified for differentiation).

498 """

499 if self.grad_position is None:

500 raise ValueError(

501 "compute_input_grad requires grad_position to be specified. "

502 "Cannot compute input gradients when grad_position is None."

503 )

504 input_tensors, input_size = self._collect_input_tensors(collect_weights=False)

505 if not input_tensors:

506 logging.info("No valid input tensors found for gradient computation.")

507 return ()

508

509 weight_tensors = self._collect_weight_tensors()

510 self._saved_intermediates = []

511 output_tensors, sens = self._prepare_output_and_sens(sens)

512

513 hook_handles = []

514 if weight_tensors:

515 hook_handles = self._setup_intermediate_hooks(output_tensors, input_tensors, weight_tensors)

516

517 input_grads = run_backward(

518 output_tensors, sens, True, False,

519 tuple(input_tensors), allow_unreachable=True, accumulate_grad=False

520 )

521 _accumulate_grads(input_tensors, input_grads)

522

523 for handle in hook_handles:

524 handle.remove()

525

526 return self._format_input_grads(input_grads, input_size)

527

528 def compute_weight_grad(self, keep_graph=False):

529 """

530 Compute gradients with respect to weights only (dw).

531

532 This is the second stage of dx/dw split computation. It uses the intermediate

533 gradients captured during compute_input_grad to compute weight gradients efficiently,

534 starting from intermediate nodes rather than recomputing from the output.

535

536 Implementation Strategy:

537 1. Use saved intermediate gradients and GradientEdges from compute_input_grad

538 2. Start backward computation from these intermediate points (not from output)

539 3. This avoids recomputing the portion of the graph already computed in dx

540

541 Args:

542 keep_graph: Whether to keep the computation graph after this computation.

543 Default is False as this is typically the final gradient computation.

544

545 Returns:

546 Gradients with respect to weights. Returns single tensor if one weight,

547 tuple of tensors if multiple weights.

548

549 Raises:

550 ValueError: If weights is None (no weights specified for differentiation).

551 RuntimeError: If compute_input_grad was not called before (no saved intermediates).

552

553 Note:

554 This function must be called after compute_input_grad, as it relies on

555 the intermediate gradients captured during that computation. The computation

556 graph must still be available (retained by keep_graph=True in compute_input_grad).

557 """

558

559 if self.weights is None:

560 warnings.warn(

561 "compute_weight_grad requires weights to be specified. "

562 "Cannot compute weight gradients when weights is None."

563 )

564 self._clear_res()

565 return ()

566 if not self._saved_intermediates:

567 raise RuntimeError("Before calling compute_weight_grad, you need first call compute_input_grad!")

568

569 weight_tensors = self._collect_weight_tensors()

570 if not weight_tensors:

571 raise ValueError("No valid weight tensors found for gradient computation.")

572

573 grad_node_to_weight = {_get_node_id(_get_grad_node(w)): w

574 for w in weight_tensors if _get_grad_node(w) is not None}

575

576 self._prune_intermediate_edges()

577

578 collected_grads = {}

579 for weight_group in self._saved_intermediates:

580 group_grads = self._process_weight_group(weight_group, grad_node_to_weight, keep_graph)

581 collected_grads.update(group_grads)

582

583 if not keep_graph:

584 self._clear_res()

585

586 result_grads = [collected_grads.get(weight) for weight in weight_tensors]

587 return tuple(result_grads)

588

589

590def forward_and_gradfn(fn, *inputs, weights=None, has_aux=False, grad_position=-1, **kwargs):

591 """

592 A wrapper function to generate the function to calculate forward output and gradient function.

593

594 As for gradient, three typical cases are included:

595

596 1. gradient with respect to inputs. In this case, `grad_position` is not None while `weights` is ``None``.

597 2. gradient with respect to weights. In this case, `grad_position` is None while `weights` is not ``None``.

598 3. gradient with respect to inputs and weights. In this case, `grad_position` and `weights` are not ``None``.

599

600 Args:

601 fn (Union[Cell, Function]): Function to do GradOperation.

602 *inputs: Variable length argument list of inputs to the function `fn`.

603 weights (Union[ParameterTuple, Parameter, list[Parameter]], optional):

604 The parameters of the training network that need to

605 calculate the gradient. `weights` can be got through `weights = net.trainable_params()` .

606 Default: ``None`` .

607 has_aux (bool, optional): If ``True`` , only the first output of `fn` contributes the gradient of `fn`,

608 while the other outputs will be returned straightly. It means the `fn` must return more than one outputs

609 in this case.

610 Default: ``False`` .

611 grad_position (Union[NoneType, int, tuple[int]], optional): Index to specify which inputs

612 to be differentiated. Default: ``-1`` means all inputs are differentiated.

613

614 - If int, get the gradient with respect to single input.

615 - If tuple, get the gradients with respect to selected inputs. `grad_position` begins with 0.

616 - If None, none derivative of any input will be solved, and in this case, `weights` is required.

617

618 **kwargs: Variable length keyword argument dictionary. Additional keyword arguments passed to the function `fn`.

619

620 Returns:

621 Tuple of (output, grad_fn):

622

623 - output: The output value of function `fn`.

624 - grad_fn: A :class:`GradFunction` instance used to compute gradients.

625

626 The :class:`GradFunction` class provides methods for gradient computation:

627

628 - :meth:`__call__`: Compute gradients with respect to both inputs and weights at once.

629 - :meth:`compute_input_grad`: Compute gradients with respect to inputs only (dx).

630 This is the first stage of dx/dw split computation, which captures intermediate

631 gradients at weight nodes for efficient dw computation.

632 - :meth:`compute_weight_grad`: Compute gradients with respect to weights only (dw).

633 This is the second stage of dx/dw split computation, which uses the intermediate

634 gradients captured during :meth:`compute_input_grad` to compute weight gradients

635 efficiently without recomputing from the output.

636

637 Examples:

638 When grad_position=-1 (default), the gradient return type matches the input structure,

639 supporting complex input types (tuple, dict, etc.):

640

641 >>> def fn(x, tuple_input, scale=None):

642 ... a, b = tuple_input

643 ... return x * a + x * b + scale * x

644 >>> x = Tensor([2.0])

645 >>> a, b = Tensor([3.0]), Tensor([4.0])

646 >>> scale = Tensor([0.5])

647 >>> _, grad_fn = forward_and_gradfn(fn, x, (a, b), grad_position=-1, scale=scale)

648 >>> dx_grads = grad_fn.compute_input_grad()

649 >>> # When both args and kwargs have tensors, returns (args_grads, kwargs_grads)

650 >>> # args_grads structure matches (x, (a, b)) -> (Tensor, (Tensor, Tensor))

651 >>> # kwargs_grads structure matches {'scale': scale} -> {'scale': Tensor}

652 >>> print(type(dx_grads))

653 <class 'tuple'>

654 >>> print(len(dx_grads))

655 2

656 >>> print(type(dx_grads[0]))

657 <class 'tuple'>

658 >>> print(type(dx_grads[0][1]))

659 <class 'tuple'>

660 >>> print(type(dx_grads[1]))

661 <class 'dict'>

662

663 Raises:

664 ValueError: If both `grad_position` and `weights` are ``None``.

665 TypeError: If type of Args does not belong to required ones.

666

667 Supported Platforms:

668 ``Ascend`` ``GPU`` ``CPU``

669 """

670 _validate_grad_config(grad_position, weights)

671 _set_requires_grad(inputs, kwargs, grad_position)

672 prev_grad_flag = _pynative_executor.grad_flag()

673 _pynative_executor.set_grad_flag(True)

674 try:

675 res = fn(*inputs, **kwargs)

676 except Exception as e:

677 _pynative_executor.clear_res()

678 raise e

679 finally:

680 _pynative_executor.set_grad_flag(prev_grad_flag)

681 grad_fn = GradFunction(res, inputs, kwargs, weights, has_aux, grad_position)

682 return res, grad_fn

Coverage for / home / jenkins / .local / lib / python3.10 / site-packages / hyper_parallel / platform / mindspore / pipeline_parallel / backward.py: 10%

352 statements