Coverage for /home/jenkins/.local/lib/python3.10/site-packages/hyper_parallel/platform/mindspore/custom_ops/custom_op

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ============================================================================

15"""MindSpore custom kernel implementations and DFunction wrappers."""

16from dataclasses import dataclass

17import importlib

18import os

19import sys

21import mindspore as ms # pylint: disable=C0415

23from hyper_parallel.core.shard.dfunction import DFunction

26_CC_DIR = os.path.dirname(os.path.abspath(__file__))

27_MS_EXTENSION_NAME = "hyper_parallel_custom_ops_ms"

28_BUILD_LIB = os.path.join(_CC_DIR, "build", "lib")

30if _BUILD_LIB not in sys.path:

31 sys.path.insert(0, _BUILD_LIB)

33_CUSTOM_OP_SOURCES = [

34 os.path.join(_CC_DIR, "module.cc"),

35 os.path.join(_CC_DIR, "dense_lightning_indexer_grad_kl_loss.cc"),

36 os.path.join(_CC_DIR, "dense_lightning_indexer_softmax_lse.cc"),

37 os.path.join(_CC_DIR, "sparse_lightning_indexer_grad_kl_loss.cc"),

38 os.path.join(_CC_DIR, "mhc_post.cc"),

39 os.path.join(_CC_DIR, "mhc_post_backward.cc"),

40 os.path.join(_CC_DIR, "mhc_pre_sinkhorn.cc"),

41 os.path.join(_CC_DIR, "mhc_pre_sinkhorn_backward.cc"),

42 os.path.join(_CC_DIR, "mhc_pre_clamp_sinkhorn.cc"),

43 os.path.join(_CC_DIR, "mhc_pre_clamp_sinkhorn_backward.cc"),

44]

45_MHC_PRE_CLAMP_NONE_GRADS = (None,) * 7

48@dataclass(frozen=True)

49class _MhcPreClampArgs:

50 """Bound arguments for npu_mhc_pre_clamp_sinkhorn."""

52 x: ms.Tensor

53 phi: ms.Tensor

54 alpha: ms.Tensor

55 bias: ms.Tensor

56 hc_mult: int

57 num_iters: int

58 hc_eps: float

59 norm_eps: float

60 out_flag: bool

61 clamp_min: float

62 clamp_max: float

65def _bind_mhc_pre_clamp_args(args, kwargs):

66 """Bind npu_mhc_pre_clamp_sinkhorn arguments with Python defaults."""

67 names = (

68 "x", "phi", "alpha", "bias", "hc_mult", "num_iters",

69 "hc_eps", "norm_eps", "out_flag", "clamp_min", "clamp_max",

70 )

71 values = {

72 "hc_mult": 4,

73 "num_iters": 20,

74 "hc_eps": 1e-6,

75 "norm_eps": 1e-6,

76 "out_flag": True,

77 "clamp_min": 0.0,

78 "clamp_max": 0.0,

79 }

80 if len(args) > len(names):

81 raise TypeError(f"npu_mhc_pre_clamp_sinkhorn expected at most {len(names)} arguments")

82 for name, value in zip(names, args):

83 values[name] = value

84 for name, value in kwargs.items():

85 if name in values and name in names[:len(args)]:

86 raise TypeError(f"npu_mhc_pre_clamp_sinkhorn got multiple values for argument '{name}'")

87 if name not in names:

88 raise TypeError(f"npu_mhc_pre_clamp_sinkhorn got an unexpected keyword argument '{name}'")

89 values[name] = value

90 missing = [name for name in names[:4] if name not in values]

91 if missing:

92 raise TypeError(f"npu_mhc_pre_clamp_sinkhorn missing required arguments: {missing}")

93 return _MhcPreClampArgs(*(values[name] for name in names))

96def _build_custom_ops():

97 return ms.ops.CustomOpBuilder(

98 _MS_EXTENSION_NAME,

99 _CUSTOM_OP_SOURCES,

100 backend="Ascend",

101 ).load()

102

103

104try:

105 _custom_ops = importlib.import_module(_MS_EXTENSION_NAME)

106except ImportError:

107 # Source-tree development: .so not pre-built; JIT-compile from local .cc files.

108 _custom_ops = _build_custom_ops()

109else:

110 # Rebuild stale source-tree extensions that predate newly added symbols.

111 if not hasattr(_custom_ops, "npu_mhc_pre_clamp_sinkhorn"):

112 _custom_ops = _build_custom_ops()

113

114

115def _ensure_contiguous(*tensors):

116 """Ensure all tensors are contiguous (no-op if already contiguous)."""

117 return tuple(t.contiguous() if not t.is_contiguous() else t for t in tensors)

118

119

120def _to_list_int64(val):

121 """Convert Tensor(int32) to List[int64] for aclnn kernel consumption."""

122 if isinstance(val, ms.Tensor):

123 return val.asnumpy().astype("int64").tolist()

124 return val

125

126

127class NpuDenseLightningIndexerSoftmaxLseDFunction(DFunction): # pylint: disable=W0221

128 """DFunction wrapper for npu_dense_lightning_indexer_softmax_lse on MindSpore.

129

130 Routes plain-tensor calls directly to the MindSpore custom kernel, and

131 DTensor calls through the distributed dispatch framework using the

132 registered DistributedOp with the same op_name.

133

134 All 11 forward arguments after ``ctx`` are positional to stay compatible

135 with both MindSpore autograd function conventions.

136

137 No backward is defined because the operator does not require gradients.

138 """

139

140 _op_name = "npu_dense_lightning_indexer_softmax_lse"

141

142 @staticmethod

143 def forward(ctx, query_index, key_index, weights,

144 actual_seq_qlen, actual_seq_klen,

145 layout, sparse_mode, pre_tokens, next_tokens):

146 """Forward pass: delegates to the MindSpore Ascend custom kernel.

147

148 Args:

149 ctx: Autograd context.

150 query_index: Lightning Indexer query input (Q̃).

151 key_index: Lightning Indexer key input (K̃).

152 weights: Lightning Indexer weight coefficient (W).

153 actual_seq_qlen: Cumulative query sequence lengths; None for BSND.

154 actual_seq_klen: Cumulative key sequence lengths; None for BSND.

155 layout: Data layout format, 'BSND' or 'TND'.

156 sparse_mode: Sparse computation mode (only mode 3 supported).

157 pre_tokens: Number of preceding tokens for sparse attention.

158 next_tokens: Number of following tokens for sparse attention.

159

160 Returns:

161 tuple[Tensor, Tensor]: (softmax_max_index, softmax_sum_index), both float32.

162 """

163 return _custom_ops.npu_dense_lightning_indexer_softmax_lse(

164 query_index, key_index, weights,

165 _to_list_int64(actual_seq_qlen), _to_list_int64(actual_seq_klen),

166 layout, sparse_mode, pre_tokens, next_tokens,

167 )

168

169 @staticmethod

170 def backward(ctx, *grad_outputs):

171 """No-op backward — this operator does not require gradients."""

172 return (None,) * 9

173

174

175class NpuDenseLightningIndexerGradKlLossDFunction(DFunction): # pylint: disable=W0221

176 """DFunction wrapper for npu_dense_lightning_indexer_grad_kl_loss on MindSpore.

177

178 Routes plain-tensor calls directly to the MindSpore custom kernel, and

179 DTensor calls through the distributed dispatch framework using the

180 registered DistributedOp with the same op_name.

181

182 All 18 forward arguments after ``ctx`` are positional to stay compatible

183 with both MindSpore autograd function conventions.

184 """

185

186 _op_name = "npu_dense_lightning_indexer_grad_kl_loss"

187

188 @staticmethod

189 def forward(ctx, query, key, query_index, key_index, weights,

190 softmax_max, softmax_sum, softmax_max_index, softmax_sum_index,

191 scale_value, query_rope, key_rope,

192 actual_seq_qlen, actual_seq_klen,

193 layout, sparse_mode, pre_tokens, next_tokens):

194 """Forward pass: delegates to the MindSpore Ascend custom kernel.

195

196 Args:

197 ctx: Autograd context.

198 query: Main attention query (Q). dtype bfloat16/float16.

199 key: Main attention key (K). dtype bfloat16/float16.

200 query_index: Lightning Indexer query input (Q̃). dtype bfloat16/float16.

201 key_index: Lightning Indexer key input (K̃). dtype bfloat16/float16.

202 weights: Lightning Indexer weight coefficient (W).

203 softmax_max: Attention softmax max values. dtype float32.

204 softmax_sum: Attention softmax sum values. dtype float32.

205 softmax_max_index: Index attention softmax max (from softmax_lse). dtype float32.

206 softmax_sum_index: Index attention softmax sum (from softmax_lse). dtype float32.

207 scale_value: Scaling factor. dtype float32.

208 query_rope: Optional MLA query rope tensor.

209 key_rope: Optional MLA key rope tensor.

210 actual_seq_qlen: Cumulative query sequence lengths; None for BSND.

211 actual_seq_klen: Cumulative key sequence lengths; None for BSND.

212 layout: Data layout format, 'BSND' or 'TND'.

213 sparse_mode: Sparse computation mode (only mode 3 supported).

214 pre_tokens: Number of preceding tokens for sparse attention.

215 next_tokens: Number of following tokens for sparse attention.

216

217 Returns:

218 tuple[Tensor, Tensor, Tensor, Tensor]:

219 (d_query_index, d_key_index, d_weights, loss).

220 """

221 result = _custom_ops.npu_dense_lightning_indexer_grad_kl_loss(

222 query, key, query_index, key_index, weights,

223 softmax_max, softmax_sum, softmax_max_index, softmax_sum_index,

224 scale_value, query_rope, key_rope,

225 _to_list_int64(actual_seq_qlen), _to_list_int64(actual_seq_klen),

226 layout, sparse_mode, pre_tokens, next_tokens,

227 )

228 ctx.save_for_backward(result[0], result[1], result[2])

229 return result

230

231 @staticmethod

232 def backward(ctx, *grad_outputs):

233 d_query_index, d_key_index, d_weights = _ensure_contiguous(*ctx.saved_tensors)

234 return (None, None, d_query_index, d_key_index, d_weights,

235 None, None, None, None, None, None, None, None, None, None, None, None, None)

236

237

238class NpuSparseLightningIndexerGradKlLossDFunction(DFunction): # pylint: disable=W0221

239 """DFunction wrapper for npu_sparse_lightning_indexer_grad_kl_loss on MindSpore.

240

241 Routes plain-tensor calls directly to the MindSpore custom kernel, and

242 DTensor calls through the distributed dispatch framework using the

243 registered DistributedOp with the same op_name.

244

245 All 17 forward arguments after ``ctx`` are positional to stay compatible

246 with both MindSpore autograd function conventions.

247 """

248

249 _op_name = "npu_sparse_lightning_indexer_grad_kl_loss"

250

251 @staticmethod

252 def forward(ctx, query, key, query_index, key_index, weights,

253 sparse_indices, softmax_max, softmax_sum, scale_value,

254 query_rope, key_rope,

255 actual_seq_qlen, actual_seq_klen,

256 layout, sparse_mode, pre_tokens, next_tokens):

257 """Forward pass: delegates to the MindSpore Ascend custom kernel.

258

259 Args:

260 ctx: Autograd context.

261 query: Main attention query (q_t). dtype bfloat16/float16.

262 key: Main attention key (K_t). dtype bfloat16/float16.

263 query_index: Lightning Indexer query input (q̃_t). dtype bfloat16/float16.

264 key_index: Lightning Indexer key input (K̃_t). dtype bfloat16/float16.

265 weights: Lightning Indexer weight coefficient (W_t).

266 sparse_indices: Sorted token indices for key/key_index. dtype bfloat16/float16.

267 softmax_max: Attention softmax max values.

268 softmax_sum: Attention softmax sum values.

269 scale_value: Scaling factor. dtype float.

270 query_rope: Optional MLA query rope tensor.

271 key_rope: Optional MLA key rope tensor.

272 actual_seq_qlen: Cumulative query sequence lengths; None for BSND.

273 actual_seq_klen: Cumulative key sequence lengths; None for BSND.

274 layout: Data layout format, 'BSND' or 'TND'.

275 sparse_mode: Sparse computation mode (only mode 3 supported).

276 pre_tokens: Number of preceding tokens for sparse attention.

277 next_tokens: Number of following tokens for sparse attention.

278

279 Returns:

280 tuple[Tensor, Tensor, Tensor, Tensor]:

281 (d_query_index, d_key_index, d_weights, loss).

282 """

283 result = _custom_ops.npu_sparse_lightning_indexer_grad_kl_loss(

284 query, key, query_index, key_index, weights,

285 sparse_indices, softmax_max, softmax_sum, scale_value,

286 query_rope, key_rope,

287 _to_list_int64(actual_seq_qlen), _to_list_int64(actual_seq_klen),

288 layout, sparse_mode, pre_tokens, next_tokens,

289 )

290 ctx.save_for_backward(result[0], result[1], result[2])

291 return result

292

293 @staticmethod

294 def backward(ctx, *grad_outputs):

295 d_query_index, d_key_index, d_weights = _ensure_contiguous(*ctx.saved_tensors)

296 return (None, None, d_query_index, d_key_index, d_weights,

297 None, None, None, None, None, None, None, None, None, None, None, None)

298

299

300class NpuMhcPostDFunction(DFunction): # pylint: disable=W0221

301 """DFunction wrapper for npu_mhc_post on MindSpore.

302

303 Routes plain-tensor calls directly to the MindSpore custom kernel, and

304 DTensor calls through the distributed dispatch framework using the

305 registered DistributedOp with the same op_name.

306

307 All 4 forward arguments after ``ctx`` are positional to stay compatible

308 with both MindSpore autograd function conventions.

309 """

310

311 _op_name = "npu_mhc_post"

312

313 @staticmethod

314 def forward(ctx, x, h_res, h_out, h_post):

315 """Forward pass: delegates to the MindSpore Ascend custom kernel.

316

317 Args:

318 ctx: Autograd context.

319 x: Input tensor of shape [B,S,N,D] or [T,N,D]. dtype bfloat16/float16.

320 h_res: mHC h_res transformation matrix. dtype float32.

321 h_out: Attention/MLP layer output. dtype bfloat16/float16.

322 h_post: mHC h_post transformation matrix. dtype float32.

323

324 Returns:

325 Tensor: Output tensor with same shape and dtype as x.

326 """

327 ctx.save_for_backward(x, h_res, h_out, h_post)

328 return _custom_ops.npu_mhc_post(x, h_res, h_out, h_post)

329

330 @staticmethod

331 def backward(ctx, *grad_outputs):

332 """Backward pass: calls npu_mhc_post_backward kernel.

333

334 Args:

335 ctx: Autograd context.

336 grad_outputs: Upstream gradients; grad_outputs[0] is grad_y.

337

338 Returns:

339 tuple: (grad_x, grad_h_res, grad_h_out, grad_h_post).

340 """

341 x, h_res, h_out, h_post = ctx.saved_tensors

342 grad_y, x, h_res, h_out, h_post = _ensure_contiguous(

343 grad_outputs[0], x, h_res, h_out, h_post)

344 grads = _custom_ops.npu_mhc_post_backward(

345 grad_y, x, h_res, h_out, h_post)

346 return grads[0], grads[1], grads[2], grads[3]

347

348

349class NpuMhcPreSinkhornDFunction(DFunction): # pylint: disable=W0221

350 """DFunction wrapper for npu_mhc_pre_sinkhorn on MindSpore.

351

352 Routes plain-tensor calls directly to the MindSpore custom kernel, and

353 DTensor calls through the distributed dispatch framework using the

354 registered DistributedOp with the same op_name.

355

356 All 9 forward arguments after ``ctx`` are positional to stay compatible

357 with both MindSpore autograd function conventions.

358 """

359

360 _op_name = "npu_mhc_pre_sinkhorn"

361

362 @staticmethod

363 def forward(ctx, x, phi, alpha, bias, hc_mult, num_iters, hc_eps, norm_eps, out_flag):

364 """Forward pass: delegates to the MindSpore Ascend custom kernel.

365

366 Args:

367 ctx: Autograd context.

368 x: Input tensor. dtype bfloat16/float16.

369 phi: mHC parameter matrix. dtype float32.

370 alpha: mHC scaling parameters. dtype float32.

371 bias: mHC bias parameters. dtype float32.

372 hc_mult: HC dimension size (currently only 4 supported).

373 num_iters: Sinkhorn iteration count.

374 hc_eps: H_pre sigmoid eps parameter.

375 norm_eps: RmsNorm eps parameter.

376 out_flag: Whether to output intermediate gradients.

377

378 Returns:

379 tuple[Tensor, ...]: 8 output tensors

380 (h_in, h_post, h_res, h_pre, hc_before_norm, inv_rms, sum_out, norm_out).

381 """

382 result = _custom_ops.npu_mhc_pre_sinkhorn(

383 x, phi, alpha, bias, hc_mult, num_iters, hc_eps, norm_eps, out_flag

384 )

385 _, _, _, h_pre, hc_before_norm, inv_rms, sum_out, norm_out = result

386 ctx.save_for_backward(x, phi, alpha, bias,

387 h_pre, hc_before_norm, inv_rms, sum_out, norm_out)

388 ctx.hc_eps = hc_eps

389 return result

390

391 @staticmethod

392 def backward(ctx, *grad_outputs):

393 """Backward pass: calls npu_mhc_pre_sinkhorn_backward kernel.

394

395 Args:

396 ctx: Autograd context.

397 grad_outputs: Upstream gradients for the 8 forward outputs.

398 grad_outputs[0]=grad_h_in, [1]=grad_h_post, [2]=grad_h_res;

399 [3..7] correspond to saved intermediates and are None.

400

401 Returns:

402 tuple: (grad_x, grad_phi, grad_alpha, grad_bias, None×5) —

403 gradients for the 9 forward inputs.

404 """

405 x, phi, alpha, bias, h_pre, hc_before_norm, inv_rms, sum_out, norm_out = ctx.saved_tensors

406 (grad_h_in, grad_h_post, grad_h_res,

407 x, phi, alpha, bias,

408 h_pre, hc_before_norm, inv_rms, sum_out, norm_out) = _ensure_contiguous(

409 grad_outputs[0], grad_outputs[1], grad_outputs[2],

410 x, phi, alpha, bias,

411 h_pre, hc_before_norm, inv_rms, sum_out, norm_out)

412 b, s, n = grad_h_post.shape

413 grad_h_res = grad_h_res.reshape(b, s, n, n)

414 grads = _custom_ops.npu_mhc_pre_sinkhorn_backward(

415 grad_h_in, grad_h_post, grad_h_res,

416 x, phi, alpha, bias,

417 h_pre, hc_before_norm, inv_rms, sum_out, norm_out,

418 ctx.hc_eps)

419 return grads[0], grads[1], grads[2], grads[3], None, None, None, None, None

420

421

422class NpuMhcPreClampSinkhornDFunction(DFunction): # pylint: disable=W0221

423 """DFunction wrapper for npu_mhc_pre_clamp_sinkhorn on MindSpore.

424

425 This matches the static-graph aclnnMhcPreClampSinkhorn integration:

426 forward has 11 arguments and returns 9 tensors, and backward consumes

427 h_res_logits plus clamp_min/clamp_max.

428 """

429

430 _op_name = "npu_mhc_pre_clamp_sinkhorn"

431

432 @staticmethod

433 def forward(ctx, *args, **kwargs):

434 """Forward pass: delegates to the clamp-enabled Ascend custom kernel."""

435 bound = _bind_mhc_pre_clamp_args(args, kwargs)

436 result = _custom_ops.npu_mhc_pre_clamp_sinkhorn(

437 bound.x, bound.phi, bound.alpha, bound.bias,

438 bound.hc_mult, bound.num_iters, bound.hc_eps, bound.norm_eps,

439 bound.out_flag, bound.clamp_min, bound.clamp_max

440 )

441 _, _, _, h_pre, hc_before_norm, inv_rms, sum_out, norm_out, h_res_logits = result

442 ctx.save_for_backward(bound.x, bound.phi, bound.alpha, bound.bias,

443 h_pre, hc_before_norm, inv_rms, sum_out, norm_out, h_res_logits)

444 ctx.hc_eps = bound.hc_eps

445 ctx.clamp_min = bound.clamp_min

446 ctx.clamp_max = bound.clamp_max

447 return result

448

449 @staticmethod

450 def backward(ctx, *grad_outputs):

451 """Backward pass: calls npu_mhc_pre_clamp_sinkhorn_backward kernel."""

452 tensors = _ensure_contiguous(

453 grad_outputs[0], grad_outputs[1], grad_outputs[2],

454 *ctx.saved_tensors

455 )

456 n = tensors[1].shape[-1]

457 grad_h_res = ms.ops.reshape(tensors[2], tuple(tensors[2].shape[:-1]) + (n, n))

458

459 grads = _custom_ops.npu_mhc_pre_clamp_sinkhorn_backward(

460 tensors[0], tensors[1], grad_h_res,

461 tensors[3], tensors[4], tensors[5], tensors[6],

462 tensors[7], tensors[8], tensors[9], tensors[10], tensors[11], tensors[12],

463 ctx.hc_eps, ctx.clamp_min, ctx.clamp_max)

464 return tuple(grads[:4]) + _MHC_PRE_CLAMP_NONE_GRADS

Coverage for / home / jenkins / .local / lib / python3.10 / site-packages / hyper_parallel / platform / mindspore / custom_ops / custom_op_impl.py: 53%

135 statements