Coverage for /home/jenkins/.local/lib/python3.10/site-packages/hyper_parallel/core/shard/ops/parallel_npu_sparse_flash

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ============================================================================

15"""Distributed implementation for npu_sparse_flash_attention operator."""

16import copy

17from typing import Callable, Optional, Tuple

19from hyper_parallel.core.dtensor.dtensor import DTensor

20from hyper_parallel.core.dtensor.layout import Layout

21from .parallel_ops import DistributedOp

22from .parallel_npu_dense_lightning_indexer_softmax_lse import _adjust_bsnd_key, _adjust_tnd_seq_lens

24_MAX_INT64 = 9223372036854775807

26# Maps layout_str -> tensor role -> {dim_index: dim_label} for replicated-dim checks.

27# 'q' = query, 'k' = key, 'v' = value, 'si' = sparse_indices.

28# N1 (head num of query) is forbidden from sharding due to severe performance impact.

29_REPLICATED_DIMS = {

30 'BSND': {

31 'q': {2: 'N1', 3: 'D'},

32 'k': {1: 'S2', 2: 'N2', 3: 'D'},

33 'v': {1: 'S2', 2: 'N2', 3: 'D'},

34 'si': {2: 'N2', 3: 'sparse_size'},

35 },

36 'TND': {

37 'q': {1: 'N1', 2: 'D'},

38 'k': {1: 'N2', 2: 'D'},

39 'v': {1: 'N2', 2: 'D'},

40 'si': {1: 'N2', 2: 'sparse_size'},

41 },

42}

45def _normalize_sfa_args(

46 query,

47 key,

48 value,

49 sparse_indices,

50 scale_value,

51 block_table=None,

52 actual_seq_lengths_query=None,

53 actual_seq_lengths_kv=None,

54 query_rope=None,

55 key_rope=None,

56 sparse_block_size=1,

57 layout_query='BSND',

58 layout_kv='BSND',

59 sparse_mode=3,

60 pre_tokens=_MAX_INT64,

61 next_tokens=_MAX_INT64,

62 attention_mode=2,

63 return_softmax_lse=False):

64 """Normalize positional and keyword arguments into a canonical positional tuple.

66 Args:

67 query: Query tensor.

68 key: Key tensor.

69 value: Value tensor.

70 sparse_indices: Sparse index tensor (int32).

71 scale_value: Scaling factor (float).

72 block_table: Optional PageAttention block mapping table.

73 actual_seq_lengths_query: Actual query sequence lengths per batch.

74 actual_seq_lengths_kv: Actual KV sequence lengths per batch.

75 query_rope: Optional MLA query rope tensor.

76 key_rope: Optional MLA key rope tensor.

77 sparse_block_size: Block size for sparse computation.

78 layout_query: Query layout string ('BSND' or 'TND').

79 layout_kv: KV layout string ('BSND', 'TND', or 'PA_BSND').

80 sparse_mode: Sparse attention mode.

81 pre_tokens: Preceding token window size.

82 next_tokens: Following token window size.

83 attention_mode: Attention mode (0 or 2 for MLA-absorb).

84 return_softmax_lse: Whether to return softmax max/sum.

86 Returns:

87 tuple: (positional_args_tuple, empty_kwargs_dict)

88 """

89 return (

90 query, key, value, sparse_indices, scale_value,

91 block_table, actual_seq_lengths_query, actual_seq_lengths_kv,

92 query_rope, key_rope, sparse_block_size,

93 layout_query, layout_kv, sparse_mode,

94 pre_tokens, next_tokens, attention_mode, return_softmax_lse,

95 ), {}

98def _to_local(t):

99 """Extract the local tensor from a DTensor input (None passes through).

100

101 Every input except actual_seq_lengths must be a DTensor so its layout can be

102 inferred; passing a plain tensor raises AttributeError here by design.

103 """

104 if t is None:

105 return None

106 return t.to_local()

107

108

109def _to_local_seq_len(t):

110 """Extract the local tensor from an actual_seq_lengths input.

111

112 These are built inside the network and are not guaranteed to be DTensors, so

113 a plain tensor (or None) is passed through unchanged.

114 """

115 if isinstance(t, DTensor):

116 return t.to_local()

117 return t

118

119

120class SparseFlashAttentionDistributedOp(DistributedOp):

121 """Distributed operator for npu_sparse_flash_attention.

122

123 Supports BSND and TND input layouts on both MindSpore

124 and PyTorch / torch_npu backends.

125

126 Both frameworks provide built-in forward and backward implementations;

127 this class handles only the distributed dispatch (layout inference and

128 optional TND+CP sequence-length adjustment).

129

130 Output shapes relative to inputs:

131 - BSND: query (B, S1, N1, D) → attention_out (B, S1, N1, D),

132 softmax_max/sum (B, N2, S1, N1/N2)

133 - TND: query (T1, N1, D) → attention_out (T1, N1, D),

134 softmax_max/sum (N2, T1, N1/N2)

135

136 Sharding constraints:

137 - N1 (query head dim) must be replicated — TP on this dim is forbidden

138 due to severe performance impact.

139 - Key/value S2 (or T2), N2, and D dims must be replicated.

140 - sparse_indices N2 and sparse_size dims must be replicated.

141 - PA_BSND layout is not supported in distributed mode.

142

143 Context parallelism:

144 - BSND+CP: k, v, and key_rope are sliced to the causal window

145 ``[:, :S1_local*(split_id+1), :, :]`` before calling the kernel, matching the

146 MindFormers adjust_bsnd_input logic. sparse_indices from lightning_indexer are

147 generated with the same truncation, so they remain valid for the sliced k.

148 - TND+CP: adjusts actual_seq_lengths_query/kv per rank using

149 _adjust_tnd_seq_lens (same logic as dsa_attention.py).

150 """

151

152 @staticmethod

153 def _infer_softmax_layout(q_layout: Layout, layout_str: str) -> Layout:

154 """Build the output layout for softmax_max and softmax_sum.

155

156 BSND: query (B, S1, N1, D) → softmax (B, N2, S1, N1/N2)

157 tensor_map: (q_tm[0], -1, q_tm[1], -1)

158 TND: query (T1, N1, D) → softmax (N2, T1, N1/N2)

159 tensor_map: (-1, q_tm[0], -1)

160

161 N2 and N1/N2 are always replicated because N2=1 and N1 is forbidden

162 from sharding.

163

164 Args:

165 q_layout: Layout of the query input.

166 layout_str: 'BSND' or 'TND'.

167

168 Returns:

169 Layout for softmax_max / softmax_sum.

170 """

171 q_tm = q_layout.tensor_map

172 out_layout = Layout.from_device_mesh(q_layout.mesh)

173 if layout_str == 'BSND':

174 out_tm = (q_tm[0], -1, q_tm[1], -1)

175 else:

176 out_tm = (-1, q_tm[0], -1)

177 out_layout.set_tensor_map(out_tm)

178 out_layout.tensor_map_to_placement()

179 return out_layout

180

181 def preprocess(self, args: tuple, kwargs: dict) -> tuple:

182 """Extract local tensors and build the layout cache.

183

184 Args:

185 args: Positional arguments (may contain DTensors).

186 kwargs: Keyword arguments.

187

188 Returns:

189 tuple: (local_args, local_kwargs, cache_values) where

190 local_args = (query_local, key_local, value_local,

191 sparse_indices_local, scale_value),

192 local_kwargs contains all remaining arguments,

193 cache_values = [q_layout, k_layout, v_layout, si_layout, layout_query_str].

194 """

195 norm_args, _ = _normalize_sfa_args(*args, **kwargs)

196 query = norm_args[0]

197 key = norm_args[1]

198 value = norm_args[2]

199 sparse_indices = norm_args[3]

200 scale_value = norm_args[4]

201 layout_query_str = norm_args[11]

202

203 local_args = (

204 _to_local(query),

205 _to_local(key),

206 _to_local(value),

207 _to_local(sparse_indices),

208 scale_value,

209 )

210 local_kwargs = {

211 'block_table': _to_local(norm_args[5]),

212 'actual_seq_lengths_query': _to_local_seq_len(norm_args[6]),

213 'actual_seq_lengths_kv': _to_local_seq_len(norm_args[7]),

214 'query_rope': _to_local(norm_args[8]),

215 'key_rope': _to_local(norm_args[9]),

216 'sparse_block_size': norm_args[10],

217 'layout_query': norm_args[11],

218 'layout_kv': norm_args[12],

219 'sparse_mode': norm_args[13],

220 'pre_tokens': norm_args[14],

221 'next_tokens': norm_args[15],

222 'attention_mode': norm_args[16],

223 'return_softmax_lse': norm_args[17],

224 }

225

226 cache_values = [

227 query.layout,

228 key.layout,

229 value.layout,

230 sparse_indices.layout,

231 layout_query_str,

232 ]

233 return local_args, local_kwargs, cache_values

234

235 @staticmethod

236 def _validate_input_layouts(

237 q_layout: Layout,

238 k_layout: Layout,

239 v_layout: Layout,

240 si_layout: Layout,

241 layout_str: str,

242 ) -> None:

243 """Validate sharding constraints for all input tensors.

244

245 BSND rules (shapes: (B,S1,N1,D) / (B,S2,N2,D) / (B,S2,N2,D) / (B,S1,N2,sparse_size)):

246 - N1 (dim 2) and D (dim 3) of query must be replicated.

247 - S2 (dim 1), N2 (dim 2), D (dim 3) of key and value must be replicated.

248 - N2 (dim 2) and sparse_size (dim 3) of sparse_indices must be replicated.

249 - B sharding of key, value, and sparse_indices must match query.

250 - S1 sharding of sparse_indices must match query.

251

252 TND rules (shapes: (T1,N1,D) / (T2,N2,D) / (T2,N2,D) / (T1,N2,sparse_size)):

253 - N1 (dim 1) and D (dim 2) of query must be replicated.

254 - N2 (dim 1) and D (dim 2) of key and value must be replicated.

255 - N2 (dim 1) and sparse_size (dim 2) of sparse_indices must be replicated.

256 - T2 sharding of key and value must match.

257 - T1 sharding of sparse_indices must match query.

258

259 PA_BSND is not supported in distributed mode.

260

261 Args:

262 q_layout: Layout of query.

263 k_layout: Layout of key.

264 v_layout: Layout of value.

265 si_layout: Layout of sparse_indices.

266 layout_str: 'BSND' or 'TND'.

267

268 Raises:

269 ValueError: If layout_str is 'PA_BSND', if any required dimension is

270 sharded, or if batch/sequence consistency constraints are violated.

271 """

272 if layout_str == 'PA_BSND':

273 raise ValueError(

274 "For npu_sparse_flash_attention, PA_BSND layout is not supported "

275 "in distributed mode."

276 )

277

278 op = "npu_sparse_flash_attention"

279 q_tm = q_layout.tensor_map

280 k_tm = k_layout.tensor_map

281 v_tm = v_layout.tensor_map

282 si_tm = si_layout.tensor_map

283 tms = {

284 'q': (q_tm, 'query'),

285 'k': (k_tm, 'key'),

286 'v': (v_tm, 'value'),

287 'si': (si_tm, 'sparse_indices'),

288 }

289 for role, dims in _REPLICATED_DIMS.get(layout_str, {}).items():

290 tm_entry = tms.get(role)

291 if tm_entry is None:

292 continue

293 tm, tensor_name = tm_entry

294 for dim, label in dims.items():

295 if tm[dim] != -1:

296 raise ValueError(

297 f"For {op}, {label} (dim {dim}) of {tensor_name} should be replicated, "

298 f"but got tensor_map={tm}"

299 )

300

301 if layout_str == 'BSND':

302 if q_tm[0] != k_tm[0]:

303 raise ValueError(

304 f"For {op}, B (dim 0) sharding of key should match query, "

305 f"but got query={q_tm[0]}, key={k_tm[0]}"

306 )

307 if q_tm[0] != v_tm[0]:

308 raise ValueError(

309 f"For {op}, B (dim 0) sharding of value should match query, "

310 f"but got query={q_tm[0]}, value={v_tm[0]}"

311 )

312 if q_tm[0] != si_tm[0]:

313 raise ValueError(

314 f"For {op}, B (dim 0) sharding of sparse_indices should match query, "

315 f"but got query={q_tm[0]}, sparse_indices={si_tm[0]}"

316 )

317 if q_tm[1] != si_tm[1]:

318 raise ValueError(

319 f"For {op}, S1 (dim 1) sharding of sparse_indices should match query, "

320 f"but got query={q_tm[1]}, sparse_indices={si_tm[1]}"

321 )

322 else: # TND

323 if k_tm[0] != v_tm[0]:

324 raise ValueError(

325 f"For {op}, T2 (dim 0) sharding of value should match key, "

326 f"but got key={k_tm[0]}, value={v_tm[0]}"

327 )

328 if q_tm[0] != si_tm[0]:

329 raise ValueError(

330 f"For {op}, T1 (dim 0) sharding of sparse_indices should match query, "

331 f"but got query={q_tm[0]}, sparse_indices={si_tm[0]}"

332 )

333

334 def infer_layout(self, cache_values: list) -> Tuple[tuple, None]:

335 """Infer output layouts for all three outputs.

336

337 Rules:

338 1. PA_BSND layout is rejected.

339 2. Partial inputs are not allowed on any of the four primary tensors.

340 3. Sharding constraints are validated (see _validate_input_layouts).

341 4. attention_out inherits query layout (deep copy).

342 5. softmax_max and softmax_sum share the same layout derived from

343 query layout with N2 and N1/N2 dims always replicated.

344 6. All three output layouts are independent deep copies.

345

346 Args:

347 cache_values: [q_layout, k_layout, v_layout, si_layout, layout_str]

348

349 Returns:

350 tuple: ((attn_layout, softmax_max_layout, softmax_sum_layout), None)

351

352 Raises:

353 ValueError: If PA_BSND layout, any input has Partial status, or

354 sharding constraints are violated.

355 """

356 q_layout = cache_values[0]

357 k_layout = cache_values[1]

358 v_layout = cache_values[2]

359 si_layout = cache_values[3]

360 layout_str = cache_values[4]

361

362 self._check_partial_inputs([q_layout, k_layout, v_layout, si_layout])

363 self._validate_input_layouts(q_layout, k_layout, v_layout, si_layout, layout_str)

364

365 attn_layout = copy.deepcopy(q_layout)

366 softmax_layout = self._infer_softmax_layout(q_layout, layout_str)

367 return (attn_layout, softmax_layout, copy.deepcopy(softmax_layout)), None

368

369 def get_expand_impl( # pylint: disable=W0237

370 self,

371 func: Optional[Callable],

372 infer_result: tuple,

373 cache_values: list,

374 extra_args: Optional[tuple] = None,

375 ) -> Optional[Callable]:

376 """Return a custom callable if context-parallel adjustment is needed.

377

378 BSND (S1 not sharded): returns None — k/v are Replicated; sparse_indices

379 reference the full k directly.

380 BSND+CP (S1 sharded): wraps func to slice k, v, and key_rope to the

381 causal window ``k[:, :S1_local*(split_id+1), :, :]`` before calling

382 the kernel. Mirrors MindFormers adjust_bsnd_input logic, ensuring

383 that sparse_indices produced by lightning_indexer (which applies the

384 same truncation) remain valid.

385 TND+CP: wraps func to adjust actual_seq_lengths_query/kv per rank,

386 using the same algorithm as dsa_attention._sparse_flash_attention_forward.

387 TND (no CP): wraps func to clamp seq_lens to local T1 slice.

388

389 Args:

390 func: The underlying op callable.

391 infer_result: Output from infer_layout.

392 cache_values: [q_layout, k_layout, v_layout, si_layout, layout_str].

393 extra_args: Unused; kept for interface compatibility.

394

395 Returns:

396 Callable wrapper or None.

397 """

398 q_layout = cache_values[0]

399 k_layout = cache_values[1]

400 layout_str = cache_values[4]

401

402 if layout_str == 'BSND':

403 if q_layout.tensor_map[1] == -1:

404 # S1 not sharded: pure DP or fully replicated.

405 # k/v are Replicate on the CP dimension, so sparse_indices reference

406 # the full k directly; no truncation needed.

407 return None

408 split_id = q_layout.get_split_id(1)

409

410 def _bsnd_cp_impl(*args, **kwargs):

411 local_q, local_k, local_v = args[0], args[1], args[2]

412 s1_local = local_q.shape[1]

413 sliced_k = _adjust_bsnd_key(local_k, s1_local, split_id)

414 sliced_v = _adjust_bsnd_key(local_v, s1_local, split_id)

415 key_rope = kwargs.get('key_rope')

416 new_kwargs = (

417 {**kwargs, 'key_rope': _adjust_bsnd_key(key_rope, s1_local, split_id)}

418 if key_rope is not None else kwargs

419 )

420 return func(local_q, sliced_k, sliced_v, *args[3:], **new_kwargs)

421

422 return _bsnd_cp_impl

423

424 # TND: CP applies when q's T1 is sharded more finely than k's T2.

425 q_split = q_layout.get_dim_split_num(0)

426 k_split = k_layout.get_dim_split_num(0)

427 split_id = q_layout.get_split_id(0) if q_split > k_split else 0

428 cp_size = q_split // k_split if k_split > 0 else 1

429 cp_rank = split_id % cp_size if cp_size > 1 else 0

430

431 def _tnd_cp_impl(*args, **kwargs):

432 local_q, local_k = args[0], args[1]

433 qlen_tensor = kwargs.get('actual_seq_lengths_query')

434 klen_tensor = kwargs.get('actual_seq_lengths_kv')

435 if qlen_tensor is None or klen_tensor is None:

436 return func(*args, **kwargs)

437 adj_q, adj_k = _adjust_tnd_seq_lens(

438 local_q, local_k, qlen_tensor, klen_tensor,

439 cp_rank=cp_rank,

440 )

441 return func(*args, **{

442 **kwargs,

443 'actual_seq_lengths_query': adj_q,

444 'actual_seq_lengths_kv': adj_k,

445 })

446

447 return _tnd_cp_impl

Coverage for / home / jenkins / .local / lib / python3.10 / site-packages / hyper_parallel / core / shard / ops / parallel_npu_sparse_flash_attention.py: 74%

114 statements