Coverage for /home/jenkins/.local/lib/python3.10/site-packages/hyper_parallel/core/shard/ops/parallel_lightning

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ============================================================================

15"""Distributed implementation for lightning_indexer operator."""

16import copy

17from typing import Callable, Optional, Tuple

19from hyper_parallel.core.dtensor.layout import Layout

20from hyper_parallel.platform import get_platform

21from .parallel_ops import DistributedOp

22from .parallel_npu_dense_lightning_indexer_softmax_lse import (

23 _adjust_bsnd_key,

24 _adjust_tnd_seq_lens,

25 _to_local_seq_len,

26)

28platform = get_platform()

30_MAX_INT64 = 9223372036854775807

32# Maps layout_str -> tensor role -> {dim_index: dim_label} for replicated-dim checks.

33# 'q' = query, 'k' = key, 'w' = weights.

34_REPLICATED_DIMS = {

35 'BSND': {

36 'q': {2: 'N1', 3: 'D'},

37 'k': {1: 'S2', 2: 'N2', 3: 'D'},

38 'w': {2: 'N1'},

39 },

40 'TND': {

41 'q': {1: 'N1', 2: 'D'},

42 'k': {1: 'N2', 2: 'D'},

43 'w': {1: 'N1'},

44 },

45}

48def _normalize_lightning_indexer_args(

49 query,

50 key,

51 weights,

52 actual_seq_lengths_query=None,

53 actual_seq_lengths_key=None,

54 block_table=None,

55 layout_query='BSND',

56 layout_key='BSND',

57 sparse_count=2048,

58 sparse_mode=3,

59 pre_tokens=_MAX_INT64,

60 next_tokens=_MAX_INT64,

61 return_value=False):

62 """Normalize positional and keyword arguments into a canonical positional tuple.

64 Args:

65 query: Query tensor.

66 key: Key tensor.

67 weights: Weight tensor.

68 actual_seq_lengths_query: Cumulative query sequence lengths (TND only).

69 actual_seq_lengths_key: Cumulative key sequence lengths (TND only).

70 block_table: Block table for PageAttention (optional).

71 layout_query: Input layout string for query, 'BSND' or 'TND'.

72 layout_key: Input layout string for key, 'BSND', 'TND', or 'PA_BSND'.

73 sparse_count: Number of top-k blocks to retain.

74 sparse_mode: Sparse attention mode (0=defaultMask, 3=rightDownCausal).

75 pre_tokens: Sparse pre-tokens count.

76 next_tokens: Sparse next-tokens count.

77 return_value: Whether to output sparse_values.

79 Returns:

80 tuple: (positional_args_tuple, keyword_args_dict)

81 """

82 local_args = (query, key, weights)

83 local_kwargs = {

84 'actual_seq_lengths_query': actual_seq_lengths_query,

85 'actual_seq_lengths_key': actual_seq_lengths_key,

86 'block_table': block_table,

87 'layout_query': layout_query,

88 'layout_key': layout_key,

89 'sparse_count': sparse_count,

90 'sparse_mode': sparse_mode,

91 'pre_tokens': pre_tokens,

92 'next_tokens': next_tokens,

93 'return_value': return_value,

94 }

95 return local_args, local_kwargs

98class LightningIndexerDistributedOp(DistributedOp):

99 """Distributed operator for MindSpore built-in lightning_indexer.

100

101 LightningIndexer computes the top-k most relevant key positions for each query token

102 in sparse attention. It is a MindSpore built-in op (accessed via

103 ``ops.lightning_indexer``), not a custom op, so only the distributed sharding

104 logic is implemented here.

105

106 Supports BSND and TND input layouts on both MindSpore and PyTorch platforms.

107

108 Output shapes:

109 - BSND: query (B, S1, N1, D) → outputs (B, S1, N2, sparse_count)

110 - TND: query (T1, N1, D) → outputs (T1, N2, sparse_count)

111

112 Context parallelism (CP) is handled in ``get_expand_impl``:

113 - BSND+CP: key S2 is sliced to the causal window for each rank.

114 - TND+CP: actual_seq_qlen / actual_seq_klen are adjusted per rank.

115

116 """

117

118 @staticmethod

119 def _infer_output_layout(q_layout: Layout, layout_str: str) -> Layout:

120 """Build the output layout for both sparse outputs from the query layout.

121

122 BSND: input (B, S1, N1, D) → output (B, S1, N2, sparse_count)

123 tensor_map: (q_tm[0], q_tm[1], -1, -1)

124 TND: input (T1, N1, D) → output (T1, N2, sparse_count)

125 tensor_map: (q_tm[0], -1, -1)

126

127 N2 is always replicated (key's head dimension constraint).

128 sparse_count is always replicated (int scalar attribute).

129

130 Args:

131 q_layout: Layout of the query input.

132 layout_str: 'BSND' or 'TND'.

133

134 Returns:

135 Layout for the output tensors.

136 """

137 q_tm = q_layout.tensor_map

138 out_layout = Layout.from_device_mesh(q_layout.mesh)

139 if layout_str == 'BSND':

140 out_tm = (q_tm[0], q_tm[1], -1, -1)

141 else:

142 out_tm = (q_tm[0], -1, -1)

143 out_layout.set_tensor_map(out_tm)

144 out_layout.tensor_map_to_placement()

145 return out_layout

146

147 def preprocess(self, args: tuple, kwargs: dict) -> tuple:

148 """Extract local tensors and build the layout cache.

149

150 Args:

151 args: Positional arguments (may contain DTensors).

152 kwargs: Keyword arguments.

153

154 Returns:

155 tuple: (local_args, local_kwargs, cache_values) where cache_values is

156 [q_layout, k_layout, w_layout, layout_str].

157 """

158 norm_args, local_kwargs = _normalize_lightning_indexer_args(*args, **kwargs)

159

160 query_index, key_index, weights = norm_args[0], norm_args[1], norm_args[2]

161 layout_str = local_kwargs['layout_query'] # layout_query

162

163 local_kwargs['actual_seq_lengths_query'] = _to_local_seq_len(

164 local_kwargs.get('actual_seq_lengths_query'))

165 local_kwargs['actual_seq_lengths_key'] = _to_local_seq_len(

166 local_kwargs.get('actual_seq_lengths_key'))

167

168 local_args = (query_index.to_local(), key_index.to_local(), weights.to_local())

169

170 cache_values = [query_index.layout, key_index.layout, weights.layout, layout_str]

171 return local_args, local_kwargs, cache_values

172

173 @staticmethod

174 def _validate_input_layouts(

175 q_layout: Layout,

176 k_layout: Layout,

177 w_layout: Layout,

178 layout_str: str,

179 ) -> None:

180 """Validate sharding constraints for all input tensors.

181

182 BSND rules (query/key/weights shapes: (B,S1,N1,D) / (B,S2,N2,D) / (B,S1,N1)):

183 - N1 (dim 2) and D (dim 3) of query must be replicated.

184 - S2 (dim 1), N2 (dim 2), D (dim 3) of key must be replicated.

185 - B sharding of query and key must be identical.

186 - B and S1 sharding of weights must match query; N1 must be replicated.

187

188 TND rules (query/key/weights shapes: (T1,N1,D) / (T2,N2,D) / (T1,N1)):

189 - N1 (dim 1) and D (dim 2) of query must be replicated.

190 - N2 (dim 1) and D (dim 2) of key must be replicated.

191 - T1 sharding of weights must match query; N1 must be replicated.

192

193 Args:

194 q_layout: Layout of query.

195 k_layout: Layout of key.

196 w_layout: Layout of weights.

197 layout_str: 'BSND' or 'TND'.

198

199 Raises:

200 ValueError: If any constraint is violated.

201 """

202 op = "lightning_indexer"

203 q_tm = q_layout.tensor_map

204 k_tm = k_layout.tensor_map

205 w_tm = w_layout.tensor_map

206 tms = {'q': (q_tm, 'query'), 'k': (k_tm, 'key'), 'w': (w_tm, 'weights')}

207 for role, dims in _REPLICATED_DIMS.get(layout_str, {}).items():

208 tm_entry = tms.get(role)

209 if tm_entry is None:

210 continue

211 tm, tensor_name = tm_entry

212 for dim, label in dims.items():

213 if tm[dim] != -1:

214 raise ValueError(

215 f"For {op}, {label} (dim {dim}) of {tensor_name} should be replicated, "

216 f"but got tensor_map={tm}"

217 )

218 if layout_str == 'BSND':

219 if q_tm[0] != k_tm[0]:

220 raise ValueError(

221 f"For {op}, B (dim 0) sharding of query and key should match, "

222 f"but got query={q_tm[0]}, key={k_tm[0]}"

223 )

224 if w_tm[0] != q_tm[0]:

225 raise ValueError(

226 f"For {op}, B (dim 0) sharding of weights should match query, "

227 f"but got weights={w_tm[0]}, query={q_tm[0]}"

228 )

229 if w_tm[1] != q_tm[1]:

230 raise ValueError(

231 f"For {op}, S1 (dim 1) sharding of weights should match query, "

232 f"but got weights={w_tm[1]}, query={q_tm[1]}"

233 )

234 else: # TND

235 if w_tm[0] != q_tm[0]:

236 raise ValueError(

237 f"For {op}, T1 (dim 0) sharding of weights should match query, "

238 f"but got weights={w_tm[0]}, query={q_tm[0]}"

239 )

240

241 def infer_layout(self, cache_values: list) -> Tuple[tuple, None]:

242 """Infer output layouts for sparse_indices and sparse_values outputs.

243

244 Rules:

245 1. No Partial inputs are allowed on any of the three input tensors.

246 2. Input sharding constraints are validated per layout_str (see

247 ``_validate_input_layouts`` for the full rule set).

248 3. Output tensor shape depends on layout_str:

249 - BSND: query (B, S1, N1, D) → outputs (B, S1, N2, sparse_count).

250 B and S1 sharding are inherited from query;

251 N2 and sparse_count are always replicated.

252 - TND: query (T1, N1, D) → outputs (T1, N2, sparse_count).

253 T1 sharding is inherited from query;

254 N2 and sparse_count are always replicated.

255 4. Both sparse_indices and sparse_values outputs share the same layout

256 (independent deep copies so callers can mutate them safely).

257

258 Args:

259 cache_values: [q_layout, k_layout, w_layout, layout_str]

260

261 Returns:

262 tuple: ((indices_layout, values_layout), None)

263

264 Raises:

265 ValueError: If any input has Partial status, or sharding constraints

266 are violated.

267 """

268 q_layout = cache_values[0]

269 k_layout = cache_values[1]

270 w_layout = cache_values[2]

271 layout_str = cache_values[3]

272

273 self._check_partial_inputs([q_layout, k_layout, w_layout])

274 self._validate_input_layouts(q_layout, k_layout, w_layout, layout_str)

275

276 out_layout = self._infer_output_layout(q_layout, layout_str)

277 return (out_layout, copy.deepcopy(out_layout)), None

278

279 def get_expand_impl( # pylint: disable=W0237

280 self,

281 func: Optional[Callable],

282 infer_result: tuple,

283 cache_values: list,

284 extra_args: Optional[tuple] = None,

285 ) -> Optional[Callable]:

286 """Return a custom callable if context-parallel adjustments are needed.

287

288 BSND+CP: wraps ``func`` to slice key's S2 to the causal window.

289 TND+CP: wraps ``func`` to adjust actual_seq_qlen/klen per rank.

290 No CP: returns None (dispatcher calls ``func`` directly).

291

292 Args:

293 func: The underlying op callable.

294 infer_result: Output from ``infer_layout``.

295 cache_values: [q_layout, k_layout, w_layout, layout_str].

296 extra_args: Unused; kept for interface compatibility.

297

298 Returns:

299 Callable wrapper or None.

300 """

301 q_layout = cache_values[0]

302 k_layout = cache_values[1]

303 layout_str = cache_values[3]

304

305 if layout_str == 'BSND':

306 # S1 is dim 1 of query; if not sharded, no CP adjustment needed.

307 if q_layout.tensor_map[1] == -1:

308 return None

309 split_id = q_layout.get_split_id(1)

310

311 def _bsnd_cp_impl(*args, **kwargs):

312 local_q, local_k = args[0], args[1]

313 sliced_k = _adjust_bsnd_key(local_k, local_q.shape[1], split_id)

314 return func(local_q, sliced_k, *args[2:], **kwargs)

315

316 return _bsnd_cp_impl

317

318 # TND: DP always requires seq_len adjustment; CP additionally

319 # requires token-level offset adjustment.

320 dp_size = k_layout.get_dim_split_num(0) # DP splits on k's T2

321 split_id = q_layout.get_split_id(0)

322 cp_size = (q_layout.get_dim_split_num(0) // dp_size

323 if dp_size > 0 else 1)

324 cp_rank = split_id % cp_size if cp_size > 1 else 0

325

326 def _tnd_impl(*args, **kwargs):

327 local_q, local_k = args[0], args[1]

328

329 qlen_tensor = kwargs.get('actual_seq_lengths_query')

330 klen_tensor = kwargs.get('actual_seq_lengths_key')

331

332 if qlen_tensor is None or klen_tensor is None:

333 return func(*args, **kwargs)

334

335 adj_q, adj_k = _adjust_tnd_seq_lens(

336 local_q, local_k, qlen_tensor, klen_tensor,

337 cp_rank=cp_rank,

338 )

339

340 return func(*args, **{**kwargs, 'actual_seq_lengths_query': adj_q,

341 'actual_seq_lengths_key': adj_k})

342

343 return _tnd_impl

Coverage for / home / jenkins / .local / lib / python3.10 / site-packages / hyper_parallel / core / shard / ops / parallel_lightning_indexer.py: 88%

92 statements