Coverage for  / home / jenkins / .local / lib / python3.10 / site-packages / hyper_parallel / core / multicore / platform / torch / __init__.py: 0%

51 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-05-20 07:18 +0800

1# Copyright 2026 Huawei Technologies Co., Ltd 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================ 

15""" 

16hyper_parallel.core.multicore.platform.torch 

17============================================= 

18Out-of-tree PyTorch operator registration for MoE-FFN operators. 

19 

20Registers into the ``hyper_parallel`` PyTorch namespace — does NOT modify 

21op-plugin or any PyTorch source. The operators are accessible via: 

22 

23 torch.ops.hyper_parallel.moe_ffn_fwd(...) 

24 torch.ops.hyper_parallel.moe_ffn_bwd(...) 

25 

26Or via the Python wrappers in this module: 

27 

28 from hyper_parallel.core.multicore.platform.torch import moe_ffn_fwd, moe_ffn_bwd 

29 

30Build prerequisites 

31------------------- 

32No manual env-var setup is required if ``prebuild/multicore_moe_ffn.tar.gz`` 

33exists alongside this package. The tarball is auto-extracted and the correct 

34``CANN_VENDOR_*`` paths are derived automatically. 

35 

36Alternatively, set one of the following environment variables before importing: 

37 

38 # Preferred: per-op explicit paths 

39 export CANN_VENDOR_FWD_LIBDIR=/path/to/multicore_moe_ffn_nn/op_api/lib 

40 export CANN_VENDOR_BWD_LIBDIR=/path/to/multicore_moe_ffn_grad_nn/op_api/lib 

41 

42 # Or: legacy single-lib (used for both fwd and bwd) 

43 export CANN_VENDOR_LIBDIR=/path/to/multicore_moe_ffn_nn/op_api/lib 

44 

45Symbol lookup (new op-plugin API) 

46---------------------------------- 

47GetOpApiFuncAddr (in op_api_common.cpp) resolves aclnnMulticoreMoeFfn* at 

48runtime via dlopen. It searches each entry in ``g_custom_lib_path`` (a C++ 

49file-scope ``const`` global in libopapi.so) for 

50``op_api/lib/libcust_opapi.so``. ``g_custom_lib_path`` is initialised 

51**once** from ``ASCEND_CUSTOM_OPP_PATH`` when ``libopapi.so`` is first loaded 

52(triggered by ``import torch_npu``); after that it is frozen and cannot be 

53updated from Python. 

54 

55Our extension links against ``libopapi.so`` and shares its 

56``g_custom_lib_path`` — the extension does **not** carry its own copy. 

57 

58This module therefore sets ``ASCEND_CUSTOM_OPP_PATH`` **before** its own 

59``import torch`` call (line 127) so that, when ``import torch_npu`` on 

60line 145 causes ``libopapi.so`` to load, the variable is already present. 

61 

62**Limitation — if** ``torch_npu`` **was imported before this module**: 

63``libopapi.so`` will have been loaded (and ``g_custom_lib_path`` frozen) 

64before we can set ``ASCEND_CUSTOM_OPP_PATH``. In that scenario the vendor 

65libs will not be found via ``GetOpApiFuncAddr``. The only reliable fix is to 

66set ``ASCEND_CUSTOM_OPP_PATH`` at the shell level (or at the very top of the 

67Python entry-point script) **before** any ``import torch_npu``. 

68""" 

69import ctypes 

70import os 

71import re as _re 

72 

73# --------------------------------------------------------------------------- 

74# Step 0: locate both CANN vendor lib dirs (forward + backward packages). 

75# 

76# Two vendor packages provide separate libcust_opapi.so files: 

77# Forward : multicore_moe_ffn_nn → aclnnMulticoreMoeFfn* 

78# Backward: multicore_moe_ffn_grad_nn → aclnnMulticoreMoeFfnGrad* 

79# 

80# Resolution order (highest priority first): 

81# 1. CANN_VENDOR_FWD_LIBDIR / CANN_VENDOR_BWD_LIBDIR (explicit per-op) 

82# 2. CANN_VENDOR_LIBDIR (legacy single-lib; used for both fwd and bwd) 

83# 3. Auto-detect from prebuild tarball / directory (no env vars needed) 

84# 

85# This block uses only pure-Python (os.path / tarfile) so it is safe to run 

86# before any C++ .so is loaded. 

87# --------------------------------------------------------------------------- 

88_ASCEND_HOME = os.environ.get("ASCEND_HOME_PATH", 

89 "/usr/local/Ascend/ascend-toolkit/latest") 

90 

91_CANN_VENDOR_FWD_LIBDIR = os.environ.get("CANN_VENDOR_FWD_LIBDIR", "") 

92_CANN_VENDOR_BWD_LIBDIR = os.environ.get("CANN_VENDOR_BWD_LIBDIR", "") 

93 

94if not (_CANN_VENDOR_FWD_LIBDIR or _CANN_VENDOR_BWD_LIBDIR): 

95 _legacy = os.environ.get("CANN_VENDOR_LIBDIR", "") 

96 if _legacy: 

97 _CANN_VENDOR_FWD_LIBDIR = _legacy 

98 _CANN_VENDOR_BWD_LIBDIR = _legacy 

99 else: 

100 # Auto-detect: prebuild directory is 2 levels up from platform/torch/ 

101 _PREBUILD_DIR = os.path.normpath( 

102 os.path.join(os.path.dirname(__file__), "../../prebuild/multicore_moe_ffn")) 

103 _TARBALL = _PREBUILD_DIR + ".tar.gz" 

104 if not os.path.isdir(_PREBUILD_DIR) and os.path.isfile(_TARBALL): 

105 import tarfile as _tarfile 

106 with _tarfile.open(_TARBALL) as _tf: 

107 _tf.extractall(os.path.dirname(_PREBUILD_DIR)) 

108 if os.path.isdir(_PREBUILD_DIR): 

109 _vendors = os.path.join(_PREBUILD_DIR, "vendors") 

110 _fwd = os.path.join(_vendors, "multicore_moe_ffn_nn", "op_api", "lib") 

111 _bwd = os.path.join(_vendors, "multicore_moe_ffn_grad_nn", "op_api", "lib") 

112 if os.path.isdir(_fwd): 

113 _CANN_VENDOR_FWD_LIBDIR = _fwd 

114 if os.path.isdir(_bwd): 

115 _CANN_VENDOR_BWD_LIBDIR = _bwd 

116 

117# --------------------------------------------------------------------------- 

118# Step 1: set ASCEND_CUSTOM_OPP_PATH from the detected vendor lib dirs. 

119# 

120# GetOpApiFuncAddr searches each entry in ASCEND_CUSTOM_OPP_PATH for 

121# op_api/lib/libcust_opapi.so to resolve aclnnMulticoreMoeFfn* symbols. 

122# g_custom_lib_path (a C++ static-duration global in op_api_common.cpp) is 

123# populated from ASCEND_CUSTOM_OPP_PATH once — when the .so that contains 

124# op_api_common.cpp is first loaded into the process. Depending on the 

125# build, that .so may be libopapi.so (loaded transitively by torch or 

126# torch_npu) or our own extension. 

127# 

128# To cover both cases, ASCEND_CUSTOM_OPP_PATH MUST be set before ANY 

129# C++ .so import — i.e., before `import torch` below. 

130# --------------------------------------------------------------------------- 

131for _libdir in (_CANN_VENDOR_FWD_LIBDIR, _CANN_VENDOR_BWD_LIBDIR): 

132 if _libdir: 

133 _vendor_root = _re.sub(r'/op_api/lib/?$', '', _libdir) 

134 _cur_opp = os.environ.get('ASCEND_CUSTOM_OPP_PATH', '') 

135 if _vendor_root and _vendor_root not in _cur_opp: 

136 os.environ['ASCEND_CUSTOM_OPP_PATH'] = ( 

137 f"{_vendor_root}:{_cur_opp}" if _cur_opp else _vendor_root) 

138 

139import torch # pylint: disable=import-self,wrong-import-position # noqa: E402 

140 

141# --------------------------------------------------------------------------- 

142# Step 2: pre-load CANN base libs with RTLD_GLOBAL. 

143# 

144# libascendcl.so: belt-and-suspenders — import torch_npu loads it, but 

145# promoting to GLOBAL ensures any late-loading consumer sees its symbols. 

146# libopapi.so: provides aclopExecutor which libcust_opapi.so calls without 

147# DT_NEEDED (CANN cmake allows undefined symbols). Must be GLOBAL first. 

148# --------------------------------------------------------------------------- 

149for _ascendcl_path in [ 

150 os.path.join(_ASCEND_HOME, "lib64", "libascendcl.so"), 

151 os.path.join(_ASCEND_HOME, "fwkacllib", "lib64", "libascendcl.so"), 

152]: 

153 if os.path.exists(_ascendcl_path): 

154 ctypes.CDLL(_ascendcl_path, mode=ctypes.RTLD_GLOBAL) 

155 break 

156 

157import torch_npu # noqa: F401 — loads libtorch_npu.so # pylint: disable=wrong-import-position 

158 

159_opapi_path = os.path.join(_ASCEND_HOME, "lib64", "libopapi.so") 

160if os.path.exists(_opapi_path): 

161 ctypes.CDLL(_opapi_path, mode=ctypes.RTLD_GLOBAL) 

162 

163# --------------------------------------------------------------------------- 

164# Step 3: pre-load both libcust_opapi.so files with RTLD_GLOBAL. 

165# 

166# Forward (multicore_moe_ffn_nn) exports: aclnnMulticoreMoeFfn* 

167# Backward (multicore_moe_ffn_grad_nn) exports: aclnnMulticoreMoeFfnGrad* 

168# Pre-loading with RTLD_GLOBAL ensures symbols from both are globally visible 

169# so GetOpApiFuncAddr can resolve either operator at runtime. 

170# --------------------------------------------------------------------------- 

171for _vendor_libdir in (_CANN_VENDOR_FWD_LIBDIR, _CANN_VENDOR_BWD_LIBDIR): 

172 if _vendor_libdir: 

173 _cust_opapi = os.path.join(_vendor_libdir, "libcust_opapi.so") 

174 if os.path.exists(_cust_opapi): 

175 ctypes.CDLL(_cust_opapi, mode=ctypes.RTLD_GLOBAL) 

176 

177# --------------------------------------------------------------------------- 

178# Step 4: import the compiled extension. 

179# 

180# ASCEND_CUSTOM_OPP_PATH is now set (Step 1) and libcust_opapi.so is 

181# pre-loaded (Step 3), so g_custom_lib_path will be initialized correctly 

182# when the .so's static initializers run. 

183# --------------------------------------------------------------------------- 

184from . import hyper_parallel_multicore_moe_ffn_pta # noqa: F401, E402 # pylint: disable=wrong-import-position,import-self 

185 

186# --------------------------------------------------------------------------- 

187# Python wrappers — thin pass-through to the registered C++ ops 

188# --------------------------------------------------------------------------- 

189 

190def moe_ffn_fwd( 

191 dispatch_target, dispatch_target_off, 

192 dispatch_src, dispatch_src_off, dispatch_size, 

193 up_proj_weight, up_proj_glist, 

194 up_proj_y, swiglu_out, 

195 down_proj_weight, down_proj_glist, down_proj_y, 

196 combine_target, combine_target_off, combine_src_off, combine_size, 

197 gmm_workspace, up_proj_tiling, swiglu_tiling, down_proj_tiling, 

198 runtime_config, all_event_counters, 

199 rank_id: int, ep: int, expert_num: int, 

200 hidden_size: int, seq_size: int, 

201): 

202 """ 

203 MoE-FFN forward operator. 

204 

205 Writes in-place to: dispatch_target, up_proj_y, swiglu_out, down_proj_y, 

206 combine_target. 

207 All output tensors must be pre-allocated with correct shapes. 

208 

209 Parameters 

210 ---------- 

211 dispatch_target, dispatch_target_off, dispatch_src, dispatch_src_off, 

212 dispatch_size : 

213 AllToAll dispatch buffers — dispatch_target written in-place. 

214 up_proj_weight, up_proj_glist : 

215 Expert weight and cumulative group sizes for GMM1 (up-projection). 

216 up_proj_y, swiglu_out : 

217 GMM1 output and SwiGLU output — written in-place. 

218 down_proj_weight, down_proj_glist, down_proj_y : 

219 Expert weight, cumulative group sizes, and output for GMM2 (down-projection). 

220 combine_target, combine_target_off, combine_src_off, combine_size : 

221 AllToAll combine buffers — combine_target written in-place. 

222 gmm_workspace, up_proj_tiling, swiglu_tiling, down_proj_tiling : 

223 Pre-computed tiling tensors (from gen_runtime_data.py). 

224 runtime_config : 

225 Per-rank runtime config tensor (from gen_runtime_data.py). 

226 all_event_counters : 

227 Event synchronization counter tensor. 

228 rank_id, ep, expert_num, hidden_size, seq_size : 

229 Topology / shape attributes. 

230 """ 

231 torch.ops.hyper_parallel.moe_ffn_fwd( 

232 dispatch_target, dispatch_target_off, 

233 dispatch_src, dispatch_src_off, dispatch_size, 

234 up_proj_weight, up_proj_glist, 

235 up_proj_y, swiglu_out, 

236 down_proj_weight, down_proj_glist, down_proj_y, 

237 combine_target, combine_target_off, combine_src_off, combine_size, 

238 gmm_workspace, up_proj_tiling, swiglu_tiling, down_proj_tiling, 

239 runtime_config, all_event_counters, 

240 rank_id, ep, expert_num, hidden_size, seq_size, 

241 ) 

242 

243 

244def moe_ffn_bwd( 

245 dispatch_target, dispatch_target_off, 

246 dy, dispatch_src_off, dispatch_size, 

247 hidden, hidden_dw, 

248 w2, act_grad_y, gate, grad_gate, w1, gate_dx, grad_x, 

249 combine_target_off, combine_src_off, combine_size, 

250 permute_out, gate_dw, group_list, 

251 act_grad_tiling, gate_grad_tiling, w2_grad_tiling, w1_grad_tiling, 

252 swiglu_grad_tiling, gmm_workspace, swiglu_grad_workspace, 

253 runtime_config, all_event_counters, 

254 rank_id: int, ep: int, expert_num: int, 

255 hidden_size: int, seq_size: int, 

256): 

257 """ 

258 MoE-FFN backward operator. 

259 

260 Writes in-place to: dispatch_target, hidden_dw, act_grad_y, grad_gate, 

261 gate_dx, grad_x, permute_out, gate_dw. 

262 All output tensors must be pre-allocated with correct shapes. 

263 

264 Parameters 

265 ---------- 

266 dispatch_target, dispatch_target_off, dy, dispatch_src_off, dispatch_size : 

267 AllToAll dispatch buffers — dispatch_target written in-place with 

268 the dispatched gradient. dy is the source gradient tensor. 

269 hidden : 

270 SwiGLU output saved from the forward pass (used by W2-grad, GMM4). 

271 hidden_dw : 

272 W2 weight gradient — written in-place. 

273 w2 : 

274 W2 weight (= down_proj_weight from forward). 

275 act_grad_y : 

276 Activation gradient output from GMM1 bwd (target @ W2.T) — written in-place. 

277 gate : 

278 up_proj_y saved from the forward pass (SwiGLU input). 

279 grad_gate : 

280 SwiGLU gradient output — written in-place. 

281 w1 : 

282 W1 weight (= up_proj_weight from forward). 

283 gate_dx : 

284 GMM2 bwd output (grad_gate @ W1.T), before AllToAll combine — written in-place. 

285 grad_x : 

286 AllToAll combine output (final activation gradient) — written in-place. 

287 combine_target_off, combine_src_off, combine_size : 

288 AllToAll combine buffer descriptors. 

289 permute_out : 

290 In-place intermediate buffer for W1-grad (GMM4). 

291 gate_dw : 

292 W1 weight gradient — written in-place. 

293 group_list : 

294 Cumulative expert token counts ([E] int64). 

295 act_grad_tiling, gate_grad_tiling, w2_grad_tiling, w1_grad_tiling, 

296 swiglu_grad_tiling : 

297 Pre-computed tiling tensors (from gen_runtime_data.py bwd). 

298 gmm_workspace, swiglu_grad_workspace : 

299 Workspace tensors. 

300 runtime_config : 

301 Per-rank runtime config tensor (from gen_runtime_data.py bwd). 

302 all_event_counters : 

303 Event synchronization counter tensor. 

304 rank_id, ep, expert_num, hidden_size, seq_size : 

305 Topology / shape attributes. 

306 """ 

307 torch.ops.hyper_parallel.moe_ffn_bwd( 

308 dispatch_target, dispatch_target_off, 

309 dy, dispatch_src_off, dispatch_size, 

310 hidden, hidden_dw, 

311 w2, act_grad_y, gate, grad_gate, w1, gate_dx, grad_x, 

312 combine_target_off, combine_src_off, combine_size, 

313 permute_out, gate_dw, group_list, 

314 act_grad_tiling, gate_grad_tiling, w2_grad_tiling, w1_grad_tiling, 

315 swiglu_grad_tiling, gmm_workspace, swiglu_grad_workspace, 

316 runtime_config, all_event_counters, 

317 rank_id, ep, expert_num, hidden_size, seq_size, 

318 ) 

319 

320 

321__all__ = ["moe_ffn_fwd", "moe_ffn_bwd"]