Coverage for /home/jenkins/.local/lib/python3.10/site-packages/hyper_parallel/core/multicore/platform/torch/__init_

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ============================================================================

15"""

16hyper_parallel.core.multicore.platform.torch

17=============================================

18Out-of-tree PyTorch operator registration for MoE-FFN operators.

20Registers into the ``hyper_parallel`` PyTorch namespace — does NOT modify

21op-plugin or any PyTorch source. The operators are accessible via:

23 torch.ops.hyper_parallel.moe_ffn_fwd(...)

24 torch.ops.hyper_parallel.moe_ffn_bwd(...)

26Or via the Python wrappers in this module:

28 from hyper_parallel.core.multicore.platform.torch import moe_ffn_fwd, moe_ffn_bwd

30Build prerequisites

31-------------------

32No manual env-var setup is required if ``prebuild/multicore_moe_ffn.tar.gz``

33exists alongside this package. The tarball is auto-extracted and the correct

34``CANN_VENDOR_*`` paths are derived automatically.

36Alternatively, set one of the following environment variables before importing:

38 # Preferred: per-op explicit paths

39 export CANN_VENDOR_FWD_LIBDIR=/path/to/multicore_moe_ffn_nn/op_api/lib

40 export CANN_VENDOR_BWD_LIBDIR=/path/to/multicore_moe_ffn_grad_nn/op_api/lib

42 # Or: legacy single-lib (used for both fwd and bwd)

43 export CANN_VENDOR_LIBDIR=/path/to/multicore_moe_ffn_nn/op_api/lib

45Symbol lookup (new op-plugin API)

46----------------------------------

47GetOpApiFuncAddr (in op_api_common.cpp) resolves aclnnMulticoreMoeFfn* at

48runtime via dlopen. It searches each entry in ``g_custom_lib_path`` (a C++

49file-scope ``const`` global in libopapi.so) for

50``op_api/lib/libcust_opapi.so``. ``g_custom_lib_path`` is initialised

51**once** from ``ASCEND_CUSTOM_OPP_PATH`` when ``libopapi.so`` is first loaded

52(triggered by ``import torch_npu``); after that it is frozen and cannot be

53updated from Python.

55Our extension links against ``libopapi.so`` and shares its

56``g_custom_lib_path`` — the extension does **not** carry its own copy.

58This module therefore sets ``ASCEND_CUSTOM_OPP_PATH`` **before** its own

59``import torch`` call (line 127) so that, when ``import torch_npu`` on

60line 145 causes ``libopapi.so`` to load, the variable is already present.

62**Limitation — if** ``torch_npu`` **was imported before this module**:

63``libopapi.so`` will have been loaded (and ``g_custom_lib_path`` frozen)

64before we can set ``ASCEND_CUSTOM_OPP_PATH``. In that scenario the vendor

65libs will not be found via ``GetOpApiFuncAddr``. The only reliable fix is to

66set ``ASCEND_CUSTOM_OPP_PATH`` at the shell level (or at the very top of the

67Python entry-point script) **before** any ``import torch_npu``.

68"""

69import ctypes

70import os

71import re as _re

73# ---------------------------------------------------------------------------

74# Step 0: locate both CANN vendor lib dirs (forward + backward packages).

75#

76# Two vendor packages provide separate libcust_opapi.so files:

77# Forward : multicore_moe_ffn_nn → aclnnMulticoreMoeFfn*

78# Backward: multicore_moe_ffn_grad_nn → aclnnMulticoreMoeFfnGrad*

79#

80# Resolution order (highest priority first):

81# 1. CANN_VENDOR_FWD_LIBDIR / CANN_VENDOR_BWD_LIBDIR (explicit per-op)

82# 2. CANN_VENDOR_LIBDIR (legacy single-lib; used for both fwd and bwd)

83# 3. Auto-detect from prebuild tarball / directory (no env vars needed)

84#

85# This block uses only pure-Python (os.path / tarfile) so it is safe to run

86# before any C++ .so is loaded.

87# ---------------------------------------------------------------------------

88_ASCEND_HOME = os.environ.get("ASCEND_HOME_PATH",

89 "/usr/local/Ascend/ascend-toolkit/latest")

91_CANN_VENDOR_FWD_LIBDIR = os.environ.get("CANN_VENDOR_FWD_LIBDIR", "")

92_CANN_VENDOR_BWD_LIBDIR = os.environ.get("CANN_VENDOR_BWD_LIBDIR", "")

94if not (_CANN_VENDOR_FWD_LIBDIR or _CANN_VENDOR_BWD_LIBDIR):

95 _legacy = os.environ.get("CANN_VENDOR_LIBDIR", "")

96 if _legacy:

97 _CANN_VENDOR_FWD_LIBDIR = _legacy

98 _CANN_VENDOR_BWD_LIBDIR = _legacy

99 else:

100 # Auto-detect: prebuild directory is 2 levels up from platform/torch/

101 _PREBUILD_DIR = os.path.normpath(

102 os.path.join(os.path.dirname(__file__), "../../prebuild/multicore_moe_ffn"))

103 _TARBALL = _PREBUILD_DIR + ".tar.gz"

104 if not os.path.isdir(_PREBUILD_DIR) and os.path.isfile(_TARBALL):

105 import tarfile as _tarfile

106 with _tarfile.open(_TARBALL) as _tf:

107 _tf.extractall(os.path.dirname(_PREBUILD_DIR))

108 if os.path.isdir(_PREBUILD_DIR):

109 _vendors = os.path.join(_PREBUILD_DIR, "vendors")

110 _fwd = os.path.join(_vendors, "multicore_moe_ffn_nn", "op_api", "lib")

111 _bwd = os.path.join(_vendors, "multicore_moe_ffn_grad_nn", "op_api", "lib")

112 if os.path.isdir(_fwd):

113 _CANN_VENDOR_FWD_LIBDIR = _fwd

114 if os.path.isdir(_bwd):

115 _CANN_VENDOR_BWD_LIBDIR = _bwd

116

117# ---------------------------------------------------------------------------

118# Step 1: set ASCEND_CUSTOM_OPP_PATH from the detected vendor lib dirs.

119#

120# GetOpApiFuncAddr searches each entry in ASCEND_CUSTOM_OPP_PATH for

121# op_api/lib/libcust_opapi.so to resolve aclnnMulticoreMoeFfn* symbols.

122# g_custom_lib_path (a C++ static-duration global in op_api_common.cpp) is

123# populated from ASCEND_CUSTOM_OPP_PATH once — when the .so that contains

124# op_api_common.cpp is first loaded into the process. Depending on the

125# build, that .so may be libopapi.so (loaded transitively by torch or

126# torch_npu) or our own extension.

127#

128# To cover both cases, ASCEND_CUSTOM_OPP_PATH MUST be set before ANY

129# C++ .so import — i.e., before `import torch` below.

130# ---------------------------------------------------------------------------

131for _libdir in (_CANN_VENDOR_FWD_LIBDIR, _CANN_VENDOR_BWD_LIBDIR):

132 if _libdir:

133 _vendor_root = _re.sub(r'/op_api/lib/?$', '', _libdir)

134 _cur_opp = os.environ.get('ASCEND_CUSTOM_OPP_PATH', '')

135 if _vendor_root and _vendor_root not in _cur_opp:

136 os.environ['ASCEND_CUSTOM_OPP_PATH'] = (

137 f"{_vendor_root}:{_cur_opp}" if _cur_opp else _vendor_root)

138

139import torch # pylint: disable=import-self,wrong-import-position # noqa: E402

140

141# ---------------------------------------------------------------------------

142# Step 2: pre-load CANN base libs with RTLD_GLOBAL.

143#

144# libascendcl.so: belt-and-suspenders — import torch_npu loads it, but

145# promoting to GLOBAL ensures any late-loading consumer sees its symbols.

146# libopapi.so: provides aclopExecutor which libcust_opapi.so calls without

147# DT_NEEDED (CANN cmake allows undefined symbols). Must be GLOBAL first.

148# ---------------------------------------------------------------------------

149for _ascendcl_path in [

150 os.path.join(_ASCEND_HOME, "lib64", "libascendcl.so"),

151 os.path.join(_ASCEND_HOME, "fwkacllib", "lib64", "libascendcl.so"),

152]:

153 if os.path.exists(_ascendcl_path):

154 ctypes.CDLL(_ascendcl_path, mode=ctypes.RTLD_GLOBAL)

155 break

156

157import torch_npu # noqa: F401 — loads libtorch_npu.so # pylint: disable=wrong-import-position

158

159_opapi_path = os.path.join(_ASCEND_HOME, "lib64", "libopapi.so")

160if os.path.exists(_opapi_path):

161 ctypes.CDLL(_opapi_path, mode=ctypes.RTLD_GLOBAL)

162

163# ---------------------------------------------------------------------------

164# Step 3: pre-load both libcust_opapi.so files with RTLD_GLOBAL.

165#

166# Forward (multicore_moe_ffn_nn) exports: aclnnMulticoreMoeFfn*

167# Backward (multicore_moe_ffn_grad_nn) exports: aclnnMulticoreMoeFfnGrad*

168# Pre-loading with RTLD_GLOBAL ensures symbols from both are globally visible

169# so GetOpApiFuncAddr can resolve either operator at runtime.

170# ---------------------------------------------------------------------------

171for _vendor_libdir in (_CANN_VENDOR_FWD_LIBDIR, _CANN_VENDOR_BWD_LIBDIR):

172 if _vendor_libdir:

173 _cust_opapi = os.path.join(_vendor_libdir, "libcust_opapi.so")

174 if os.path.exists(_cust_opapi):

175 ctypes.CDLL(_cust_opapi, mode=ctypes.RTLD_GLOBAL)

176

177# ---------------------------------------------------------------------------

178# Step 4: import the compiled extension.

179#

180# ASCEND_CUSTOM_OPP_PATH is now set (Step 1) and libcust_opapi.so is

181# pre-loaded (Step 3), so g_custom_lib_path will be initialized correctly

182# when the .so's static initializers run.

183# ---------------------------------------------------------------------------

184from . import hyper_parallel_multicore_moe_ffn_pta # noqa: F401, E402 # pylint: disable=wrong-import-position,import-self

185

186# ---------------------------------------------------------------------------

187# Python wrappers — thin pass-through to the registered C++ ops

188# ---------------------------------------------------------------------------

189

190def moe_ffn_fwd(

191 dispatch_target, dispatch_target_off,

192 dispatch_src, dispatch_src_off, dispatch_size,

193 up_proj_weight, up_proj_glist,

194 up_proj_y, swiglu_out,

195 down_proj_weight, down_proj_glist, down_proj_y,

196 combine_target, combine_target_off, combine_src_off, combine_size,

197 gmm_workspace, up_proj_tiling, swiglu_tiling, down_proj_tiling,

198 runtime_config, all_event_counters,

199 rank_id: int, ep: int, expert_num: int,

200 hidden_size: int, seq_size: int,

201):

202 """

203 MoE-FFN forward operator.

204

205 Writes in-place to: dispatch_target, up_proj_y, swiglu_out, down_proj_y,

206 combine_target.

207 All output tensors must be pre-allocated with correct shapes.

208

209 Parameters

210 ----------

211 dispatch_target, dispatch_target_off, dispatch_src, dispatch_src_off,

212 dispatch_size :

213 AllToAll dispatch buffers — dispatch_target written in-place.

214 up_proj_weight, up_proj_glist :

215 Expert weight and cumulative group sizes for GMM1 (up-projection).

216 up_proj_y, swiglu_out :

217 GMM1 output and SwiGLU output — written in-place.

218 down_proj_weight, down_proj_glist, down_proj_y :

219 Expert weight, cumulative group sizes, and output for GMM2 (down-projection).

220 combine_target, combine_target_off, combine_src_off, combine_size :

221 AllToAll combine buffers — combine_target written in-place.

222 gmm_workspace, up_proj_tiling, swiglu_tiling, down_proj_tiling :

223 Pre-computed tiling tensors (from gen_runtime_data.py).

224 runtime_config :

225 Per-rank runtime config tensor (from gen_runtime_data.py).

226 all_event_counters :

227 Event synchronization counter tensor.

228 rank_id, ep, expert_num, hidden_size, seq_size :

229 Topology / shape attributes.

230 """

231 torch.ops.hyper_parallel.moe_ffn_fwd(

232 dispatch_target, dispatch_target_off,

233 dispatch_src, dispatch_src_off, dispatch_size,

234 up_proj_weight, up_proj_glist,

235 up_proj_y, swiglu_out,

236 down_proj_weight, down_proj_glist, down_proj_y,

237 combine_target, combine_target_off, combine_src_off, combine_size,

238 gmm_workspace, up_proj_tiling, swiglu_tiling, down_proj_tiling,

239 runtime_config, all_event_counters,

240 rank_id, ep, expert_num, hidden_size, seq_size,

241 )

242

243

244def moe_ffn_bwd(

245 dispatch_target, dispatch_target_off,

246 dy, dispatch_src_off, dispatch_size,

247 hidden, hidden_dw,

248 w2, act_grad_y, gate, grad_gate, w1, gate_dx, grad_x,

249 combine_target_off, combine_src_off, combine_size,

250 permute_out, gate_dw, group_list,

251 act_grad_tiling, gate_grad_tiling, w2_grad_tiling, w1_grad_tiling,

252 swiglu_grad_tiling, gmm_workspace, swiglu_grad_workspace,

253 runtime_config, all_event_counters,

254 rank_id: int, ep: int, expert_num: int,

255 hidden_size: int, seq_size: int,

256):

257 """

258 MoE-FFN backward operator.

259

260 Writes in-place to: dispatch_target, hidden_dw, act_grad_y, grad_gate,

261 gate_dx, grad_x, permute_out, gate_dw.

262 All output tensors must be pre-allocated with correct shapes.

263

264 Parameters

265 ----------

266 dispatch_target, dispatch_target_off, dy, dispatch_src_off, dispatch_size :

267 AllToAll dispatch buffers — dispatch_target written in-place with

268 the dispatched gradient. dy is the source gradient tensor.

269 hidden :

270 SwiGLU output saved from the forward pass (used by W2-grad, GMM4).

271 hidden_dw :

272 W2 weight gradient — written in-place.

273 w2 :

274 W2 weight (= down_proj_weight from forward).

275 act_grad_y :

276 Activation gradient output from GMM1 bwd (target @ W2.T) — written in-place.

277 gate :

278 up_proj_y saved from the forward pass (SwiGLU input).

279 grad_gate :

280 SwiGLU gradient output — written in-place.

281 w1 :

282 W1 weight (= up_proj_weight from forward).

283 gate_dx :

284 GMM2 bwd output (grad_gate @ W1.T), before AllToAll combine — written in-place.

285 grad_x :

286 AllToAll combine output (final activation gradient) — written in-place.

287 combine_target_off, combine_src_off, combine_size :

288 AllToAll combine buffer descriptors.

289 permute_out :

290 In-place intermediate buffer for W1-grad (GMM4).

291 gate_dw :

292 W1 weight gradient — written in-place.

293 group_list :

294 Cumulative expert token counts ([E] int64).

295 act_grad_tiling, gate_grad_tiling, w2_grad_tiling, w1_grad_tiling,

296 swiglu_grad_tiling :

297 Pre-computed tiling tensors (from gen_runtime_data.py bwd).

298 gmm_workspace, swiglu_grad_workspace :

299 Workspace tensors.

300 runtime_config :

301 Per-rank runtime config tensor (from gen_runtime_data.py bwd).

302 all_event_counters :

303 Event synchronization counter tensor.

304 rank_id, ep, expert_num, hidden_size, seq_size :

305 Topology / shape attributes.

306 """

307 torch.ops.hyper_parallel.moe_ffn_bwd(

308 dispatch_target, dispatch_target_off,

309 dy, dispatch_src_off, dispatch_size,

310 hidden, hidden_dw,

311 w2, act_grad_y, gate, grad_gate, w1, gate_dx, grad_x,

312 combine_target_off, combine_src_off, combine_size,

313 permute_out, gate_dw, group_list,

314 act_grad_tiling, gate_grad_tiling, w2_grad_tiling, w1_grad_tiling,

315 swiglu_grad_tiling, gmm_workspace, swiglu_grad_workspace,

316 runtime_config, all_event_counters,

317 rank_id, ep, expert_num, hidden_size, seq_size,

318 )

319

320

321__all__ = ["moe_ffn_fwd", "moe_ffn_bwd"]

Coverage for / home / jenkins / .local / lib / python3.10 / site-packages / hyper_parallel / core / multicore / platform / torch / init.py: 0%

51 statements