Coverage for / home / jenkins / .local / lib / python3.10 / site-packages / hyper_parallel / core / multicore / platform / torch / __init__.py: 0%
51 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-05-20 07:18 +0800
« prev ^ index » next coverage.py v7.13.1, created at 2026-05-20 07:18 +0800
1# Copyright 2026 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ============================================================================
15"""
16hyper_parallel.core.multicore.platform.torch
17=============================================
18Out-of-tree PyTorch operator registration for MoE-FFN operators.
20Registers into the ``hyper_parallel`` PyTorch namespace — does NOT modify
21op-plugin or any PyTorch source. The operators are accessible via:
23 torch.ops.hyper_parallel.moe_ffn_fwd(...)
24 torch.ops.hyper_parallel.moe_ffn_bwd(...)
26Or via the Python wrappers in this module:
28 from hyper_parallel.core.multicore.platform.torch import moe_ffn_fwd, moe_ffn_bwd
30Build prerequisites
31-------------------
32No manual env-var setup is required if ``prebuild/multicore_moe_ffn.tar.gz``
33exists alongside this package. The tarball is auto-extracted and the correct
34``CANN_VENDOR_*`` paths are derived automatically.
36Alternatively, set one of the following environment variables before importing:
38 # Preferred: per-op explicit paths
39 export CANN_VENDOR_FWD_LIBDIR=/path/to/multicore_moe_ffn_nn/op_api/lib
40 export CANN_VENDOR_BWD_LIBDIR=/path/to/multicore_moe_ffn_grad_nn/op_api/lib
42 # Or: legacy single-lib (used for both fwd and bwd)
43 export CANN_VENDOR_LIBDIR=/path/to/multicore_moe_ffn_nn/op_api/lib
45Symbol lookup (new op-plugin API)
46----------------------------------
47GetOpApiFuncAddr (in op_api_common.cpp) resolves aclnnMulticoreMoeFfn* at
48runtime via dlopen. It searches each entry in ``g_custom_lib_path`` (a C++
49file-scope ``const`` global in libopapi.so) for
50``op_api/lib/libcust_opapi.so``. ``g_custom_lib_path`` is initialised
51**once** from ``ASCEND_CUSTOM_OPP_PATH`` when ``libopapi.so`` is first loaded
52(triggered by ``import torch_npu``); after that it is frozen and cannot be
53updated from Python.
55Our extension links against ``libopapi.so`` and shares its
56``g_custom_lib_path`` — the extension does **not** carry its own copy.
58This module therefore sets ``ASCEND_CUSTOM_OPP_PATH`` **before** its own
59``import torch`` call (line 127) so that, when ``import torch_npu`` on
60line 145 causes ``libopapi.so`` to load, the variable is already present.
62**Limitation — if** ``torch_npu`` **was imported before this module**:
63``libopapi.so`` will have been loaded (and ``g_custom_lib_path`` frozen)
64before we can set ``ASCEND_CUSTOM_OPP_PATH``. In that scenario the vendor
65libs will not be found via ``GetOpApiFuncAddr``. The only reliable fix is to
66set ``ASCEND_CUSTOM_OPP_PATH`` at the shell level (or at the very top of the
67Python entry-point script) **before** any ``import torch_npu``.
68"""
69import ctypes
70import os
71import re as _re
73# ---------------------------------------------------------------------------
74# Step 0: locate both CANN vendor lib dirs (forward + backward packages).
75#
76# Two vendor packages provide separate libcust_opapi.so files:
77# Forward : multicore_moe_ffn_nn → aclnnMulticoreMoeFfn*
78# Backward: multicore_moe_ffn_grad_nn → aclnnMulticoreMoeFfnGrad*
79#
80# Resolution order (highest priority first):
81# 1. CANN_VENDOR_FWD_LIBDIR / CANN_VENDOR_BWD_LIBDIR (explicit per-op)
82# 2. CANN_VENDOR_LIBDIR (legacy single-lib; used for both fwd and bwd)
83# 3. Auto-detect from prebuild tarball / directory (no env vars needed)
84#
85# This block uses only pure-Python (os.path / tarfile) so it is safe to run
86# before any C++ .so is loaded.
87# ---------------------------------------------------------------------------
88_ASCEND_HOME = os.environ.get("ASCEND_HOME_PATH",
89 "/usr/local/Ascend/ascend-toolkit/latest")
91_CANN_VENDOR_FWD_LIBDIR = os.environ.get("CANN_VENDOR_FWD_LIBDIR", "")
92_CANN_VENDOR_BWD_LIBDIR = os.environ.get("CANN_VENDOR_BWD_LIBDIR", "")
94if not (_CANN_VENDOR_FWD_LIBDIR or _CANN_VENDOR_BWD_LIBDIR):
95 _legacy = os.environ.get("CANN_VENDOR_LIBDIR", "")
96 if _legacy:
97 _CANN_VENDOR_FWD_LIBDIR = _legacy
98 _CANN_VENDOR_BWD_LIBDIR = _legacy
99 else:
100 # Auto-detect: prebuild directory is 2 levels up from platform/torch/
101 _PREBUILD_DIR = os.path.normpath(
102 os.path.join(os.path.dirname(__file__), "../../prebuild/multicore_moe_ffn"))
103 _TARBALL = _PREBUILD_DIR + ".tar.gz"
104 if not os.path.isdir(_PREBUILD_DIR) and os.path.isfile(_TARBALL):
105 import tarfile as _tarfile
106 with _tarfile.open(_TARBALL) as _tf:
107 _tf.extractall(os.path.dirname(_PREBUILD_DIR))
108 if os.path.isdir(_PREBUILD_DIR):
109 _vendors = os.path.join(_PREBUILD_DIR, "vendors")
110 _fwd = os.path.join(_vendors, "multicore_moe_ffn_nn", "op_api", "lib")
111 _bwd = os.path.join(_vendors, "multicore_moe_ffn_grad_nn", "op_api", "lib")
112 if os.path.isdir(_fwd):
113 _CANN_VENDOR_FWD_LIBDIR = _fwd
114 if os.path.isdir(_bwd):
115 _CANN_VENDOR_BWD_LIBDIR = _bwd
117# ---------------------------------------------------------------------------
118# Step 1: set ASCEND_CUSTOM_OPP_PATH from the detected vendor lib dirs.
119#
120# GetOpApiFuncAddr searches each entry in ASCEND_CUSTOM_OPP_PATH for
121# op_api/lib/libcust_opapi.so to resolve aclnnMulticoreMoeFfn* symbols.
122# g_custom_lib_path (a C++ static-duration global in op_api_common.cpp) is
123# populated from ASCEND_CUSTOM_OPP_PATH once — when the .so that contains
124# op_api_common.cpp is first loaded into the process. Depending on the
125# build, that .so may be libopapi.so (loaded transitively by torch or
126# torch_npu) or our own extension.
127#
128# To cover both cases, ASCEND_CUSTOM_OPP_PATH MUST be set before ANY
129# C++ .so import — i.e., before `import torch` below.
130# ---------------------------------------------------------------------------
131for _libdir in (_CANN_VENDOR_FWD_LIBDIR, _CANN_VENDOR_BWD_LIBDIR):
132 if _libdir:
133 _vendor_root = _re.sub(r'/op_api/lib/?$', '', _libdir)
134 _cur_opp = os.environ.get('ASCEND_CUSTOM_OPP_PATH', '')
135 if _vendor_root and _vendor_root not in _cur_opp:
136 os.environ['ASCEND_CUSTOM_OPP_PATH'] = (
137 f"{_vendor_root}:{_cur_opp}" if _cur_opp else _vendor_root)
139import torch # pylint: disable=import-self,wrong-import-position # noqa: E402
141# ---------------------------------------------------------------------------
142# Step 2: pre-load CANN base libs with RTLD_GLOBAL.
143#
144# libascendcl.so: belt-and-suspenders — import torch_npu loads it, but
145# promoting to GLOBAL ensures any late-loading consumer sees its symbols.
146# libopapi.so: provides aclopExecutor which libcust_opapi.so calls without
147# DT_NEEDED (CANN cmake allows undefined symbols). Must be GLOBAL first.
148# ---------------------------------------------------------------------------
149for _ascendcl_path in [
150 os.path.join(_ASCEND_HOME, "lib64", "libascendcl.so"),
151 os.path.join(_ASCEND_HOME, "fwkacllib", "lib64", "libascendcl.so"),
152]:
153 if os.path.exists(_ascendcl_path):
154 ctypes.CDLL(_ascendcl_path, mode=ctypes.RTLD_GLOBAL)
155 break
157import torch_npu # noqa: F401 — loads libtorch_npu.so # pylint: disable=wrong-import-position
159_opapi_path = os.path.join(_ASCEND_HOME, "lib64", "libopapi.so")
160if os.path.exists(_opapi_path):
161 ctypes.CDLL(_opapi_path, mode=ctypes.RTLD_GLOBAL)
163# ---------------------------------------------------------------------------
164# Step 3: pre-load both libcust_opapi.so files with RTLD_GLOBAL.
165#
166# Forward (multicore_moe_ffn_nn) exports: aclnnMulticoreMoeFfn*
167# Backward (multicore_moe_ffn_grad_nn) exports: aclnnMulticoreMoeFfnGrad*
168# Pre-loading with RTLD_GLOBAL ensures symbols from both are globally visible
169# so GetOpApiFuncAddr can resolve either operator at runtime.
170# ---------------------------------------------------------------------------
171for _vendor_libdir in (_CANN_VENDOR_FWD_LIBDIR, _CANN_VENDOR_BWD_LIBDIR):
172 if _vendor_libdir:
173 _cust_opapi = os.path.join(_vendor_libdir, "libcust_opapi.so")
174 if os.path.exists(_cust_opapi):
175 ctypes.CDLL(_cust_opapi, mode=ctypes.RTLD_GLOBAL)
177# ---------------------------------------------------------------------------
178# Step 4: import the compiled extension.
179#
180# ASCEND_CUSTOM_OPP_PATH is now set (Step 1) and libcust_opapi.so is
181# pre-loaded (Step 3), so g_custom_lib_path will be initialized correctly
182# when the .so's static initializers run.
183# ---------------------------------------------------------------------------
184from . import hyper_parallel_multicore_moe_ffn_pta # noqa: F401, E402 # pylint: disable=wrong-import-position,import-self
186# ---------------------------------------------------------------------------
187# Python wrappers — thin pass-through to the registered C++ ops
188# ---------------------------------------------------------------------------
190def moe_ffn_fwd(
191 dispatch_target, dispatch_target_off,
192 dispatch_src, dispatch_src_off, dispatch_size,
193 up_proj_weight, up_proj_glist,
194 up_proj_y, swiglu_out,
195 down_proj_weight, down_proj_glist, down_proj_y,
196 combine_target, combine_target_off, combine_src_off, combine_size,
197 gmm_workspace, up_proj_tiling, swiglu_tiling, down_proj_tiling,
198 runtime_config, all_event_counters,
199 rank_id: int, ep: int, expert_num: int,
200 hidden_size: int, seq_size: int,
201):
202 """
203 MoE-FFN forward operator.
205 Writes in-place to: dispatch_target, up_proj_y, swiglu_out, down_proj_y,
206 combine_target.
207 All output tensors must be pre-allocated with correct shapes.
209 Parameters
210 ----------
211 dispatch_target, dispatch_target_off, dispatch_src, dispatch_src_off,
212 dispatch_size :
213 AllToAll dispatch buffers — dispatch_target written in-place.
214 up_proj_weight, up_proj_glist :
215 Expert weight and cumulative group sizes for GMM1 (up-projection).
216 up_proj_y, swiglu_out :
217 GMM1 output and SwiGLU output — written in-place.
218 down_proj_weight, down_proj_glist, down_proj_y :
219 Expert weight, cumulative group sizes, and output for GMM2 (down-projection).
220 combine_target, combine_target_off, combine_src_off, combine_size :
221 AllToAll combine buffers — combine_target written in-place.
222 gmm_workspace, up_proj_tiling, swiglu_tiling, down_proj_tiling :
223 Pre-computed tiling tensors (from gen_runtime_data.py).
224 runtime_config :
225 Per-rank runtime config tensor (from gen_runtime_data.py).
226 all_event_counters :
227 Event synchronization counter tensor.
228 rank_id, ep, expert_num, hidden_size, seq_size :
229 Topology / shape attributes.
230 """
231 torch.ops.hyper_parallel.moe_ffn_fwd(
232 dispatch_target, dispatch_target_off,
233 dispatch_src, dispatch_src_off, dispatch_size,
234 up_proj_weight, up_proj_glist,
235 up_proj_y, swiglu_out,
236 down_proj_weight, down_proj_glist, down_proj_y,
237 combine_target, combine_target_off, combine_src_off, combine_size,
238 gmm_workspace, up_proj_tiling, swiglu_tiling, down_proj_tiling,
239 runtime_config, all_event_counters,
240 rank_id, ep, expert_num, hidden_size, seq_size,
241 )
244def moe_ffn_bwd(
245 dispatch_target, dispatch_target_off,
246 dy, dispatch_src_off, dispatch_size,
247 hidden, hidden_dw,
248 w2, act_grad_y, gate, grad_gate, w1, gate_dx, grad_x,
249 combine_target_off, combine_src_off, combine_size,
250 permute_out, gate_dw, group_list,
251 act_grad_tiling, gate_grad_tiling, w2_grad_tiling, w1_grad_tiling,
252 swiglu_grad_tiling, gmm_workspace, swiglu_grad_workspace,
253 runtime_config, all_event_counters,
254 rank_id: int, ep: int, expert_num: int,
255 hidden_size: int, seq_size: int,
256):
257 """
258 MoE-FFN backward operator.
260 Writes in-place to: dispatch_target, hidden_dw, act_grad_y, grad_gate,
261 gate_dx, grad_x, permute_out, gate_dw.
262 All output tensors must be pre-allocated with correct shapes.
264 Parameters
265 ----------
266 dispatch_target, dispatch_target_off, dy, dispatch_src_off, dispatch_size :
267 AllToAll dispatch buffers — dispatch_target written in-place with
268 the dispatched gradient. dy is the source gradient tensor.
269 hidden :
270 SwiGLU output saved from the forward pass (used by W2-grad, GMM4).
271 hidden_dw :
272 W2 weight gradient — written in-place.
273 w2 :
274 W2 weight (= down_proj_weight from forward).
275 act_grad_y :
276 Activation gradient output from GMM1 bwd (target @ W2.T) — written in-place.
277 gate :
278 up_proj_y saved from the forward pass (SwiGLU input).
279 grad_gate :
280 SwiGLU gradient output — written in-place.
281 w1 :
282 W1 weight (= up_proj_weight from forward).
283 gate_dx :
284 GMM2 bwd output (grad_gate @ W1.T), before AllToAll combine — written in-place.
285 grad_x :
286 AllToAll combine output (final activation gradient) — written in-place.
287 combine_target_off, combine_src_off, combine_size :
288 AllToAll combine buffer descriptors.
289 permute_out :
290 In-place intermediate buffer for W1-grad (GMM4).
291 gate_dw :
292 W1 weight gradient — written in-place.
293 group_list :
294 Cumulative expert token counts ([E] int64).
295 act_grad_tiling, gate_grad_tiling, w2_grad_tiling, w1_grad_tiling,
296 swiglu_grad_tiling :
297 Pre-computed tiling tensors (from gen_runtime_data.py bwd).
298 gmm_workspace, swiglu_grad_workspace :
299 Workspace tensors.
300 runtime_config :
301 Per-rank runtime config tensor (from gen_runtime_data.py bwd).
302 all_event_counters :
303 Event synchronization counter tensor.
304 rank_id, ep, expert_num, hidden_size, seq_size :
305 Topology / shape attributes.
306 """
307 torch.ops.hyper_parallel.moe_ffn_bwd(
308 dispatch_target, dispatch_target_off,
309 dy, dispatch_src_off, dispatch_size,
310 hidden, hidden_dw,
311 w2, act_grad_y, gate, grad_gate, w1, gate_dx, grad_x,
312 combine_target_off, combine_src_off, combine_size,
313 permute_out, gate_dw, group_list,
314 act_grad_tiling, gate_grad_tiling, w2_grad_tiling, w1_grad_tiling,
315 swiglu_grad_tiling, gmm_workspace, swiglu_grad_workspace,
316 runtime_config, all_event_counters,
317 rank_id, ep, expert_num, hidden_size, seq_size,
318 )
321__all__ = ["moe_ffn_fwd", "moe_ffn_bwd"]