============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops, configfile: ../../../../../../sault/virtual_test/virtualenv_002/sault/config/pytest.ini plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collected 1 item test_parallel.py /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) Start worker process with rank id:0, log file:parallel/matmul_reduce_scatter/test_binary_case_init2/worker_0.log. Environment variable [RANK_ID=0] is exported. Start worker process with rank id:0, log file:parallel/matmul_reduce_scatter/test_binary_case_init0/worker_0.log. Environment variable [RANK_ID=0] is exported. Start worker process with rank id:1, log file:parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log. Environment variable [RANK_ID=1] is exported. Start worker process with rank id:1, log file:parallel/matmul_reduce_scatter/test_binary_case_init2/worker_1.log. Environment variable [RANK_ID=1] is exported. Start worker process with rank id:0, log file:parallel/matmul_reduce_scatter/test_binary_case_init1/worker_0.log. Environment variable [RANK_ID=0] is exported. [WARNING] ME(1433294:281473495330496,MainProcess):2025-07-15-13:36:37.578.701 [mindspore/parallel/cluster/process_entity/_api.py:267] Distributed job is spawned. Waiting all processes to exit... [WARNING] ME(1433293:281472927264448,MainProcess):2025-07-15-13:36:37.589.303 [mindspore/parallel/cluster/process_entity/_api.py:267] Distributed job is spawned. Waiting all processes to exit... Start worker process with rank id:1, log file:parallel/matmul_reduce_scatter/test_binary_case_init1/worker_1.log. Environment variable [RANK_ID=1] is exported. [WARNING] ME(1433295:281473295314624,MainProcess):2025-07-15-13:36:37.754.396 [mindspore/parallel/cluster/process_entity/_api.py:267] Distributed job is spawned. Waiting all processes to exit... ============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 -- /home/jenkins/anaconda3/envs/ci39/bin/python cachedir: .pytest_cache rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collecting ... ============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 -- /home/jenkins/anaconda3/envs/ci39/bin/python cachedir: .pytest_cache rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collecting ... collected 3 items / 2 deselected / 1 selected parallel/matmul_reduce_scatter.py::test_binary_case[_init2] collected 3 items / 2 deselected / 1 selected parallel/matmul_reduce_scatter.py::test_binary_case[_init0] ============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 -- /home/jenkins/anaconda3/envs/ci39/bin/python cachedir: .pytest_cache rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collecting ... collected 3 items / 2 deselected / 1 selected parallel/matmul_reduce_scatter.py::test_binary_case[_init1] ============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 -- /home/jenkins/anaconda3/envs/ci39/bin/python cachedir: .pytest_cache rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collecting ... collected 3 items / 2 deselected / 1 selected parallel/matmul_reduce_scatter.py::test_binary_case[_init0] ============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 -- /home/jenkins/anaconda3/envs/ci39/bin/python cachedir: .pytest_cache rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collecting ... collected 3 items / 2 deselected / 1 selected parallel/matmul_reduce_scatter.py::test_binary_case[_init2] ============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 -- /home/jenkins/anaconda3/envs/ci39/bin/python cachedir: .pytest_cache rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collecting ... collected 3 items / 2 deselected / 1 selected parallel/matmul_reduce_scatter.py::test_binary_case[_init1] PASSED [100%] ================= 1 passed, 2 deselected, 22 warnings in 9.20s ================= PASSED [100%] ================= 1 passed, 2 deselected, 22 warnings in 9.45s ================= [INFO] PS(1433512,ffff39fbefa0,python):2025-07-15-13:36:54.961.473 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:220] Start] Event base dispatch success! [INFO] PS(1433512,ffff397aefa0,python):2025-07-15-13:36:54.961.507 [mindspore/ccsrc/ps/core/communicator/tcp_client.cc:318] Start] Event base dispatch success! [INFO] PS(1433526,ffff3e7cefa0,python):2025-07-15-13:36:55.186.667 [mindspore/ccsrc/ps/core/communicator/tcp_client.cc:318] Start] Event base dispatch success! [INFO] PS(1433526,ffff3efdefa0,python):2025-07-15-13:36:55.186.710 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:220] Start] Event base dispatch success! PASSED [100%] ================ 1 passed, 2 deselected, 22 warnings in 20.29s ================= PASSED [100%] ================ 1 passed, 2 deselected, 22 warnings in 20.57s ================= [INFO] PS(1433530,ffff1fffefa0,python):2025-07-15-13:37:07.281.590 [mindspore/ccsrc/ps/core/communicator/tcp_client.cc:318] Start] Event base dispatch success! [INFO] PS(1433530,ffff34aaefa0,python):2025-07-15-13:37:07.281.590 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:220] Start] Event base dispatch success! [INFO] PS(1433536,ffff0f7eefa0,python):2025-07-15-13:37:07.505.652 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:220] Start] Event base dispatch success! [INFO] PS(1433536,ffff0efdefa0,python):2025-07-15-13:37:07.505.652 [mindspore/ccsrc/ps/core/communicator/tcp_client.cc:318] Start] Event base dispatch success! ERROR [100%] ==================================== ERRORS ==================================== __________________ ERROR at setup of test_binary_case[_init0] __________________ request = > @pytest.fixture(params=[ (ms.GRAPH_MODE, 'O0'), (ms.GRAPH_MODE, 'O2'), (ms.PYNATIVE_MODE, ''), ], autouse=True) def _init(request): mode, jit_level = request.param > ms.communication.init() parallel/conftest.py:27: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ backend_name = 'hccl' def init(backend_name=None): """ Initialize distributed backends required by communication services, e.g. ``"hccl"`` / ``"nccl"`` / ``"mccl"``. It is usually used in distributed parallel scenarios and set before using communication services. Note: - The full name of ``"hccl"`` is Huawei Collective Communication Library(HCCL). - The full name of ``"nccl"`` is NVIDIA Collective Communication Library(NCCL). - The full name of ``"mccl"`` is MindSpore Collective Communication Library(MCCL). - In Ascend hardware platforms, ``init()`` should be set before the definition of any Tensor and Parameter, and the instantiation and execution of any operation and net. Args: backend_name (str): Backend, using ``"hccl"`` / ``"nccl"`` / ``"mccl"``. ``"hccl"`` should be used for Ascend hardware platforms, ``"nccl"`` for GPU hardware platforms and ``"mccl"`` for CPU hardware platforms. If not set, inference is automatically made based on the hardware platform type (device_target). Default: ``None`` . Raises: TypeError: If `backend_name` is not a string. RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails, or the environment variables RANK_ID/MINDSPORE_HCCL_CONFIG_PATH have not been exported when backend is HCCL. Supported Platforms: ``Ascend`` ``GPU`` ``CPU`` Examples: .. note:: Before running the following examples, you need to configure the communication environment variables. For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method without any third-party or configuration file dependencies. Please see the `msrun start up `_ for more details. >>> from mindspore.communication import init >>> init() """ host_init = _host_distribute() device_target = context.get_context("device_target") if backend_name is None: if device_target == "Ascend": backend_name = "hccl" elif device_target == "GPU": backend_name = "nccl" elif device_target == "CPU": backend_name = "mccl" else: raise RuntimeError("For 'set_context', the argument 'device_target' {} is not supported in " "parallel initialization, please use Ascend, GPU or CPU.".format(device_target)) if not isinstance(backend_name, str): raise TypeError("For 'init', the argument 'backend_name' must be a string, " "but got the type : {}".format(type(backend_name))) if os.getenv("MS_ROLE") == "MS_SCHED": backend_name = "mccl" _set_elegant_exit_handle() if backend_name == "hccl": if _is_ps_mode(): # Use MindSpore cluster to build network for Parameter Server training. init_cluster() if _is_role_sched() or _is_role_pserver(): raise RuntimeError("Parameter server and scheduler should use 'CPU' as backend instead of 'Ascend'") if _get_ps_context("worker_num") == 1: GlobalComm.INITED = True return if device_target != "Ascend": raise RuntimeError("For 'init', the argument 'backend_name' should be '{}' to init '{}', " "but got 'hccl'.".format(DEVICE_TO_BACKEND[device_target], device_target)) if is_initialized(device_target): logger.warning(f"For 'init' in Ascend backend, the backend is already initialized, please set it before " "the definition of any Tensor and Parameter, and the instantiation and execution of any " "operation and net, otherwise the 'init' may not take effect.") if not host_init: _check_parallel_envs() GlobalComm.BACKEND = Backend("hccl") _check_hccl() > init_hccl() E RuntimeError: Call aclrtSetDevice failed, ret[507033]. Got device count[2] and device id[1], please check if device id is valid. E E ---------------------------------------------------- E - C++ Call Stack: (For framework developers) E ---------------------------------------------------- E mindspore/ccsrc/plugin/res_manager/ascend/hal_manager/ascend_hal_manager.cc:67 InitDevice /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py:203: RuntimeError ---------------------------- Captured stderr setup ----------------------------- [WARNING] DISTRIBUTED(1433522,ffffa367eec0,python):2025-07-15-13:36:43.380.530 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 27 source: 127.0.0.1:32910, destination: 127.0.0.1:50183 [WARNING] DISTRIBUTED(1433522,ffff377eefa0,python):2025-07-15-13:36:43.380.552 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:32910 to 127.0.0.1:50183 is successfully created. System errno: Success [WARNING] DISTRIBUTED(1433522,ffffa367eec0,python):2025-07-15-13:36:43.380.598 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:50183 to be connected...Retry number: 1 [WARNING] DISTRIBUTED(1433522,ffffa367eec0,python):2025-07-15-13:36:43.880.839 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 28 source: 127.0.0.1:32928, destination: 127.0.0.1:50183 [WARNING] DISTRIBUTED(1433522,ffffa367eec0,python):2025-07-15-13:36:43.880.870 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:50183 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(1433522,ffff3cb2efa0,python):2025-07-15-13:36:43.880.868 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:32928 to 127.0.0.1:50183 is successfully created. System errno: Success [WARNING] DISTRIBUTED(1433522,ffffa367eec0,python):2025-07-15-13:36:44.381.349 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/1200). [WARNING] DISTRIBUTED(1433522,ffffa367eec0,python):2025-07-15-13:36:44.881.456 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(1433522,ffffa367eec0,python):2025-07-15-13:36:44.881.488 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 1 rank id: 1 [WARNING] PS(1433522,ffffa367eec0,python):2025-07-15-13:36:44.881.913 [mindspore/ccsrc/ps/core/file_configuration.cc:24] Initialize] The file: is not exist. [WARNING] DEVICE(1433522,ffffa367eec0,python):2025-07-15-13:36:44.881.952 [mindspore/ccsrc/plugin/device/cpu/hal/hardware/ms_collective_node.cc:33] Start] Failed to initialize the configuration for this mccl collective node. [WARNING] PS(1433522,ffffa367eec0,python):2025-07-15-13:36:44.887.788 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:188] Init] The port 8375 is already in use. So increase port to: 8375 =========================== short test summary info ============================ ERROR parallel/matmul_reduce_scatter.py::test_binary_case[_init0] - RuntimeEr... =========== 2 deselected, 22 warnings, 1 error in 161.79s (0:02:41) ============ [WARNING] DEVICE(1433522,ffffa367eec0,python):2025-07-15-13:39:25.435.825 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_res_manager.cc:350] SyncAllStreams] The ascend_res_manager_ is nullptr in scenarios where it is not actually executed [INFO] PS(1433522,ffff357aefa0,python):2025-07-15-13:39:26.069.100 [mindspore/ccsrc/ps/core/communicator/tcp_client.cc:318] Start] Event base dispatch success! [INFO] PS(1433522,ffff35fbefa0,python):2025-07-15-13:39:26.069.436 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:220] Start] Event base dispatch success! [ERROR] ME(1433294:281473495330496,MainProcess):2025-07-15-13:39:27.620.7 [mindspore/parallel/cluster/process_entity/_api.py:363] Worker process 1433522 exit with exception. Error code: 1. [WARNING] ME(1433294:281473495330496,MainProcess):2025-07-15-13:39:27.656.1 [mindspore/parallel/cluster/process_entity/_api.py:369] There's worker exits with exception, kill all other workers. [ERROR] ME(1433294:281473495330496,MainProcess):2025-07-15-13:40:01.720.241 [mindspore/parallel/cluster/process_entity/_api.py:382] Scheduler process 1433510 exit with exception. [ERROR] ME(1433294:281473495330496,MainProcess):2025-07-15-13:40:01.721.300 [mindspore/parallel/cluster/process_entity/_api.py:603] Time out nodes are ['0'] parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-3-cachedir: .pytest_cache parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-4-rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-5-plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-6-collecting ... collected 3 items / 2 deselected / 1 selected parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-7- parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log:8:parallel/matmul_reduce_scatter.py::test_binary_case[_init0] ERROR [100%] parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-9- parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log:10:==================================== ERRORS ==================================== parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log:11:__________________ ERROR at setup of test_binary_case[_init0] __________________ parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-12- parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-13-request = > parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-14- parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-15- @pytest.fixture(params=[ parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-16- (ms.GRAPH_MODE, 'O0'), -- parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-44- ``"nccl"`` for GPU hardware platforms and ``"mccl"`` for CPU hardware platforms. parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-45- If not set, inference is automatically made based on the hardware parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-46- platform type (device_target). Default: ``None`` . parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-47- parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-48- Raises: parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log:49: TypeError: If `backend_name` is not a string. parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log:50: RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails, parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-51- or the environment variables RANK_ID/MINDSPORE_HCCL_CONFIG_PATH parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-52- have not been exported when backend is HCCL. parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-53- parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-54- Supported Platforms: parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-55- ``Ascend`` ``GPU`` ``CPU`` -- parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-76- elif device_target == "GPU": parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-77- backend_name = "nccl" parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-78- elif device_target == "CPU": parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-79- backend_name = "mccl" parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-80- else: parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log:81: raise RuntimeError("For 'set_context', the argument 'device_target' {} is not supported in " parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-82- "parallel initialization, please use Ascend, GPU or CPU.".format(device_target)) parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-83- if not isinstance(backend_name, str): parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log:84: raise TypeError("For 'init', the argument 'backend_name' must be a string, " parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-85- "but got the type : {}".format(type(backend_name))) parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-86- if os.getenv("MS_ROLE") == "MS_SCHED": parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-87- backend_name = "mccl" parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-88- parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-89- _set_elegant_exit_handle() parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-90- if backend_name == "hccl": parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-91- if _is_ps_mode(): parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-92- # Use MindSpore cluster to build network for Parameter Server training. parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-93- init_cluster() parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-94- if _is_role_sched() or _is_role_pserver(): parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log:95: raise RuntimeError("Parameter server and scheduler should use 'CPU' as backend instead of 'Ascend'") parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-96- if _get_ps_context("worker_num") == 1: parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-97- GlobalComm.INITED = True parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-98- return parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-99- if device_target != "Ascend": parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log:100: raise RuntimeError("For 'init', the argument 'backend_name' should be '{}' to init '{}', " parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-101- "but got 'hccl'.".format(DEVICE_TO_BACKEND[device_target], device_target)) parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-102- if is_initialized(device_target): parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-103- logger.warning(f"For 'init' in Ascend backend, the backend is already initialized, please set it before " parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-104- "the definition of any Tensor and Parameter, and the instantiation and execution of any " parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-105- "operation and net, otherwise the 'init' may not take effect.") parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-106- if not host_init: parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-107- _check_parallel_envs() parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-108- GlobalComm.BACKEND = Backend("hccl") parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-109- _check_hccl() parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-110-> init_hccl() parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log:111:E RuntimeError: Call aclrtSetDevice failed, ret[507033]. Got device count[2] and device id[1], please check if device id is valid. parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-112-E parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-113-E ---------------------------------------------------- parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-114-E - C++ Call Stack: (For framework developers) parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-115-E ---------------------------------------------------- parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-116-E mindspore/ccsrc/plugin/res_manager/ascend/hal_manager/ascend_hal_manager.cc:67 InitDevice parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-117- parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log:118:/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py:203: RuntimeError parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-119----------------------------- Captured stderr setup ----------------------------- parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-120-[WARNING] DISTRIBUTED(1433522,ffffa367eec0,python):2025-07-15-13:36:43.380.530 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 27 source: 127.0.0.1:32910, destination: 127.0.0.1:50183 parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-121-[WARNING] DISTRIBUTED(1433522,ffff377eefa0,python):2025-07-15-13:36:43.380.552 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:32910 to 127.0.0.1:50183 is successfully created. System errno: Success parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-122-[WARNING] DISTRIBUTED(1433522,ffffa367eec0,python):2025-07-15-13:36:43.380.598 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:50183 to be connected...Retry number: 1 parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-123-[WARNING] DISTRIBUTED(1433522,ffffa367eec0,python):2025-07-15-13:36:43.880.839 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 28 source: 127.0.0.1:32928, destination: 127.0.0.1:50183 -- parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-128-[WARNING] DISTRIBUTED(1433522,ffffa367eec0,python):2025-07-15-13:36:44.881.488 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 1 rank id: 1 parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-129-[WARNING] PS(1433522,ffffa367eec0,python):2025-07-15-13:36:44.881.913 [mindspore/ccsrc/ps/core/file_configuration.cc:24] Initialize] The file: is not exist. parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-130-[WARNING] DEVICE(1433522,ffffa367eec0,python):2025-07-15-13:36:44.881.952 [mindspore/ccsrc/plugin/device/cpu/hal/hardware/ms_collective_node.cc:33] Start] Failed to initialize the configuration for this mccl collective node. parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-131-[WARNING] PS(1433522,ffffa367eec0,python):2025-07-15-13:36:44.887.788 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:188] Init] The port 8375 is already in use. So increase port to: 8375 parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-132-=========================== short test summary info ============================ parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log:133:ERROR parallel/matmul_reduce_scatter.py::test_binary_case[_init0] - RuntimeEr... parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-134-=========== 2 deselected, 22 warnings, 1 error in 161.79s (0:02:41) ============ parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-135-[WARNING] DEVICE(1433522,ffffa367eec0,python):2025-07-15-13:39:25.435.825 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_res_manager.cc:350] SyncAllStreams] The ascend_res_manager_ is nullptr in scenarios where it is not actually executed parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-136-[INFO] PS(1433522,ffff357aefa0,python):2025-07-15-13:39:26.069.100 [mindspore/ccsrc/ps/core/communicator/tcp_client.cc:318] Start] Event base dispatch success! parallel/matmul_reduce_scatter/test_binary_case_init0/worker_1.log-137-[INFO] PS(1433522,ffff35fbefa0,python):2025-07-15-13:39:26.069.436 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:220] Start] Event base dispatch success! -- parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-3-cachedir: .pytest_cache parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-4-rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-5-plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-6-collecting ... collected 3 items / 2 deselected / 1 selected parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-7- parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log:8:parallel/matmul_reduce_scatter.py::test_binary_case[_init0] ERROR [100%] parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-9- parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log:10:==================================== ERRORS ==================================== parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log:11:__________________ ERROR at setup of test_binary_case[_init0] __________________ parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-12- parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-13-request = > parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-14- parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-15- @pytest.fixture(params=[ parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-16- (ms.GRAPH_MODE, 'O0'), -- parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-44- ``"nccl"`` for GPU hardware platforms and ``"mccl"`` for CPU hardware platforms. parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-45- If not set, inference is automatically made based on the hardware parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-46- platform type (device_target). Default: ``None`` . parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-47- parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-48- Raises: parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log:49: TypeError: If `backend_name` is not a string. parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log:50: RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails, parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-51- or the environment variables RANK_ID/MINDSPORE_HCCL_CONFIG_PATH parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-52- have not been exported when backend is HCCL. parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-53- parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-54- Supported Platforms: parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-55- ``Ascend`` ``GPU`` ``CPU`` -- parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-76- elif device_target == "GPU": parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-77- backend_name = "nccl" parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-78- elif device_target == "CPU": parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-79- backend_name = "mccl" parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-80- else: parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log:81: raise RuntimeError("For 'set_context', the argument 'device_target' {} is not supported in " parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-82- "parallel initialization, please use Ascend, GPU or CPU.".format(device_target)) parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-83- if not isinstance(backend_name, str): parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log:84: raise TypeError("For 'init', the argument 'backend_name' must be a string, " parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-85- "but got the type : {}".format(type(backend_name))) parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-86- if os.getenv("MS_ROLE") == "MS_SCHED": parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-87- backend_name = "mccl" parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-88- parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-89- _set_elegant_exit_handle() parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-90- if backend_name == "hccl": parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-91- if _is_ps_mode(): parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-92- # Use MindSpore cluster to build network for Parameter Server training. parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-93- init_cluster() parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-94- if _is_role_sched() or _is_role_pserver(): parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log:95: raise RuntimeError("Parameter server and scheduler should use 'CPU' as backend instead of 'Ascend'") parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-96- if _get_ps_context("worker_num") == 1: parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-97- GlobalComm.INITED = True parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-98- return parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-99- if device_target != "Ascend": parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log:100: raise RuntimeError("For 'init', the argument 'backend_name' should be '{}' to init '{}', " parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-101- "but got 'hccl'.".format(DEVICE_TO_BACKEND[device_target], device_target)) parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-102- if is_initialized(device_target): parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-103- logger.warning(f"For 'init' in Ascend backend, the backend is already initialized, please set it before " parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-104- "the definition of any Tensor and Parameter, and the instantiation and execution of any " parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-105- "operation and net, otherwise the 'init' may not take effect.") -- parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-109- _check_hccl() parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-110- init_hccl() parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-111- GlobalComm.WORLD_COMM_GROUP = HCCL_WORLD_COMM_GROUP parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-112- elif backend_name == "nccl": parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-113- if device_target != "GPU": parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log:114: raise RuntimeError("For 'init', the argument 'backend_name' should be '{}' to init '{}', " parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-115- "but got 'nccl'.".format(DEVICE_TO_BACKEND[device_target], device_target)) parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-116- init_cluster() parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-117- GlobalComm.BACKEND = Backend("nccl") parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-118- GlobalComm.WORLD_COMM_GROUP = NCCL_WORLD_COMM_GROUP parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-119- elif backend_name == "mccl": parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-120-> init_cluster() parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log:121:E RuntimeError: The total number of timed out node is 1. Timed out node list is: [const vector]{0}, worker 0 is the first one timed out, please check its log. parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-122-E parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-123-E ---------------------------------------------------- parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-124-E - C++ Call Stack: (For framework developers) parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-125-E ---------------------------------------------------- parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-126-E mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:517 UpdateTopoState parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-127- parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log:128:/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py:213: RuntimeError parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-129----------------------------- Captured stderr setup ----------------------------- parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-130-[WARNING] DISTRIBUTED(1433510,ffffa9ffeec0,python):2025-07-15-13:36:43.266.250 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/1200). parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-131-[WARNING] DISTRIBUTED(1433510,ffffa9ffeec0,python):2025-07-15-13:36:43.766.391 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/1200). parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-132-[WARNING] DISTRIBUTED(1433510,ffff4349efa0,python):2025-07-15-13:36:44.088.459 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:254] ProcessRegister] The new node: 0(role: MS_WORKER), rank id: 0, device id: 0, hostname: ascend213, ip: 127.0.0.1 is registered successfully. Currently registered node number: 1, expected node number: 2 parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-133-[WARNING] DISTRIBUTED(1433510,ffffa9ffeec0,python):2025-07-15-13:36:44.266.487 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(3/1200). -- parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-212-[WARNING] DISTRIBUTED(1433510,ffffa9ffeec0,python):2025-07-15-13:39:44.772.048 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-213-[WARNING] DISTRIBUTED(1433510,ffffa9ffeec0,python):2025-07-15-13:39:49.772.156 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 1 alive nodes. parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-214-[WARNING] DISTRIBUTED(1433510,ffffa9ffeec0,python):2025-07-15-13:39:49.772.215 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-215-[WARNING] DISTRIBUTED(1433510,ffffa9ffeec0,python):2025-07-15-13:39:54.772.308 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 1 alive nodes. parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-216-[WARNING] DISTRIBUTED(1433510,ffffa9ffeec0,python):2025-07-15-13:39:54.772.336 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log:217:[ERROR] DISTRIBUTED(1433510,ffff4247efa0,python):2025-07-15-13:39:57.288.487 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:511] UpdateTopoState] The node: 0 is timed out. It may exit with exception, please check this node's log. parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log:218:[ERROR] DISTRIBUTED(1433510,ffffa9ffeec0,python):2025-07-15-13:39:59.772.441 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:103] Finalize] There are 1 abnormal compute graph nodes. parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-219-=========================== short test summary info ============================ parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log:220:ERROR parallel/matmul_reduce_scatter.py::test_binary_case[_init0] - RuntimeEr... parallel/matmul_reduce_scatter/test_binary_case_init0/scheduler.log-221-=========== 2 deselected, 22 warnings, 1 error in 196.90s (0:03:16) ============ Traceback (most recent call last): File "/home/jenkins/anaconda3/envs/ci39/bin/msrun", line 8, in sys.exit(main()) File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/parallel/cluster/run.py", line 191, in main run(args) File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/parallel/cluster/run.py", line 185, in run process_manager.run() File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/parallel/cluster/process_entity/_api.py", line 268, in run self.join_processes() File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/parallel/cluster/process_entity/_api.py", line 387, in join_processes raise RuntimeError("Distributed job exited with exception. Please check logs in " RuntimeError: Distributed job exited with exception. Please check logs in directory: parallel/matmul_reduce_scatter/test_binary_case_init0. F =================================== FAILURES =================================== ____________________ test_matmul_reduce_scatter_binary_case ____________________ @mark_utils.arg_mark( plat_marks=['platform_ascend910b'], level_mark='level1', card_mark='allcards', essential_mark='essential', ) def test_matmul_reduce_scatter_binary_case() -> None: """ Feature: mindspore.ops.matmul_reduce_scatter Description: Test the precision of forward calculation. Expectation: The result of mindspore.ops.matmul_reduce_scatter forward calculation is equal to the result of torch_npu.npu_mm_reduce_scatter_base forword calculation. """ > run_parallel_tests( _MATMUL_REDUCE_SCATTER_TEST_SCRIPT, _TEST_BINARY_CASE, _PARALLEL_STRATEGY, run_matmul_reduce_scatter_test, ) test_parallel.py:202: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ test_parallel.py:123: in run_parallel_tests raise thread.exception test_parallel.py:52: in run super().run() /home/jenkins/anaconda3/envs/ci39/lib/python3.9/threading.py:917: in run self._target(*self._args, **self._kwargs) test_parallel.py:105: in run_matmul_reduce_scatter_test run_test(test_script, test_name, devices, filter_, hccl_port, {'HCCL_DETERMINISTIC': 'true'}) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ test_script = 'parallel/matmul_reduce_scatter.py' test_name = 'test_binary_case', devices = '0,1', filter_ = '_init0' hccl_port = '61000', env_vars = {'HCCL_DETERMINISTIC': 'true'} def run_test( test_script: str, test_name: str, devices: str, filter_: str, hccl_port: str, env_vars: dict_annotation[str, str] = None, ) -> None: os.environ['ASCEND_RT_VISIBLE_DEVICES'] = devices if env_vars: for key, value in env_vars.items(): os.environ[key] = value port = get_free_port() path = pathlib.Path(test_script) log_dir = path.parent / path.stem / (test_name + filter_) status = os.system(rf''' export ASCEND_RT_VISIBLE_DEVICES={devices} \ && export HCCL_IF_BASE_PORT={hccl_port} \ && msrun \ --worker_num {_WORKER_NUM} \ --local_worker_num {_WORKER_NUM} \ --join True \ --master_port {port} \ --log_dir {log_dir.as_posix()} \ pytest -vra --disable-warnings -k '{filter_}' {test_script}::{test_name} ''') if status != 0: > raise RuntimeError(f'Test failed with status {status}, please check {log_dir.as_posix()} for more details.') E RuntimeError: Test failed with status 256, please check parallel/matmul_reduce_scatter/test_binary_case_init0 for more details. test_parallel.py:91: RuntimeError =========================== short test summary info ============================ FAILED test_parallel.py::test_matmul_reduce_scatter_binary_case - RuntimeErro... ======================== 1 failed in 211.15s (0:03:31) =========================