============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops, configfile: ../../../../../../sault/virtual_test/virtualenv_002/sault/config/pytest.ini plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collected 1 item test_parallel.py /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) Start worker process with rank id:0, log file:parallel/all_gather_matmul/test_dynamic_shape_init0/worker_0.log. Environment variable [RANK_ID=0] is exported. Start worker process with rank id:0, log file:parallel/all_gather_matmul/test_dynamic_shape_init2/worker_0.log. Environment variable [RANK_ID=0] is exported. Start worker process with rank id:1, log file:parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log. Environment variable [RANK_ID=1] is exported. Start worker process with rank id:0, log file:parallel/all_gather_matmul/test_dynamic_shape_init1/worker_0.log. Environment variable [RANK_ID=0] is exported. Start worker process with rank id:1, log file:parallel/all_gather_matmul/test_dynamic_shape_init2/worker_1.log. Environment variable [RANK_ID=1] is exported. [WARNING] ME(1427704:281472968683200,MainProcess):2025-07-15-13:28:25.555.518 [mindspore/parallel/cluster/process_entity/_api.py:267] Distributed job is spawned. Waiting all processes to exit... Start worker process with rank id:1, log file:parallel/all_gather_matmul/test_dynamic_shape_init1/worker_1.log. Environment variable [RANK_ID=1] is exported. [WARNING] ME(1427702:281473797582528,MainProcess):2025-07-15-13:28:25.611.649 [mindspore/parallel/cluster/process_entity/_api.py:267] Distributed job is spawned. Waiting all processes to exit... [WARNING] ME(1427703:281473588850368,MainProcess):2025-07-15-13:28:25.653.852 [mindspore/parallel/cluster/process_entity/_api.py:267] Distributed job is spawned. Waiting all processes to exit... ============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 -- /home/jenkins/anaconda3/envs/ci39/bin/python cachedir: .pytest_cache rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collecting ... collected 3 items / 2 deselected / 1 selected parallel/all_gather_matmul.py::test_dynamic_shape[_init0] ============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 -- /home/jenkins/anaconda3/envs/ci39/bin/python cachedir: .pytest_cache rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collecting ... collected 3 items / 2 deselected / 1 selected parallel/all_gather_matmul.py::test_dynamic_shape[_init0] ============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 -- /home/jenkins/anaconda3/envs/ci39/bin/python cachedir: .pytest_cache rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collecting ... collected 3 items / 2 deselected / 1 selected parallel/all_gather_matmul.py::test_dynamic_shape[_init2] ============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 -- /home/jenkins/anaconda3/envs/ci39/bin/python cachedir: .pytest_cache rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collecting ... collected 3 items / 2 deselected / 1 selected parallel/all_gather_matmul.py::test_dynamic_shape[_init1] ============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 -- /home/jenkins/anaconda3/envs/ci39/bin/python cachedir: .pytest_cache rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collecting ... collected 3 items / 2 deselected / 1 selected parallel/all_gather_matmul.py::test_dynamic_shape[_init2] ============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 -- /home/jenkins/anaconda3/envs/ci39/bin/python cachedir: .pytest_cache rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collecting ... collected 3 items / 2 deselected / 1 selected parallel/all_gather_matmul.py::test_dynamic_shape[_init1] PASSED [100%]PASSED [100%] ================ 1 passed, 2 deselected, 22 warnings in 11.59s ================= ================ 1 passed, 2 deselected, 22 warnings in 11.68s ================= PASSED [100%]PASSED [100%] ================ 1 passed, 2 deselected, 22 warnings in 11.79s ================= ================ 1 passed, 2 deselected, 22 warnings in 11.68s ================= [INFO] PS(1427933,ffff277eefa0,python):2025-07-15-13:28:45.585.780 [mindspore/ccsrc/ps/core/communicator/tcp_client.cc:318] Start] Event base dispatch success! [INFO] PS(1427933,ffff27ffefa0,python):2025-07-15-13:28:45.585.785 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:220] Start] Event base dispatch success! [INFO] PS(1427943,ffff27ffefa0,python):2025-07-15-13:28:46.610.679 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:220] Start] Event base dispatch success! [INFO] PS(1427943,ffff277eefa0,python):2025-07-15-13:28:46.610.679 [mindspore/ccsrc/ps/core/communicator/tcp_client.cc:318] Start] Event base dispatch success! [INFO] PS(1427937,ffff36fdefa0,python):2025-07-15-13:28:47.281.570 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:220] Start] Event base dispatch success! [INFO] PS(1427937,ffff367cefa0,python):2025-07-15-13:28:47.281.570 [mindspore/ccsrc/ps/core/communicator/tcp_client.cc:318] Start] Event base dispatch success! [INFO] PS(1427923,ffff28e9efa0,python):2025-07-15-13:28:47.346.026 [mindspore/ccsrc/ps/core/communicator/tcp_client.cc:318] Start] Event base dispatch success! [INFO] PS(1427923,ffff296aefa0,python):2025-07-15-13:28:47.346.035 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:220] Start] Event base dispatch success! ERROR [100%] ==================================== ERRORS ==================================== _________________ ERROR at setup of test_dynamic_shape[_init0] _________________ request = > @pytest.fixture(params=[ (ms.GRAPH_MODE, 'O0'), (ms.GRAPH_MODE, 'O2'), (ms.PYNATIVE_MODE, ''), ], autouse=True) def _init(request): mode, jit_level = request.param > ms.communication.init() parallel/conftest.py:27: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ backend_name = 'hccl' def init(backend_name=None): """ Initialize distributed backends required by communication services, e.g. ``"hccl"`` / ``"nccl"`` / ``"mccl"``. It is usually used in distributed parallel scenarios and set before using communication services. Note: - The full name of ``"hccl"`` is Huawei Collective Communication Library(HCCL). - The full name of ``"nccl"`` is NVIDIA Collective Communication Library(NCCL). - The full name of ``"mccl"`` is MindSpore Collective Communication Library(MCCL). - In Ascend hardware platforms, ``init()`` should be set before the definition of any Tensor and Parameter, and the instantiation and execution of any operation and net. Args: backend_name (str): Backend, using ``"hccl"`` / ``"nccl"`` / ``"mccl"``. ``"hccl"`` should be used for Ascend hardware platforms, ``"nccl"`` for GPU hardware platforms and ``"mccl"`` for CPU hardware platforms. If not set, inference is automatically made based on the hardware platform type (device_target). Default: ``None`` . Raises: TypeError: If `backend_name` is not a string. RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails, or the environment variables RANK_ID/MINDSPORE_HCCL_CONFIG_PATH have not been exported when backend is HCCL. Supported Platforms: ``Ascend`` ``GPU`` ``CPU`` Examples: .. note:: Before running the following examples, you need to configure the communication environment variables. For Ascend/GPU/CPU devices, it is recommended to use the msrun startup method without any third-party or configuration file dependencies. Please see the `msrun start up `_ for more details. >>> from mindspore.communication import init >>> init() """ host_init = _host_distribute() device_target = context.get_context("device_target") if backend_name is None: if device_target == "Ascend": backend_name = "hccl" elif device_target == "GPU": backend_name = "nccl" elif device_target == "CPU": backend_name = "mccl" else: raise RuntimeError("For 'set_context', the argument 'device_target' {} is not supported in " "parallel initialization, please use Ascend, GPU or CPU.".format(device_target)) if not isinstance(backend_name, str): raise TypeError("For 'init', the argument 'backend_name' must be a string, " "but got the type : {}".format(type(backend_name))) if os.getenv("MS_ROLE") == "MS_SCHED": backend_name = "mccl" _set_elegant_exit_handle() if backend_name == "hccl": if _is_ps_mode(): # Use MindSpore cluster to build network for Parameter Server training. init_cluster() if _is_role_sched() or _is_role_pserver(): raise RuntimeError("Parameter server and scheduler should use 'CPU' as backend instead of 'Ascend'") if _get_ps_context("worker_num") == 1: GlobalComm.INITED = True return if device_target != "Ascend": raise RuntimeError("For 'init', the argument 'backend_name' should be '{}' to init '{}', " "but got 'hccl'.".format(DEVICE_TO_BACKEND[device_target], device_target)) if is_initialized(device_target): logger.warning(f"For 'init' in Ascend backend, the backend is already initialized, please set it before " "the definition of any Tensor and Parameter, and the instantiation and execution of any " "operation and net, otherwise the 'init' may not take effect.") if not host_init: _check_parallel_envs() GlobalComm.BACKEND = Backend("hccl") _check_hccl() > init_hccl() E RuntimeError: Call aclrtSetDevice failed, ret[507033]. Got device count[2] and device id[1], please check if device id is valid. E E ---------------------------------------------------- E - C++ Call Stack: (For framework developers) E ---------------------------------------------------- E mindspore/ccsrc/plugin/res_manager/ascend/hal_manager/ascend_hal_manager.cc:67 InitDevice /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py:203: RuntimeError ---------------------------- Captured stderr setup ----------------------------- [WARNING] DISTRIBUTED(1427929,ffffaa63eec0,python):2025-07-15-13:28:31.286.644 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 27 source: 127.0.0.1:34136, destination: 127.0.0.1:60523 [WARNING] DISTRIBUTED(1427929,ffff42aaefa0,python):2025-07-15-13:28:31.286.653 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:34136 to 127.0.0.1:60523 is successfully created. System errno: Success [WARNING] DISTRIBUTED(1427929,ffffaa63eec0,python):2025-07-15-13:28:31.286.715 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:60523 to be connected...Retry number: 1 [WARNING] DISTRIBUTED(1427929,ffffaa63eec0,python):2025-07-15-13:28:31.786.967 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 28 source: 127.0.0.1:34154, destination: 127.0.0.1:60523 [WARNING] DISTRIBUTED(1427929,ffffaa63eec0,python):2025-07-15-13:28:31.787.004 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:60523 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(1427929,ffff43acefa0,python):2025-07-15-13:28:31.787.006 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:34154 to 127.0.0.1:60523 is successfully created. System errno: Success [WARNING] DISTRIBUTED(1427929,ffffaa63eec0,python):2025-07-15-13:28:32.287.663 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/1200). [WARNING] DISTRIBUTED(1427929,ffffaa63eec0,python):2025-07-15-13:28:32.787.761 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/1200). [WARNING] DISTRIBUTED(1427929,ffffaa63eec0,python):2025-07-15-13:28:33.287.874 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(1427929,ffffaa63eec0,python):2025-07-15-13:28:33.287.904 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 1 rank id: 1 [WARNING] PS(1427929,ffffaa63eec0,python):2025-07-15-13:28:33.288.416 [mindspore/ccsrc/ps/core/file_configuration.cc:24] Initialize] The file: is not exist. [WARNING] DEVICE(1427929,ffffaa63eec0,python):2025-07-15-13:28:33.288.458 [mindspore/ccsrc/plugin/device/cpu/hal/hardware/ms_collective_node.cc:33] Start] Failed to initialize the configuration for this mccl collective node. [WARNING] PS(1427929,ffffaa63eec0,python):2025-07-15-13:28:33.294.402 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:188] Init] The port 8375 is already in use. So increase port to: 8375 [WARNING] PS(1427929,ffffaa63eec0,python):2025-07-15-13:28:33.294.458 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:188] Init] The port 8376 is already in use. So increase port to: 8376 =========================== short test summary info ============================ ERROR parallel/all_gather_matmul.py::test_dynamic_shape[_init0] - RuntimeErro... =========== 2 deselected, 22 warnings, 1 error in 162.38s (0:02:42) ============ [WARNING] DEVICE(1427929,ffffaa63eec0,python):2025-07-15-13:31:13.846.776 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_res_manager.cc:350] SyncAllStreams] The ascend_res_manager_ is nullptr in scenarios where it is not actually executed [INFO] PS(1427929,ffff40a6efa0,python):2025-07-15-13:31:14.439.902 [mindspore/ccsrc/ps/core/communicator/tcp_client.cc:318] Start] Event base dispatch success! [INFO] PS(1427929,ffff4127efa0,python):2025-07-15-13:31:14.440.195 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:220] Start] Event base dispatch success! [ERROR] ME(1427704:281472968683200,MainProcess):2025-07-15-13:31:15.361.723 [mindspore/parallel/cluster/process_entity/_api.py:363] Worker process 1427929 exit with exception. Error code: 1. [WARNING] ME(1427704:281472968683200,MainProcess):2025-07-15-13:31:15.362.019 [mindspore/parallel/cluster/process_entity/_api.py:369] There's worker exits with exception, kill all other workers. [ERROR] ME(1427704:281472968683200,MainProcess):2025-07-15-13:31:49.635.209 [mindspore/parallel/cluster/process_entity/_api.py:382] Scheduler process 1427915 exit with exception. [ERROR] ME(1427704:281472968683200,MainProcess):2025-07-15-13:31:49.636.211 [mindspore/parallel/cluster/process_entity/_api.py:603] Time out nodes are ['0'] parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-3-cachedir: .pytest_cache parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-4-rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-5-plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-6-collecting ... collected 3 items / 2 deselected / 1 selected parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-7- parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log:8:parallel/all_gather_matmul.py::test_dynamic_shape[_init0] ERROR [100%] parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-9- parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log:10:==================================== ERRORS ==================================== parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log:11:_________________ ERROR at setup of test_dynamic_shape[_init0] _________________ parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-12- parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-13-request = > parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-14- parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-15- @pytest.fixture(params=[ parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-16- (ms.GRAPH_MODE, 'O0'), -- parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-44- ``"nccl"`` for GPU hardware platforms and ``"mccl"`` for CPU hardware platforms. parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-45- If not set, inference is automatically made based on the hardware parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-46- platform type (device_target). Default: ``None`` . parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-47- parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-48- Raises: parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log:49: TypeError: If `backend_name` is not a string. parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log:50: RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails, parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-51- or the environment variables RANK_ID/MINDSPORE_HCCL_CONFIG_PATH parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-52- have not been exported when backend is HCCL. parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-53- parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-54- Supported Platforms: parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-55- ``Ascend`` ``GPU`` ``CPU`` -- parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-76- elif device_target == "GPU": parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-77- backend_name = "nccl" parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-78- elif device_target == "CPU": parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-79- backend_name = "mccl" parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-80- else: parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log:81: raise RuntimeError("For 'set_context', the argument 'device_target' {} is not supported in " parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-82- "parallel initialization, please use Ascend, GPU or CPU.".format(device_target)) parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-83- if not isinstance(backend_name, str): parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log:84: raise TypeError("For 'init', the argument 'backend_name' must be a string, " parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-85- "but got the type : {}".format(type(backend_name))) parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-86- if os.getenv("MS_ROLE") == "MS_SCHED": parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-87- backend_name = "mccl" parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-88- parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-89- _set_elegant_exit_handle() parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-90- if backend_name == "hccl": parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-91- if _is_ps_mode(): parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-92- # Use MindSpore cluster to build network for Parameter Server training. parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-93- init_cluster() parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-94- if _is_role_sched() or _is_role_pserver(): parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log:95: raise RuntimeError("Parameter server and scheduler should use 'CPU' as backend instead of 'Ascend'") parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-96- if _get_ps_context("worker_num") == 1: parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-97- GlobalComm.INITED = True parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-98- return parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-99- if device_target != "Ascend": parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log:100: raise RuntimeError("For 'init', the argument 'backend_name' should be '{}' to init '{}', " parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-101- "but got 'hccl'.".format(DEVICE_TO_BACKEND[device_target], device_target)) parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-102- if is_initialized(device_target): parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-103- logger.warning(f"For 'init' in Ascend backend, the backend is already initialized, please set it before " parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-104- "the definition of any Tensor and Parameter, and the instantiation and execution of any " parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-105- "operation and net, otherwise the 'init' may not take effect.") parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-106- if not host_init: parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-107- _check_parallel_envs() parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-108- GlobalComm.BACKEND = Backend("hccl") parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-109- _check_hccl() parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-110-> init_hccl() parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log:111:E RuntimeError: Call aclrtSetDevice failed, ret[507033]. Got device count[2] and device id[1], please check if device id is valid. parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-112-E parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-113-E ---------------------------------------------------- parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-114-E - C++ Call Stack: (For framework developers) parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-115-E ---------------------------------------------------- parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-116-E mindspore/ccsrc/plugin/res_manager/ascend/hal_manager/ascend_hal_manager.cc:67 InitDevice parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-117- parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log:118:/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py:203: RuntimeError parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-119----------------------------- Captured stderr setup ----------------------------- parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-120-[WARNING] DISTRIBUTED(1427929,ffffaa63eec0,python):2025-07-15-13:28:31.286.644 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 27 source: 127.0.0.1:34136, destination: 127.0.0.1:60523 parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-121-[WARNING] DISTRIBUTED(1427929,ffff42aaefa0,python):2025-07-15-13:28:31.286.653 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:34136 to 127.0.0.1:60523 is successfully created. System errno: Success parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-122-[WARNING] DISTRIBUTED(1427929,ffffaa63eec0,python):2025-07-15-13:28:31.286.715 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:60523 to be connected...Retry number: 1 parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-123-[WARNING] DISTRIBUTED(1427929,ffffaa63eec0,python):2025-07-15-13:28:31.786.967 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 28 source: 127.0.0.1:34154, destination: 127.0.0.1:60523 -- parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-130-[WARNING] PS(1427929,ffffaa63eec0,python):2025-07-15-13:28:33.288.416 [mindspore/ccsrc/ps/core/file_configuration.cc:24] Initialize] The file: is not exist. parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-131-[WARNING] DEVICE(1427929,ffffaa63eec0,python):2025-07-15-13:28:33.288.458 [mindspore/ccsrc/plugin/device/cpu/hal/hardware/ms_collective_node.cc:33] Start] Failed to initialize the configuration for this mccl collective node. parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-132-[WARNING] PS(1427929,ffffaa63eec0,python):2025-07-15-13:28:33.294.402 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:188] Init] The port 8375 is already in use. So increase port to: 8375 parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-133-[WARNING] PS(1427929,ffffaa63eec0,python):2025-07-15-13:28:33.294.458 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:188] Init] The port 8376 is already in use. So increase port to: 8376 parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-134-=========================== short test summary info ============================ parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log:135:ERROR parallel/all_gather_matmul.py::test_dynamic_shape[_init0] - RuntimeErro... parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-136-=========== 2 deselected, 22 warnings, 1 error in 162.38s (0:02:42) ============ parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-137-[WARNING] DEVICE(1427929,ffffaa63eec0,python):2025-07-15-13:31:13.846.776 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_res_manager.cc:350] SyncAllStreams] The ascend_res_manager_ is nullptr in scenarios where it is not actually executed parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-138-[INFO] PS(1427929,ffff40a6efa0,python):2025-07-15-13:31:14.439.902 [mindspore/ccsrc/ps/core/communicator/tcp_client.cc:318] Start] Event base dispatch success! parallel/all_gather_matmul/test_dynamic_shape_init0/worker_1.log-139-[INFO] PS(1427929,ffff4127efa0,python):2025-07-15-13:31:14.440.195 [mindspore/ccsrc/ps/core/communicator/tcp_server.cc:220] Start] Event base dispatch success! -- parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-3-cachedir: .pytest_cache parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-4-rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-5-plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-6-collecting ... collected 3 items / 2 deselected / 1 selected parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-7- parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log:8:parallel/all_gather_matmul.py::test_dynamic_shape[_init0] ERROR [100%] parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-9- parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log:10:==================================== ERRORS ==================================== parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log:11:_________________ ERROR at setup of test_dynamic_shape[_init0] _________________ parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-12- parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-13-request = > parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-14- parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-15- @pytest.fixture(params=[ parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-16- (ms.GRAPH_MODE, 'O0'), -- parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-44- ``"nccl"`` for GPU hardware platforms and ``"mccl"`` for CPU hardware platforms. parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-45- If not set, inference is automatically made based on the hardware parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-46- platform type (device_target). Default: ``None`` . parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-47- parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-48- Raises: parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log:49: TypeError: If `backend_name` is not a string. parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log:50: RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails, parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-51- or the environment variables RANK_ID/MINDSPORE_HCCL_CONFIG_PATH parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-52- have not been exported when backend is HCCL. parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-53- parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-54- Supported Platforms: parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-55- ``Ascend`` ``GPU`` ``CPU`` -- parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-76- elif device_target == "GPU": parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-77- backend_name = "nccl" parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-78- elif device_target == "CPU": parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-79- backend_name = "mccl" parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-80- else: parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log:81: raise RuntimeError("For 'set_context', the argument 'device_target' {} is not supported in " parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-82- "parallel initialization, please use Ascend, GPU or CPU.".format(device_target)) parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-83- if not isinstance(backend_name, str): parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log:84: raise TypeError("For 'init', the argument 'backend_name' must be a string, " parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-85- "but got the type : {}".format(type(backend_name))) parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-86- if os.getenv("MS_ROLE") == "MS_SCHED": parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-87- backend_name = "mccl" parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-88- parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-89- _set_elegant_exit_handle() parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-90- if backend_name == "hccl": parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-91- if _is_ps_mode(): parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-92- # Use MindSpore cluster to build network for Parameter Server training. parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-93- init_cluster() parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-94- if _is_role_sched() or _is_role_pserver(): parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log:95: raise RuntimeError("Parameter server and scheduler should use 'CPU' as backend instead of 'Ascend'") parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-96- if _get_ps_context("worker_num") == 1: parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-97- GlobalComm.INITED = True parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-98- return parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-99- if device_target != "Ascend": parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log:100: raise RuntimeError("For 'init', the argument 'backend_name' should be '{}' to init '{}', " parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-101- "but got 'hccl'.".format(DEVICE_TO_BACKEND[device_target], device_target)) parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-102- if is_initialized(device_target): parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-103- logger.warning(f"For 'init' in Ascend backend, the backend is already initialized, please set it before " parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-104- "the definition of any Tensor and Parameter, and the instantiation and execution of any " parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-105- "operation and net, otherwise the 'init' may not take effect.") -- parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-109- _check_hccl() parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-110- init_hccl() parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-111- GlobalComm.WORLD_COMM_GROUP = HCCL_WORLD_COMM_GROUP parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-112- elif backend_name == "nccl": parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-113- if device_target != "GPU": parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log:114: raise RuntimeError("For 'init', the argument 'backend_name' should be '{}' to init '{}', " parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-115- "but got 'nccl'.".format(DEVICE_TO_BACKEND[device_target], device_target)) parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-116- init_cluster() parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-117- GlobalComm.BACKEND = Backend("nccl") parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-118- GlobalComm.WORLD_COMM_GROUP = NCCL_WORLD_COMM_GROUP parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-119- elif backend_name == "mccl": parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-120-> init_cluster() parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log:121:E RuntimeError: The total number of timed out node is 1. Timed out node list is: [const vector]{0}, worker 0 is the first one timed out, please check its log. parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-122-E parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-123-E ---------------------------------------------------- parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-124-E - C++ Call Stack: (For framework developers) parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-125-E ---------------------------------------------------- parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-126-E mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:517 UpdateTopoState parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-127- parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log:128:/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py:213: RuntimeError parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-129----------------------------- Captured stderr setup ----------------------------- parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-130-[WARNING] DISTRIBUTED(1427915,ffffbccaeec0,python):2025-07-15-13:28:31.162.229 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/1200). parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-131-[WARNING] DISTRIBUTED(1427915,ffffbccaeec0,python):2025-07-15-13:28:31.662.398 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/1200). parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-132-[WARNING] DISTRIBUTED(1427915,ffffbccaeec0,python):2025-07-15-13:28:32.162.494 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(3/1200). parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-133-[WARNING] DISTRIBUTED(1427915,ffff5615efa0,python):2025-07-15-13:28:32.287.398 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:254] ProcessRegister] The new node: 1(role: MS_WORKER), rank id: 1, device id: 1, hostname: ascend213, ip: 127.0.0.1 is registered successfully. Currently registered node number: 1, expected node number: 2 -- parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-212-[WARNING] DISTRIBUTED(1427915,ffffbccaeec0,python):2025-07-15-13:31:32.667.861 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-213-[WARNING] DISTRIBUTED(1427915,ffffbccaeec0,python):2025-07-15-13:31:37.667.953 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 1 alive nodes. parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-214-[WARNING] DISTRIBUTED(1427915,ffffbccaeec0,python):2025-07-15-13:31:37.667.993 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-215-[WARNING] DISTRIBUTED(1427915,ffffbccaeec0,python):2025-07-15-13:31:42.668.092 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 1 alive nodes. parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-216-[WARNING] DISTRIBUTED(1427915,ffffbccaeec0,python):2025-07-15-13:31:42.668.121 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log:217:[ERROR] DISTRIBUTED(1427915,ffff5513efa0,python):2025-07-15-13:31:45.185.309 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:511] UpdateTopoState] The node: 0 is timed out. It may exit with exception, please check this node's log. parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log:218:[ERROR] DISTRIBUTED(1427915,ffffbccaeec0,python):2025-07-15-13:31:47.668.222 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:103] Finalize] There are 1 abnormal compute graph nodes. parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-219-=========================== short test summary info ============================ parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log:220:ERROR parallel/all_gather_matmul.py::test_dynamic_shape[_init0] - RuntimeErro... parallel/all_gather_matmul/test_dynamic_shape_init0/scheduler.log-221-=========== 2 deselected, 22 warnings, 1 error in 196.89s (0:03:16) ============ Traceback (most recent call last): File "/home/jenkins/anaconda3/envs/ci39/bin/msrun", line 8, in sys.exit(main()) File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/parallel/cluster/run.py", line 191, in main run(args) File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/parallel/cluster/run.py", line 185, in run process_manager.run() File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/parallel/cluster/process_entity/_api.py", line 268, in run self.join_processes() File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/parallel/cluster/process_entity/_api.py", line 387, in join_processes raise RuntimeError("Distributed job exited with exception. Please check logs in " RuntimeError: Distributed job exited with exception. Please check logs in directory: parallel/all_gather_matmul/test_dynamic_shape_init0. F =================================== FAILURES =================================== _____________________ test_all_gather_matmul_dynamic_shape _____________________ @mark_utils.arg_mark( plat_marks=['platform_ascend910b'], level_mark='level1', card_mark='allcards', essential_mark='essential', ) def test_all_gather_matmul_dynamic_shape() -> None: """ Feature: mindspore.ops.all_gather_matmul Description: Test the dynamic shape function of forward calculation. Expectation: The result of forward calculation with inputs in dynamic shapes is equal to the result of forword calculation with inputs in static shapes. """ > run_parallel_tests( _ALL_GATHER_MATMUL_TEST_SCRIPT, _TEST_DYNAMIC_SHAPE, _PARALLEL_STRATEGY, run_all_gather_matmul_test, ) test_parallel.py:160: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ test_parallel.py:123: in run_parallel_tests raise thread.exception test_parallel.py:52: in run super().run() /home/jenkins/anaconda3/envs/ci39/lib/python3.9/threading.py:917: in run self._target(*self._args, **self._kwargs) test_parallel.py:95: in run_all_gather_matmul_test run_test(test_script, test_name, devices, filter_, hccl_port) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ test_script = 'parallel/all_gather_matmul.py', test_name = 'test_dynamic_shape' devices = '0,1', filter_ = '_init0', hccl_port = '61000', env_vars = None def run_test( test_script: str, test_name: str, devices: str, filter_: str, hccl_port: str, env_vars: dict_annotation[str, str] = None, ) -> None: os.environ['ASCEND_RT_VISIBLE_DEVICES'] = devices if env_vars: for key, value in env_vars.items(): os.environ[key] = value port = get_free_port() path = pathlib.Path(test_script) log_dir = path.parent / path.stem / (test_name + filter_) status = os.system(rf''' export ASCEND_RT_VISIBLE_DEVICES={devices} \ && export HCCL_IF_BASE_PORT={hccl_port} \ && msrun \ --worker_num {_WORKER_NUM} \ --local_worker_num {_WORKER_NUM} \ --join True \ --master_port {port} \ --log_dir {log_dir.as_posix()} \ pytest -vra --disable-warnings -k '{filter_}' {test_script}::{test_name} ''') if status != 0: > raise RuntimeError(f'Test failed with status {status}, please check {log_dir.as_posix()} for more details.') E RuntimeError: Test failed with status 256, please check parallel/all_gather_matmul/test_dynamic_shape_init0 for more details. test_parallel.py:91: RuntimeError =========================== short test summary info ============================ FAILED test_parallel.py::test_all_gather_matmul_dynamic_shape - RuntimeError:... ======================== 1 failed in 210.97s (0:03:30) =========================