============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/llama, configfile: ../../../../../../../../sault/virtual_test/virtualenv_002/sault/config/pytest.ini plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collected 1 item test_parallel_train.py [INFO] CORE(935455,ffffb287eec0,python):2025-07-15-10:47:42.363.585 [mindspore/core/utils/ms_context.cc:105] SetDeviceId] Set MS_CTX_DEVICE_ID by env DEVICE_ID to: 0 [INFO] ME(935455,ffffb287eec0,python):2025-07-15-10:47:42.918.737 [mindspore/ccsrc/runtime/hardware/device_context_manager.cc:377] LoadPlugin] Load libmindspore_ascend plugin file /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/lib/plugin/libmindspore_ascend.so.2 success. [INFO] ME(935455,ffffb287eec0,python):2025-07-15-10:47:42.919.145 [mindspore/ccsrc/runtime/hardware/device_context_manager.cc:377] LoadPlugin] Load libmindspore_ops_ascend plugin file /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/lib/plugin/libmindspore_ops_ascend.so success. [INFO] ME(935455,ffffb287eec0,python):2025-07-15-10:47:42.963.354 [mindspore/ccsrc/runtime/hardware/device_context_manager.cc:377] LoadPlugin] Load libmindspore_ops_host plugin file /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/lib/plugin/libmindspore_ops_host.so success. [INFO] CORE(935455,ffffb287eec0,python):2025-07-15-10:47:42.963.562 [mindspore/core/utils/ms_context.cc:105] SetDeviceId] Set MS_CTX_DEVICE_ID by env DEVICE_ID to: 0 [INFO] CORE(935455,ffffb287eec0,python):2025-07-15-10:47:42.963.617 [mindspore/core/utils/ms_context.cc:165] set_backend_policy] ms set context backend policy:ge [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:42.964.343 [mindspore/ccsrc/plugin/res_manager/ascend/symbol_interface/acl_base_symbol.cc:63] LoadAclBaseApiSymbol] Load acl base api success! [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:42.966.273 [mindspore/ccsrc/plugin/res_manager/ascend/symbol_interface/acl_compiler_symbol.cc:39] LoadAclOpCompilerApiSymbol] Load acl op compiler api success! [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:42.966.366 [mindspore/ccsrc/plugin/res_manager/ascend/symbol_interface/acl_mdl_symbol.cc:130] LoadAclMdlApiSymbol] Load acl mdl api success! [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:42.966.420 [mindspore/ccsrc/plugin/res_manager/ascend/symbol_interface/acl_op_symbol.cc:55] LoadAclOpApiSymbol] Load ascend op api success! [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:42.966.543 [mindspore/ccsrc/plugin/res_manager/ascend/symbol_interface/acl_prof_symbol.cc:51] LoadProfApiSymbol] Load acl prof api success! [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:42.966.589 [mindspore/ccsrc/plugin/res_manager/ascend/symbol_interface/acl_rt_allocator_symbol.cc:48] LoadAclAllocatorApiSymbol] Load acl allocator api success! [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:42.966.654 [mindspore/ccsrc/plugin/res_manager/ascend/symbol_interface/acl_rt_symbol.cc:159] LoadAclRtApiSymbol] Load acl rt api success! [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:42.966.699 [mindspore/ccsrc/plugin/res_manager/ascend/symbol_interface/acl_symbol.cc:34] LoadAclApiSymbol] Load acl base api success! [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:42.968.362 [mindspore/ccsrc/plugin/res_manager/ascend/symbol_interface/acl_tdt_symbol.cc:82] LoadAcltdtApiSymbol] Load acl tdt api success! [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:42.968.416 [mindspore/ccsrc/plugin/res_manager/ascend/symbol_interface/symbol_utils.cc:79] LoadAscendApiSymbols] Load ascend api success! [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:42.976.379 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_context.cc:234] SetContextSocVersion] The soc version :Ascend910B3 [INFO] COMMON(935455,ffffb287eec0,python):2025-07-15-10:47:43.016.215 [mindspore/ccsrc/pybind_api/utils/log_adapter_py.cc:27] PyExceptionInitializer] Set exception handler [INFO] COMMON(935455,ffffb287eec0,python):2025-07-15-10:47:43.060.367 [mindspore/ccsrc/pybind_api/init.cc:260] pybind11_init__c_expression] Start GraphExecutorPy... [INFO] COMMON(935455,ffffb287eec0,python):2025-07-15-10:47:43.061.078 [mindspore/ccsrc/pybind_api/init.cc:326] pybind11_init__c_expression] Start JitExecutorPy... [INFO] COMMON(935455,ffffb287eec0,python):2025-07-15-10:47:43.061.714 [mindspore/ccsrc/pybind_api/init.cc:391] pybind11_init__c_expression] Start ParallelContext... [INFO] COMMON(935455,ffffb287eec0,python):2025-07-15-10:47:43.062.925 [mindspore/ccsrc/pybind_api/init.cc:525] pybind11_init__c_expression] Start CostModelContext... [INFO] COMMON(935455,ffffb287eec0,python):2025-07-15-10:47:43.063.553 [mindspore/ccsrc/pybind_api/init.cc:627] pybind11_init__c_expression] Start OffloadContext... [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:43.066.112 [mindspore/ccsrc/plugin/res_manager/ascend/symbol_interface/symbol_utils.cc:65] LoadAscendApiSymbols] Ascend api is already loaded. [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:43.066.201 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_context.cc:234] SetContextSocVersion] The soc version :Ascend910B3 /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) [INFO] CORE(935455,ffffb287eec0,python):2025-07-15-10:47:45.507.666 [mindspore/core/utils/ms_context.cc:306] SetDeviceTargetFromInner] ms set context device target:Ascend [INFO] PARALLEL(935455,ffffb287eec0,python):2025-07-15-10:47:45.507.799 [mindspore/ccsrc/frontend/parallel/costmodel_context.cc:30] GetInstance] Create costmodel_context [INFO] CORE(935455,ffffb287eec0,python):2025-07-15-10:47:45.507.849 [mindspore/core/utils/ms_context.cc:310] SetDeviceTargetFromInner] Set memory_optimize_level to O0 as default on other device [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:45.603.183 [mindspore/ccsrc/runtime/pynative/lazy_fusion.cc:27] Init] Start init lazy fusion. [INFO] KERNEL(935455,ffffb287eec0,python):2025-07-15-10:47:45.603.403 [mindspore/ccsrc/plugin/device/ascend/kernel/dvm/lazy_fusion_flags.cc:174] LazyFusionFlags] lazy_fusion_flags :{"disable_ops":[],"dump_as_text":false,"enable_ops_only":[],"flush_threshold":100,"online_tuning":false,"opt_level":0,"synchronize":false} [INFO] KERNEL(935455,ffffb287eec0,python):2025-07-15-10:47:45.603.442 [mindspore/ccsrc/plugin/device/ascend/kernel/dvm/lazy_fusion_op.cc:1802] LazyFusionAscendInit] Skip init lazy fusion. [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:45.603.471 [mindspore/ccsrc/runtime/pynative/lazy_fusion.cc:31] Init] End init lazy fusion. [INFO] ME(935455:281473676996288,MainProcess):2025-07-15-10:47:47.206.024 [mindspore/profiler/common/registry.py:36] registered module: CpuProfiler with name: CPU [INFO] ME(935455:281473676996288,MainProcess):2025-07-15-10:47:47.206.551 [mindspore/profiler/common/registry.py:36] registered module: GpuProfiler with name: GPU [INFO] ME(935455:281473676996288,MainProcess):2025-07-15-10:47:47.214.313 [mindspore/profiler/common/registry.py:36] registered module: NpuProfiler with name: Ascend [INFO] ME(935455:281473676996288,MainProcess):2025-07-15-10:47:51.887.01 [mindspore/communication/_comm_helper.py:336] You are invoking this interface without calling `init` method.Return 'RANK_SIZE' env value instead. If 'RANK_SIZE' is not set, return 1 as default value. [INFO] ME(935455:281473676996288,MainProcess):2025-07-15-10:47:51.951.92 [mindspore/communication/_comm_helper.py:336] You are invoking this interface without calling `init` method.Return 'RANK_SIZE' env value instead. If 'RANK_SIZE' is not set, return 1 as default value. [INFO] ME(935455:281473676996288,MainProcess):2025-07-15-10:47:51.953.90 [mindspore/communication/_comm_helper.py:278] You are invoking this interface without calling `init` method.Return 'RANK_ID' env value instead. If 'RANK_ID' is not set, return 0 as default value. [INFO] ME(935455,ffffb287eec0,python):2025-07-15-10:47:51.134.879 [mindspore/ccsrc/runtime/hardware/device_context_manager.cc:466] GetDeviceContext] Device context of device Ascend is not created yet. [INFO] ME(935455:281473676996288,MainProcess):2025-07-15-10:47:51.134.965 [mindspore/hal/device.py:156] Backend Ascend is not created yet. [INFO] ME(935455,ffffb287eec0,python):2025-07-15-10:47:51.135.182 [mindspore/ccsrc/runtime/hardware/device_context_manager.cc:466] GetDeviceContext] Device context of device Ascend is not created yet. [INFO] ME(935455:281473676996288,MainProcess):2025-07-15-10:47:51.135.226 [mindspore/hal/device.py:156] Backend Ascend is not created yet. [INFO] ME(935455,ffffb287eec0,python):2025-07-15-10:47:51.135.447 [mindspore/ccsrc/runtime/hardware/device_context_manager.cc:466] GetDeviceContext] Device context of device Ascend is not created yet. [INFO] ME(935455:281473676996288,MainProcess):2025-07-15-10:47:51.135.491 [mindspore/hal/device.py:156] Backend Ascend is not created yet. [INFO] PS(935455,ffffb287eec0,python):2025-07-15-10:47:51.136.302 [mindspore/ccsrc/ps/ps_context.cc:256] set_ms_role] MS_ROLE of this node is MS_WORKER [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.136.370 [mindspore/ccsrc/include/backend/distributed/cluster/topology/node_base.h:46] NodeBase] Cluster topo timeout is 600 seconds. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.136.403 [mindspore/ccsrc/include/backend/distributed/cluster/topology/node_base.h:51] NodeBase] Node timeout after exception is 30 seconds. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.136.472 [mindspore/ccsrc/distributed/rpc/tcp/tcp_client.cc:29] TCPClient] Tcp client receiving message timeout is 15 seconds. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.136.819 [mindspore/ccsrc/distributed/rpc/tcp/event_loop.cc:189] Initialize] Set pthread name success name:RECV_EVENT_LOOP,loop_thread_:281471452573600 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.137.012 [mindspore/ccsrc/distributed/rpc/tcp/event_loop.cc:189] Initialize] Set pthread name success name:SEND_EVENT_LOOP,loop_thread_:281471444119456 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.137.054 [mindspore/ccsrc/distributed/rpc/tcp/tcp_client.cc:29] TCPClient] Tcp client receiving message timeout is 15 seconds. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.137.214 [mindspore/ccsrc/distributed/rpc/tcp/event_loop.cc:189] Initialize] Set pthread name success name:RECV_EVENT_LOOP,loop_thread_:281471435665312 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.137.460 [mindspore/ccsrc/distributed/rpc/tcp/event_loop.cc:189] Initialize] Set pthread name success name:SEND_EVENT_LOOP,loop_thread_:281471218610080 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.137.509 [mindspore/ccsrc/distributed/cluster/topology/compute_graph_node.cc:170] Register] Start connecting heartbeat client. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.137.542 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:441] Connect] Can not found link destination: 127.0.0.1:8128 [WARNING] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.137.861 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 19 source: 127.0.0.1:48070, destination: 127.0.0.1:8128 [WARNING] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.137.897 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:8128 to be connected...Retry number: 1 [WARNING] DISTRIBUTED(935455,ffff2cefefa0,python):2025-07-15-10:47:51.137.936 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:48070 to 127.0.0.1:8128 is successfully created. System errno: Success [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.638.038 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:505] Connect] Connected to destination: 127.0.0.1:8128 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.638.089 [mindspore/ccsrc/distributed/rpc/tcp/tcp_client.cc:73] Connect] Connected to the tcp server 127.0.0.1:8128 successfully. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.638.120 [mindspore/ccsrc/distributed/cluster/topology/compute_graph_node.cc:178] Register] Start connecting business client. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.638.148 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:441] Connect] Can not found link destination: 127.0.0.1:8128 [WARNING] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.638.331 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 20 source: 127.0.0.1:48106, destination: 127.0.0.1:8128 [WARNING] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:51.638.366 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:8128 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(935455,ffff2df1efa0,python):2025-07-15-10:47:51.638.365 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:48106 to 127.0.0.1:8128 is successfully created. System errno: Success [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:52.138.522 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:505] Connect] Connected to destination: 127.0.0.1:8128 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:52.138.588 [mindspore/ccsrc/distributed/rpc/tcp/tcp_client.cc:73] Connect] Connected to the tcp server 127.0.0.1:8128 successfully. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:52.139.407 [mindspore/ccsrc/distributed/cluster/topology/compute_graph_node.cc:222] Register] The compute graph node: 0 has been registered successfully. [WARNING] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:52.139.622 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/1200). [INFO] DISTRIBUTED(935455,ffff1f7eefa0,python):2025-07-15-10:47:52.139.690 [mindspore/ccsrc/distributed/cluster/topology/compute_graph_node.cc:270] Heartbeat] Interval of heartbeat lower and upper are 3 and 5 [INFO] DISTRIBUTED(935455,ffff1f7eefa0,python):2025-07-15-10:47:52.139.728 [mindspore/ccsrc/distributed/cluster/topology/compute_graph_node.cc:274] Heartbeat] The heartbeat thread is started. [WARNING] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:52.639.819 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/1200). [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.140.098 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:247] BuildCluster] [PROF]BuildCluster costs 2003.63 msec. [WARNING] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.140.186 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.140.217 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:341] PostProcess] Start post processing for computing graph nodes. [WARNING] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.140.250 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 0 rank id: 0 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.140.278 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:359] PostProcess] Client ip address in this cluster of this compute graph node is 127.0.0.1 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.140.324 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:370] PostProcess] Assigned for this worker port range is 8118 to 8373 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.140.393 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:253] BuildCluster] [PROF]PostBuildCluster costs 0.168 msec. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.140.494 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:184] node_num] Number of role MS_WORKER is 4 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.140.544 [mindspore/ccsrc/distributed/init.cc:52] Initialize] [PROF]distributed_cluster_init costs 2004.32 msec. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.140.585 [mindspore/ccsrc/distributed/collective/collective_manager.cc:185] Initialize] Start initializing collective communication for backend: Ascend... [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.140.731 [mindspore/ccsrc/runtime/device/res_manager/hal_res_manager.cc:106] LoadResManager] HalResManager dlopen ascend lib name: libmindspore_cpu_res_manager.so [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.140.815 [mindspore/ccsrc/runtime/device/res_manager/hal_res_manager.cc:128] LoadResManager] HalResManager dlopen current device lib name: /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/lib/plugin/cpu/libmindspore_cpu_res_manager.so [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.141.222 [mindspore/ccsrc/plugin/device/cpu/hal/hardware/ms_collective_comm_lib.cc:50] MsCollectiveCommLib] Global group name of MindSpore collective communication library is mccl_world_group [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.141.259 [mindspore/ccsrc/distributed/collective/collective_manager.cc:589] InitHostCommlib] Start initializing communication library on host side... [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.141.315 [mindspore/ccsrc/plugin/device/cpu/hal/hardware/ms_collective_comm_lib.cc:84] Initialize] Query retry count is 400 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.141.346 [mindspore/ccsrc/plugin/device/cpu/hal/hardware/ms_collective_comm_lib.cc:92] Initialize] Interval of retry allgather hostname lower and upper are 1 and 2 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.141.390 [mindspore/ccsrc/distributed/collective/collective_manager.cc:609] InitHostCommlib] Communication library on host side is successfully initialized. Global rank id: 0, global rank size: 4 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.141.431 [mindspore/ccsrc/distributed/collective/collective_manager.cc:204] Initialize] [PROF]InitHostCommlib costs 0.806 msec. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.141.464 [mindspore/ccsrc/distributed/collective/collective_manager.cc:792] AssignLocalRank] Host name for rank 0 is ascend213 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.141.721 [mindspore/ccsrc/distributed/cluster/topology/compute_graph_node.cc:564] GetHostNames] Worker gets host names {"hostnames":["ascend213","ascend213","ascend213","ascend213"]} [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.141.855 [mindspore/ccsrc/distributed/collective/collective_manager.cc:813] AssignLocalRank] Successfully get all nodes' hostname. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.141.895 [mindspore/ccsrc/distributed/collective/collective_manager.cc:830] AssignLocalRank] The local rank id assigned for this process is 0 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.141.925 [mindspore/ccsrc/distributed/collective/collective_manager.cc:831] AssignLocalRank] The env 'DEVICE_ID' assigned for this process is: 0 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.141.970 [mindspore/core/include/utils/device_manager_conf.h:45] distributed_refresh_device_id] Refresh device id to 0 for distributed. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.142.007 [mindspore/ccsrc/distributed/collective/collective_manager.cc:850] AssignLocalRank] The device_id of ms_context is set to local rank id [0]. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:53.142.061 [mindspore/ccsrc/distributed/collective/collective_manager.cc:212] Initialize] [PROF]AssignLocalRank costs 0.59 msec. [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.142.143 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_context.cc:105] Initialize] Start initializing device context. [INFO] CORE(935455,ffffb287eec0,python):2025-07-15-10:47:53.142.188 [mindspore/core/utils/ms_context.cc:463] PrintJitLevelAndExecMode] The jit_level is: O1, and enable kernelbykernel executor. [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.142.224 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_context.cc:119] Initialize] The current overflow detection mode is INFNAN. [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.142.395 [mindspore/ccsrc/runtime/device/res_manager/hal_res_manager.cc:106] LoadResManager] HalResManager dlopen ascend lib name: libmindspore_ascend_res_manager.so [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.142.465 [mindspore/ccsrc/runtime/device/res_manager/hal_res_manager.cc:128] LoadResManager] HalResManager dlopen current device lib name: /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/lib/plugin/ascend/libmindspore_ascend_res_manager.so [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.142.686 [mindspore/ccsrc/plugin/res_manager/ascend/hal_manager/ascend_hal_manager.cc:58] InitDevice] Enter SetRtDevice, current initialize device number:0 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.359.912 [mindspore/ccsrc/plugin/res_manager/ascend/stream_manager/ascend_stream_manager.cc:105] RegCallback] Register callback thread, stream : 0x46137dd0. [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.359.984 [mindspore/ccsrc/plugin/res_manager/ascend/stream_manager/ascend_stream_manager.cc:387] CreateDefaultStream] Create ascend default stream, stream id: 0 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.361.242 [mindspore/ccsrc/plugin/res_manager/ascend/stream_manager/ascend_stream_manager.cc:105] RegCallback] Register callback thread, stream : 0x461d9a70. [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.361.572 [mindspore/ccsrc/plugin/res_manager/ascend/stream_manager/ascend_stream_manager.cc:125] RegCallback] Register callback thread success, stream : 0x46137dd0. [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.361.727 [mindspore/ccsrc/plugin/res_manager/ascend/stream_manager/ascend_stream_manager.cc:125] RegCallback] Register callback thread success, stream : 0x461d9a70. [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.361.771 [mindspore/ccsrc/plugin/res_manager/ascend/stream_manager/ascend_stream_manager.cc:396] CreateDefaultStream] Create ascend communication stream, stream id: 1 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.361.967 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_vmm_adapter.h:160] CheckVmmDriverVersion] Driver path is /usr/local/Ascend [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.362.040 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_vmm_adapter.h:189] CheckVmmDriverVersion] Driver version is 25.2.0, major version is 25 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.362.082 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_vmm_adapter.h:115] IsVmmEnabled] VMM is enabled. [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.362.122 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_memory_adapter.cc:113] Initialize] Config huge_page_reserve_size : 0, device_hbm_huge_page_reserved_size_ : 0 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.362.739 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_vmm_adapter.h:60] AscendVmmAdapter] VMM align size is 2097152 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.362.802 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_memory_adapter.cc:183] Initialize] Device MOC Size:62420M, Device free MOC Size:62091M, Reserved MOC size for Other Components(HCCL/rts/etc.):3891M, Recommend Reserved MOC size for Other Components:3880M, User define MindSpore MOC Size:0G, MindSpore Used MOC Size:58200M. [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.362.852 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_dynamic_mem_adapter.cc:81] Initialize] Ascend Memory Adapter initialize success, Memory Statistics: Device MOC memory size: 62420M MindSpore Used memory size: 58200M Used peak memory usage (without fragments): 0M Actual peak memory usage (with fragments): 0M [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:53.362.910 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_memory_pool.cc:185] DefaultEnhancedAscendMemoryPool] DefaultEnhancedAscendMemoryPool constructed. [INFO] PRE_ACT(935455,ffffb287eec0,python):2025-07-15-10:47:53.362.961 [mindspore/ccsrc/memory/mem_pool/abstract_dynamic_mem_pool.cc:429] Initialize] Skip initialization of memory pool since init size is not configured. [INFO] PROFILER(935455,ffffb287eec0,python):2025-07-15-10:47:53.363.559 [mindspore/ccsrc/debug/profiler/mstx/mstx_symbol.cc:65] LoadMstxApiSymbol] Load mstx api success! [INFO] UTILS(935455,ffffb287eec0,python):2025-07-15-10:47:53.363.709 [mindspore/ccsrc/utils/offload_context.cc:86] enable_aio] On ascend devices, enable aio and enable pinned mem cannot be turned on at the same time. [INFO] UTILS(935455,ffffb287eec0,python):2025-07-15-10:47:53.363.772 [mindspore/ccsrc/utils/utils.cc:400] GetSystemFreeDiskSize] Failed to get disk directory ./offload/ size, check whether the directory is created. [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.883.733 [mindspore/ccsrc/plugin/res_manager/ascend/hal_manager/ascend_hal_manager.cc:208] InitializeAcl] Call aclInit successfully, json is {"dump":{"dump_scene":"lite_exception"},"err_msg_mode":"1"} [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.883.936 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_deprecated_interface.cc:82] OpenTsd] Device id = 0, rank size = 4. [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.884.058 [mindspore/ccsrc/plugin/res_manager/ascend/mbuf_manager/mbuf_receive_manager.cc:308] MbufDataHandler] Channel _npu_log begins the construction process witch capacity 128 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.888.236 [mindspore/ccsrc/plugin/res_manager/ascend/mbuf_manager/mbuf_receive_manager.cc:308] MbufDataHandler] Channel ms_tensor_dump begins the construction process witch capacity 128 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.888.704 [mindspore/ccsrc/plugin/res_manager/ascend/mbuf_manager/mbuf_receive_manager.cc:308] MbufDataHandler] Channel ms_tensor_summary begins the construction process witch capacity 128 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.889.114 [mindspore/ccsrc/plugin/res_manager/ascend/mbuf_manager/mbuf_receive_manager.cc:308] MbufDataHandler] Channel ms_image_summary begins the construction process witch capacity 128 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.889.588 [mindspore/ccsrc/plugin/res_manager/ascend/mbuf_manager/mbuf_receive_manager.cc:308] MbufDataHandler] Channel ms_scalar_summary begins the construction process witch capacity 128 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.889.963 [mindspore/ccsrc/plugin/res_manager/ascend/mbuf_manager/mbuf_receive_manager.cc:308] MbufDataHandler] Channel ms_histogram_summary begins the construction process witch capacity 128 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.890.333 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_context.cc:146] Initialize] End initializing device context. [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.890.436 [mindspore/ccsrc/plugin/res_manager/ascend/ascend_res_manager.cc:671] LoadCollectiveCommLib] Loading MACCL collective library successfully. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:54.890.493 [mindspore/ccsrc/distributed/collective/collective_manager.cc:633] InitDeviceCommLib] Start initializing communication library on device side... [INFO] CORE(935455,ffffb287eec0,python):2025-07-15-10:47:54.890.541 [mindspore/core/utils/ms_context.cc:771] IsEnableInferBoost] MSContext enable ms infer boost [INFO] CORE(935455,ffffb287eec0,python):2025-07-15-10:47:54.890.628 [mindspore/core/utils/ms_context.cc:751] SetMsInternalEnableCustomKernelList] Enable internal kernel list: Add, AddLayerNorm, AddRmsNorm, AddRmsNormDynamicQuant, AddRmsNormQuantV2, FlashAttentionScore, InferenceMatmulSplit, InferenceSwiGLU, MatMul, MatMulAllReduce, MatMulElemwise, MatMulSigmoidCastAdd, PagedAttention, PagedAttentionMask, QbmmAdd, QbmmAllReduceAdd, RmsNorm, RmsNormQuant, Sub, SwiGLUDynamicQuant, [INFO] HCCL_ADPT(935455,ffffb287eec0,python):2025-07-15-10:47:54.890.789 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:259] InitHccl] Start init hccl adapter. [INFO] HCCL_ADPT(935455,ffffb287eec0,python):2025-07-15-10:47:54.890.977 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:552] InitKernelInfoStore] Start init hccl kernel info store. [INFO] HCCL_ADPT(935455,ffffb287eec0,python):2025-07-15-10:47:54.891.055 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:569] InitKernelInfoStore] Get builder ops_kernel_info_hccl [INFO] HCCL_ADPT(935455,ffffb287eec0,python):2025-07-15-10:47:54.891.431 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:591] InitKernelInfoStore] Init hccl kernel info store success. [INFO] HCCL_ADPT(935455,ffffb287eec0,python):2025-07-15-10:47:54.891.473 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:787] InitHcclExec] Start init hccl exec. [INFO] HCCL_ADPT(935455,ffffb287eec0,python):2025-07-15-10:47:54.894.574 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:793] InitHcclExec] Hcom DynamicKernel Initialize success [INFO] HCCL_ADPT(935455,ffffb287eec0,python):2025-07-15-10:47:54.894.634 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:799] InitHcclExec] InitHcclExec success [INFO] HCCL_ADPT(935455,ffffb287eec0,python):2025-07-15-10:47:54.894.665 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:277] InitHccl] Init hccl adapter success. [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.894.707 [mindspore/ccsrc/plugin/res_manager/ascend/collective/multi_ascend_collective_comm_lib.cc:100] Initialize] Successfully initialize HCCL. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:54.894.741 [mindspore/ccsrc/distributed/collective/collective_manager.cc:640] InitDeviceCommLib] Communication library on device side is successfully initialized. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:54.894.796 [mindspore/ccsrc/distributed/collective/collective_manager.cc:217] Initialize] [PROF]InitDeviceBackend costs 1752.68 msec. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:54.894.842 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1071] IsAsyncInitGlobalComm] Async initialize global comm: 1. async_conf: 1, is_graph: 1, use_rank_table: 0, simulation: 0, use_mpi: 0, is_ascend: 1 [WARNING] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:54.894.877 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3}, async: 1, submit_now: 1 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:54.894.941 [mindspore/ccsrc/distributed/collective/collective_manager.cc:373] CreateCommunicationGroup] [PROF]CreateCommunicationGroupOnHostSide costs 0.013 msec. [INFO] CORE(935455,ffffb287eec0,python):2025-07-15-10:47:54.895.011 [mindspore/core/utils/ms_context.cc:771] IsEnableInferBoost] MSContext enable ms infer boost [INFO] CORE(935455,ffffb287eec0,python):2025-07-15-10:47:54.895.096 [mindspore/core/utils/ms_context.cc:751] SetMsInternalEnableCustomKernelList] Enable internal kernel list: Add, AddLayerNorm, AddRmsNorm, AddRmsNormDynamicQuant, AddRmsNormQuantV2, FlashAttentionScore, InferenceMatmulSplit, InferenceSwiGLU, MatMul, MatMulAllReduce, MatMulElemwise, MatMulSigmoidCastAdd, PagedAttention, PagedAttentionMask, QbmmAdd, QbmmAllReduceAdd, RmsNorm, RmsNormQuant, Sub, SwiGLUDynamicQuant, [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.895.190 [mindspore/ccsrc/plugin/res_manager/ascend/collective/multi_ascend_collective_comm_lib.cc:183] CreateCommunicationGroup] Successfully create HCCL communication group hccl_world_group [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:54.895.234 [mindspore/ccsrc/distributed/collective/collective_manager.cc:380] CreateCommunicationGroup] [PROF]CreateCommunicationGroupOnDeviceSide costs 0.254 msec. [WARNING] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:54.895.295 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:54.895.417 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1130] SubmitCreateDeviceCommTask] Launch init comm thread. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:54.895.462 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1136] SubmitCreateDeviceCommTask] Submit init communicator task for hccl_world_group. Call 'WaitCommInitDone' later to wait initialization to be done. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:54.895.516 [mindspore/ccsrc/distributed/collective/collective_manager.cc:231] Initialize] [PROF]CreateGlobalCommunicationGroup costs 0.634 msec. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:54.895.547 [mindspore/ccsrc/distributed/collective/collective_manager.cc:234] Initialize] End initializing collective communication for backend: Ascend [INFO] DISTRIBUTED(935455,fffed587efa0,python):2025-07-15-10:47:54.895.583 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1184] RunInitCommTasks] Create device communicator in thread for group: hccl_world_group [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:54.895.590 [mindspore/ccsrc/distributed/init.cc:62] Initialize] [PROF]distributed_collective_init costs 1755.01 msec. [INFO] DISTRIBUTED(935455,fffed587efa0,python):2025-07-15-10:47:54.895.651 [mindspore/ccsrc/distributed/collective/collective_manager.cc:965] CreateDeviceCommunicator] Create device communicator for hccl_world_group [INFO] PIPELINE(935455,ffffb287eec0,python):2025-07-15-10:47:54.895.684 [mindspore/ccsrc/pipeline/jit/ps/pipeline.cc:177] RecordInitStatus] Status record: system init. [INFO] DISTRIBUTED(935455,fffed587efa0,python):2025-07-15-10:47:54.895.704 [mindspore/ccsrc/distributed/collective/collective_manager.cc:927] SetGlobalCommInfo] Begin set global communication info: hccl_world_group [WARNING] DEVICE(935455,fffed587efa0,python):2025-07-15-10:47:54.895.762 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:8128, node_rank:2130706433, total_rank_size:4, local_rank_size4 [WARNING] HCCL_ADPT(935455,fffed587efa0,python):2025-07-15-10:47:54.895.831 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(935455,fffed587efa0,python):2025-07-15-10:47:54.895.861 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:635] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [INFO] DEVICE(935455,fffed587efa0,python):2025-07-15-10:47:54.895.888 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:260] SetGlobalCommInfo] HcclSetGlobalCommInfo is not supported. [WARNING] DEVICE(935455,fffed587efa0,python):2025-07-15-10:47:54.895.925 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [INFO] DEVICE(935455,fffed587efa0,python):2025-07-15-10:47:54.895.953 [mindspore/ccsrc/plugin/res_manager/ascend/collective/multi_ascend_communication_group.cc:113] SetGlobalCommInfo] Successfully SetGlobalCommInfo for HCCL group hccl_world_group [INFO] DISTRIBUTED(935455,fffed587efa0,python):2025-07-15-10:47:54.895.979 [mindspore/ccsrc/distributed/collective/collective_manager.cc:960] SetGlobalCommInfo] End set global communication info: hccl_world_group 2025-07-15 10:47:54,900 - mindformers./output/log[mindformers/version_control.py:125] - INFO - The Lazy Inline compilation acceleration feature only works in pipeline parallel mode (pipeline_stage > 1). Current pipeline stage=1, the feature is disabled by default. You can also enable lazy inline without pipeline parallel, by setting environment variable `export ENABLE_LAZY_INLINE_NO_PIPELINE=1`. [INFO] DISTRIBUTED(935455,fffed587efa0,python):2025-07-15-10:47:54.903.465 [mindspore/ccsrc/distributed/collective/collective_manager.cc:991] CreateDeviceCommunicator] [PROF]GenerateRootInfo costs 7.4 msec. [INFO] COMMON(935455,ffffb287eec0,python):2025-07-15-10:47:54.903.526 [mindspore/ccsrc/pybind_api/ir/tensor_api/auto_generate/tensor_api_0.cc:1484] TensorMethodReshape] Call TensorReshape [INFO] DISTRIBUTED(935455,fffed587efa0,python):2025-07-15-10:47:54.903.709 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1010] CreateDeviceCommunicator] Successfully send/fetch unqiueid for communication group hccl_world_group [INFO] DISTRIBUTED(935455,fffed587efa0,python):2025-07-15-10:47:54.903.764 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1012] CreateDeviceCommunicator] [PROF]BroadcastUniqueID costs 0.224 msec. [WARNING] DISTRIBUTED(935455,fffed587efa0,python):2025-07-15-10:47:54.903.793 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.903.807 [mindspore/ccsrc/plugin/res_manager/ascend/ascend_res_manager.cc:700] SetDeterministic] Set kernel deterministic value: 0 [INFO] DISTRIBUTED(935455,fffed587efa0,python):2025-07-15-10:47:54.903.827 [mindspore/ccsrc/distributed/collective/collective_manager.cc:901] GetCommunicatorInitTimeout] HCCL_CONNECT_TIMEOUT is 600 seconds. [INFO] CORE(935455,fffe7f7eefa0,python):2025-07-15-10:47:54.904.033 [mindspore/core/utils/ms_context.cc:771] IsEnableInferBoost] MSContext enable ms infer boost [INFO] CORE(935455,fffe7f7eefa0,python):2025-07-15-10:47:54.904.145 [mindspore/core/utils/ms_context.cc:751] SetMsInternalEnableCustomKernelList] Enable internal kernel list: Add, AddLayerNorm, AddRmsNorm, AddRmsNormDynamicQuant, AddRmsNormQuantV2, FlashAttentionScore, InferenceMatmulSplit, InferenceSwiGLU, MatMul, MatMulAllReduce, MatMulElemwise, MatMulSigmoidCastAdd, PagedAttention, PagedAttentionMask, QbmmAdd, QbmmAllReduceAdd, RmsNorm, RmsNormQuant, Sub, SwiGLUDynamicQuant, [WARNING] DEVICE(935455,fffe7f7eefa0,python):2025-07-15-10:47:54.904.254 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:54.904.900 [mindspore/ccsrc/pynative/pynative_utils.cc:1833] DispatchOp] PyBoost sync run frontend task [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:54.905.309 [mindspore/ccsrc/pyboost/pyboost_utils.cc:304] DispatchRun] PyBoost sync run device task [INFO] PRE_ACT(935455,ffffb287eec0,python):2025-07-15-10:47:54.905.392 [mindspore/ccsrc/memory/mem_pool/abstract_dynamic_mem_pool.cc:743] GenerateAllocator] Generate allocator, is persistent : 0, stream id : 0. [INFO] PRE_ACT(935455,ffffb287eec0,python):2025-07-15-10:47:54.905.472 [mindspore/ccsrc/memory/mem_pool/abstract_dynamic_mem_pool.cc:755] operator()] Malloc mem block, is enable eager free : 0, is enable vmm : 1, size : 512, block size : 1073741824. [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.905.509 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_vmm_adapter.cc:205] AllocDeviceMem] VMM AllocDeviceMem size:61027123200, align_size:61027123200 [INFO] PRE_ACT(935455,ffffb287eec0,python):2025-07-15-10:47:54.912.579 [mindspore/ccsrc/memory/mem_pool/abstract_dynamic_mem_pool.cc:775] operator()] Malloc mem block : {"addr_":0x12c180000000,"size_":61027123200,"stream_id_":0,"min_addr_":0,"max_addr_":0}. [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:54.913.444 [mindspore/ccsrc/pyboost/pyboost_utils.cc:304] DispatchRun] PyBoost sync run device task [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.913.613 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_kernel_executor.cc:1513] operator()] Api aclnnInplaceCopy miss cache, with hash id:186559549515354546 [INFO] GE_ADPT(935455,ffffb287eec0,python):2025-07-15-10:47:54.913.916 [mindspore/ops/kernel/ascend/acl_ir/op_api_exec.cc:124] ParseCustomPriority] Could not open the file /usr/local/Ascend/ascend-toolkit/latest/opp/vendors/config.ini [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.977.926 [mindspore/ops/kernel/ascend/acl_ir/op_api_cache.h:60] SetExecutorRepeatable] aclnnInplaceCopyGetWorkspaceSize don't support cache, repeat_ret is 561000 2025-07-15 10:47:54,981 - mindformers./output/log[mindformers/models/llama/llama.py:507] - INFO - Predict run mode: False [INFO] COMMON(935455,ffffb287eec0,python):2025-07-15-10:47:54.983.093 [mindspore/ccsrc/pybind_api/ir/tensor_api/auto_generate/tensor_api_0.cc:1484] TensorMethodReshape] Call TensorReshape [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:54.983.209 [mindspore/ccsrc/pynative/pynative_utils.cc:1833] DispatchOp] PyBoost sync run frontend task [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:54.983.328 [mindspore/ccsrc/pyboost/pyboost_utils.cc:304] DispatchRun] PyBoost sync run device task [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:54.983.573 [mindspore/ccsrc/pyboost/pyboost_utils.cc:304] DispatchRun] PyBoost sync run device task [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.983.658 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_kernel_executor.cc:1513] operator()] Api aclnnInplaceCopy miss cache, with hash id:186559549515354546 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:54.983.752 [mindspore/ops/kernel/ascend/acl_ir/op_api_cache.h:60] SetExecutorRepeatable] aclnnInplaceCopyGetWorkspaceSize don't support cache, repeat_ret is 561000 2025-07-15 10:47:54,988 - mindformers./output/log[mindformers/models/llama/llama.py:107] - INFO - MoE config is None, use normal FFN [INFO] COMMON(935455,ffffb287eec0,python):2025-07-15-10:47:55.002.303 [mindspore/ccsrc/pybind_api/ir/tensor_api/auto_generate/tensor_api_0.cc:1484] TensorMethodReshape] Call TensorReshape [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:55.002.417 [mindspore/ccsrc/pynative/pynative_utils.cc:1833] DispatchOp] PyBoost sync run frontend task [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:55.002.524 [mindspore/ccsrc/pyboost/pyboost_utils.cc:304] DispatchRun] PyBoost sync run device task [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:55.002.756 [mindspore/ccsrc/pyboost/pyboost_utils.cc:304] DispatchRun] PyBoost sync run device task [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:55.002.834 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_kernel_executor.cc:1513] operator()] Api aclnnInplaceCopy miss cache, with hash id:13328202571349160149 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:55.002.935 [mindspore/ops/kernel/ascend/acl_ir/op_api_cache.h:60] SetExecutorRepeatable] aclnnInplaceCopyGetWorkspaceSize don't support cache, repeat_ret is 561000 [INFO] ME(935455:281473676996288,MainProcess):2025-07-15-10:47:55.483.1 [mindspore/testcases/testcases/tests/st/networks/mindformers/mindformers/models/llama/llama_layer.py:125] Embedding use init method: sigma=0.01, mean=0.0 2025-07-15 10:47:55,006 - mindformers./output/log[mindformers/models/utils.py:205] - INFO - num_layers per stage: [[2]] 2025-07-15 10:47:55,006 - mindformers./output/log[mindformers/models/utils.py:206] - INFO - Accumulated num_layers per stage: [[2]] 2025-07-15 10:47:55,007 - mindformers./output/log[mindformers/models/utils.py:208] - INFO - Pipeline id list with start_stage: [0, 0] 2025-07-15 10:47:55,007 - mindformers./output/log[mindformers/models/utils.py:209] - INFO - Interleave id list: [0, 0] 2025-07-15 10:47:55,007 - mindformers./output/log[mindformers/models/utils.py:227] - INFO - Formative layer_recompute: [[0]] 2025-07-15 10:47:55,007 - mindformers./output/log[mindformers/models/utils.py:229] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-15 10:47:55,007 - mindformers./output/log[mindformers/models/utils.py:235] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0]], 'feed_forward\\.w1\\.activation\\.silu': [[0]]} 2025-07-15 10:47:55,008 - mindformers./output/log[mindformers/models/utils.py:236] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0]]} 2025-07-15 10:47:55,008 - mindformers./output/log[mindformers/models/utils.py:237] - INFO - Formative select_recompute_exclude: {} 2025-07-15 10:47:55,008 - mindformers./output/log[mindformers/models/utils.py:238] - INFO - Formative select_comm_recompute_exclude: {} [INFO] ME(935455:281473676996288,MainProcess):2025-07-15-10:47:55.284.472 [mindspore/testcases/testcases/tests/st/networks/mindformers/mindformers/models/llama/llama_layer.py:148] Using 1 data parallel for the embedding lookup. 2025-07-15 10:47:55,290 - mindformers./output/log[mindformers/models/modeling_utils.py:1517] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. 2025-07-15 10:47:55,292 - mindformers./output/log[mindformers/trainer/trainer.py:222] - INFO - The model instance has been entered, and the model will not be created from model_config 2025-07-15 10:47:55,292 - mindformers./output/log[mindformers/trainer/trainer.py:244] - WARNING - Recognizing that a model instance is sent and model_name is None, 2025-07-15 10:47:55,292 - mindformers./output/log[mindformers/trainer/trainer.py:246] - WARNING - it is recommended to select a model configuration that corresponds to the support of MindFormers based on the instance model and set model_name. 2025-07-15 10:47:55,293 - mindformers./output/log[mindformers/trainer/trainer.py:249] - WARNING - Otherwise, they will default to a general configuration.You are advised to pass instances such as optimizers, metric, tokenizer, and processor 2025-07-15 10:47:55,334 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:47:55,334 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:47:55,335 - mindformers./output/log[mindformers/tools/register/template.py:685] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_dataset', 'eval_dataset_task', 'filepath_prefix', 'processor'] 2025-07-15 10:47:55,335 - mindformers./output/log[mindformers/trainer/trainer.py:1025] - INFO - Load configs in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/mindformers/configs/gpt2/run_gpt2.yaml to build trainer. 2025-07-15 10:47:55,335 - mindformers./output/log[mindformers/trainer/trainer.py:1061] - INFO - ..........Init Config.......... 2025-07-15 10:47:55,335 - mindformers./output/log[mindformers/trainer/trainer.py:1083] - WARNING - When using the TrainingArguments class, its arguments will override the default config configuration. 2025-07-15 10:47:55,336 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 1, 'capacity_factor': 1.05, 'aux_loss_factor': 0.05, 'num_experts_chosen': 1, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV1', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 0, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': None, 'n_group': None, 'first_k_dense_replace': True, 'moe_intermediate_size': 1407, 'routed_scaling_factor': 1.0, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} 2025-07-15 10:47:55,336 - mindformers./output/log[mindformers/core/parallel_config.py:48] - INFO - initial swap_config from dict: {'swap': False, 'layer_swap': None, 'op_swap': None, 'default_prefetch': 1} 2025-07-15 10:47:55,337 - mindformers./output/log[mindformers/core/parallel_config.py:55] - INFO - initial recompute_config from dict: {'recompute': False, 'select_recompute': False, 'parallel_optimizer_comm_recompute': False, 'select_comm_recompute': False, 'mp_comm_recompute': True, 'recompute_slice_activation': False, 'select_recompute_exclude': False, 'select_comm_recompute_exclude': False} 2025-07-15 10:47:55,337 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 1, 'model_parallel': 1, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': True, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} 2025-07-15 10:47:55,338 - mindformers./output/log[mindformers/tools/utils.py:170] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/llama/output' 2025-07-15 10:47:55,338 - mindformers./output/log[mindformers/tools/utils.py:185] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_0.ckpt' 2025-07-15 10:47:55,482 - mindformers./output/log[mindformers/trainer/base_trainer.py:108] - INFO - host_name: ascend213, host_ip: 121.37.54.128 2025-07-15 10:47:55,483 - mindformers./output/log[mindformers/trainer/base_trainer.py:114] - INFO - Now Running Task is: text_generation, Model is: gpt2 2025-07-15 10:47:55,483 - mindformers./output/log[mindformers/trainer/base_trainer.py:144] - WARNING - Input model name is not in the supported list or unspecified. 2025-07-15 10:47:55,483 - mindformers./output/log[mindformers/trainer/base_trainer.py:145] - WARNING - See the list of supported task and model name: ['common', 'glm4_9b'] 2025-07-15 10:47:55,484 - mindformers./output/log[mindformers/trainer/base_trainer.py:146] - WARNING - The default model config: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/mindformers/configs/gpt2/run_gpt2.yaml will now be used for the text_generation task 2025-07-15 10:47:55,484 - mindformers./output/log[mindformers/trainer/trainer.py:1134] - INFO - ..........Init Model.......... 2025-07-15 10:47:55,485 - mindformers./output/log[mindformers/trainer/trainer.py:1175] - INFO - ..........Init Callbacks.......... 2025-07-15 10:47:55,485 - mindformers./output/log[mindformers/trainer/trainer.py:324] - INFO - ==========Trainer Init Success!========== 2025-07-15 10:47:55,486 - mindformers./output/log[mindformers/trainer/trainer.py:906] - INFO - The incoming model will be reinit when parallel config is reconfigured. 2025-07-15 10:47:55,486 - mindformers./output/log[mindformers/trainer/trainer.py:1134] - INFO - ..........Init Model.......... 2025-07-15 10:47:55,486 - mindformers./output/log[mindformers/trainer/trainer.py:1006] - INFO - ..........Reinit Model.......... 2025-07-15 10:47:55,487 - mindformers./output/log[mindformers/version_control.py:140] - INFO - The Lazy Inline compilation acceleration feature is turned on. [INFO] ME(935455:281473676996288,MainProcess):2025-07-15-10:47:55.488.096 [mindspore/common/lazy_inline.py:200] The function construct's parameters: labels , input_position , position_ids , attention_mask , input_embeds , init_reset , batch_valid_length , batch_index , zactivate_len , block_tables , slot_mapping , prefix_keys_values , llm_boost_inputs , q_seq_lens , loss_mask , gather_index , seq_range , actual_seq_len must be key word or positional arguments and can't have default values. line: 645 in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/mindformers/mindformers/models/llama/llama.py [INFO] COMMON(935455,ffffb287eec0,python):2025-07-15-10:47:55.492.885 [mindspore/ccsrc/pybind_api/ir/tensor_api/auto_generate/tensor_api_0.cc:1484] TensorMethodReshape] Call TensorReshape [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:55.493.048 [mindspore/ccsrc/pynative/pynative_utils.cc:1833] DispatchOp] PyBoost sync run frontend task [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:55.493.205 [mindspore/ccsrc/pyboost/pyboost_utils.cc:304] DispatchRun] PyBoost sync run device task [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:55.493.564 [mindspore/ccsrc/pyboost/pyboost_utils.cc:304] DispatchRun] PyBoost sync run device task [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:55.493.644 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_kernel_executor.cc:1513] operator()] Api aclnnInplaceCopy miss cache, with hash id:186559549515354546 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:55.493.822 [mindspore/ops/kernel/ascend/acl_ir/op_api_cache.h:60] SetExecutorRepeatable] aclnnInplaceCopyGetWorkspaceSize don't support cache, repeat_ret is 561000 2025-07-15 10:47:55,497 - mindformers./output/log[mindformers/models/llama/llama.py:507] - INFO - Predict run mode: False [INFO] COMMON(935455,ffffb287eec0,python):2025-07-15-10:47:55.497.956 [mindspore/ccsrc/pybind_api/ir/tensor_api/auto_generate/tensor_api_0.cc:1484] TensorMethodReshape] Call TensorReshape [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:55.498.037 [mindspore/ccsrc/pynative/pynative_utils.cc:1833] DispatchOp] PyBoost sync run frontend task [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:55.498.110 [mindspore/ccsrc/pyboost/pyboost_utils.cc:304] DispatchRun] PyBoost sync run device task [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:55.498.287 [mindspore/ccsrc/pyboost/pyboost_utils.cc:304] DispatchRun] PyBoost sync run device task [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:55.498.348 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_kernel_executor.cc:1513] operator()] Api aclnnInplaceCopy miss cache, with hash id:186559549515354546 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:55.498.439 [mindspore/ops/kernel/ascend/acl_ir/op_api_cache.h:60] SetExecutorRepeatable] aclnnInplaceCopyGetWorkspaceSize don't support cache, repeat_ret is 561000 2025-07-15 10:47:55,499 - mindformers./output/log[mindformers/models/llama/llama.py:107] - INFO - MoE config is None, use normal FFN [INFO] COMMON(935455,ffffb287eec0,python):2025-07-15-10:47:55.515.284 [mindspore/ccsrc/pybind_api/ir/tensor_api/auto_generate/tensor_api_0.cc:1484] TensorMethodReshape] Call TensorReshape [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:55.515.377 [mindspore/ccsrc/pynative/pynative_utils.cc:1833] DispatchOp] PyBoost sync run frontend task [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:55.515.486 [mindspore/ccsrc/pyboost/pyboost_utils.cc:304] DispatchRun] PyBoost sync run device task [INFO] PYNATIVE(935455,ffffb287eec0,python):2025-07-15-10:47:55.515.705 [mindspore/ccsrc/pyboost/pyboost_utils.cc:304] DispatchRun] PyBoost sync run device task [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:55.515.768 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_kernel_executor.cc:1513] operator()] Api aclnnInplaceCopy miss cache, with hash id:13328202571349160149 [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:55.515.864 [mindspore/ops/kernel/ascend/acl_ir/op_api_cache.h:60] SetExecutorRepeatable] aclnnInplaceCopyGetWorkspaceSize don't support cache, repeat_ret is 561000 [INFO] ME(935455:281473676996288,MainProcess):2025-07-15-10:47:55.517.682 [mindspore/testcases/testcases/tests/st/networks/mindformers/mindformers/models/llama/llama_layer.py:125] Embedding use init method: sigma=0.01, mean=0.0 2025-07-15 10:47:55,519 - mindformers./output/log[mindformers/models/utils.py:205] - INFO - num_layers per stage: [[1, 1]] 2025-07-15 10:47:55,519 - mindformers./output/log[mindformers/models/utils.py:206] - INFO - Accumulated num_layers per stage: [[1, 2]] 2025-07-15 10:47:55,519 - mindformers./output/log[mindformers/models/utils.py:208] - INFO - Pipeline id list with start_stage: [0, 1] 2025-07-15 10:47:55,519 - mindformers./output/log[mindformers/models/utils.py:209] - INFO - Interleave id list: [0, 0] 2025-07-15 10:47:55,520 - mindformers./output/log[mindformers/models/utils.py:227] - INFO - Formative layer_recompute: [[0, 0]] 2025-07-15 10:47:55,520 - mindformers./output/log[mindformers/models/utils.py:229] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-15 10:47:55,520 - mindformers./output/log[mindformers/models/utils.py:235] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0, 0]], 'feed_forward\\.w1\\.activation\\.silu': [[0, 0]]} 2025-07-15 10:47:55,520 - mindformers./output/log[mindformers/models/utils.py:236] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0, 0]]} 2025-07-15 10:47:55,520 - mindformers./output/log[mindformers/models/utils.py:237] - INFO - Formative select_recompute_exclude: {} 2025-07-15 10:47:55,520 - mindformers./output/log[mindformers/models/utils.py:238] - INFO - Formative select_comm_recompute_exclude: {} [INFO] ME(935455:281473676996288,MainProcess):2025-07-15-10:47:55.560.681 [mindspore/testcases/testcases/tests/st/networks/mindformers/mindformers/models/llama/llama_layer.py:165] Using 1 data parallel, 1 context parallel and 2 model parallel for the embedding lookup. 2025-07-15 10:47:55,566 - mindformers./output/log[mindformers/models/modeling_utils.py:1517] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.570.408 [mindspore/ccsrc/minddata/dataset/util/task_manager.cc:165] DoServiceStart] Starting Task Manager. [INFO] MD(935455,fffe750bdfa0,python):2025-07-15-10:47:55.570.915 [mindspore/ccsrc/minddata/utils.h:37] BindThreadCoreForMindDataOp] [dataset::Watchdog]: Core binding is not enabled. [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.571.688 [mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc:376] Compile] Input plan: +-GeneratorDataset(:,columns:[input_ids],) [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.571.740 [mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc:389] Compile] Environment MS_INDEPENDENT_DATASET is false, dataset will be ran in main process. [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.571.948 [mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc:409] Compile] Plan before optimization: +-Top | +-GeneratorDataset(:,columns:[input_ids],) [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.571.982 [mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc:99] PrePass] Running pre pass loops. [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.572.112 [mindspore/ccsrc/minddata/dataset/engine/opt/pre/node_offload_pass.cc:136] RunOnTree] Pre pass: node offload pass started. [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.572.166 [mindspore/ccsrc/minddata/dataset/engine/opt/pre/node_offload_pass.cc:155] RunOnTree] Pre pass: offload node removal pass complete. [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.572.215 [mindspore/ccsrc/minddata/dataset/engine/opt/pre/node_removal_pass.cc:59] RunOnTree] Pre pass: node removal pass started. [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.572.252 [mindspore/ccsrc/minddata/dataset/engine/opt/pre/node_removal_pass.cc:73] RunOnTree] Pre pass: node removal pass complete. [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.572.286 [mindspore/ccsrc/minddata/dataset/engine/opt/pre/epoch_ctrl_pass.cc:76] RunOnTree] Pre pass: Injection pass started. [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.572.319 [mindspore/ccsrc/minddata/dataset/engine/opt/pre/epoch_ctrl_pass.cc:91] RunOnTree] Pre pass: Injection pass complete. [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.572.354 [mindspore/ccsrc/minddata/dataset/engine/opt/pre/cache_transform_pass.cc:176] RunOnTree] Pre pass: Cache transform pass started. [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.572.391 [mindspore/ccsrc/minddata/dataset/engine/opt/pre/cache_transform_pass.cc:193] RunOnTree] Pre pass: Cache transform pass complete. [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.572.422 [mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc:129] PrePass] Pre pass offload complete. [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.572.457 [mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc:152] PostPass] Running post pass loops. [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.572.562 [mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc:169] PostPass] Post passes complete. [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.572.603 [mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc:423] Compile] Plan after optimization: +-Top | +-GeneratorDataset(:,columns:[input_ids],) [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.574.736 [mindspore/ccsrc/minddata/dataset/engine/execution_tree.cc:192] Launch] Printing the tree before launch tasks: Execution tree summary: ----------------------- +- ( 0) : [workers: 1] Execution tree operator details: -------------------------------- ( 0) : Number of children : 0 Number of parents : 0 Connector queue size : 16 Total repeats : 1 Number repeats per epoch : 1 Num workers: 1 Column names: input_ids [INFO] MD(935455,fffe748adfa0,python):2025-07-15-10:47:55.575.101 [mindspore/ccsrc/minddata/utils.h:37] BindThreadCoreForMindDataOp] [dataset::GeneratorOp(ID:0)]: Core binding is not enabled. [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.579.138 [mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc:476] GetNext] End of data iteration. cur_batch_num_: 0 [INFO] MD(935455,ffffb287eec0,python):2025-07-15-10:47:55.579.216 [mindspore/ccsrc/minddata/dataset/engine/python_runtime_context.cc:22] Terminate] Terminating a Dataset PythonRuntime. 2025-07-15 10:47:55,579 - mindformers./output/log[mindformers/trainer/base_trainer.py:209] - INFO - Pipeline parallel was opened: pipeline_stages = 2, full batch is True, gradient_accumulation_steps will not take effect in pipeline parallel, global batch size will be changed: global_batch_size = batch_size * data_parallel * micro_batch_num * micro_batch_interleave_num = 32 = 8 * 1 * 2 * 2). 2025-07-15 10:47:55,579 - mindformers./output/log[mindformers/trainer/base_trainer.py:343] - WARNING - When using the pipeline parallel mode, the MFPipelineWithLossScaleCell class is used by default. 2025-07-15 10:47:55,580 - mindformers./output/log[mindformers/trainer/base_trainer.py:351] - INFO - PipelineWrapper under evaluate or predict mode will not take effect. 2025-07-15 10:47:55,580 - mindformers./output/log[mindformers/trainer/base_trainer.py:944] - INFO - .........Build Dataset For Train.......... 2025-07-15 10:47:55,580 - mindformers./output/log[mindformers/trainer/base_trainer.py:469] - INFO - .........Build Dataset From Config.......... 2025-07-15 10:47:55,580 - mindformers./output/log[mindformers/trainer/base_trainer.py:948] - INFO - Create train dataset finish, dataset size:10 2025-07-15 10:47:55,581 - mindformers./output/log[mindformers/trainer/utils.py:172] - WARNING - Sink mode is False, per epoch size is invalid, it will reset -1. 2025-07-15 10:47:55,581 - mindformers./output/log[mindformers/trainer/utils.py:176] - INFO - Will be Training epochs:1, sink_size:-1 2025-07-15 10:47:55,581 - mindformers./output/log[mindformers/trainer/utils.py:178] - INFO - Create training dataset finish, dataset size:10 2025-07-15 10:47:55,581 - mindformers./output/log[mindformers/trainer/base_trainer.py:995] - INFO - .........Build Net For Train.......... 2025-07-15 10:47:55,582 - mindformers./output/log[mindformers/trainer/base_trainer.py:545] - INFO - micro_batch_interleave_num > 1, the double copy parallel feature is turned on. 2025-07-15 10:47:55,587 - mindformers./output/log[mindformers/trainer/base_trainer.py:739] - INFO - Network Parameters: 154 M. 2025-07-15 10:47:55,587 - mindformers./output/log[mindformers/trainer/base_trainer.py:1034] - INFO - .........Build Optimizer For Train.......... 2025-07-15 10:47:55,587 - mindformers./output/log[mindformers/trainer/base_trainer.py:596] - INFO - .........Build Optimizer From Config.......... 2025-07-15 10:47:55,588 - mindformers./output/log[mindformers/trainer/base_trainer.py:643] - INFO - .........Build LR Schedule From Config.......... 2025-07-15 10:47:55,590 - mindformers./output/log[mindformers/trainer/optimizer_grouped_parameters.py:77] - WARNING - dynamic_lr_schedule will be reset and invalid when layer_scale is False. 2025-07-15 10:47:55,590 - mindformers./output/log[mindformers/trainer/optimizer_grouped_parameters.py:116] - INFO - Param groups = { "decay": { "weight_decay": 0.0, "params": [ "model.tok_embeddings.embedding_weight", "model.layers.0.attention.wq.weight", "model.layers.0.attention.wk.weight", "model.layers.0.attention.wv.weight", "model.layers.0.attention.wo.weight", "model.layers.0.feed_forward.w1.weight", "model.layers.0.feed_forward.w2.weight", "model.layers.0.feed_forward.w3.weight", "model.layers.1.attention.wq.weight", "model.layers.1.attention.wk.weight", "model.layers.1.attention.wv.weight", "model.layers.1.attention.wo.weight", "model.layers.1.feed_forward.w1.weight", "model.layers.1.feed_forward.w2.weight", "model.layers.1.feed_forward.w3.weight", "lm_head.weight" ] }, "no_decay": { "weight_decay": 0.0, "params": [ "model.layers.0.attention_norm.weight", "model.layers.0.ffn_norm.weight", "model.layers.1.attention_norm.weight", "model.layers.1.ffn_norm.weight", "model.norm_out.weight" ] } } 2025-07-15 10:47:55,610 - mindformers./output/log[mindformers/trainer/base_trainer.py:1042] - INFO - .........Build Running Wrapper From Config For Train.......... 2025-07-15 10:47:55,611 - mindformers./output/log[mindformers/trainer/base_trainer.py:680] - INFO - .........Build Model Wrapper for Train From Config.......... [WARNING] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:55.634.983 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: dfbffab08ce62ec457a6c2cf7f55e76c [const vector]{0, 2}, async: 0, submit_now: 1 [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:55.635.141 [mindspore/ccsrc/distributed/collective/collective_manager.cc:373] CreateCommunicationGroup] [PROF]CreateCommunicationGroupOnHostSide costs 0.022 msec. [INFO] CORE(935455,ffffb287eec0,python):2025-07-15-10:47:55.635.211 [mindspore/core/utils/ms_context.cc:771] IsEnableInferBoost] MSContext enable ms infer boost [INFO] CORE(935455,ffffb287eec0,python):2025-07-15-10:47:55.635.342 [mindspore/core/utils/ms_context.cc:751] SetMsInternalEnableCustomKernelList] Enable internal kernel list: Add, AddLayerNorm, AddRmsNorm, AddRmsNormDynamicQuant, AddRmsNormQuantV2, FlashAttentionScore, InferenceMatmulSplit, InferenceSwiGLU, MatMul, MatMulAllReduce, MatMulElemwise, MatMulSigmoidCastAdd, PagedAttention, PagedAttentionMask, QbmmAdd, QbmmAllReduceAdd, RmsNorm, RmsNormQuant, Sub, SwiGLUDynamicQuant, [INFO] DEVICE(935455,ffffb287eec0,python):2025-07-15-10:47:55.635.451 [mindspore/ccsrc/plugin/res_manager/ascend/collective/multi_ascend_collective_comm_lib.cc:183] CreateCommunicationGroup] Successfully create HCCL communication group dfbffab08ce62ec457a6c2cf7f55e76c [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:55.635.499 [mindspore/ccsrc/distributed/collective/collective_manager.cc:380] CreateCommunicationGroup] [PROF]CreateCommunicationGroupOnDeviceSide costs 0.308 msec. [INFO] DISTRIBUTED(935455,ffffb287eec0,python):2025-07-15-10:47:55.635.551 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1136] SubmitCreateDeviceCommTask] Submit init communicator task for dfbffab08ce62ec457a6c2cf7f55e76c. Call 'WaitCommInitDone' later to wait initialization to be done. testcase ('export MS_DEV_P2P_HCCL_BUFFSIZE=24 && export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 && bash /home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/llama/msrun_launch_llama.sh 4 test_train 8128', '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/llama/test_train/worker_0.log') failed. please check log /home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/llama/test_train/worker_0.log. F =================================== FAILURES =================================== __________________________________ test_train __________________________________ @arg_mark(plat_marks=['platform_ascend910b'], level_mark='level0', card_mark='allcards', essential_mark='essential') def test_train(): """ Feature: Trainer.train() Description: Test context parallel trainer for train. Expectation: AssertionError """ ascend_home_path = os.getenv('ASCEND_HOME_PATH') if not ascend_home_path: os.environ['ASCEND_HOME_PATH'] = "/usr/local/Ascend/latest" sh_path = os.path.split(os.path.realpath(__file__))[0] commands = [(f"export MS_DEV_P2P_HCCL_BUFFSIZE=24 && export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 && " f"bash {sh_path}/msrun_launch_llama.sh 4 test_train 8128", f"{sh_path}/test_train/worker_0.log"), (f"export ASCEND_RT_VISIBLE_DEVICES=4,5 && " f"bash {sh_path}/msrun_launch_llama.sh 2 test_train_cp 8129", f"{sh_path}/test_train_cp/worker_0.log"), (f"export ASCEND_RT_VISIBLE_DEVICES=6,7 && " f"bash {sh_path}/msrun_launch_llama.sh 2 test_train_dp 8131", f"{sh_path}/test_train_dp/worker_0.log") ] with Pool(len(commands)) as pool: results = list(pool.imap(run_command, commands)) > check_results(commands, results) test_parallel_train.py:67: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ commands = [('export MS_DEV_P2P_HCCL_BUFFSIZE=24 && export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 && bash /home/jenkins/mindspore/test... 8131', '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/llama/test_train_dp/worker_0.log')] results = [(256, '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/llama/test_train/worker_0.log'), (0...'), (0, '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/llama/test_train_dp/worker_0.log')] def check_results(commands, results): error_idx = [_ for _ in range(len(results)) if results[_][0] != 0] for idx in error_idx: print(f"testcase {commands[idx]} failed. please check log {results[idx][1]}.") os.system(f"grep -E 'ERROR|error|Error' {results[idx][1]} -C 5") os.system(f"cat {results[idx][1]}") > assert error_idx == [] E assert [0] == [] E Left contains one more item: 0 E Use -v to get the full diff test_parallel_train.py:38: AssertionError =========================== short test summary info ============================ FAILED test_parallel_train.py::test_train - assert [0] == [] ======================== 1 failed in 205.69s (0:03:25) =========================