============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3, configfile: ../../../../../../../../sault/virtual_test/virtualenv_002/sault/config/pytest.ini plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collected 1 item test_deepseekv3_pretrain.py enable lazy inline in pp /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) Start worker process with rank id:0, log file:/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_0.log. Environment variable [RANK_ID=0] is exported. [WARNING] ME(904305:281473038347968,MainProcess):2025-07-15-10:25:53.579.782 [mindspore/parallel/cluster/process_entity/_utils.py:62] Launch process with command: taskset -c 144-167 python /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py --config /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/pretrain_deepseek3.yaml --register_path /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/research/deepseek3/ Start worker process with rank id:1, log file:/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log. Environment variable [RANK_ID=1] is exported. [WARNING] ME(904305:281473038347968,MainProcess):2025-07-15-10:25:53.626.014 [mindspore/parallel/cluster/process_entity/_utils.py:62] Launch process with command: taskset -c 24-47 python /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py --config /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/pretrain_deepseek3.yaml --register_path /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/research/deepseek3/ Start worker process with rank id:2, log file:/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_2.log. Environment variable [RANK_ID=2] is exported. [WARNING] ME(904305:281473038347968,MainProcess):2025-07-15-10:25:53.673.028 [mindspore/parallel/cluster/process_entity/_utils.py:62] Launch process with command: taskset -c 96-119 python /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py --config /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/pretrain_deepseek3.yaml --register_path /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/research/deepseek3/ Start worker process with rank id:3, log file:/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_3.log. Environment variable [RANK_ID=3] is exported. [WARNING] ME(904305:281473038347968,MainProcess):2025-07-15-10:25:53.720.649 [mindspore/parallel/cluster/process_entity/_utils.py:62] Launch process with command: taskset -c 72-95 python /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py --config /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/pretrain_deepseek3.yaml --register_path /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/research/deepseek3/ Start worker process with rank id:4, log file:/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_4.log. Environment variable [RANK_ID=4] is exported. [WARNING] ME(904305:281473038347968,MainProcess):2025-07-15-10:25:53.770.589 [mindspore/parallel/cluster/process_entity/_utils.py:62] Launch process with command: taskset -c 0-23 python /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py --config /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/pretrain_deepseek3.yaml --register_path /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/research/deepseek3/ Start worker process with rank id:5, log file:/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_5.log. Environment variable [RANK_ID=5] is exported. [WARNING] ME(904305:281473038347968,MainProcess):2025-07-15-10:25:53.825.442 [mindspore/parallel/cluster/process_entity/_utils.py:62] Launch process with command: taskset -c 120-143 python /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py --config /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/pretrain_deepseek3.yaml --register_path /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/research/deepseek3/ Start worker process with rank id:6, log file:/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_6.log. Environment variable [RANK_ID=6] is exported. [WARNING] ME(904305:281473038347968,MainProcess):2025-07-15-10:25:53.880.906 [mindspore/parallel/cluster/process_entity/_utils.py:62] Launch process with command: taskset -c 48-71 python /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py --config /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/pretrain_deepseek3.yaml --register_path /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/research/deepseek3/ Start worker process with rank id:7, log file:/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_7.log. Environment variable [RANK_ID=7] is exported. [WARNING] ME(904305:281473038347968,MainProcess):2025-07-15-10:25:53.936.318 [mindspore/parallel/cluster/process_entity/_utils.py:62] Launch process with command: taskset -c 168-191 python /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py --config /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/pretrain_deepseek3.yaml --register_path /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/research/deepseek3/ [WARNING] ME(904305:281473038347968,MainProcess):2025-07-15-10:25:53.988.017 [mindspore/parallel/cluster/process_entity/_api.py:267] Distributed job is spawned. Waiting all processes to exit... /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) 2025-07-15 10:26:02,202 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:26:02,203 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:26:02,203 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'filepath_prefix', 'processor', 'remove_redundancy', 'resume_by_last_timestamp_ckpt'] 2025-07-15 10:26:02,204 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' [WARNING] ME(904625:281473770581696,MainProcess):2025-07-15-10:26:02.221.823 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(904625:281473770581696,MainProcess):2025-07-15-10:26:02.222.591 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(904625:281473770581696,MainProcess):2025-07-15-10:26:02.222.997 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(904625:281473770581696,MainProcess):2025-07-15-10:26:02.223.117 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(904625:281473770581696,MainProcess):2025-07-15-10:26:02.223.441 [mindspore/context.py:921] For 'context.set_context', 'matmul_grad_comm_overlap' parameter is deprecated, and will be removed in the next version, Please use 'grad_matmul_communication_overlap' instead. [WARNING] ME(904625:281473770581696,MainProcess):2025-07-15-10:26:02.223.572 [mindspore/context.py:1412] For 'context.set_context', the parameter 'memory_optimize_level' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(904625:281473770581696,MainProcess):2025-07-15-10:26:02.223.673 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(904625:281473770581696,MainProcess):2025-07-15-10:26:02.223.788 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. [WARNING] ME(904625:281473770581696,MainProcess):2025-07-15-10:26:02.223.984 [mindspore/context.py:1412] For 'context.set_context', the parameter 'deterministic' will be deprecated and removed in a future version. Please use the api mindspore.set_deterministic() instead. [WARNING] ME(904625:281473770581696,MainProcess):2025-07-15-10:26:02.224.197 [mindspore/context.py:1412] For 'context.set_context', the parameter 'mempool_block_size' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] DISTRIBUTED(904625,ffffb81beec0,python):2025-07-15-10:26:02.226.103 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:42074, destination: 127.0.0.1:7124 [WARNING] DISTRIBUTED(904625,ffff3282efa0,python):2025-07-15-10:26:02.226.106 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:42074 to 127.0.0.1:7124 is successfully created. System errno: Success [WARNING] DISTRIBUTED(904625,ffffb81beec0,python):2025-07-15-10:26:02.226.175 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7124 to be connected...Retry number: 1 2025-07-15 10:26:02,229 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:26:02,230 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:26:02,230 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'filepath_prefix', 'processor', 'remove_redundancy', 'resume_by_last_timestamp_ckpt'] 2025-07-15 10:26:02,231 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' [WARNING] ME(904631:281473081732800,MainProcess):2025-07-15-10:26:02.250.315 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(904631:281473081732800,MainProcess):2025-07-15-10:26:02.251.063 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(904631:281473081732800,MainProcess):2025-07-15-10:26:02.251.466 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(904631:281473081732800,MainProcess):2025-07-15-10:26:02.251.579 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(904631:281473081732800,MainProcess):2025-07-15-10:26:02.251.891 [mindspore/context.py:921] For 'context.set_context', 'matmul_grad_comm_overlap' parameter is deprecated, and will be removed in the next version, Please use 'grad_matmul_communication_overlap' instead. [WARNING] ME(904631:281473081732800,MainProcess):2025-07-15-10:26:02.252.018 [mindspore/context.py:1412] For 'context.set_context', the parameter 'memory_optimize_level' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(904631:281473081732800,MainProcess):2025-07-15-10:26:02.252.114 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(904631:281473081732800,MainProcess):2025-07-15-10:26:02.252.224 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. [WARNING] ME(904631:281473081732800,MainProcess):2025-07-15-10:26:02.252.400 [mindspore/context.py:1412] For 'context.set_context', the parameter 'deterministic' will be deprecated and removed in a future version. Please use the api mindspore.set_deterministic() instead. [WARNING] ME(904631:281473081732800,MainProcess):2025-07-15-10:26:02.252.616 [mindspore/context.py:1412] For 'context.set_context', the parameter 'mempool_block_size' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] DISTRIBUTED(904631,ffff8f0ceec0,python):2025-07-15-10:26:02.254.560 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:42078, destination: 127.0.0.1:7124 [WARNING] DISTRIBUTED(904631,ffff0970efa0,python):2025-07-15-10:26:02.254.560 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:42078 to 127.0.0.1:7124 is successfully created. System errno: Success [WARNING] DISTRIBUTED(904631,ffff8f0ceec0,python):2025-07-15-10:26:02.254.630 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7124 to be connected...Retry number: 1 2025-07-15 10:26:02,317 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:26:02,317 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:26:02,317 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'filepath_prefix', 'processor', 'remove_redundancy', 'resume_by_last_timestamp_ckpt'] 2025-07-15 10:26:02,318 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' 2025-07-15 10:26:02,325 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:26:02,326 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:26:02,326 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'filepath_prefix', 'processor', 'remove_redundancy', 'resume_by_last_timestamp_ckpt'] 2025-07-15 10:26:02,326 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' [WARNING] ME(904639:281472824635072,MainProcess):2025-07-15-10:26:02.341.182 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(904639:281472824635072,MainProcess):2025-07-15-10:26:02.341.910 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(904639:281472824635072,MainProcess):2025-07-15-10:26:02.342.307 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(904639:281472824635072,MainProcess):2025-07-15-10:26:02.342.416 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(904639:281472824635072,MainProcess):2025-07-15-10:26:02.342.738 [mindspore/context.py:921] For 'context.set_context', 'matmul_grad_comm_overlap' parameter is deprecated, and will be removed in the next version, Please use 'grad_matmul_communication_overlap' instead. [WARNING] ME(904639:281472824635072,MainProcess):2025-07-15-10:26:02.342.867 [mindspore/context.py:1412] For 'context.set_context', the parameter 'memory_optimize_level' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(904639:281472824635072,MainProcess):2025-07-15-10:26:02.342.962 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(904639:281472824635072,MainProcess):2025-07-15-10:26:02.343.071 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. [WARNING] ME(904639:281472824635072,MainProcess):2025-07-15-10:26:02.343.242 [mindspore/context.py:1412] For 'context.set_context', the parameter 'deterministic' will be deprecated and removed in a future version. Please use the api mindspore.set_deterministic() instead. [WARNING] ME(904639:281472824635072,MainProcess):2025-07-15-10:26:02.343.430 [mindspore/context.py:1412] For 'context.set_context', the parameter 'mempool_block_size' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] DISTRIBUTED(904639,ffff7fb9eec0,python):2025-07-15-10:26:02.345.187 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:42094, destination: 127.0.0.1:7124 [WARNING] DISTRIBUTED(904639,fffefa1eefa0,python):2025-07-15-10:26:02.345.204 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:42094 to 127.0.0.1:7124 is successfully created. System errno: Success [WARNING] DISTRIBUTED(904639,ffff7fb9eec0,python):2025-07-15-10:26:02.345.259 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7124 to be connected...Retry number: 1 [WARNING] ME(904635:281473102507712,MainProcess):2025-07-15-10:26:02.345.662 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(904635:281473102507712,MainProcess):2025-07-15-10:26:02.346.406 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(904635:281473102507712,MainProcess):2025-07-15-10:26:02.346.830 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(904635:281473102507712,MainProcess):2025-07-15-10:26:02.346.943 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(904635:281473102507712,MainProcess):2025-07-15-10:26:02.347.254 [mindspore/context.py:921] For 'context.set_context', 'matmul_grad_comm_overlap' parameter is deprecated, and will be removed in the next version, Please use 'grad_matmul_communication_overlap' instead. [WARNING] ME(904635:281473102507712,MainProcess):2025-07-15-10:26:02.347.384 [mindspore/context.py:1412] For 'context.set_context', the parameter 'memory_optimize_level' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(904635:281473102507712,MainProcess):2025-07-15-10:26:02.347.479 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(904635:281473102507712,MainProcess):2025-07-15-10:26:02.347.589 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. [WARNING] ME(904635:281473102507712,MainProcess):2025-07-15-10:26:02.347.770 [mindspore/context.py:1412] For 'context.set_context', the parameter 'deterministic' will be deprecated and removed in a future version. Please use the api mindspore.set_deterministic() instead. [WARNING] ME(904635:281473102507712,MainProcess):2025-07-15-10:26:02.347.977 [mindspore/context.py:1412] For 'context.set_context', the parameter 'mempool_block_size' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. 2025-07-15 10:26:02,349 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. [WARNING] DISTRIBUTED(904635,ffff9049eec0,python):2025-07-15-10:26:02.349.900 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:42104, destination: 127.0.0.1:7124 2025-07-15 10:26:02,349 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. [WARNING] DISTRIBUTED(904635,ffff0aaeefa0,python):2025-07-15-10:26:02.349.918 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:42104 to 127.0.0.1:7124 is successfully created. System errno: Success [WARNING] DISTRIBUTED(904635,ffff9049eec0,python):2025-07-15-10:26:02.349.980 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7124 to be connected...Retry number: 1 2025-07-15 10:26:02,350 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'filepath_prefix', 'processor', 'remove_redundancy', 'resume_by_last_timestamp_ckpt'] 2025-07-15 10:26:02,350 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' [WARNING] ME(904643:281473447489216,MainProcess):2025-07-15-10:26:02.369.121 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(904643:281473447489216,MainProcess):2025-07-15-10:26:02.369.843 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(904643:281473447489216,MainProcess):2025-07-15-10:26:02.370.243 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(904643:281473447489216,MainProcess):2025-07-15-10:26:02.370.357 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(904643:281473447489216,MainProcess):2025-07-15-10:26:02.370.677 [mindspore/context.py:921] For 'context.set_context', 'matmul_grad_comm_overlap' parameter is deprecated, and will be removed in the next version, Please use 'grad_matmul_communication_overlap' instead. [WARNING] ME(904643:281473447489216,MainProcess):2025-07-15-10:26:02.370.802 [mindspore/context.py:1412] For 'context.set_context', the parameter 'memory_optimize_level' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(904643:281473447489216,MainProcess):2025-07-15-10:26:02.370.898 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(904643:281473447489216,MainProcess):2025-07-15-10:26:02.371.010 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. [WARNING] ME(904643:281473447489216,MainProcess):2025-07-15-10:26:02.371.189 [mindspore/context.py:1412] For 'context.set_context', the parameter 'deterministic' will be deprecated and removed in a future version. Please use the api mindspore.set_deterministic() instead. [WARNING] ME(904643:281473447489216,MainProcess):2025-07-15-10:26:02.371.399 [mindspore/context.py:1412] For 'context.set_context', the parameter 'mempool_block_size' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] DISTRIBUTED(904643,ffffa4d9eec0,python):2025-07-15-10:26:02.373.384 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:42118, destination: 127.0.0.1:7124 [WARNING] DISTRIBUTED(904643,ffff1f3eefa0,python):2025-07-15-10:26:02.373.403 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:42118 to 127.0.0.1:7124 is successfully created. System errno: Success [WARNING] DISTRIBUTED(904643,ffffa4d9eec0,python):2025-07-15-10:26:02.373.456 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7124 to be connected...Retry number: 1 2025-07-15 10:26:02,439 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:26:02,440 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:26:02,440 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'filepath_prefix', 'processor', 'remove_redundancy', 'resume_by_last_timestamp_ckpt'] 2025-07-15 10:26:02,441 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' [WARNING] ME(904652:281473622798016,MainProcess):2025-07-15-10:26:02.459.924 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(904652:281473622798016,MainProcess):2025-07-15-10:26:02.460.662 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(904652:281473622798016,MainProcess):2025-07-15-10:26:02.461.060 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(904652:281473622798016,MainProcess):2025-07-15-10:26:02.461.174 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(904652:281473622798016,MainProcess):2025-07-15-10:26:02.461.484 [mindspore/context.py:921] For 'context.set_context', 'matmul_grad_comm_overlap' parameter is deprecated, and will be removed in the next version, Please use 'grad_matmul_communication_overlap' instead. [WARNING] ME(904652:281473622798016,MainProcess):2025-07-15-10:26:02.461.610 [mindspore/context.py:1412] For 'context.set_context', the parameter 'memory_optimize_level' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(904652:281473622798016,MainProcess):2025-07-15-10:26:02.461.707 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(904652:281473622798016,MainProcess):2025-07-15-10:26:02.461.818 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. [WARNING] ME(904652:281473622798016,MainProcess):2025-07-15-10:26:02.462.002 [mindspore/context.py:1412] For 'context.set_context', the parameter 'deterministic' will be deprecated and removed in a future version. Please use the api mindspore.set_deterministic() instead. [WARNING] ME(904652:281473622798016,MainProcess):2025-07-15-10:26:02.462.198 [mindspore/context.py:1412] For 'context.set_context', the parameter 'mempool_block_size' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] DISTRIBUTED(904652,ffff29b3efa0,python):2025-07-15-10:26:02.463.940 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:42130 to 127.0.0.1:7124 is successfully created. System errno: Success [WARNING] DISTRIBUTED(904652,ffffaf4ceec0,python):2025-07-15-10:26:02.463.940 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:42130, destination: 127.0.0.1:7124 [WARNING] DISTRIBUTED(904652,ffffaf4ceec0,python):2025-07-15-10:26:02.464.117 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 22 source: 127.0.0.1:42140, destination: 127.0.0.1:7124 [WARNING] DISTRIBUTED(904652,ffff2ab5efa0,python):2025-07-15-10:26:02.464.146 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:42140 to 127.0.0.1:7124 is successfully created. System errno: Success [WARNING] DISTRIBUTED(904652,ffffaf4ceec0,python):2025-07-15-10:26:02.464.156 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7124 to be connected...Retry number: 1 2025-07-15 10:26:02,519 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:26:02,520 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:26:02,520 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'filepath_prefix', 'processor', 'remove_redundancy', 'resume_by_last_timestamp_ckpt'] 2025-07-15 10:26:02,521 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' [WARNING] ME(904647:281473519840960,MainProcess):2025-07-15-10:26:02.539.981 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(904647:281473519840960,MainProcess):2025-07-15-10:26:02.540.732 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(904647:281473519840960,MainProcess):2025-07-15-10:26:02.541.132 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(904647:281473519840960,MainProcess):2025-07-15-10:26:02.541.249 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(904647:281473519840960,MainProcess):2025-07-15-10:26:02.541.565 [mindspore/context.py:921] For 'context.set_context', 'matmul_grad_comm_overlap' parameter is deprecated, and will be removed in the next version, Please use 'grad_matmul_communication_overlap' instead. [WARNING] ME(904647:281473519840960,MainProcess):2025-07-15-10:26:02.541.693 [mindspore/context.py:1412] For 'context.set_context', the parameter 'memory_optimize_level' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(904647:281473519840960,MainProcess):2025-07-15-10:26:02.541.795 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(904647:281473519840960,MainProcess):2025-07-15-10:26:02.541.906 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. [WARNING] ME(904647:281473519840960,MainProcess):2025-07-15-10:26:02.542.105 [mindspore/context.py:1412] For 'context.set_context', the parameter 'deterministic' will be deprecated and removed in a future version. Please use the api mindspore.set_deterministic() instead. [WARNING] ME(904647:281473519840960,MainProcess):2025-07-15-10:26:02.542.312 [mindspore/context.py:1412] For 'context.set_context', the parameter 'mempool_block_size' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] DISTRIBUTED(904647,ffffa929eec0,python):2025-07-15-10:26:02.544.363 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:42150, destination: 127.0.0.1:7124 [WARNING] DISTRIBUTED(904647,ffff1f7eefa0,python):2025-07-15-10:26:02.544.363 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:42150 to 127.0.0.1:7124 is successfully created. System errno: Success [WARNING] DISTRIBUTED(904647,ffffa929eec0,python):2025-07-15-10:26:02.544.432 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7124 to be connected...Retry number: 1 2025-07-15 10:26:02,655 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:26:02,655 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:26:02,655 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'filepath_prefix', 'processor', 'remove_redundancy', 'resume_by_last_timestamp_ckpt'] 2025-07-15 10:26:02,656 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' [WARNING] ME(904656:281473291775680,MainProcess):2025-07-15-10:26:02.675.218 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(904656:281473291775680,MainProcess):2025-07-15-10:26:02.675.970 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(904656:281473291775680,MainProcess):2025-07-15-10:26:02.676.377 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(904656:281473291775680,MainProcess):2025-07-15-10:26:02.676.499 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(904656:281473291775680,MainProcess):2025-07-15-10:26:02.676.817 [mindspore/context.py:921] For 'context.set_context', 'matmul_grad_comm_overlap' parameter is deprecated, and will be removed in the next version, Please use 'grad_matmul_communication_overlap' instead. [WARNING] ME(904656:281473291775680,MainProcess):2025-07-15-10:26:02.676.949 [mindspore/context.py:1412] For 'context.set_context', the parameter 'memory_optimize_level' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(904656:281473291775680,MainProcess):2025-07-15-10:26:02.677.047 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(904656:281473291775680,MainProcess):2025-07-15-10:26:02.677.159 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. [WARNING] ME(904656:281473291775680,MainProcess):2025-07-15-10:26:02.677.351 [mindspore/context.py:1412] For 'context.set_context', the parameter 'deterministic' will be deprecated and removed in a future version. Please use the api mindspore.set_deterministic() instead. [WARNING] ME(904656:281473291775680,MainProcess):2025-07-15-10:26:02.677.580 [mindspore/context.py:1412] For 'context.set_context', the parameter 'mempool_block_size' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] DISTRIBUTED(904656,ffff9b91eec0,python):2025-07-15-10:26:02.679.544 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:42156, destination: 127.0.0.1:7124 [WARNING] DISTRIBUTED(904656,ffff15f6efa0,python):2025-07-15-10:26:02.679.544 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:42156 to 127.0.0.1:7124 is successfully created. System errno: Success [WARNING] DISTRIBUTED(904656,ffff9b91eec0,python):2025-07-15-10:26:02.679.618 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7124 to be connected...Retry number: 1 [WARNING] DISTRIBUTED(904625,ffffb81beec0,python):2025-07-15-10:26:02.726.413 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 22 source: 127.0.0.1:42160, destination: 127.0.0.1:7124 [WARNING] DISTRIBUTED(904625,ffff3384efa0,python):2025-07-15-10:26:02.726.458 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:42160 to 127.0.0.1:7124 is successfully created. System errno: Success [WARNING] DISTRIBUTED(904625,ffffb81beec0,python):2025-07-15-10:26:02.726.475 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7124 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(904631,ffff8f0ceec0,python):2025-07-15-10:26:02.754.864 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 22 source: 127.0.0.1:42164, destination: 127.0.0.1:7124 [WARNING] DISTRIBUTED(904631,ffff0a72efa0,python):2025-07-15-10:26:02.754.894 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:42164 to 127.0.0.1:7124 is successfully created. System errno: Success [WARNING] DISTRIBUTED(904631,ffff8f0ceec0,python):2025-07-15-10:26:02.754.909 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7124 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(904639,ffff7fb9eec0,python):2025-07-15-10:26:02.845.462 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 22 source: 127.0.0.1:42172, destination: 127.0.0.1:7124 [WARNING] DISTRIBUTED(904639,fffefb20efa0,python):2025-07-15-10:26:02.845.497 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:42172 to 127.0.0.1:7124 is successfully created. System errno: Success [WARNING] DISTRIBUTED(904639,ffff7fb9eec0,python):2025-07-15-10:26:02.845.504 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7124 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(904635,ffff9049eec0,python):2025-07-15-10:26:02.850.216 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 22 source: 127.0.0.1:42176, destination: 127.0.0.1:7124 [WARNING] DISTRIBUTED(904635,ffff0bb0efa0,python):2025-07-15-10:26:02.850.244 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:42176 to 127.0.0.1:7124 is successfully created. System errno: Success [WARNING] DISTRIBUTED(904635,ffff9049eec0,python):2025-07-15-10:26:02.850.261 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7124 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(904643,ffffa4d9eec0,python):2025-07-15-10:26:02.873.698 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 22 source: 127.0.0.1:42184, destination: 127.0.0.1:7124 [WARNING] DISTRIBUTED(904643,ffff2040efa0,python):2025-07-15-10:26:02.873.733 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:42184 to 127.0.0.1:7124 is successfully created. System errno: Success [WARNING] DISTRIBUTED(904643,ffffa4d9eec0,python):2025-07-15-10:26:02.873.740 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7124 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(904652,ffffaf4ceec0,python):2025-07-15-10:26:02.964.685 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). [WARNING] DISTRIBUTED(904647,ffffa929eec0,python):2025-07-15-10:26:03.044.684 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 22 source: 127.0.0.1:42192, destination: 127.0.0.1:7124 [WARNING] DISTRIBUTED(904647,ffff2491efa0,python):2025-07-15-10:26:03.044.705 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:42192 to 127.0.0.1:7124 is successfully created. System errno: Success [WARNING] DISTRIBUTED(904647,ffffa929eec0,python):2025-07-15-10:26:03.044.732 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7124 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(904656,ffff9b91eec0,python):2025-07-15-10:26:03.179.856 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 22 source: 127.0.0.1:42206, destination: 127.0.0.1:7124 [WARNING] DISTRIBUTED(904656,ffff16f8efa0,python):2025-07-15-10:26:03.179.887 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:42206 to 127.0.0.1:7124 is successfully created. System errno: Success [WARNING] DISTRIBUTED(904656,ffff9b91eec0,python):2025-07-15-10:26:03.179.898 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7124 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(904625,ffffb81beec0,python):2025-07-15-10:26:03.226.996 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). [WARNING] DISTRIBUTED(904631,ffff8f0ceec0,python):2025-07-15-10:26:03.255.452 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). [WARNING] DISTRIBUTED(904639,ffff7fb9eec0,python):2025-07-15-10:26:03.345.914 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). [WARNING] DISTRIBUTED(904635,ffff9049eec0,python):2025-07-15-10:26:03.350.736 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). [WARNING] DISTRIBUTED(904643,ffffa4d9eec0,python):2025-07-15-10:26:03.374.240 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). [WARNING] DISTRIBUTED(904652,ffffaf4ceec0,python):2025-07-15-10:26:03.464.792 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/14400). [WARNING] DISTRIBUTED(904647,ffffa929eec0,python):2025-07-15-10:26:03.545.204 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). [WARNING] DISTRIBUTED(904656,ffff9b91eec0,python):2025-07-15-10:26:03.680.426 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). [WARNING] DISTRIBUTED(904625,ffffb81beec0,python):2025-07-15-10:26:03.727.110 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/14400). [WARNING] DISTRIBUTED(904631,ffff8f0ceec0,python):2025-07-15-10:26:03.755.561 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/14400). [WARNING] DISTRIBUTED(904639,ffff7fb9eec0,python):2025-07-15-10:26:03.846.018 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/14400). [WARNING] DISTRIBUTED(904635,ffff9049eec0,python):2025-07-15-10:26:03.850.851 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/14400). [WARNING] DISTRIBUTED(904643,ffffa4d9eec0,python):2025-07-15-10:26:03.874.354 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/14400). [WARNING] DISTRIBUTED(904652,ffffaf4ceec0,python):2025-07-15-10:26:03.964.893 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(3/14400). [WARNING] DISTRIBUTED(904647,ffffa929eec0,python):2025-07-15-10:26:04.045.316 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/14400). [WARNING] DISTRIBUTED(904656,ffff9b91eec0,python):2025-07-15-10:26:04.180.566 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(904656,ffff9b91eec0,python):2025-07-15-10:26:04.180.612 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 7 rank id: 7 [WARNING] DISTRIBUTED(904625,ffffb81beec0,python):2025-07-15-10:26:04.227.248 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(904625,ffffb81beec0,python):2025-07-15-10:26:04.227.292 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 0 rank id: 0 [WARNING] DISTRIBUTED(904631,ffff8f0ceec0,python):2025-07-15-10:26:04.255.695 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(904631,ffff8f0ceec0,python):2025-07-15-10:26:04.255.742 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 1 rank id: 1 [WARNING] DISTRIBUTED(904639,ffff7fb9eec0,python):2025-07-15-10:26:04.346.132 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(904639,ffff7fb9eec0,python):2025-07-15-10:26:04.346.171 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 3 rank id: 3 [WARNING] DISTRIBUTED(904635,ffff9049eec0,python):2025-07-15-10:26:04.350.974 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(904635,ffff9049eec0,python):2025-07-15-10:26:04.351.015 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 2 rank id: 2 [WARNING] DISTRIBUTED(904643,ffffa4d9eec0,python):2025-07-15-10:26:04.374.492 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(904643,ffffa4d9eec0,python):2025-07-15-10:26:04.374.539 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 4 rank id: 4 [WARNING] DISTRIBUTED(904652,ffffaf4ceec0,python):2025-07-15-10:26:04.465.019 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(904652,ffffaf4ceec0,python):2025-07-15-10:26:04.465.063 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 6 rank id: 6 [WARNING] DISTRIBUTED(904647,ffffa929eec0,python):2025-07-15-10:26:04.545.449 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(904647,ffffa929eec0,python):2025-07-15-10:26:04.545.495 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 5 rank id: 5 [WARNING] DISTRIBUTED(904656,ffff9b91eec0,python):2025-07-15-10:26:05.931.116 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3, 4, 5, 6, 7}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(904656,ffff9b91eec0,python):2025-07-15-10:26:05.931.367 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(904656,fffebe95efa0,python):2025-07-15-10:26:05.931.592 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:7124, node_rank:2130706433, total_rank_size:8, local_rank_size8 [WARNING] HCCL_ADPT(904656,fffebe95efa0,python):2025-07-15-10:26:05.931.687 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(904656,fffebe95efa0,python):2025-07-15-10:26:05.931.725 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:635] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(904656,fffebe95efa0,python):2025-07-15-10:26:05.931.769 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [WARNING] DEVICE(904656,fffebe95efa0,python):2025-07-15-10:26:05.932.220 [mindspore/ccsrc/plugin/device/cpu/hal/hardware/ms_collective_comm_lib.cc:251] QueryUniqueID] Retry to lookup the unique id for group hccl_world_group from the meta server node...Retry time: 399/400, sleep 2 2025-07-15 10:26:05,932 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_7.ckpt' [WARNING] DISTRIBUTED(904625,ffffb81beec0,python):2025-07-15-10:26:05.943.498 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3, 4, 5, 6, 7}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(904625,ffffb81beec0,python):2025-07-15-10:26:05.943.768 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(904625,fffedb04efa0,python):2025-07-15-10:26:05.943.988 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:7124, node_rank:2130706433, total_rank_size:8, local_rank_size8 [WARNING] HCCL_ADPT(904625,fffedb04efa0,python):2025-07-15-10:26:05.944.071 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(904625,fffedb04efa0,python):2025-07-15-10:26:05.944.131 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:635] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(904625,fffedb04efa0,python):2025-07-15-10:26:05.944.162 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group 2025-07-15 10:26:05,945 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_0.ckpt' [WARNING] DISTRIBUTED(904625,fffedb04efa0,python):2025-07-15-10:26:05.951.674 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group [WARNING] DEVICE(904625,fffed900efa0,python):2025-07-15-10:26:05.951.997 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 2025-07-15 10:26:05,959 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:26:05,960 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config metric is empty. 2025-07-15 10:26:05,960 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:26:05,960 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'eval_dataset', 'eval_dataset_task', 'filepath_prefix', 'processor'] 2025-07-15 10:26:05,961 - mindformers./output/log[mindformers/trainer/trainer.py:1008] - INFO - Load configs in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/general/run_general_task.yaml to build trainer. 2025-07-15 10:26:05,961 - mindformers./output/log[mindformers/trainer/trainer.py:1044] - INFO - ..........Init Config.......... 2025-07-15 10:26:05,961 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 4, 'capacity_factor': 1.5, 'aux_loss_factor': 0.05, 'num_experts_chosen': 2, 'expert_group_size': 2, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': False, 'enable_sdrop': False, 'use_fused_ops_topkrouter': True, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 3, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': ['expert'], 'aux_loss_factors': [0.0001], 'z_loss_factor': 0.0, 'balance_via_topk_bias': True, 'topk_bias_update_rate': 0.0001, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': 1, 'use_gating_sigmoid': True, 'enable_deredundency': False, 'npu_nums_per_device': 2, 'use_gmm': True, 'enable_gmm_safe_tokens': True, 'use_fused_ops_permute': True, 'callback_moe_droprate': False} 2025-07-15 10:26:05,961 - mindformers./output/log[mindformers/core/parallel_config.py:48] - INFO - initial swap_config from dict: {'swap': False, 'layer_swap': None, 'op_swap': None, 'default_prefetch': 1} 2025-07-15 10:26:05,962 - mindformers./output/log[mindformers/core/parallel_config.py:55] - INFO - initial recompute_config from dict: {'recompute': True, 'select_recompute': False, 'parallel_optimizer_comm_recompute': True, 'select_comm_recompute': False, 'mp_comm_recompute': True, 'recompute_slice_activation': True, 'select_recompute_exclude': False, 'select_comm_recompute_exclude': False} 2025-07-15 10:26:05,962 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 2, 'pipeline_stage': 2, 'micro_batch_num': 2, 'seq_split_num': 1, 'use_seq_parallel': True, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': True, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} 2025-07-15 10:26:05,962 - mindformers./output/log[mindformers/core/parallel_config.py:63] - INFO - pipeline_stage = 2 > 1, vocab_emd_dp will be reset to False. 2025-07-15 10:26:05,963 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' 2025-07-15 10:26:05,963 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_7.ckpt' 2025-07-15 10:26:05,972 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:26:05,972 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config metric is empty. 2025-07-15 10:26:05,972 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:26:05,973 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'eval_dataset', 'eval_dataset_task', 'filepath_prefix', 'processor'] 2025-07-15 10:26:05,973 - mindformers./output/log[mindformers/trainer/trainer.py:1008] - INFO - Load configs in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/general/run_general_task.yaml to build trainer. 2025-07-15 10:26:05,973 - mindformers./output/log[mindformers/trainer/trainer.py:1044] - INFO - ..........Init Config.......... 2025-07-15 10:26:05,973 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 4, 'capacity_factor': 1.5, 'aux_loss_factor': 0.05, 'num_experts_chosen': 2, 'expert_group_size': 2, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': False, 'enable_sdrop': False, 'use_fused_ops_topkrouter': True, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 3, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': ['expert'], 'aux_loss_factors': [0.0001], 'z_loss_factor': 0.0, 'balance_via_topk_bias': True, 'topk_bias_update_rate': 0.0001, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': 1, 'use_gating_sigmoid': True, 'enable_deredundency': False, 'npu_nums_per_device': 2, 'use_gmm': True, 'enable_gmm_safe_tokens': True, 'use_fused_ops_permute': True, 'callback_moe_droprate': False} 2025-07-15 10:26:05,974 - mindformers./output/log[mindformers/core/parallel_config.py:48] - INFO - initial swap_config from dict: {'swap': False, 'layer_swap': None, 'op_swap': None, 'default_prefetch': 1} 2025-07-15 10:26:05,974 - mindformers./output/log[mindformers/core/parallel_config.py:55] - INFO - initial recompute_config from dict: {'recompute': True, 'select_recompute': False, 'parallel_optimizer_comm_recompute': True, 'select_comm_recompute': False, 'mp_comm_recompute': True, 'recompute_slice_activation': True, 'select_recompute_exclude': False, 'select_comm_recompute_exclude': False} 2025-07-15 10:26:05,974 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 2, 'pipeline_stage': 2, 'micro_batch_num': 2, 'seq_split_num': 1, 'use_seq_parallel': True, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': True, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} 2025-07-15 10:26:05,974 - mindformers./output/log[mindformers/core/parallel_config.py:63] - INFO - pipeline_stage = 2 > 1, vocab_emd_dp will be reset to False. 2025-07-15 10:26:05,975 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' 2025-07-15 10:26:05,976 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_0.ckpt' 2025-07-15 10:26:06,098 - mindformers./output/log[mindformers/trainer/base_trainer.py:107] - INFO - host_name: ascend213, host_ip: 121.37.54.128 2025-07-15 10:26:06,099 - mindformers./output/log[mindformers/trainer/base_trainer.py:113] - INFO - Now Running Task is: text_generation, Model is: deepseekV3 2025-07-15 10:26:06,099 - mindformers./output/log[mindformers/trainer/base_trainer.py:143] - WARNING - Input model name is not in the supported list or unspecified. 2025-07-15 10:26:06,099 - mindformers./output/log[mindformers/trainer/base_trainer.py:144] - WARNING - See the list of supported task and model name: ['codellama_34b', 'common', 'deepseek1_5_7b', 'deepseek_33b', 'glm3_6b', 'glm4_9b', 'gpt2', 'gpt2_13b', 'gpt2_52b', 'gpt2_lora', 'gpt2_xl', 'gpt2_xl_lora', 'internlm_7b', 'internlm_7b_lora', 'llama2_13b', 'llama2_70b', 'llama2_7b', 'llama2_7b_lora', 'llama_7b_slora', 'yi_34b', 'yi_6b'] 2025-07-15 10:26:06,100 - mindformers./output/log[mindformers/trainer/base_trainer.py:145] - WARNING - The default model config: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/gpt2/run_gpt2.yaml will now be used for the text_generation task 2025-07-15 10:26:06,100 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:26:06,100 - mindformers./output/log[mindformers/trainer/trainer.py:323] - INFO - ==========Trainer Init Success!========== 2025-07-15 10:26:06,101 - mindformers./output/log[mindformers/trainer/trainer.py:406] - WARNING - sink_size will not be able to set in a future release. Modifying sink_size may cause functional issues when resuming training from a checkpoint. 2025-07-15 10:26:06,101 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:26:06,101 - mindformers./output/log[mindformers/trainer/base_trainer.py:204] - INFO - Pipeline parallel was opened: pipeline_stages = 2, full batch is True, gradient_accumulation_steps will not take effect in pipeline parallel, global batch size will be changed: global_batch_size = batch_size * data_parallel * micro_batch_num * micro_batch_interleave_num = 4 = 1 * 2 * 2 * 1). 2025-07-15 10:26:06,102 - mindformers./output/log[mindformers/trainer/base_trainer.py:338] - WARNING - When using the pipeline parallel mode, the MFPipelineWithLossScaleCell class is used by default. 2025-07-15 10:26:06,102 - mindformers./output/log[mindformers/trainer/base_trainer.py:346] - INFO - PipelineWrapper under evaluate or predict mode will not take effect. 2025-07-15 10:26:06,102 - mindformers./output/log[mindformers/trainer/base_trainer.py:920] - INFO - .........Build Dataset For Train.......... 2025-07-15 10:26:06,102 - mindformers./output/log[mindformers/trainer/base_trainer.py:464] - INFO - .........Build Dataset From Config.......... 2025-07-15 10:26:06,103 - mindformers./output/log[mindformers/dataset/causal_language_model_dataset.py:302] - INFO - Now Create Causal Language Model Dataset. 2025-07-15 10:26:06,104 - mindformers./output/log[mindformers/dataset/base_dataset.py:83] - INFO - Now dataset_strategy is full_batch, shard_id: None, num_shards: None [WARNING] DISTRIBUTED(904639,ffff7fb9eec0,python):2025-07-15-10:26:06.107.203 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3, 4, 5, 6, 7}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(904639,ffff7fb9eec0,python):2025-07-15-10:26:06.107.424 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(904639,fffea304efa0,python):2025-07-15-10:26:06.107.621 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:7124, node_rank:2130706433, total_rank_size:8, local_rank_size8 [WARNING] HCCL_ADPT(904639,fffea304efa0,python):2025-07-15-10:26:06.107.701 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(904639,fffea304efa0,python):2025-07-15-10:26:06.107.746 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:635] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(904639,fffea304efa0,python):2025-07-15-10:26:06.107.772 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [WARNING] DISTRIBUTED(904639,fffea304efa0,python):2025-07-15-10:26:06.108.303 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group [WARNING] DEVICE(904639,fffea283efa0,python):2025-07-15-10:26:06.108.590 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 2025-07-15 10:26:06,109 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_3.ckpt' 2025-07-15 10:26:06,110 - mindformers./output/log[mindformers/trainer/base_trainer.py:107] - INFO - host_name: ascend213, host_ip: 121.37.54.128 2025-07-15 10:26:06,110 - mindformers./output/log[mindformers/trainer/base_trainer.py:113] - INFO - Now Running Task is: text_generation, Model is: deepseekV3 2025-07-15 10:26:06,111 - mindformers./output/log[mindformers/trainer/base_trainer.py:143] - WARNING - Input model name is not in the supported list or unspecified. 2025-07-15 10:26:06,111 - mindformers./output/log[mindformers/trainer/base_trainer.py:924] - INFO - Create train dataset finish, dataset size:6 2025-07-15 10:26:06,111 - mindformers./output/log[mindformers/trainer/utils.py:176] - INFO - Will be Training epochs:1, sink_size:1 2025-07-15 10:26:06,111 - mindformers./output/log[mindformers/trainer/base_trainer.py:144] - WARNING - See the list of supported task and model name: ['codellama_34b', 'common', 'deepseek1_5_7b', 'deepseek_33b', 'glm3_6b', 'glm4_9b', 'gpt2', 'gpt2_13b', 'gpt2_52b', 'gpt2_lora', 'gpt2_xl', 'gpt2_xl_lora', 'internlm_7b', 'internlm_7b_lora', 'llama2_13b', 'llama2_70b', 'llama2_7b', 'llama2_7b_lora', 'llama_7b_slora', 'yi_34b', 'yi_6b'] 2025-07-15 10:26:06,111 - mindformers./output/log[mindformers/trainer/utils.py:178] - INFO - Create training dataset finish, dataset size:6 2025-07-15 10:26:06,111 - mindformers./output/log[mindformers/trainer/base_trainer.py:145] - WARNING - The default model config: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/gpt2/run_gpt2.yaml will now be used for the text_generation task 2025-07-15 10:26:06,112 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:26:06,112 - mindformers./output/log[mindformers/trainer/base_trainer.py:971] - INFO - .........Build Net For Train.......... 2025-07-15 10:26:06,112 - mindformers./output/log[mindformers/trainer/trainer.py:323] - INFO - ==========Trainer Init Success!========== 2025-07-15 10:26:06,112 - mindformers./output/log[mindformers/trainer/base_trainer.py:498] - INFO - .........Build Network From Config.......... 2025-07-15 10:26:06,112 - mindformers./output/log[mindformers/trainer/trainer.py:406] - WARNING - sink_size will not be able to set in a future release. Modifying sink_size may cause functional issues when resuming training from a checkpoint. 2025-07-15 10:26:06,112 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:26:06,113 - mindformers./output/log[mindformers/trainer/base_trainer.py:204] - INFO - Pipeline parallel was opened: pipeline_stages = 2, full batch is True, gradient_accumulation_steps will not take effect in pipeline parallel, global batch size will be changed: global_batch_size = batch_size * data_parallel * micro_batch_num * micro_batch_interleave_num = 4 = 1 * 2 * 2 * 1). 2025-07-15 10:26:06,113 - mindformers./output/log[mindformers/trainer/base_trainer.py:338] - WARNING - When using the pipeline parallel mode, the MFPipelineWithLossScaleCell class is used by default. 2025-07-15 10:26:06,113 - mindformers./output/log[mindformers/trainer/base_trainer.py:346] - INFO - PipelineWrapper under evaluate or predict mode will not take effect. 2025-07-15 10:26:06,113 - mindformers./output/log[mindformers/trainer/base_trainer.py:920] - INFO - .........Build Dataset For Train.......... 2025-07-15 10:26:06,114 - mindformers./output/log[mindformers/trainer/base_trainer.py:464] - INFO - .........Build Dataset From Config.......... 2025-07-15 10:26:06,114 - mindformers./output/log[mindformers/dataset/causal_language_model_dataset.py:302] - INFO - Now Create Causal Language Model Dataset. 2025-07-15 10:26:06,115 - mindformers./output/log[mindformers/dataset/base_dataset.py:83] - INFO - Now dataset_strategy is full_batch, shard_id: None, num_shards: None 2025-07-15 10:26:06,122 - mindformers./output/log[mindformers/trainer/base_trainer.py:924] - INFO - Create train dataset finish, dataset size:6 2025-07-15 10:26:06,122 - mindformers./output/log[mindformers/trainer/utils.py:176] - INFO - Will be Training epochs:1, sink_size:1 2025-07-15 10:26:06,122 - mindformers./output/log[mindformers/trainer/utils.py:178] - INFO - Create training dataset finish, dataset size:6 2025-07-15 10:26:06,123 - mindformers./output/log[mindformers/trainer/base_trainer.py:971] - INFO - .........Build Net For Train.......... 2025-07-15 10:26:06,123 - mindformers./output/log[mindformers/trainer/base_trainer.py:498] - INFO - .........Build Network From Config.......... [WARNING] DISTRIBUTED(904635,ffff9049eec0,python):2025-07-15-10:26:06.127.602 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3, 4, 5, 6, 7}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(904635,ffff9049eec0,python):2025-07-15-10:26:06.127.860 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(904635,fffe777eefa0,python):2025-07-15-10:26:06.128.074 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:7124, node_rank:2130706433, total_rank_size:8, local_rank_size8 [WARNING] HCCL_ADPT(904635,fffe777eefa0,python):2025-07-15-10:26:06.128.154 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(904635,fffe777eefa0,python):2025-07-15-10:26:06.128.210 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:635] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(904635,fffe777eefa0,python):2025-07-15-10:26:06.128.242 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [WARNING] DISTRIBUTED(904635,fffe777eefa0,python):2025-07-15-10:26:06.128.682 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group [WARNING] DEVICE(904635,fffe76fdefa0,python):2025-07-15-10:26:06.128.973 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 2025-07-15 10:26:06,129 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_2.ckpt' 2025-07-15 10:26:06,135 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:26:06,136 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config metric is empty. 2025-07-15 10:26:06,136 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:26:06,136 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'eval_dataset', 'eval_dataset_task', 'filepath_prefix', 'processor'] 2025-07-15 10:26:06,136 - mindformers./output/log[mindformers/trainer/trainer.py:1008] - INFO - Load configs in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/general/run_general_task.yaml to build trainer. 2025-07-15 10:26:06,136 - mindformers./output/log[mindformers/trainer/trainer.py:1044] - INFO - ..........Init Config.......... 2025-07-15 10:26:06,137 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 4, 'capacity_factor': 1.5, 'aux_loss_factor': 0.05, 'num_experts_chosen': 2, 'expert_group_size': 2, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': False, 'enable_sdrop': False, 'use_fused_ops_topkrouter': True, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 3, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': ['expert'], 'aux_loss_factors': [0.0001], 'z_loss_factor': 0.0, 'balance_via_topk_bias': True, 'topk_bias_update_rate': 0.0001, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': 1, 'use_gating_sigmoid': True, 'enable_deredundency': False, 'npu_nums_per_device': 2, 'use_gmm': True, 'enable_gmm_safe_tokens': True, 'use_fused_ops_permute': True, 'callback_moe_droprate': False} 2025-07-15 10:26:06,137 - mindformers./output/log[mindformers/core/parallel_config.py:48] - INFO - initial swap_config from dict: {'swap': False, 'layer_swap': None, 'op_swap': None, 'default_prefetch': 1} 2025-07-15 10:26:06,137 - mindformers./output/log[mindformers/core/parallel_config.py:55] - INFO - initial recompute_config from dict: {'recompute': True, 'select_recompute': False, 'parallel_optimizer_comm_recompute': True, 'select_comm_recompute': False, 'mp_comm_recompute': True, 'recompute_slice_activation': True, 'select_recompute_exclude': False, 'select_comm_recompute_exclude': False} 2025-07-15 10:26:06,138 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 2, 'pipeline_stage': 2, 'micro_batch_num': 2, 'seq_split_num': 1, 'use_seq_parallel': True, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': True, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} 2025-07-15 10:26:06,138 - mindformers./output/log[mindformers/core/parallel_config.py:63] - INFO - pipeline_stage = 2 > 1, vocab_emd_dp will be reset to False. 2025-07-15 10:26:06,139 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' 2025-07-15 10:26:06,139 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_3.ckpt' [WARNING] DISTRIBUTED(904643,ffffa4d9eec0,python):2025-07-15-10:26:06.141.244 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3, 4, 5, 6, 7}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(904643,ffffa4d9eec0,python):2025-07-15-10:26:06.141.477 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(904643,fffe83ffefa0,python):2025-07-15-10:26:06.141.791 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:7124, node_rank:2130706433, total_rank_size:8, local_rank_size8 [WARNING] HCCL_ADPT(904643,fffe83ffefa0,python):2025-07-15-10:26:06.141.888 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(904643,fffe83ffefa0,python):2025-07-15-10:26:06.141.947 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:635] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(904643,fffe83ffefa0,python):2025-07-15-10:26:06.141.978 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [WARNING] DISTRIBUTED(904643,fffe83ffefa0,python):2025-07-15-10:26:06.142.395 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group [WARNING] DEVICE(904643,fffe837eefa0,python):2025-07-15-10:26:06.142.711 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 2025-07-15 10:26:06,143 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_4.ckpt' 2025-07-15 10:26:06,156 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:26:06,156 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config metric is empty. 2025-07-15 10:26:06,156 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:26:06,157 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'eval_dataset', 'eval_dataset_task', 'filepath_prefix', 'processor'] 2025-07-15 10:26:06,157 - mindformers./output/log[mindformers/trainer/trainer.py:1008] - INFO - Load configs in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/general/run_general_task.yaml to build trainer. 2025-07-15 10:26:06,157 - mindformers./output/log[mindformers/trainer/trainer.py:1044] - INFO - ..........Init Config.......... 2025-07-15 10:26:06,157 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 4, 'capacity_factor': 1.5, 'aux_loss_factor': 0.05, 'num_experts_chosen': 2, 'expert_group_size': 2, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': False, 'enable_sdrop': False, 'use_fused_ops_topkrouter': True, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 3, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': ['expert'], 'aux_loss_factors': [0.0001], 'z_loss_factor': 0.0, 'balance_via_topk_bias': True, 'topk_bias_update_rate': 0.0001, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': 1, 'use_gating_sigmoid': True, 'enable_deredundency': False, 'npu_nums_per_device': 2, 'use_gmm': True, 'enable_gmm_safe_tokens': True, 'use_fused_ops_permute': True, 'callback_moe_droprate': False} 2025-07-15 10:26:06,158 - mindformers./output/log[mindformers/core/parallel_config.py:48] - INFO - initial swap_config from dict: {'swap': False, 'layer_swap': None, 'op_swap': None, 'default_prefetch': 1} 2025-07-15 10:26:06,158 - mindformers./output/log[mindformers/core/parallel_config.py:55] - INFO - initial recompute_config from dict: {'recompute': True, 'select_recompute': False, 'parallel_optimizer_comm_recompute': True, 'select_comm_recompute': False, 'mp_comm_recompute': True, 'recompute_slice_activation': True, 'select_recompute_exclude': False, 'select_comm_recompute_exclude': False} 2025-07-15 10:26:06,158 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 2, 'pipeline_stage': 2, 'micro_batch_num': 2, 'seq_split_num': 1, 'use_seq_parallel': True, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': True, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} 2025-07-15 10:26:06,158 - mindformers./output/log[mindformers/core/parallel_config.py:63] - INFO - pipeline_stage = 2 > 1, vocab_emd_dp will be reset to False. 2025-07-15 10:26:06,159 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' 2025-07-15 10:26:06,160 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_2.ckpt' 2025-07-15 10:26:06,169 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:26:06,170 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config metric is empty. 2025-07-15 10:26:06,170 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:26:06,170 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'eval_dataset', 'eval_dataset_task', 'filepath_prefix', 'processor'] 2025-07-15 10:26:06,170 - mindformers./output/log[mindformers/trainer/trainer.py:1008] - INFO - Load configs in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/general/run_general_task.yaml to build trainer. 2025-07-15 10:26:06,171 - mindformers./output/log[mindformers/trainer/trainer.py:1044] - INFO - ..........Init Config.......... 2025-07-15 10:26:06,171 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 4, 'capacity_factor': 1.5, 'aux_loss_factor': 0.05, 'num_experts_chosen': 2, 'expert_group_size': 2, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': False, 'enable_sdrop': False, 'use_fused_ops_topkrouter': True, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 3, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': ['expert'], 'aux_loss_factors': [0.0001], 'z_loss_factor': 0.0, 'balance_via_topk_bias': True, 'topk_bias_update_rate': 0.0001, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': 1, 'use_gating_sigmoid': True, 'enable_deredundency': False, 'npu_nums_per_device': 2, 'use_gmm': True, 'enable_gmm_safe_tokens': True, 'use_fused_ops_permute': True, 'callback_moe_droprate': False} 2025-07-15 10:26:06,171 - mindformers./output/log[mindformers/core/parallel_config.py:48] - INFO - initial swap_config from dict: {'swap': False, 'layer_swap': None, 'op_swap': None, 'default_prefetch': 1} 2025-07-15 10:26:06,171 - mindformers./output/log[mindformers/core/parallel_config.py:55] - INFO - initial recompute_config from dict: {'recompute': True, 'select_recompute': False, 'parallel_optimizer_comm_recompute': True, 'select_comm_recompute': False, 'mp_comm_recompute': True, 'recompute_slice_activation': True, 'select_recompute_exclude': False, 'select_comm_recompute_exclude': False} 2025-07-15 10:26:06,172 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 2, 'pipeline_stage': 2, 'micro_batch_num': 2, 'seq_split_num': 1, 'use_seq_parallel': True, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': True, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} 2025-07-15 10:26:06,172 - mindformers./output/log[mindformers/core/parallel_config.py:63] - INFO - pipeline_stage = 2 > 1, vocab_emd_dp will be reset to False. 2025-07-15 10:26:06,173 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' 2025-07-15 10:26:06,173 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_4.ckpt' [WARNING] DISTRIBUTED(904652,ffffaf4ceec0,python):2025-07-15-10:26:06.215.067 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3, 4, 5, 6, 7}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(904652,ffffaf4ceec0,python):2025-07-15-10:26:06.215.302 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(904652,fffed295efa0,python):2025-07-15-10:26:06.215.563 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:7124, node_rank:2130706433, total_rank_size:8, local_rank_size8 [WARNING] HCCL_ADPT(904652,fffed295efa0,python):2025-07-15-10:26:06.215.668 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(904652,fffed295efa0,python):2025-07-15-10:26:06.215.724 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:635] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(904652,fffed295efa0,python):2025-07-15-10:26:06.215.755 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [WARNING] DISTRIBUTED(904652,fffed295efa0,python):2025-07-15-10:26:06.216.235 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group [WARNING] DEVICE(904652,fffed214efa0,python):2025-07-15-10:26:06.216.564 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 2025-07-15 10:26:06,216 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_6.ckpt' 2025-07-15 10:26:06,243 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:26:06,244 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config metric is empty. 2025-07-15 10:26:06,244 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:26:06,244 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'eval_dataset', 'eval_dataset_task', 'filepath_prefix', 'processor'] 2025-07-15 10:26:06,244 - mindformers./output/log[mindformers/trainer/trainer.py:1008] - INFO - Load configs in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/general/run_general_task.yaml to build trainer. 2025-07-15 10:26:06,245 - mindformers./output/log[mindformers/trainer/trainer.py:1044] - INFO - ..........Init Config.......... 2025-07-15 10:26:06,245 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 4, 'capacity_factor': 1.5, 'aux_loss_factor': 0.05, 'num_experts_chosen': 2, 'expert_group_size': 2, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': False, 'enable_sdrop': False, 'use_fused_ops_topkrouter': True, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 3, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': ['expert'], 'aux_loss_factors': [0.0001], 'z_loss_factor': 0.0, 'balance_via_topk_bias': True, 'topk_bias_update_rate': 0.0001, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': 1, 'use_gating_sigmoid': True, 'enable_deredundency': False, 'npu_nums_per_device': 2, 'use_gmm': True, 'enable_gmm_safe_tokens': True, 'use_fused_ops_permute': True, 'callback_moe_droprate': False} 2025-07-15 10:26:06,245 - mindformers./output/log[mindformers/core/parallel_config.py:48] - INFO - initial swap_config from dict: {'swap': False, 'layer_swap': None, 'op_swap': None, 'default_prefetch': 1} 2025-07-15 10:26:06,245 - mindformers./output/log[mindformers/core/parallel_config.py:55] - INFO - initial recompute_config from dict: {'recompute': True, 'select_recompute': False, 'parallel_optimizer_comm_recompute': True, 'select_comm_recompute': False, 'mp_comm_recompute': True, 'recompute_slice_activation': True, 'select_recompute_exclude': False, 'select_comm_recompute_exclude': False} 2025-07-15 10:26:06,246 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 2, 'pipeline_stage': 2, 'micro_batch_num': 2, 'seq_split_num': 1, 'use_seq_parallel': True, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': True, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} 2025-07-15 10:26:06,246 - mindformers./output/log[mindformers/core/parallel_config.py:63] - INFO - pipeline_stage = 2 > 1, vocab_emd_dp will be reset to False. 2025-07-15 10:26:06,247 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' 2025-07-15 10:26:06,247 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_6.ckpt' 2025-07-15 10:26:06,249 - mindformers./output/log[mindformers/trainer/base_trainer.py:107] - INFO - host_name: ascend213, host_ip: 121.37.54.128 2025-07-15 10:26:06,249 - mindformers./output/log[mindformers/trainer/base_trainer.py:113] - INFO - Now Running Task is: text_generation, Model is: deepseekV3 2025-07-15 10:26:06,249 - mindformers./output/log[mindformers/trainer/base_trainer.py:143] - WARNING - Input model name is not in the supported list or unspecified. 2025-07-15 10:26:06,250 - mindformers./output/log[mindformers/trainer/base_trainer.py:144] - WARNING - See the list of supported task and model name: ['codellama_34b', 'common', 'deepseek1_5_7b', 'deepseek_33b', 'glm3_6b', 'glm4_9b', 'gpt2', 'gpt2_13b', 'gpt2_52b', 'gpt2_lora', 'gpt2_xl', 'gpt2_xl_lora', 'internlm_7b', 'internlm_7b_lora', 'llama2_13b', 'llama2_70b', 'llama2_7b', 'llama2_7b_lora', 'llama_7b_slora', 'yi_34b', 'yi_6b'] 2025-07-15 10:26:06,250 - mindformers./output/log[mindformers/trainer/base_trainer.py:145] - WARNING - The default model config: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/gpt2/run_gpt2.yaml will now be used for the text_generation task 2025-07-15 10:26:06,250 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:26:06,251 - mindformers./output/log[mindformers/trainer/trainer.py:323] - INFO - ==========Trainer Init Success!========== 2025-07-15 10:26:06,251 - mindformers./output/log[mindformers/trainer/trainer.py:406] - WARNING - sink_size will not be able to set in a future release. Modifying sink_size may cause functional issues when resuming training from a checkpoint. 2025-07-15 10:26:06,251 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:26:06,252 - mindformers./output/log[mindformers/trainer/base_trainer.py:204] - INFO - Pipeline parallel was opened: pipeline_stages = 2, full batch is True, gradient_accumulation_steps will not take effect in pipeline parallel, global batch size will be changed: global_batch_size = batch_size * data_parallel * micro_batch_num * micro_batch_interleave_num = 4 = 1 * 2 * 2 * 1). 2025-07-15 10:26:06,252 - mindformers./output/log[mindformers/trainer/base_trainer.py:338] - WARNING - When using the pipeline parallel mode, the MFPipelineWithLossScaleCell class is used by default. 2025-07-15 10:26:06,252 - mindformers./output/log[mindformers/trainer/base_trainer.py:346] - INFO - PipelineWrapper under evaluate or predict mode will not take effect. 2025-07-15 10:26:06,252 - mindformers./output/log[mindformers/trainer/base_trainer.py:920] - INFO - .........Build Dataset For Train.......... 2025-07-15 10:26:06,252 - mindformers./output/log[mindformers/trainer/base_trainer.py:464] - INFO - .........Build Dataset From Config.......... 2025-07-15 10:26:06,253 - mindformers./output/log[mindformers/dataset/causal_language_model_dataset.py:302] - INFO - Now Create Causal Language Model Dataset. 2025-07-15 10:26:06,254 - mindformers./output/log[mindformers/dataset/base_dataset.py:83] - INFO - Now dataset_strategy is full_batch, shard_id: None, num_shards: None 2025-07-15 10:26:06,260 - mindformers./output/log[mindformers/trainer/base_trainer.py:924] - INFO - Create train dataset finish, dataset size:6 2025-07-15 10:26:06,260 - mindformers./output/log[mindformers/trainer/utils.py:176] - INFO - Will be Training epochs:1, sink_size:1 2025-07-15 10:26:06,261 - mindformers./output/log[mindformers/trainer/utils.py:178] - INFO - Create training dataset finish, dataset size:6 2025-07-15 10:26:06,261 - mindformers./output/log[mindformers/trainer/base_trainer.py:971] - INFO - .........Build Net For Train.......... 2025-07-15 10:26:06,261 - mindformers./output/log[mindformers/trainer/base_trainer.py:498] - INFO - .........Build Network From Config.......... 2025-07-15 10:26:06,310 - mindformers./output/log[mindformers/trainer/base_trainer.py:107] - INFO - host_name: ascend213, host_ip: 121.37.54.128 2025-07-15 10:26:06,310 - mindformers./output/log[mindformers/trainer/base_trainer.py:107] - INFO - host_name: ascend213, host_ip: 121.37.54.128 2025-07-15 10:26:06,310 - mindformers./output/log[mindformers/trainer/base_trainer.py:113] - INFO - Now Running Task is: text_generation, Model is: deepseekV3 2025-07-15 10:26:06,311 - mindformers./output/log[mindformers/trainer/base_trainer.py:113] - INFO - Now Running Task is: text_generation, Model is: deepseekV3 2025-07-15 10:26:06,311 - mindformers./output/log[mindformers/trainer/base_trainer.py:143] - WARNING - Input model name is not in the supported list or unspecified. 2025-07-15 10:26:06,311 - mindformers./output/log[mindformers/trainer/base_trainer.py:143] - WARNING - Input model name is not in the supported list or unspecified. 2025-07-15 10:26:06,311 - mindformers./output/log[mindformers/trainer/base_trainer.py:144] - WARNING - See the list of supported task and model name: ['codellama_34b', 'common', 'deepseek1_5_7b', 'deepseek_33b', 'glm3_6b', 'glm4_9b', 'gpt2', 'gpt2_13b', 'gpt2_52b', 'gpt2_lora', 'gpt2_xl', 'gpt2_xl_lora', 'internlm_7b', 'internlm_7b_lora', 'llama2_13b', 'llama2_70b', 'llama2_7b', 'llama2_7b_lora', 'llama_7b_slora', 'yi_34b', 'yi_6b'] 2025-07-15 10:26:06,311 - mindformers./output/log[mindformers/trainer/base_trainer.py:145] - WARNING - The default model config: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/gpt2/run_gpt2.yaml will now be used for the text_generation task 2025-07-15 10:26:06,311 - mindformers./output/log[mindformers/trainer/base_trainer.py:144] - WARNING - See the list of supported task and model name: ['codellama_34b', 'common', 'deepseek1_5_7b', 'deepseek_33b', 'glm3_6b', 'glm4_9b', 'gpt2', 'gpt2_13b', 'gpt2_52b', 'gpt2_lora', 'gpt2_xl', 'gpt2_xl_lora', 'internlm_7b', 'internlm_7b_lora', 'llama2_13b', 'llama2_70b', 'llama2_7b', 'llama2_7b_lora', 'llama_7b_slora', 'yi_34b', 'yi_6b'] 2025-07-15 10:26:06,312 - mindformers./output/log[mindformers/trainer/base_trainer.py:145] - WARNING - The default model config: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/gpt2/run_gpt2.yaml will now be used for the text_generation task 2025-07-15 10:26:06,312 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:26:06,312 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:26:06,312 - mindformers./output/log[mindformers/trainer/trainer.py:323] - INFO - ==========Trainer Init Success!========== 2025-07-15 10:26:06,312 - mindformers./output/log[mindformers/trainer/trainer.py:323] - INFO - ==========Trainer Init Success!========== 2025-07-15 10:26:06,312 - mindformers./output/log[mindformers/trainer/trainer.py:406] - WARNING - sink_size will not be able to set in a future release. Modifying sink_size may cause functional issues when resuming training from a checkpoint. 2025-07-15 10:26:06,313 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:26:06,313 - mindformers./output/log[mindformers/trainer/trainer.py:406] - WARNING - sink_size will not be able to set in a future release. Modifying sink_size may cause functional issues when resuming training from a checkpoint. 2025-07-15 10:26:06,313 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:26:06,313 - mindformers./output/log[mindformers/trainer/base_trainer.py:204] - INFO - Pipeline parallel was opened: pipeline_stages = 2, full batch is True, gradient_accumulation_steps will not take effect in pipeline parallel, global batch size will be changed: global_batch_size = batch_size * data_parallel * micro_batch_num * micro_batch_interleave_num = 4 = 1 * 2 * 2 * 1). 2025-07-15 10:26:06,313 - mindformers./output/log[mindformers/trainer/base_trainer.py:338] - WARNING - When using the pipeline parallel mode, the MFPipelineWithLossScaleCell class is used by default. 2025-07-15 10:26:06,313 - mindformers./output/log[mindformers/trainer/base_trainer.py:204] - INFO - Pipeline parallel was opened: pipeline_stages = 2, full batch is True, gradient_accumulation_steps will not take effect in pipeline parallel, global batch size will be changed: global_batch_size = batch_size * data_parallel * micro_batch_num * micro_batch_interleave_num = 4 = 1 * 2 * 2 * 1). 2025-07-15 10:26:06,313 - mindformers./output/log[mindformers/trainer/base_trainer.py:346] - INFO - PipelineWrapper under evaluate or predict mode will not take effect. 2025-07-15 10:26:06,313 - mindformers./output/log[mindformers/trainer/base_trainer.py:338] - WARNING - When using the pipeline parallel mode, the MFPipelineWithLossScaleCell class is used by default. 2025-07-15 10:26:06,314 - mindformers./output/log[mindformers/trainer/base_trainer.py:920] - INFO - .........Build Dataset For Train.......... 2025-07-15 10:26:06,314 - mindformers./output/log[mindformers/trainer/base_trainer.py:346] - INFO - PipelineWrapper under evaluate or predict mode will not take effect. 2025-07-15 10:26:06,314 - mindformers./output/log[mindformers/trainer/base_trainer.py:464] - INFO - .........Build Dataset From Config.......... 2025-07-15 10:26:06,314 - mindformers./output/log[mindformers/trainer/base_trainer.py:920] - INFO - .........Build Dataset For Train.......... 2025-07-15 10:26:06,314 - mindformers./output/log[mindformers/dataset/causal_language_model_dataset.py:302] - INFO - Now Create Causal Language Model Dataset. 2025-07-15 10:26:06,314 - mindformers./output/log[mindformers/trainer/base_trainer.py:464] - INFO - .........Build Dataset From Config.......... 2025-07-15 10:26:06,314 - mindformers./output/log[mindformers/dataset/causal_language_model_dataset.py:302] - INFO - Now Create Causal Language Model Dataset. 2025-07-15 10:26:06,315 - mindformers./output/log[mindformers/dataset/base_dataset.py:83] - INFO - Now dataset_strategy is full_batch, shard_id: None, num_shards: None 2025-07-15 10:26:06,315 - mindformers./output/log[mindformers/dataset/base_dataset.py:83] - INFO - Now dataset_strategy is full_batch, shard_id: None, num_shards: None 2025-07-15 10:26:06,322 - mindformers./output/log[mindformers/trainer/base_trainer.py:924] - INFO - Create train dataset finish, dataset size:6 2025-07-15 10:26:06,322 - mindformers./output/log[mindformers/trainer/utils.py:176] - INFO - Will be Training epochs:1, sink_size:1 2025-07-15 10:26:06,323 - mindformers./output/log[mindformers/trainer/utils.py:178] - INFO - Create training dataset finish, dataset size:6 2025-07-15 10:26:06,323 - mindformers./output/log[mindformers/trainer/base_trainer.py:924] - INFO - Create train dataset finish, dataset size:6 2025-07-15 10:26:06,323 - mindformers./output/log[mindformers/trainer/base_trainer.py:971] - INFO - .........Build Net For Train.......... 2025-07-15 10:26:06,323 - mindformers./output/log[mindformers/trainer/utils.py:176] - INFO - Will be Training epochs:1, sink_size:1 2025-07-15 10:26:06,323 - mindformers./output/log[mindformers/trainer/base_trainer.py:498] - INFO - .........Build Network From Config.......... 2025-07-15 10:26:06,323 - mindformers./output/log[mindformers/trainer/utils.py:178] - INFO - Create training dataset finish, dataset size:6 2025-07-15 10:26:06,324 - mindformers./output/log[mindformers/trainer/base_trainer.py:971] - INFO - .........Build Net For Train.......... 2025-07-15 10:26:06,324 - mindformers./output/log[mindformers/trainer/base_trainer.py:498] - INFO - .........Build Network From Config.......... [WARNING] DISTRIBUTED(904647,ffffa929eec0,python):2025-07-15-10:26:06.331.458 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3, 4, 5, 6, 7}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(904647,ffffa929eec0,python):2025-07-15-10:26:06.331.708 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(904647,fffeccbaefa0,python):2025-07-15-10:26:06.331.947 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:7124, node_rank:2130706433, total_rank_size:8, local_rank_size8 [WARNING] HCCL_ADPT(904647,fffeccbaefa0,python):2025-07-15-10:26:06.332.051 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(904647,fffeccbaefa0,python):2025-07-15-10:26:06.332.110 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:635] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(904647,fffeccbaefa0,python):2025-07-15-10:26:06.332.142 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [WARNING] DISTRIBUTED(904647,fffeccbaefa0,python):2025-07-15-10:26:06.332.804 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group [WARNING] DEVICE(904647,fffe7fffefa0,python):2025-07-15-10:26:06.333.215 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 2025-07-15 10:26:06,333 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_5.ckpt' 2025-07-15 10:26:06,360 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:26:06,360 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config metric is empty. 2025-07-15 10:26:06,360 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:26:06,361 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'eval_dataset', 'eval_dataset_task', 'filepath_prefix', 'processor'] 2025-07-15 10:26:06,361 - mindformers./output/log[mindformers/trainer/trainer.py:1008] - INFO - Load configs in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/general/run_general_task.yaml to build trainer. 2025-07-15 10:26:06,361 - mindformers./output/log[mindformers/trainer/trainer.py:1044] - INFO - ..........Init Config.......... 2025-07-15 10:26:06,361 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 4, 'capacity_factor': 1.5, 'aux_loss_factor': 0.05, 'num_experts_chosen': 2, 'expert_group_size': 2, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': False, 'enable_sdrop': False, 'use_fused_ops_topkrouter': True, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 3, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': ['expert'], 'aux_loss_factors': [0.0001], 'z_loss_factor': 0.0, 'balance_via_topk_bias': True, 'topk_bias_update_rate': 0.0001, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': 1, 'use_gating_sigmoid': True, 'enable_deredundency': False, 'npu_nums_per_device': 2, 'use_gmm': True, 'enable_gmm_safe_tokens': True, 'use_fused_ops_permute': True, 'callback_moe_droprate': False} 2025-07-15 10:26:06,362 - mindformers./output/log[mindformers/core/parallel_config.py:48] - INFO - initial swap_config from dict: {'swap': False, 'layer_swap': None, 'op_swap': None, 'default_prefetch': 1} 2025-07-15 10:26:06,362 - mindformers./output/log[mindformers/core/parallel_config.py:55] - INFO - initial recompute_config from dict: {'recompute': True, 'select_recompute': False, 'parallel_optimizer_comm_recompute': True, 'select_comm_recompute': False, 'mp_comm_recompute': True, 'recompute_slice_activation': True, 'select_recompute_exclude': False, 'select_comm_recompute_exclude': False} 2025-07-15 10:26:06,362 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 2, 'pipeline_stage': 2, 'micro_batch_num': 2, 'seq_split_num': 1, 'use_seq_parallel': True, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': True, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} 2025-07-15 10:26:06,362 - mindformers./output/log[mindformers/core/parallel_config.py:63] - INFO - pipeline_stage = 2 > 1, vocab_emd_dp will be reset to False. 2025-07-15 10:26:06,363 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' 2025-07-15 10:26:06,364 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_5.ckpt' 2025-07-15 10:26:06,378 - mindformers./output/log[mindformers/trainer/base_trainer.py:107] - INFO - host_name: ascend213, host_ip: 121.37.54.128 2025-07-15 10:26:06,379 - mindformers./output/log[mindformers/trainer/base_trainer.py:113] - INFO - Now Running Task is: text_generation, Model is: deepseekV3 2025-07-15 10:26:06,379 - mindformers./output/log[mindformers/trainer/base_trainer.py:143] - WARNING - Input model name is not in the supported list or unspecified. 2025-07-15 10:26:06,379 - mindformers./output/log[mindformers/trainer/base_trainer.py:144] - WARNING - See the list of supported task and model name: ['codellama_34b', 'common', 'deepseek1_5_7b', 'deepseek_33b', 'glm3_6b', 'glm4_9b', 'gpt2', 'gpt2_13b', 'gpt2_52b', 'gpt2_lora', 'gpt2_xl', 'gpt2_xl_lora', 'internlm_7b', 'internlm_7b_lora', 'llama2_13b', 'llama2_70b', 'llama2_7b', 'llama2_7b_lora', 'llama_7b_slora', 'yi_34b', 'yi_6b'] 2025-07-15 10:26:06,380 - mindformers./output/log[mindformers/trainer/base_trainer.py:145] - WARNING - The default model config: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/gpt2/run_gpt2.yaml will now be used for the text_generation task 2025-07-15 10:26:06,380 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:26:06,380 - mindformers./output/log[mindformers/trainer/trainer.py:323] - INFO - ==========Trainer Init Success!========== 2025-07-15 10:26:06,381 - mindformers./output/log[mindformers/trainer/trainer.py:406] - WARNING - sink_size will not be able to set in a future release. Modifying sink_size may cause functional issues when resuming training from a checkpoint. 2025-07-15 10:26:06,381 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:26:06,381 - mindformers./output/log[mindformers/trainer/base_trainer.py:204] - INFO - Pipeline parallel was opened: pipeline_stages = 2, full batch is True, gradient_accumulation_steps will not take effect in pipeline parallel, global batch size will be changed: global_batch_size = batch_size * data_parallel * micro_batch_num * micro_batch_interleave_num = 4 = 1 * 2 * 2 * 1). 2025-07-15 10:26:06,381 - mindformers./output/log[mindformers/trainer/base_trainer.py:338] - WARNING - When using the pipeline parallel mode, the MFPipelineWithLossScaleCell class is used by default. 2025-07-15 10:26:06,382 - mindformers./output/log[mindformers/trainer/base_trainer.py:346] - INFO - PipelineWrapper under evaluate or predict mode will not take effect. 2025-07-15 10:26:06,382 - mindformers./output/log[mindformers/trainer/base_trainer.py:920] - INFO - .........Build Dataset For Train.......... 2025-07-15 10:26:06,382 - mindformers./output/log[mindformers/trainer/base_trainer.py:464] - INFO - .........Build Dataset From Config.......... 2025-07-15 10:26:06,382 - mindformers./output/log[mindformers/dataset/causal_language_model_dataset.py:302] - INFO - Now Create Causal Language Model Dataset. 2025-07-15 10:26:06,383 - mindformers./output/log[mindformers/dataset/base_dataset.py:83] - INFO - Now dataset_strategy is full_batch, shard_id: None, num_shards: None 2025-07-15 10:26:06,390 - mindformers./output/log[mindformers/trainer/base_trainer.py:924] - INFO - Create train dataset finish, dataset size:6 2025-07-15 10:26:06,390 - mindformers./output/log[mindformers/trainer/utils.py:176] - INFO - Will be Training epochs:1, sink_size:1 2025-07-15 10:26:06,390 - mindformers./output/log[mindformers/trainer/utils.py:178] - INFO - Create training dataset finish, dataset size:6 2025-07-15 10:26:06,391 - mindformers./output/log[mindformers/trainer/base_trainer.py:971] - INFO - .........Build Net For Train.......... 2025-07-15 10:26:06,391 - mindformers./output/log[mindformers/trainer/base_trainer.py:498] - INFO - .........Build Network From Config.......... [WARNING] DISTRIBUTED(904656,fffebe95efa0,python):2025-07-15-10:26:06.432.935 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group [WARNING] DEVICE(904656,fffebc93dfa0,python):2025-07-15-10:26:06.433.467 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 2025-07-15 10:26:06,451 - mindformers./output/log[mindformers/version_control.py:140] - INFO - The Lazy Inline compilation acceleration feature is turned on. 2025-07-15 10:26:06,456 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1216] - INFO - Enable flash attention. 2025-07-15 10:26:06,465 - mindformers./output/log[mindformers/version_control.py:140] - INFO - The Lazy Inline compilation acceleration feature is turned on. 2025-07-15 10:26:06,470 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1216] - INFO - Enable flash attention. 2025-07-15 10:26:06,507 - mindformers./output/log[mindformers/trainer/base_trainer.py:107] - INFO - host_name: ascend213, host_ip: 121.37.54.128 2025-07-15 10:26:06,508 - mindformers./output/log[mindformers/trainer/base_trainer.py:113] - INFO - Now Running Task is: text_generation, Model is: deepseekV3 2025-07-15 10:26:06,508 - mindformers./output/log[mindformers/trainer/base_trainer.py:143] - WARNING - Input model name is not in the supported list or unspecified. 2025-07-15 10:26:06,508 - mindformers./output/log[mindformers/trainer/base_trainer.py:144] - WARNING - See the list of supported task and model name: ['codellama_34b', 'common', 'deepseek1_5_7b', 'deepseek_33b', 'glm3_6b', 'glm4_9b', 'gpt2', 'gpt2_13b', 'gpt2_52b', 'gpt2_lora', 'gpt2_xl', 'gpt2_xl_lora', 'internlm_7b', 'internlm_7b_lora', 'llama2_13b', 'llama2_70b', 'llama2_7b', 'llama2_7b_lora', 'llama_7b_slora', 'yi_34b', 'yi_6b'] 2025-07-15 10:26:06,509 - mindformers./output/log[mindformers/trainer/base_trainer.py:145] - WARNING - The default model config: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/gpt2/run_gpt2.yaml will now be used for the text_generation task 2025-07-15 10:26:06,509 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:26:06,509 - mindformers./output/log[mindformers/trainer/trainer.py:323] - INFO - ==========Trainer Init Success!========== 2025-07-15 10:26:06,510 - mindformers./output/log[mindformers/trainer/trainer.py:406] - WARNING - sink_size will not be able to set in a future release. Modifying sink_size may cause functional issues when resuming training from a checkpoint. 2025-07-15 10:26:06,510 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:26:06,511 - mindformers./output/log[mindformers/trainer/base_trainer.py:204] - INFO - Pipeline parallel was opened: pipeline_stages = 2, full batch is True, gradient_accumulation_steps will not take effect in pipeline parallel, global batch size will be changed: global_batch_size = batch_size * data_parallel * micro_batch_num * micro_batch_interleave_num = 4 = 1 * 2 * 2 * 1). 2025-07-15 10:26:06,511 - mindformers./output/log[mindformers/trainer/base_trainer.py:338] - WARNING - When using the pipeline parallel mode, the MFPipelineWithLossScaleCell class is used by default. 2025-07-15 10:26:06,511 - mindformers./output/log[mindformers/trainer/base_trainer.py:346] - INFO - PipelineWrapper under evaluate or predict mode will not take effect. 2025-07-15 10:26:06,511 - mindformers./output/log[mindformers/trainer/base_trainer.py:920] - INFO - .........Build Dataset For Train.......... 2025-07-15 10:26:06,511 - mindformers./output/log[mindformers/trainer/base_trainer.py:464] - INFO - .........Build Dataset From Config.......... 2025-07-15 10:26:06,512 - mindformers./output/log[mindformers/dataset/causal_language_model_dataset.py:302] - INFO - Now Create Causal Language Model Dataset. 2025-07-15 10:26:06,512 - mindformers./output/log[mindformers/dataset/base_dataset.py:83] - INFO - Now dataset_strategy is full_batch, shard_id: None, num_shards: None 2025-07-15 10:26:06,520 - mindformers./output/log[mindformers/trainer/base_trainer.py:924] - INFO - Create train dataset finish, dataset size:6 2025-07-15 10:26:06,520 - mindformers./output/log[mindformers/trainer/utils.py:176] - INFO - Will be Training epochs:1, sink_size:1 2025-07-15 10:26:06,520 - mindformers./output/log[mindformers/trainer/utils.py:178] - INFO - Create training dataset finish, dataset size:6 2025-07-15 10:26:06,520 - mindformers./output/log[mindformers/trainer/base_trainer.py:971] - INFO - .........Build Net For Train.......... 2025-07-15 10:26:06,521 - mindformers./output/log[mindformers/trainer/base_trainer.py:498] - INFO - .........Build Network From Config.......... 2025-07-15 10:26:06,570 - mindformers./output/log[mindformers/version_control.py:140] - INFO - The Lazy Inline compilation acceleration feature is turned on. 2025-07-15 10:26:06,575 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1216] - INFO - Enable flash attention. 2025-07-15 10:26:06,626 - mindformers./output/log[mindformers/version_control.py:140] - INFO - The Lazy Inline compilation acceleration feature is turned on. 2025-07-15 10:26:06,628 - mindformers./output/log[mindformers/models/utils.py:190] - INFO - num_layers per stage: [[2, 2]] 2025-07-15 10:26:06,629 - mindformers./output/log[mindformers/models/utils.py:191] - INFO - Accumulated num_layers per stage: [[2, 4]] 2025-07-15 10:26:06,629 - mindformers./output/log[mindformers/models/utils.py:193] - INFO - Pipeline id list with start_stage: [0, 0, 1, 1] 2025-07-15 10:26:06,629 - mindformers./output/log[mindformers/models/utils.py:194] - INFO - Interleave id list: [0, 0, 0, 0] 2025-07-15 10:26:06,629 - mindformers./output/log[mindformers/models/utils.py:212] - INFO - Formative layer_recompute: [[2, 2]] 2025-07-15 10:26:06,629 - mindformers./output/log[mindformers/models/utils.py:214] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-15 10:26:06,629 - mindformers./output/log[mindformers/models/utils.py:220] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0, 0]], 'feed_forward\\.w1\\.activation\\.silu': [[0, 0]]} 2025-07-15 10:26:06,630 - mindformers./output/log[mindformers/models/utils.py:221] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0, 0]]} 2025-07-15 10:26:06,630 - mindformers./output/log[mindformers/models/utils.py:222] - INFO - Formative select_recompute_exclude: {} 2025-07-15 10:26:06,630 - mindformers./output/log[mindformers/models/utils.py:223] - INFO - Formative select_comm_recompute_exclude: {} 2025-07-15 10:26:06,631 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1216] - INFO - Enable flash attention. 2025-07-15 10:26:06,645 - mindformers./output/log[mindformers/version_control.py:140] - INFO - The Lazy Inline compilation acceleration feature is turned on. 2025-07-15 10:26:06,646 - mindformers./output/log[mindformers/models/utils.py:190] - INFO - num_layers per stage: [[2, 2]] 2025-07-15 10:26:06,647 - mindformers./output/log[mindformers/models/utils.py:191] - INFO - Accumulated num_layers per stage: [[2, 4]] 2025-07-15 10:26:06,647 - mindformers./output/log[mindformers/models/utils.py:193] - INFO - Pipeline id list with start_stage: [0, 0, 1, 1] 2025-07-15 10:26:06,647 - mindformers./output/log[mindformers/models/utils.py:194] - INFO - Interleave id list: [0, 0, 0, 0] 2025-07-15 10:26:06,647 - mindformers./output/log[mindformers/models/utils.py:212] - INFO - Formative layer_recompute: [[2, 2]] 2025-07-15 10:26:06,648 - mindformers./output/log[mindformers/models/utils.py:214] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-15 10:26:06,648 - mindformers./output/log[mindformers/models/utils.py:220] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0, 0]], 'feed_forward\\.w1\\.activation\\.silu': [[0, 0]]} 2025-07-15 10:26:06,648 - mindformers./output/log[mindformers/models/utils.py:221] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0, 0]]} 2025-07-15 10:26:06,648 - mindformers./output/log[mindformers/models/utils.py:222] - INFO - Formative select_recompute_exclude: {} 2025-07-15 10:26:06,648 - mindformers./output/log[mindformers/models/utils.py:223] - INFO - Formative select_comm_recompute_exclude: {} 2025-07-15 10:26:06,649 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1042] - WARNING - first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. 2025-07-15 10:26:06,650 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1216] - INFO - Enable flash attention. 2025-07-15 10:26:06,657 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 0 2025-07-15 10:26:06,668 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1042] - WARNING - first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. 2025-07-15 10:26:06,675 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 0 2025-07-15 10:26:06,676 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn [WARNING] DISTRIBUTED(904656,ffff9b91eec0,python):2025-07-15-10:26:06.682.800 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: c0a3056327cfa3486f48201b4f88a5fa [const vector]{6, 7}, async: 0, submit_now: 1 2025-07-15 10:26:06,694 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn [WARNING] DISTRIBUTED(904625,ffffb81beec0,python):2025-07-15-10:26:06.700.847 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: c2eb282156233b5d827219971c8b04c2 [const vector]{0, 1}, async: 0, submit_now: 1 2025-07-15 10:26:06,707 - mindformers./output/log[mindformers/version_control.py:140] - INFO - The Lazy Inline compilation acceleration feature is turned on. 2025-07-15 10:26:06,712 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1216] - INFO - Enable flash attention. 2025-07-15 10:26:06,750 - mindformers./output/log[mindformers/models/utils.py:190] - INFO - num_layers per stage: [[2, 2]] 2025-07-15 10:26:06,751 - mindformers./output/log[mindformers/models/utils.py:191] - INFO - Accumulated num_layers per stage: [[2, 4]] 2025-07-15 10:26:06,751 - mindformers./output/log[mindformers/models/utils.py:193] - INFO - Pipeline id list with start_stage: [0, 0, 1, 1] 2025-07-15 10:26:06,751 - mindformers./output/log[mindformers/models/utils.py:194] - INFO - Interleave id list: [0, 0, 0, 0] 2025-07-15 10:26:06,751 - mindformers./output/log[mindformers/models/utils.py:212] - INFO - Formative layer_recompute: [[2, 2]] 2025-07-15 10:26:06,751 - mindformers./output/log[mindformers/models/utils.py:214] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-15 10:26:06,751 - mindformers./output/log[mindformers/models/utils.py:220] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0, 0]], 'feed_forward\\.w1\\.activation\\.silu': [[0, 0]]} 2025-07-15 10:26:06,752 - mindformers./output/log[mindformers/models/utils.py:221] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0, 0]]} 2025-07-15 10:26:06,752 - mindformers./output/log[mindformers/models/utils.py:222] - INFO - Formative select_recompute_exclude: {} 2025-07-15 10:26:06,752 - mindformers./output/log[mindformers/models/utils.py:223] - INFO - Formative select_comm_recompute_exclude: {} 2025-07-15 10:26:06,771 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1042] - WARNING - first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. 2025-07-15 10:26:06,778 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 0 2025-07-15 10:26:06,797 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn [WARNING] DISTRIBUTED(904639,ffff7fb9eec0,python):2025-07-15-10:26:06.802.951 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: c22c60349630d688cef20a3fd708ad87 [const vector]{2, 3}, async: 0, submit_now: 1 2025-07-15 10:26:06,809 - mindformers./output/log[mindformers/models/utils.py:190] - INFO - num_layers per stage: [[2, 2]] 2025-07-15 10:26:06,810 - mindformers./output/log[mindformers/models/utils.py:191] - INFO - Accumulated num_layers per stage: [[2, 4]] 2025-07-15 10:26:06,810 - mindformers./output/log[mindformers/models/utils.py:193] - INFO - Pipeline id list with start_stage: [0, 0, 1, 1] 2025-07-15 10:26:06,810 - mindformers./output/log[mindformers/models/utils.py:194] - INFO - Interleave id list: [0, 0, 0, 0] 2025-07-15 10:26:06,810 - mindformers./output/log[mindformers/models/utils.py:212] - INFO - Formative layer_recompute: [[2, 2]] 2025-07-15 10:26:06,811 - mindformers./output/log[mindformers/models/utils.py:214] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-15 10:26:06,811 - mindformers./output/log[mindformers/models/utils.py:220] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0, 0]], 'feed_forward\\.w1\\.activation\\.silu': [[0, 0]]} 2025-07-15 10:26:06,811 - mindformers./output/log[mindformers/models/utils.py:221] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0, 0]]} 2025-07-15 10:26:06,811 - mindformers./output/log[mindformers/models/utils.py:222] - INFO - Formative select_recompute_exclude: {} 2025-07-15 10:26:06,811 - mindformers./output/log[mindformers/models/utils.py:223] - INFO - Formative select_comm_recompute_exclude: {} 2025-07-15 10:26:06,828 - mindformers./output/log[mindformers/models/utils.py:190] - INFO - num_layers per stage: [[2, 2]] 2025-07-15 10:26:06,828 - mindformers./output/log[mindformers/models/utils.py:191] - INFO - Accumulated num_layers per stage: [[2, 4]] 2025-07-15 10:26:06,828 - mindformers./output/log[mindformers/models/utils.py:193] - INFO - Pipeline id list with start_stage: [0, 0, 1, 1] 2025-07-15 10:26:06,829 - mindformers./output/log[mindformers/models/utils.py:194] - INFO - Interleave id list: [0, 0, 0, 0] 2025-07-15 10:26:06,829 - mindformers./output/log[mindformers/models/utils.py:212] - INFO - Formative layer_recompute: [[2, 2]] 2025-07-15 10:26:06,829 - mindformers./output/log[mindformers/models/utils.py:214] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-15 10:26:06,829 - mindformers./output/log[mindformers/models/utils.py:220] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0, 0]], 'feed_forward\\.w1\\.activation\\.silu': [[0, 0]]} 2025-07-15 10:26:06,829 - mindformers./output/log[mindformers/models/utils.py:221] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0, 0]]} 2025-07-15 10:26:06,829 - mindformers./output/log[mindformers/models/utils.py:222] - INFO - Formative select_recompute_exclude: {} 2025-07-15 10:26:06,830 - mindformers./output/log[mindformers/models/utils.py:223] - INFO - Formative select_comm_recompute_exclude: {} 2025-07-15 10:26:06,830 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1042] - WARNING - first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. 2025-07-15 10:26:06,837 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 0 2025-07-15 10:26:06,842 - mindformers./output/log[mindformers/version_control.py:140] - INFO - The Lazy Inline compilation acceleration feature is turned on. 2025-07-15 10:26:06,847 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1216] - INFO - Enable flash attention. 2025-07-15 10:26:06,849 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1042] - WARNING - first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. 2025-07-15 10:26:06,855 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:26:06,857 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 0 [WARNING] DISTRIBUTED(904643,ffffa4d9eec0,python):2025-07-15-10:26:06.861.370 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: 5270c475a57ca8f687fafaee7ef53c7f [const vector]{4, 5}, async: 0, submit_now: 1 2025-07-15 10:26:06,876 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn [WARNING] DISTRIBUTED(904635,ffff9049eec0,python):2025-07-15-10:26:06.882.690 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: c22c60349630d688cef20a3fd708ad87 [const vector]{2, 3}, async: 0, submit_now: 1 2025-07-15 10:26:06,888 - mindformers./output/log[mindformers/models/utils.py:190] - INFO - num_layers per stage: [[2, 2]] 2025-07-15 10:26:06,888 - mindformers./output/log[mindformers/models/utils.py:191] - INFO - Accumulated num_layers per stage: [[2, 4]] 2025-07-15 10:26:06,888 - mindformers./output/log[mindformers/models/utils.py:193] - INFO - Pipeline id list with start_stage: [0, 0, 1, 1] 2025-07-15 10:26:06,888 - mindformers./output/log[mindformers/models/utils.py:194] - INFO - Interleave id list: [0, 0, 0, 0] 2025-07-15 10:26:06,889 - mindformers./output/log[mindformers/models/utils.py:212] - INFO - Formative layer_recompute: [[2, 2]] 2025-07-15 10:26:06,889 - mindformers./output/log[mindformers/models/utils.py:214] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-15 10:26:06,889 - mindformers./output/log[mindformers/models/utils.py:220] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0, 0]], 'feed_forward\\.w1\\.activation\\.silu': [[0, 0]]} 2025-07-15 10:26:06,889 - mindformers./output/log[mindformers/models/utils.py:221] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0, 0]]} 2025-07-15 10:26:06,889 - mindformers./output/log[mindformers/models/utils.py:222] - INFO - Formative select_recompute_exclude: {} 2025-07-15 10:26:06,889 - mindformers./output/log[mindformers/models/utils.py:223] - INFO - Formative select_comm_recompute_exclude: {} 2025-07-15 10:26:06,908 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1042] - WARNING - first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. 2025-07-15 10:26:06,916 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 0 2025-07-15 10:26:06,935 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn [WARNING] DISTRIBUTED(904652,ffffaf4ceec0,python):2025-07-15-10:26:06.941.070 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: c0a3056327cfa3486f48201b4f88a5fa [const vector]{6, 7}, async: 0, submit_now: 1 2025-07-15 10:26:07,022 - mindformers./output/log[mindformers/models/utils.py:190] - INFO - num_layers per stage: [[2, 2]] 2025-07-15 10:26:07,023 - mindformers./output/log[mindformers/models/utils.py:191] - INFO - Accumulated num_layers per stage: [[2, 4]] 2025-07-15 10:26:07,023 - mindformers./output/log[mindformers/models/utils.py:193] - INFO - Pipeline id list with start_stage: [0, 0, 1, 1] 2025-07-15 10:26:07,023 - mindformers./output/log[mindformers/models/utils.py:194] - INFO - Interleave id list: [0, 0, 0, 0] 2025-07-15 10:26:07,023 - mindformers./output/log[mindformers/models/utils.py:212] - INFO - Formative layer_recompute: [[2, 2]] 2025-07-15 10:26:07,023 - mindformers./output/log[mindformers/models/utils.py:214] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-15 10:26:07,024 - mindformers./output/log[mindformers/models/utils.py:220] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0, 0]], 'feed_forward\\.w1\\.activation\\.silu': [[0, 0]]} 2025-07-15 10:26:07,024 - mindformers./output/log[mindformers/models/utils.py:221] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0, 0]]} 2025-07-15 10:26:07,024 - mindformers./output/log[mindformers/models/utils.py:222] - INFO - Formative select_recompute_exclude: {} 2025-07-15 10:26:07,024 - mindformers./output/log[mindformers/models/utils.py:223] - INFO - Formative select_comm_recompute_exclude: {} 2025-07-15 10:26:07,044 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1042] - WARNING - first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. 2025-07-15 10:26:07,052 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 0 2025-07-15 10:26:07,070 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn [WARNING] DISTRIBUTED(904647,ffffa929eec0,python):2025-07-15-10:26:07.076.943 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: 5270c475a57ca8f687fafaee7ef53c7f [const vector]{4, 5}, async: 0, submit_now: 1 2025-07-15 10:28:48,754 - mindformers./output/log[mindformers/core/context/parallel.py:88] - ERROR - Notice: if you are trying to run with a single device, please set use_parallel=False. If not, please check the error message above. 2025-07-15 10:28:48,756 - mindformers./output/log[mindformers/tools/cloud_adapter/cloud_monitor.py:43] - ERROR - Traceback (most recent call last): File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 34, in wrapper result = run_func(*args, **kwargs) File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py", line 68, in main build_context(config) File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/build_context.py", line 464, in build_context ctx = Context(mf_config) File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/build_context.py", line 71, in __init__ self.parallel_opr.init_communication() File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/parallel.py", line 86, in init_communication init() File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py", line 203, in init init_hccl() RuntimeError: Call aclrtSetDevice failed, ret[507033]. Got device count[8] and device id[1], please check if device id is valid. ---------------------------------------------------- - C++ Call Stack: (For framework developers) ---------------------------------------------------- mindspore/ccsrc/plugin/res_manager/ascend/hal_manager/ascend_hal_manager.cc:67 InitDevice Traceback (most recent call last): File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py", line 336, in main(config_) File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 44, in wrapper raise exc File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 34, in wrapper result = run_func(*args, **kwargs) File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py", line 68, in main build_context(config) File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/build_context.py", line 464, in build_context ctx = Context(mf_config) File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/build_context.py", line 71, in __init__ self.parallel_opr.init_communication() File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/parallel.py", line 86, in init_communication init() File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py", line 203, in init init_hccl() RuntimeError: Call aclrtSetDevice failed, ret[507033]. Got device count[8] and device id[1], please check if device id is valid. ---------------------------------------------------- - C++ Call Stack: (For framework developers) ---------------------------------------------------- mindspore/ccsrc/plugin/res_manager/ascend/hal_manager/ascend_hal_manager.cc:67 InitDevice [WARNING] DEVICE(904631,ffff8f0ceec0,python):2025-07-15-10:28:48.807.886 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_res_manager.cc:350] SyncAllStreams] The ascend_res_manager_ is nullptr in scenarios where it is not actually executed [ERROR] ME(904305:281473038347968,MainProcess):2025-07-15-10:28:50.643.981 [mindspore/parallel/cluster/process_entity/_api.py:363] Worker process 904631 exit with exception. Error code: 1. [WARNING] ME(904305:281473038347968,MainProcess):2025-07-15-10:28:50.644.294 [mindspore/parallel/cluster/process_entity/_api.py:369] There's worker exits with exception, kill all other workers. [ERROR] ME(904305:281473038347968,MainProcess):2025-07-15-10:29:25.803.623 [mindspore/parallel/cluster/process_entity/_api.py:382] Scheduler process 904623 exit with exception. [ERROR] ME(904305:281473038347968,MainProcess):2025-07-15-10:29:25.805.039 [mindspore/parallel/cluster/process_entity/_api.py:603] Time out nodes are ['1'] /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-32-[WARNING] DISTRIBUTED(904631,ffff8f0ceec0,python):2025-07-15-10:26:02.754.909 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7124 to be connected...Retry number: 2 /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-33-[WARNING] DISTRIBUTED(904631,ffff8f0ceec0,python):2025-07-15-10:26:03.255.452 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-34-[WARNING] DISTRIBUTED(904631,ffff8f0ceec0,python):2025-07-15-10:26:03.755.561 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/14400). /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-35-[WARNING] DISTRIBUTED(904631,ffff8f0ceec0,python):2025-07-15-10:26:04.255.695 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-36-[WARNING] DISTRIBUTED(904631,ffff8f0ceec0,python):2025-07-15-10:26:04.255.742 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 1 rank id: 1 /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log:37:2025-07-15 10:28:48,754 - mindformers./output/log[mindformers/core/context/parallel.py:88] - ERROR - Notice: if you are trying to run with a single device, please set use_parallel=False. If not, please check the error message above. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log:38:2025-07-15 10:28:48,756 - mindformers./output/log[mindformers/tools/cloud_adapter/cloud_monitor.py:43] - ERROR - Traceback (most recent call last): /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-39- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 34, in wrapper /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-40- result = run_func(*args, **kwargs) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-41- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py", line 68, in main /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-42- build_context(config) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-43- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/build_context.py", line 464, in build_context -- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-46- self.parallel_opr.init_communication() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-47- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/parallel.py", line 86, in init_communication /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-48- init() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-49- File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py", line 203, in init /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-50- init_hccl() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log:51:RuntimeError: Call aclrtSetDevice failed, ret[507033]. Got device count[8] and device id[1], please check if device id is valid. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-52- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-53----------------------------------------------------- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-54-- C++ Call Stack: (For framework developers) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-55----------------------------------------------------- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-56-mindspore/ccsrc/plugin/res_manager/ascend/hal_manager/ascend_hal_manager.cc:67 InitDevice /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-57- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-58- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log:59:Traceback (most recent call last): /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-60- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py", line 336, in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-61- main(config_) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-62- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 44, in wrapper /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-63- raise exc /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-64- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 34, in wrapper -- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-71- self.parallel_opr.init_communication() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-72- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/parallel.py", line 86, in init_communication /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-73- init() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-74- File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py", line 203, in init /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-75- init_hccl() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log:76:RuntimeError: Call aclrtSetDevice failed, ret[507033]. Got device count[8] and device id[1], please check if device id is valid. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-77- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-78----------------------------------------------------- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-79-- C++ Call Stack: (For framework developers) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-80----------------------------------------------------- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_1.log-81-mindspore/ccsrc/plugin/res_manager/ascend/hal_manager/ascend_hal_manager.cc:67 InitDevice -- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-118-[WARNING] DISTRIBUTED(904623,ffff99adeec0,python):2025-07-15-10:29:08.688.061 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-119-[WARNING] DISTRIBUTED(904623,ffff99adeec0,python):2025-07-15-10:29:13.688.153 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 8 alive nodes. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-120-[WARNING] DISTRIBUTED(904623,ffff99adeec0,python):2025-07-15-10:29:13.688.188 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-121-[WARNING] DISTRIBUTED(904623,ffff99adeec0,python):2025-07-15-10:29:18.688.308 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 8 alive nodes. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-122-[WARNING] DISTRIBUTED(904623,ffff99adeec0,python):2025-07-15-10:29:18.688.344 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log:123:[ERROR] DISTRIBUTED(904623,ffff0fffefa0,python):2025-07-15-10:29:19.206.942 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:511] UpdateTopoState] The node: 1 is timed out. It may exit with exception, please check this node's log. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log:124:[ERROR] DISTRIBUTED(904623,ffff99adeec0,python):2025-07-15-10:29:23.688.450 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:103] Finalize] There are 1 abnormal compute graph nodes. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log:125:2025-07-15 10:29:23,688 - mindformers./output/log[mindformers/core/context/parallel.py:88] - ERROR - Notice: if you are trying to run with a single device, please set use_parallel=False. If not, please check the error message above. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log:126:2025-07-15 10:29:23,690 - mindformers./output/log[mindformers/tools/cloud_adapter/cloud_monitor.py:43] - ERROR - Traceback (most recent call last): /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-127- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 34, in wrapper /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-128- result = run_func(*args, **kwargs) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-129- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py", line 68, in main /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-130- build_context(config) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-131- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/build_context.py", line 464, in build_context -- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-134- self.parallel_opr.init_communication() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-135- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/parallel.py", line 86, in init_communication /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-136- init() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-137- File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py", line 213, in init /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-138- init_cluster() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log:139:RuntimeError: The total number of timed out node is 1. Timed out node list is: [const vector]{1}, worker 1 is the first one timed out, please check its log. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-140- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-141----------------------------------------------------- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-142-- C++ Call Stack: (For framework developers) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-143----------------------------------------------------- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-144-mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:517 UpdateTopoState /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-145- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-146- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log:147:Traceback (most recent call last): /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-148- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py", line 336, in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-149- main(config_) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-150- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 44, in wrapper /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-151- raise exc /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-152- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 34, in wrapper -- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-159- self.parallel_opr.init_communication() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-160- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/parallel.py", line 86, in init_communication /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-161- init() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-162- File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py", line 213, in init /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-163- init_cluster() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log:164:RuntimeError: The total number of timed out node is 1. Timed out node list is: [const vector]{1}, worker 1 is the first one timed out, please check its log. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-165- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-166----------------------------------------------------- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-167-- C++ Call Stack: (For framework developers) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-168----------------------------------------------------- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/scheduler.log-169-mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:517 UpdateTopoState Traceback (most recent call last): File "/home/jenkins/anaconda3/envs/ci39/bin/msrun", line 8, in sys.exit(main()) File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/parallel/cluster/run.py", line 191, in main run(args) File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/parallel/cluster/run.py", line 185, in run process_manager.run() File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/parallel/cluster/process_entity/_api.py", line 268, in run self.join_processes() File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/parallel/cluster/process_entity/_api.py", line 387, in join_processes raise RuntimeError("Distributed job exited with exception. Please check logs in " RuntimeError: Distributed job exited with exception. Please check logs in directory: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/. F =================================== FAILURES =================================== ________ test_deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance ________ @arg_mark(plat_marks=['platform_ascend910b'], level_mark='level0', card_mark='allcards', essential_mark='essential') def test_deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance(): """ Feature: test deepseekv3 cell dp2mp2ep4pp2mb4gas1bs1 8p gmm performance Description: test deepseekv3 cell dp2mp2ep4pp2mb4gas1bs1 8p gmm performance Expectation: st pass """ case_name = "deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance" sh_path = os.path.split(os.path.realpath(__file__))[0] # set the speed up json parallel_speed_up_json = {'matmul_grad_comm_overlap': True} # set the config deepseek_config = DeepseekConfig(num_samples=24, hidden_size=4096, intermediate_size=8192, moe_intermediate_size=2048, parallel_speed_up_json=parallel_speed_up_json, use_gmm=True, enable_deredundency=False, npu_nums_per_device=2, use_fused_ops_permute=True, use_fused_swiglu=True, enable_fa_var_len=True, use_fused_rope=True, pp_interleave_num=1, deterministic="OFF" ) file_path = prepare_deepseekv3_testcase_env(case_name, deepseek_config) # set the communication parameters device_num = 8 master_port = 7124 hccl_if_base_port = 63395 # set env for training graph_kernel_flags = "--enable_pass=grouped_matmul_assignadd_fusion " \ "--enable_cluster_ops=MatMul,BatchMatMul,Reshape --online_tuning=1" os.system(f"bash {sh_path}/run_llm.sh {device_num} \ {file_path} {case_name} {master_port} {hccl_if_base_port} pp mindrecord \"{graph_kernel_flags}\"") # check train over check_pair = {"Training Over": 1} real_log_path = log_path_preprocess(case_name, device_num) for log_path in real_log_path: > check_log(log_path, check_pair) test_deepseekv3_pretrain.py:300: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ file_path = './deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_0.log' check_pairs = {'Training Over': 1} def check_log(file_path, check_pairs=None): # check the number of key in check_pairs in log file is equal to the value log_error_count = subprocess.check_output( ["grep -rE '%s' %s | wc -l" % ("ERROR|Traceback", file_path)], shell=True) log_cnt = str(log_error_count, 'utf-8').strip() if log_cnt != "0": os.system(f"cat {file_path}") assert log_cnt == "0", f"Error found in {file_path}" if check_pairs is not None: for key_word, value in check_pairs.items(): log_output = subprocess.check_output( ["grep -r '%s' %s | wc -l" % (key_word, file_path)], shell=True) log_cnt = str(log_output, 'utf-8').strip() > assert log_cnt == str(value), (f"Failed to find {key_word} in {file_path} or content is not correct." f"Expected occurrences: {value}, but got {log_cnt}") E AssertionError: Failed to find Training Over in ./deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance/worker_0.log or content is not correct.Expected occurrences: 1, but got 0 ../utils.py:160: AssertionError =========================== short test summary info ============================ FAILED test_deepseekv3_pretrain.py::test_deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_gmm_performance ======================== 1 failed in 229.84s (0:03:49) =========================