============================= test session starts ============================== platform linux -- Python 3.9.21, pytest-6.2.5, py-1.11.0, pluggy-0.13.1 rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3, configfile: ../../../../../../../../sault/virtual_test/virtualenv_002/sault/config/pytest.ini plugins: forked-1.6.0, hydra-core-1.3.2, xdist-1.32.0, anyio-4.9.0 collected 1 item test_deepseekv3_pretrain.py enable lazy inline in pp /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) Start worker process with rank id:0, log file:/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_0.log. Environment variable [RANK_ID=0] is exported. [WARNING] ME(909097:281473613819584,MainProcess):2025-07-15-10:34:37.276.519 [mindspore/parallel/cluster/process_entity/_utils.py:62] Launch process with command: taskset -c 144-167 python /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py --config /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/pretrain_deepseek3.yaml --register_path /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/research/deepseek3/ Start worker process with rank id:1, log file:/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log. Environment variable [RANK_ID=1] is exported. [WARNING] ME(909097:281473613819584,MainProcess):2025-07-15-10:34:37.334.517 [mindspore/parallel/cluster/process_entity/_utils.py:62] Launch process with command: taskset -c 24-47 python /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py --config /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/pretrain_deepseek3.yaml --register_path /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/research/deepseek3/ Start worker process with rank id:2, log file:/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_2.log. Environment variable [RANK_ID=2] is exported. [WARNING] ME(909097:281473613819584,MainProcess):2025-07-15-10:34:37.425.127 [mindspore/parallel/cluster/process_entity/_utils.py:62] Launch process with command: taskset -c 96-119 python /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py --config /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/pretrain_deepseek3.yaml --register_path /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/research/deepseek3/ Start worker process with rank id:3, log file:/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_3.log. Environment variable [RANK_ID=3] is exported. [WARNING] ME(909097:281473613819584,MainProcess):2025-07-15-10:34:37.505.112 [mindspore/parallel/cluster/process_entity/_utils.py:62] Launch process with command: taskset -c 72-95 python /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py --config /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/pretrain_deepseek3.yaml --register_path /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/research/deepseek3/ Start worker process with rank id:4, log file:/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_4.log. Environment variable [RANK_ID=4] is exported. [WARNING] ME(909097:281473613819584,MainProcess):2025-07-15-10:34:37.581.776 [mindspore/parallel/cluster/process_entity/_utils.py:62] Launch process with command: taskset -c 0-23 python /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py --config /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/pretrain_deepseek3.yaml --register_path /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/research/deepseek3/ Start worker process with rank id:5, log file:/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_5.log. Environment variable [RANK_ID=5] is exported. [WARNING] ME(909097:281473613819584,MainProcess):2025-07-15-10:34:37.642.167 [mindspore/parallel/cluster/process_entity/_utils.py:62] Launch process with command: taskset -c 120-143 python /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py --config /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/pretrain_deepseek3.yaml --register_path /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/research/deepseek3/ Start worker process with rank id:6, log file:/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_6.log. Environment variable [RANK_ID=6] is exported. [WARNING] ME(909097:281473613819584,MainProcess):2025-07-15-10:34:37.704.449 [mindspore/parallel/cluster/process_entity/_utils.py:62] Launch process with command: taskset -c 48-71 python /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py --config /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/pretrain_deepseek3.yaml --register_path /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/research/deepseek3/ Start worker process with rank id:7, log file:/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_7.log. Environment variable [RANK_ID=7] is exported. [WARNING] ME(909097:281473613819584,MainProcess):2025-07-15-10:34:37.765.838 [mindspore/parallel/cluster/process_entity/_utils.py:62] Launch process with command: taskset -c 168-191 python /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py --config /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/pretrain_deepseek3.yaml --register_path /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/research/deepseek3/ [WARNING] ME(909097:281473613819584,MainProcess):2025-07-15-10:34:37.826.884 [mindspore/parallel/cluster/process_entity/_api.py:267] Distributed job is spawned. Waiting all processes to exit... /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/.local/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) 2025-07-15 10:34:45,974 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:34:45,974 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:34:45,974 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'filepath_prefix', 'processor', 'remove_redundancy', 'resume_by_last_timestamp_ckpt'] 2025-07-15 10:34:45,975 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' [WARNING] ME(909457:281473659236032,MainProcess):2025-07-15-10:34:45.994.579 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(909457:281473659236032,MainProcess):2025-07-15-10:34:45.995.351 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(909457:281473659236032,MainProcess):2025-07-15-10:34:45.995.770 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(909457:281473659236032,MainProcess):2025-07-15-10:34:45.995.889 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(909457:281473659236032,MainProcess):2025-07-15-10:34:45.996.221 [mindspore/context.py:921] For 'context.set_context', 'matmul_grad_comm_overlap' parameter is deprecated, and will be removed in the next version, Please use 'grad_matmul_communication_overlap' instead. [WARNING] ME(909457:281473659236032,MainProcess):2025-07-15-10:34:45.996.359 [mindspore/context.py:1412] For 'context.set_context', the parameter 'memory_optimize_level' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(909457:281473659236032,MainProcess):2025-07-15-10:34:45.996.459 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(909457:281473659236032,MainProcess):2025-07-15-10:34:45.996.578 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. [WARNING] ME(909457:281473659236032,MainProcess):2025-07-15-10:34:45.996.774 [mindspore/context.py:1412] For 'context.set_context', the parameter 'deterministic' will be deprecated and removed in a future version. Please use the api mindspore.set_deterministic() instead. [WARNING] ME(909457:281473659236032,MainProcess):2025-07-15-10:34:45.996.986 [mindspore/context.py:1412] For 'context.set_context', the parameter 'mempool_block_size' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] DISTRIBUTED(909457,ffffb178eec0,python):2025-07-15-10:34:45.999.222 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:53794, destination: 127.0.0.1:7126 [WARNING] DISTRIBUTED(909457,ffff277eefa0,python):2025-07-15-10:34:45.999.224 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53794 to 127.0.0.1:7126 is successfully created. System errno: Success [WARNING] DISTRIBUTED(909457,ffffb178eec0,python):2025-07-15-10:34:45.999.298 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7126 to be connected...Retry number: 1 2025-07-15 10:34:46,041 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:34:46,042 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:34:46,042 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'filepath_prefix', 'processor', 'remove_redundancy', 'resume_by_last_timestamp_ckpt'] 2025-07-15 10:34:46,043 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' 2025-07-15 10:34:46,056 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:34:46,056 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:34:46,056 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'filepath_prefix', 'processor', 'remove_redundancy', 'resume_by_last_timestamp_ckpt'] 2025-07-15 10:34:46,057 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' [WARNING] ME(909461:281473413279424,MainProcess):2025-07-15-10:34:46.621.26 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(909461:281473413279424,MainProcess):2025-07-15-10:34:46.628.87 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(909461:281473413279424,MainProcess):2025-07-15-10:34:46.632.82 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(909461:281473413279424,MainProcess):2025-07-15-10:34:46.633.92 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(909461:281473413279424,MainProcess):2025-07-15-10:34:46.637.04 [mindspore/context.py:921] For 'context.set_context', 'matmul_grad_comm_overlap' parameter is deprecated, and will be removed in the next version, Please use 'grad_matmul_communication_overlap' instead. [WARNING] ME(909461:281473413279424,MainProcess):2025-07-15-10:34:46.638.38 [mindspore/context.py:1412] For 'context.set_context', the parameter 'memory_optimize_level' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(909461:281473413279424,MainProcess):2025-07-15-10:34:46.639.32 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(909461:281473413279424,MainProcess):2025-07-15-10:34:46.640.40 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. [WARNING] ME(909461:281473413279424,MainProcess):2025-07-15-10:34:46.642.21 [mindspore/context.py:1412] For 'context.set_context', the parameter 'deterministic' will be deprecated and removed in a future version. Please use the api mindspore.set_deterministic() instead. [WARNING] ME(909461:281473413279424,MainProcess):2025-07-15-10:34:46.644.26 [mindspore/context.py:1412] For 'context.set_context', the parameter 'mempool_block_size' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] DISTRIBUTED(909461,ffff1d35efa0,python):2025-07-15-10:34:46.066.336 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53796 to 127.0.0.1:7126 is successfully created. System errno: Success [WARNING] DISTRIBUTED(909461,ffffa2cfeec0,python):2025-07-15-10:34:46.066.336 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:53796, destination: 127.0.0.1:7126 [WARNING] DISTRIBUTED(909461,ffffa2cfeec0,python):2025-07-15-10:34:46.066.544 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 22 source: 127.0.0.1:53810, destination: 127.0.0.1:7126 [WARNING] DISTRIBUTED(909461,ffff1e37efa0,python):2025-07-15-10:34:46.066.576 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53810 to 127.0.0.1:7126 is successfully created. System errno: Success [WARNING] DISTRIBUTED(909461,ffffa2cfeec0,python):2025-07-15-10:34:46.066.586 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7126 to be connected...Retry number: 1 [WARNING] ME(909465:281473887694528,MainProcess):2025-07-15-10:34:46.758.01 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(909465:281473887694528,MainProcess):2025-07-15-10:34:46.765.33 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(909465:281473887694528,MainProcess):2025-07-15-10:34:46.769.29 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(909465:281473887694528,MainProcess):2025-07-15-10:34:46.770.38 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(909465:281473887694528,MainProcess):2025-07-15-10:34:46.773.47 [mindspore/context.py:921] For 'context.set_context', 'matmul_grad_comm_overlap' parameter is deprecated, and will be removed in the next version, Please use 'grad_matmul_communication_overlap' instead. [WARNING] ME(909465:281473887694528,MainProcess):2025-07-15-10:34:46.774.78 [mindspore/context.py:1412] For 'context.set_context', the parameter 'memory_optimize_level' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(909465:281473887694528,MainProcess):2025-07-15-10:34:46.775.70 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(909465:281473887694528,MainProcess):2025-07-15-10:34:46.776.79 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. [WARNING] ME(909465:281473887694528,MainProcess):2025-07-15-10:34:46.778.49 [mindspore/context.py:1412] For 'context.set_context', the parameter 'deterministic' will be deprecated and removed in a future version. Please use the api mindspore.set_deterministic() instead. [WARNING] ME(909465:281473887694528,MainProcess):2025-07-15-10:34:46.780.54 [mindspore/context.py:1412] For 'context.set_context', the parameter 'mempool_block_size' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] DISTRIBUTED(909465,ffff397cefa0,python):2025-07-15-10:34:46.079.980 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53822 to 127.0.0.1:7126 is successfully created. System errno: Success [WARNING] DISTRIBUTED(909465,ffffbf16eec0,python):2025-07-15-10:34:46.079.962 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:53822, destination: 127.0.0.1:7126 [WARNING] DISTRIBUTED(909465,ffffbf16eec0,python):2025-07-15-10:34:46.080.181 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 22 source: 127.0.0.1:53832, destination: 127.0.0.1:7126 [WARNING] DISTRIBUTED(909465,ffff3a7eefa0,python):2025-07-15-10:34:46.080.211 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53832 to 127.0.0.1:7126 is successfully created. System errno: Success [WARNING] DISTRIBUTED(909465,ffffbf16eec0,python):2025-07-15-10:34:46.080.221 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7126 to be connected...Retry number: 1 2025-07-15 10:34:46,151 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:34:46,151 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:34:46,152 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'filepath_prefix', 'processor', 'remove_redundancy', 'resume_by_last_timestamp_ckpt'] 2025-07-15 10:34:46,152 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' [WARNING] ME(909469:281473422847680,MainProcess):2025-07-15-10:34:46.169.239 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(909469:281473422847680,MainProcess):2025-07-15-10:34:46.169.981 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(909469:281473422847680,MainProcess):2025-07-15-10:34:46.170.375 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(909469:281473422847680,MainProcess):2025-07-15-10:34:46.170.503 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(909469:281473422847680,MainProcess):2025-07-15-10:34:46.170.813 [mindspore/context.py:921] For 'context.set_context', 'matmul_grad_comm_overlap' parameter is deprecated, and will be removed in the next version, Please use 'grad_matmul_communication_overlap' instead. [WARNING] ME(909469:281473422847680,MainProcess):2025-07-15-10:34:46.170.951 [mindspore/context.py:1412] For 'context.set_context', the parameter 'memory_optimize_level' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(909469:281473422847680,MainProcess):2025-07-15-10:34:46.171.047 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(909469:281473422847680,MainProcess):2025-07-15-10:34:46.171.158 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. [WARNING] ME(909469:281473422847680,MainProcess):2025-07-15-10:34:46.171.341 [mindspore/context.py:1412] For 'context.set_context', the parameter 'deterministic' will be deprecated and removed in a future version. Please use the api mindspore.set_deterministic() instead. [WARNING] ME(909469:281473422847680,MainProcess):2025-07-15-10:34:46.171.535 [mindspore/context.py:1412] For 'context.set_context', the parameter 'mempool_block_size' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] DISTRIBUTED(909469,ffffa361eec0,python):2025-07-15-10:34:46.173.202 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:53836, destination: 127.0.0.1:7126 [WARNING] DISTRIBUTED(909469,ffff1dc7efa0,python):2025-07-15-10:34:46.173.202 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53836 to 127.0.0.1:7126 is successfully created. System errno: Success [WARNING] DISTRIBUTED(909469,ffffa361eec0,python):2025-07-15-10:34:46.173.267 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7126 to be connected...Retry number: 1 2025-07-15 10:34:46,177 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:34:46,178 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:34:46,178 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'filepath_prefix', 'processor', 'remove_redundancy', 'resume_by_last_timestamp_ckpt'] 2025-07-15 10:34:46,179 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' [WARNING] ME(909473:281473684991680,MainProcess):2025-07-15-10:34:46.196.038 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(909473:281473684991680,MainProcess):2025-07-15-10:34:46.196.769 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(909473:281473684991680,MainProcess):2025-07-15-10:34:46.197.158 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(909473:281473684991680,MainProcess):2025-07-15-10:34:46.197.268 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(909473:281473684991680,MainProcess):2025-07-15-10:34:46.197.567 [mindspore/context.py:921] For 'context.set_context', 'matmul_grad_comm_overlap' parameter is deprecated, and will be removed in the next version, Please use 'grad_matmul_communication_overlap' instead. [WARNING] ME(909473:281473684991680,MainProcess):2025-07-15-10:34:46.197.701 [mindspore/context.py:1412] For 'context.set_context', the parameter 'memory_optimize_level' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(909473:281473684991680,MainProcess):2025-07-15-10:34:46.197.795 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(909473:281473684991680,MainProcess):2025-07-15-10:34:46.197.905 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. [WARNING] ME(909473:281473684991680,MainProcess):2025-07-15-10:34:46.198.075 [mindspore/context.py:1412] For 'context.set_context', the parameter 'deterministic' will be deprecated and removed in a future version. Please use the api mindspore.set_deterministic() instead. [WARNING] ME(909473:281473684991680,MainProcess):2025-07-15-10:34:46.198.267 [mindspore/context.py:1412] For 'context.set_context', the parameter 'mempool_block_size' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] DISTRIBUTED(909473,ffffb301eec0,python):2025-07-15-10:34:46.200.111 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:53838, destination: 127.0.0.1:7126 [WARNING] DISTRIBUTED(909473,ffff2d68efa0,python):2025-07-15-10:34:46.200.119 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53838 to 127.0.0.1:7126 is successfully created. System errno: Success [WARNING] DISTRIBUTED(909473,ffffb301eec0,python):2025-07-15-10:34:46.200.180 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7126 to be connected...Retry number: 1 2025-07-15 10:34:46,288 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:34:46,288 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:34:46,288 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'filepath_prefix', 'processor', 'remove_redundancy', 'resume_by_last_timestamp_ckpt'] 2025-07-15 10:34:46,289 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' [WARNING] ME(909481:281473862397632,MainProcess):2025-07-15-10:34:46.307.530 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(909481:281473862397632,MainProcess):2025-07-15-10:34:46.308.269 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(909481:281473862397632,MainProcess):2025-07-15-10:34:46.308.668 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(909481:281473862397632,MainProcess):2025-07-15-10:34:46.308.780 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(909481:281473862397632,MainProcess):2025-07-15-10:34:46.309.081 [mindspore/context.py:921] For 'context.set_context', 'matmul_grad_comm_overlap' parameter is deprecated, and will be removed in the next version, Please use 'grad_matmul_communication_overlap' instead. [WARNING] ME(909481:281473862397632,MainProcess):2025-07-15-10:34:46.309.216 [mindspore/context.py:1412] For 'context.set_context', the parameter 'memory_optimize_level' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(909481:281473862397632,MainProcess):2025-07-15-10:34:46.309.311 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(909481:281473862397632,MainProcess):2025-07-15-10:34:46.309.422 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. [WARNING] ME(909481:281473862397632,MainProcess):2025-07-15-10:34:46.309.589 [mindspore/context.py:1412] For 'context.set_context', the parameter 'deterministic' will be deprecated and removed in a future version. Please use the api mindspore.set_deterministic() instead. [WARNING] ME(909481:281473862397632,MainProcess):2025-07-15-10:34:46.309.793 [mindspore/context.py:1412] For 'context.set_context', the parameter 'mempool_block_size' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] DISTRIBUTED(909481,ffffbd94eec0,python):2025-07-15-10:34:46.311.637 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:53844, destination: 127.0.0.1:7126 [WARNING] DISTRIBUTED(909481,ffff337eefa0,python):2025-07-15-10:34:46.311.645 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53844 to 127.0.0.1:7126 is successfully created. System errno: Success [WARNING] DISTRIBUTED(909481,ffffbd94eec0,python):2025-07-15-10:34:46.311.698 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7126 to be connected...Retry number: 1 2025-07-15 10:34:46,358 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:34:46,359 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:34:46,359 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'filepath_prefix', 'processor', 'remove_redundancy', 'resume_by_last_timestamp_ckpt'] 2025-07-15 10:34:46,359 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' [WARNING] ME(909477:281473880354496,MainProcess):2025-07-15-10:34:46.378.407 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(909477:281473880354496,MainProcess):2025-07-15-10:34:46.379.182 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(909477:281473880354496,MainProcess):2025-07-15-10:34:46.379.583 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(909477:281473880354496,MainProcess):2025-07-15-10:34:46.379.698 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(909477:281473880354496,MainProcess):2025-07-15-10:34:46.380.011 [mindspore/context.py:921] For 'context.set_context', 'matmul_grad_comm_overlap' parameter is deprecated, and will be removed in the next version, Please use 'grad_matmul_communication_overlap' instead. [WARNING] ME(909477:281473880354496,MainProcess):2025-07-15-10:34:46.380.150 [mindspore/context.py:1412] For 'context.set_context', the parameter 'memory_optimize_level' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(909477:281473880354496,MainProcess):2025-07-15-10:34:46.380.247 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(909477:281473880354496,MainProcess):2025-07-15-10:34:46.380.359 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. [WARNING] ME(909477:281473880354496,MainProcess):2025-07-15-10:34:46.380.545 [mindspore/context.py:1412] For 'context.set_context', the parameter 'deterministic' will be deprecated and removed in a future version. Please use the api mindspore.set_deterministic() instead. [WARNING] ME(909477:281473880354496,MainProcess):2025-07-15-10:34:46.380.756 [mindspore/context.py:1412] For 'context.set_context', the parameter 'mempool_block_size' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] DISTRIBUTED(909477,ffffbea6eec0,python):2025-07-15-10:34:46.382.714 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:53854, destination: 127.0.0.1:7126 [WARNING] DISTRIBUTED(909477,ffff390aefa0,python):2025-07-15-10:34:46.382.723 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53854 to 127.0.0.1:7126 is successfully created. System errno: Success [WARNING] DISTRIBUTED(909477,ffffbea6eec0,python):2025-07-15-10:34:46.382.785 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7126 to be connected...Retry number: 1 [WARNING] DISTRIBUTED(909457,ffffb178eec0,python):2025-07-15-10:34:46.499.663 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 22 source: 127.0.0.1:53866, destination: 127.0.0.1:7126 [WARNING] DISTRIBUTED(909457,ffff2ce1efa0,python):2025-07-15-10:34:46.499.678 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53866 to 127.0.0.1:7126 is successfully created. System errno: Success [WARNING] DISTRIBUTED(909457,ffffb178eec0,python):2025-07-15-10:34:46.499.728 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7126 to be connected...Retry number: 2 2025-07-15 10:34:46,501 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:34:46,502 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:34:46,502 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'filepath_prefix', 'processor', 'remove_redundancy', 'resume_by_last_timestamp_ckpt'] 2025-07-15 10:34:46,503 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' [WARNING] ME(909485:281473190260416,MainProcess):2025-07-15-10:34:46.521.535 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(909485:281473190260416,MainProcess):2025-07-15-10:34:46.522.285 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(909485:281473190260416,MainProcess):2025-07-15-10:34:46.522.707 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(909485:281473190260416,MainProcess):2025-07-15-10:34:46.522.830 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(909485:281473190260416,MainProcess):2025-07-15-10:34:46.523.145 [mindspore/context.py:921] For 'context.set_context', 'matmul_grad_comm_overlap' parameter is deprecated, and will be removed in the next version, Please use 'grad_matmul_communication_overlap' instead. [WARNING] ME(909485:281473190260416,MainProcess):2025-07-15-10:34:46.523.287 [mindspore/context.py:1412] For 'context.set_context', the parameter 'memory_optimize_level' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(909485:281473190260416,MainProcess):2025-07-15-10:34:46.523.387 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(909485:281473190260416,MainProcess):2025-07-15-10:34:46.523.501 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. [WARNING] ME(909485:281473190260416,MainProcess):2025-07-15-10:34:46.523.697 [mindspore/context.py:1412] For 'context.set_context', the parameter 'deterministic' will be deprecated and removed in a future version. Please use the api mindspore.set_deterministic() instead. [WARNING] ME(909485:281473190260416,MainProcess):2025-07-15-10:34:46.523.925 [mindspore/context.py:1412] For 'context.set_context', the parameter 'mempool_block_size' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] DISTRIBUTED(909485,ffff0b7eefa0,python):2025-07-15-10:34:46.525.966 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53870 to 127.0.0.1:7126 is successfully created. System errno: Success [WARNING] DISTRIBUTED(909485,ffff9584eec0,python):2025-07-15-10:34:46.525.960 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:53870, destination: 127.0.0.1:7126 [WARNING] DISTRIBUTED(909485,ffff9584eec0,python):2025-07-15-10:34:46.526.180 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 22 source: 127.0.0.1:53872, destination: 127.0.0.1:7126 [WARNING] DISTRIBUTED(909485,ffff10ecefa0,python):2025-07-15-10:34:46.526.209 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53872 to 127.0.0.1:7126 is successfully created. System errno: Success [WARNING] DISTRIBUTED(909485,ffff9584eec0,python):2025-07-15-10:34:46.526.227 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7126 to be connected...Retry number: 1 [WARNING] DISTRIBUTED(909461,ffffa2cfeec0,python):2025-07-15-10:34:46.567.281 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). [WARNING] DISTRIBUTED(909465,ffffbf16eec0,python):2025-07-15-10:34:46.580.799 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). [WARNING] DISTRIBUTED(909469,ffffa361eec0,python):2025-07-15-10:34:46.673.474 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 22 source: 127.0.0.1:53880, destination: 127.0.0.1:7126 [WARNING] DISTRIBUTED(909469,ffff1ec9efa0,python):2025-07-15-10:34:46.673.501 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53880 to 127.0.0.1:7126 is successfully created. System errno: Success [WARNING] DISTRIBUTED(909469,ffffa361eec0,python):2025-07-15-10:34:46.673.516 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7126 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(909473,ffffb301eec0,python):2025-07-15-10:34:46.700.419 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 22 source: 127.0.0.1:53890, destination: 127.0.0.1:7126 [WARNING] DISTRIBUTED(909473,ffff2e6aefa0,python):2025-07-15-10:34:46.700.451 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53890 to 127.0.0.1:7126 is successfully created. System errno: Success [WARNING] DISTRIBUTED(909473,ffffb301eec0,python):2025-07-15-10:34:46.700.460 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7126 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(909481,ffffbd94eec0,python):2025-07-15-10:34:46.811.908 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 22 source: 127.0.0.1:53894, destination: 127.0.0.1:7126 [WARNING] DISTRIBUTED(909481,ffff38fcefa0,python):2025-07-15-10:34:46.811.944 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53894 to 127.0.0.1:7126 is successfully created. System errno: Success [WARNING] DISTRIBUTED(909481,ffffbd94eec0,python):2025-07-15-10:34:46.811.950 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7126 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(909477,ffffbea6eec0,python):2025-07-15-10:34:46.883.039 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 22 source: 127.0.0.1:53896, destination: 127.0.0.1:7126 [WARNING] DISTRIBUTED(909477,ffff3a0cefa0,python):2025-07-15-10:34:46.883.066 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53896 to 127.0.0.1:7126 is successfully created. System errno: Success [WARNING] DISTRIBUTED(909477,ffffbea6eec0,python):2025-07-15-10:34:46.883.084 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:7126 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(909457,ffffb178eec0,python):2025-07-15-10:34:47.000.268 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). [WARNING] DISTRIBUTED(909485,ffff9584eec0,python):2025-07-15-10:34:47.026.728 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). [WARNING] DISTRIBUTED(909461,ffffa2cfeec0,python):2025-07-15-10:34:47.067.392 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/14400). [WARNING] DISTRIBUTED(909465,ffffbf16eec0,python):2025-07-15-10:34:47.080.912 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/14400). [WARNING] DISTRIBUTED(909469,ffffa361eec0,python):2025-07-15-10:34:47.173.986 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). [WARNING] DISTRIBUTED(909473,ffffb301eec0,python):2025-07-15-10:34:47.201.011 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). [WARNING] DISTRIBUTED(909481,ffffbd94eec0,python):2025-07-15-10:34:47.312.427 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). [WARNING] DISTRIBUTED(909477,ffffbea6eec0,python):2025-07-15-10:34:47.383.711 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). [WARNING] DISTRIBUTED(909457,ffffb178eec0,python):2025-07-15-10:34:47.500.383 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/14400). [WARNING] DISTRIBUTED(909485,ffff9584eec0,python):2025-07-15-10:34:47.526.848 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/14400). [WARNING] DISTRIBUTED(909461,ffffa2cfeec0,python):2025-07-15-10:34:47.567.494 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(3/14400). [WARNING] DISTRIBUTED(909465,ffffbf16eec0,python):2025-07-15-10:34:47.581.012 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(3/14400). [WARNING] DISTRIBUTED(909469,ffffa361eec0,python):2025-07-15-10:34:47.674.091 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/14400). [WARNING] DISTRIBUTED(909473,ffffb301eec0,python):2025-07-15-10:34:47.701.124 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/14400). [WARNING] DISTRIBUTED(909481,ffffbd94eec0,python):2025-07-15-10:34:47.812.544 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/14400). [WARNING] DISTRIBUTED(909477,ffffbea6eec0,python):2025-07-15-10:34:47.883.845 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(909477,ffffbea6eec0,python):2025-07-15-10:34:47.883.891 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 5 rank id: 5 [WARNING] DISTRIBUTED(909457,ffffb178eec0,python):2025-07-15-10:34:48.000.520 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(909457,ffffb178eec0,python):2025-07-15-10:34:48.000.570 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 0 rank id: 0 [WARNING] DISTRIBUTED(909485,ffff9584eec0,python):2025-07-15-10:34:48.027.063 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(909485,ffff9584eec0,python):2025-07-15-10:34:48.027.146 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 7 rank id: 7 [WARNING] DISTRIBUTED(909461,ffffa2cfeec0,python):2025-07-15-10:34:48.067.622 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(909461,ffffa2cfeec0,python):2025-07-15-10:34:48.067.662 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 1 rank id: 1 [WARNING] DISTRIBUTED(909465,ffffbf16eec0,python):2025-07-15-10:34:48.081.158 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(909465,ffffbf16eec0,python):2025-07-15-10:34:48.081.218 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 2 rank id: 2 [WARNING] DISTRIBUTED(909469,ffffa361eec0,python):2025-07-15-10:34:48.174.224 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(909469,ffffa361eec0,python):2025-07-15-10:34:48.174.268 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 3 rank id: 3 [WARNING] DISTRIBUTED(909473,ffffb301eec0,python):2025-07-15-10:34:48.201.250 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(909473,ffffb301eec0,python):2025-07-15-10:34:48.201.299 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 4 rank id: 4 [WARNING] DISTRIBUTED(909481,ffffbd94eec0,python):2025-07-15-10:34:48.312.695 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(909481,ffffbd94eec0,python):2025-07-15-10:34:48.312.747 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 6 rank id: 6 [WARNING] DISTRIBUTED(909477,ffffbea6eec0,python):2025-07-15-10:34:49.661.607 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3, 4, 5, 6, 7}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(909477,ffffbea6eec0,python):2025-07-15-10:34:49.661.848 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(909477,fffee1b1efa0,python):2025-07-15-10:34:49.662.081 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:7126, node_rank:2130706433, total_rank_size:8, local_rank_size8 [WARNING] HCCL_ADPT(909477,fffee1b1efa0,python):2025-07-15-10:34:49.662.187 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(909477,fffee1b1efa0,python):2025-07-15-10:34:49.662.224 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:635] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(909477,fffee1b1efa0,python):2025-07-15-10:34:49.662.277 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [WARNING] DEVICE(909477,fffee1b1efa0,python):2025-07-15-10:34:49.662.713 [mindspore/ccsrc/plugin/device/cpu/hal/hardware/ms_collective_comm_lib.cc:251] QueryUniqueID] Retry to lookup the unique id for group hccl_world_group from the meta server node...Retry time: 399/400, sleep 2 2025-07-15 10:34:49,663 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_5.ckpt' 2025-07-15 10:34:49,690 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:34:49,690 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config metric is empty. 2025-07-15 10:34:49,690 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:34:49,691 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'eval_dataset', 'eval_dataset_task', 'filepath_prefix', 'processor'] 2025-07-15 10:34:49,691 - mindformers./output/log[mindformers/trainer/trainer.py:1008] - INFO - Load configs in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/general/run_general_task.yaml to build trainer. 2025-07-15 10:34:49,691 - mindformers./output/log[mindformers/trainer/trainer.py:1044] - INFO - ..........Init Config.......... 2025-07-15 10:34:49,691 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 4, 'capacity_factor': 1.5, 'aux_loss_factor': 0.05, 'num_experts_chosen': 2, 'expert_group_size': 2, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': False, 'enable_sdrop': False, 'use_fused_ops_topkrouter': True, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 3, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': ['expert'], 'aux_loss_factors': [0.0001], 'z_loss_factor': 0.0, 'balance_via_topk_bias': True, 'topk_bias_update_rate': 0.0001, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': 1, 'use_gating_sigmoid': True, 'enable_deredundency': True, 'npu_nums_per_device': 2, 'use_gmm': False, 'enable_gmm_safe_tokens': True, 'use_fused_ops_permute': True, 'callback_moe_droprate': False} 2025-07-15 10:34:49,692 - mindformers./output/log[mindformers/core/parallel_config.py:48] - INFO - initial swap_config from dict: {'swap': False, 'layer_swap': None, 'op_swap': None, 'default_prefetch': 1} 2025-07-15 10:34:49,692 - mindformers./output/log[mindformers/core/parallel_config.py:55] - INFO - initial recompute_config from dict: {'recompute': True, 'select_recompute': False, 'parallel_optimizer_comm_recompute': True, 'select_comm_recompute': False, 'mp_comm_recompute': True, 'recompute_slice_activation': True, 'select_recompute_exclude': False, 'select_comm_recompute_exclude': False} 2025-07-15 10:34:49,692 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 2, 'pipeline_stage': 2, 'micro_batch_num': 2, 'seq_split_num': 1, 'use_seq_parallel': True, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': True, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} 2025-07-15 10:34:49,692 - mindformers./output/log[mindformers/core/parallel_config.py:63] - INFO - pipeline_stage = 2 > 1, vocab_emd_dp will be reset to False. 2025-07-15 10:34:49,693 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' 2025-07-15 10:34:49,694 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_5.ckpt' [WARNING] DISTRIBUTED(909457,ffffb178eec0,python):2025-07-15-10:34:49.724.070 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3, 4, 5, 6, 7}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(909457,ffffb178eec0,python):2025-07-15-10:34:49.724.339 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(909457,fffed4baefa0,python):2025-07-15-10:34:49.724.577 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:7126, node_rank:2130706433, total_rank_size:8, local_rank_size8 [WARNING] HCCL_ADPT(909457,fffed4baefa0,python):2025-07-15-10:34:49.724.669 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(909457,fffed4baefa0,python):2025-07-15-10:34:49.724.731 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:635] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(909457,fffed4baefa0,python):2025-07-15-10:34:49.724.763 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group 2025-07-15 10:34:49,725 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_0.ckpt' [WARNING] DISTRIBUTED(909457,fffed4baefa0,python):2025-07-15-10:34:49.732.255 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group [WARNING] DEVICE(909457,fffe867cefa0,python):2025-07-15-10:34:49.732.572 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 2025-07-15 10:34:49,753 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:34:49,753 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config metric is empty. 2025-07-15 10:34:49,753 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:34:49,753 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'eval_dataset', 'eval_dataset_task', 'filepath_prefix', 'processor'] 2025-07-15 10:34:49,754 - mindformers./output/log[mindformers/trainer/trainer.py:1008] - INFO - Load configs in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/general/run_general_task.yaml to build trainer. 2025-07-15 10:34:49,754 - mindformers./output/log[mindformers/trainer/trainer.py:1044] - INFO - ..........Init Config.......... 2025-07-15 10:34:49,754 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 4, 'capacity_factor': 1.5, 'aux_loss_factor': 0.05, 'num_experts_chosen': 2, 'expert_group_size': 2, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': False, 'enable_sdrop': False, 'use_fused_ops_topkrouter': True, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 3, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': ['expert'], 'aux_loss_factors': [0.0001], 'z_loss_factor': 0.0, 'balance_via_topk_bias': True, 'topk_bias_update_rate': 0.0001, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': 1, 'use_gating_sigmoid': True, 'enable_deredundency': True, 'npu_nums_per_device': 2, 'use_gmm': False, 'enable_gmm_safe_tokens': True, 'use_fused_ops_permute': True, 'callback_moe_droprate': False} 2025-07-15 10:34:49,754 - mindformers./output/log[mindformers/core/parallel_config.py:48] - INFO - initial swap_config from dict: {'swap': False, 'layer_swap': None, 'op_swap': None, 'default_prefetch': 1} 2025-07-15 10:34:49,755 - mindformers./output/log[mindformers/core/parallel_config.py:55] - INFO - initial recompute_config from dict: {'recompute': True, 'select_recompute': False, 'parallel_optimizer_comm_recompute': True, 'select_comm_recompute': False, 'mp_comm_recompute': True, 'recompute_slice_activation': True, 'select_recompute_exclude': False, 'select_comm_recompute_exclude': False} 2025-07-15 10:34:49,755 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 2, 'pipeline_stage': 2, 'micro_batch_num': 2, 'seq_split_num': 1, 'use_seq_parallel': True, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': True, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} 2025-07-15 10:34:49,755 - mindformers./output/log[mindformers/core/parallel_config.py:63] - INFO - pipeline_stage = 2 > 1, vocab_emd_dp will be reset to False. 2025-07-15 10:34:49,756 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' 2025-07-15 10:34:49,756 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_0.ckpt' [WARNING] DISTRIBUTED(909485,ffff9584eec0,python):2025-07-15-10:34:49.774.335 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3, 4, 5, 6, 7}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(909485,ffff9584eec0,python):2025-07-15-10:34:49.774.613 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(909485,fffeb8baefa0,python):2025-07-15-10:34:49.774.833 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:7126, node_rank:2130706433, total_rank_size:8, local_rank_size8 [WARNING] HCCL_ADPT(909485,fffeb8baefa0,python):2025-07-15-10:34:49.774.917 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(909485,fffeb8baefa0,python):2025-07-15-10:34:49.774.954 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:635] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(909485,fffeb8baefa0,python):2025-07-15-10:34:49.775.008 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [WARNING] DISTRIBUTED(909485,fffeb8baefa0,python):2025-07-15-10:34:49.775.452 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group [WARNING] DEVICE(909485,fffe6bffefa0,python):2025-07-15-10:34:49.775.817 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 2025-07-15 10:34:49,776 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_7.ckpt' 2025-07-15 10:34:49,803 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:34:49,803 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config metric is empty. 2025-07-15 10:34:49,804 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:34:49,804 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'eval_dataset', 'eval_dataset_task', 'filepath_prefix', 'processor'] 2025-07-15 10:34:49,804 - mindformers./output/log[mindformers/trainer/trainer.py:1008] - INFO - Load configs in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/general/run_general_task.yaml to build trainer. 2025-07-15 10:34:49,804 - mindformers./output/log[mindformers/trainer/trainer.py:1044] - INFO - ..........Init Config.......... 2025-07-15 10:34:49,805 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 4, 'capacity_factor': 1.5, 'aux_loss_factor': 0.05, 'num_experts_chosen': 2, 'expert_group_size': 2, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': False, 'enable_sdrop': False, 'use_fused_ops_topkrouter': True, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 3, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': ['expert'], 'aux_loss_factors': [0.0001], 'z_loss_factor': 0.0, 'balance_via_topk_bias': True, 'topk_bias_update_rate': 0.0001, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': 1, 'use_gating_sigmoid': True, 'enable_deredundency': True, 'npu_nums_per_device': 2, 'use_gmm': False, 'enable_gmm_safe_tokens': True, 'use_fused_ops_permute': True, 'callback_moe_droprate': False} 2025-07-15 10:34:49,805 - mindformers./output/log[mindformers/core/parallel_config.py:48] - INFO - initial swap_config from dict: {'swap': False, 'layer_swap': None, 'op_swap': None, 'default_prefetch': 1} 2025-07-15 10:34:49,805 - mindformers./output/log[mindformers/core/parallel_config.py:55] - INFO - initial recompute_config from dict: {'recompute': True, 'select_recompute': False, 'parallel_optimizer_comm_recompute': True, 'select_comm_recompute': False, 'mp_comm_recompute': True, 'recompute_slice_activation': True, 'select_recompute_exclude': False, 'select_comm_recompute_exclude': False} 2025-07-15 10:34:49,805 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 2, 'pipeline_stage': 2, 'micro_batch_num': 2, 'seq_split_num': 1, 'use_seq_parallel': True, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': True, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} 2025-07-15 10:34:49,806 - mindformers./output/log[mindformers/core/parallel_config.py:63] - INFO - pipeline_stage = 2 > 1, vocab_emd_dp will be reset to False. 2025-07-15 10:34:49,807 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' 2025-07-15 10:34:49,807 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_7.ckpt' 2025-07-15 10:34:49,837 - mindformers./output/log[mindformers/trainer/base_trainer.py:107] - INFO - host_name: ascend213, host_ip: 121.37.54.128 2025-07-15 10:34:49,838 - mindformers./output/log[mindformers/trainer/base_trainer.py:113] - INFO - Now Running Task is: text_generation, Model is: deepseekV3 2025-07-15 10:34:49,838 - mindformers./output/log[mindformers/trainer/base_trainer.py:143] - WARNING - Input model name is not in the supported list or unspecified. 2025-07-15 10:34:49,838 - mindformers./output/log[mindformers/trainer/base_trainer.py:144] - WARNING - See the list of supported task and model name: ['codellama_34b', 'common', 'deepseek1_5_7b', 'deepseek_33b', 'glm3_6b', 'glm4_9b', 'gpt2', 'gpt2_13b', 'gpt2_52b', 'gpt2_lora', 'gpt2_xl', 'gpt2_xl_lora', 'internlm_7b', 'internlm_7b_lora', 'llama2_13b', 'llama2_70b', 'llama2_7b', 'llama2_7b_lora', 'llama_7b_slora', 'yi_34b', 'yi_6b'] 2025-07-15 10:34:49,839 - mindformers./output/log[mindformers/trainer/base_trainer.py:145] - WARNING - The default model config: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/gpt2/run_gpt2.yaml will now be used for the text_generation task 2025-07-15 10:34:49,839 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:34:49,839 - mindformers./output/log[mindformers/trainer/trainer.py:323] - INFO - ==========Trainer Init Success!========== 2025-07-15 10:34:49,840 - mindformers./output/log[mindformers/trainer/trainer.py:406] - WARNING - sink_size will not be able to set in a future release. Modifying sink_size may cause functional issues when resuming training from a checkpoint. 2025-07-15 10:34:49,840 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:34:49,840 - mindformers./output/log[mindformers/trainer/base_trainer.py:204] - INFO - Pipeline parallel was opened: pipeline_stages = 2, full batch is True, gradient_accumulation_steps will not take effect in pipeline parallel, global batch size will be changed: global_batch_size = batch_size * data_parallel * micro_batch_num * micro_batch_interleave_num = 4 = 1 * 2 * 2 * 1). 2025-07-15 10:34:49,841 - mindformers./output/log[mindformers/trainer/base_trainer.py:338] - WARNING - When using the pipeline parallel mode, the MFPipelineWithLossScaleCell class is used by default. 2025-07-15 10:34:49,841 - mindformers./output/log[mindformers/trainer/base_trainer.py:346] - INFO - PipelineWrapper under evaluate or predict mode will not take effect. 2025-07-15 10:34:49,841 - mindformers./output/log[mindformers/trainer/base_trainer.py:920] - INFO - .........Build Dataset For Train.......... 2025-07-15 10:34:49,841 - mindformers./output/log[mindformers/trainer/base_trainer.py:464] - INFO - .........Build Dataset From Config.......... 2025-07-15 10:34:49,841 - mindformers./output/log[mindformers/dataset/causal_language_model_dataset.py:302] - INFO - Now Create Causal Language Model Dataset. 2025-07-15 10:34:49,842 - mindformers./output/log[mindformers/dataset/base_dataset.py:83] - INFO - Now dataset_strategy is full_batch, shard_id: None, num_shards: None [WARNING] DISTRIBUTED(909465,ffffbf16eec0,python):2025-07-15-10:34:49.849.570 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3, 4, 5, 6, 7}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(909465,ffffbf16eec0,python):2025-07-15-10:34:49.849.823 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(909465,fffee1c6efa0,python):2025-07-15-10:34:49.850.070 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:7126, node_rank:2130706433, total_rank_size:8, local_rank_size8 [WARNING] HCCL_ADPT(909465,fffee1c6efa0,python):2025-07-15-10:34:49.850.171 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(909465,fffee1c6efa0,python):2025-07-15-10:34:49.850.226 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:635] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(909465,fffee1c6efa0,python):2025-07-15-10:34:49.850.257 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group 2025-07-15 10:34:49,850 - mindformers./output/log[mindformers/trainer/base_trainer.py:924] - INFO - Create train dataset finish, dataset size:6 [WARNING] DISTRIBUTED(909465,fffee1c6efa0,python):2025-07-15-10:34:49.850.735 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group 2025-07-15 10:34:49,850 - mindformers./output/log[mindformers/trainer/utils.py:176] - INFO - Will be Training epochs:1, sink_size:1 2025-07-15 10:34:49,850 - mindformers./output/log[mindformers/trainer/utils.py:178] - INFO - Create training dataset finish, dataset size:6 [WARNING] DEVICE(909465,fffee145efa0,python):2025-07-15-10:34:49.851.073 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 2025-07-15 10:34:49,851 - mindformers./output/log[mindformers/trainer/base_trainer.py:971] - INFO - .........Build Net For Train.......... 2025-07-15 10:34:49,851 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_2.ckpt' 2025-07-15 10:34:49,851 - mindformers./output/log[mindformers/trainer/base_trainer.py:498] - INFO - .........Build Network From Config.......... 2025-07-15 10:34:49,878 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:34:49,878 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config metric is empty. 2025-07-15 10:34:49,878 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:34:49,878 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'eval_dataset', 'eval_dataset_task', 'filepath_prefix', 'processor'] 2025-07-15 10:34:49,879 - mindformers./output/log[mindformers/trainer/trainer.py:1008] - INFO - Load configs in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/general/run_general_task.yaml to build trainer. 2025-07-15 10:34:49,879 - mindformers./output/log[mindformers/trainer/trainer.py:1044] - INFO - ..........Init Config.......... 2025-07-15 10:34:49,879 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 4, 'capacity_factor': 1.5, 'aux_loss_factor': 0.05, 'num_experts_chosen': 2, 'expert_group_size': 2, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': False, 'enable_sdrop': False, 'use_fused_ops_topkrouter': True, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 3, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': ['expert'], 'aux_loss_factors': [0.0001], 'z_loss_factor': 0.0, 'balance_via_topk_bias': True, 'topk_bias_update_rate': 0.0001, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': 1, 'use_gating_sigmoid': True, 'enable_deredundency': True, 'npu_nums_per_device': 2, 'use_gmm': False, 'enable_gmm_safe_tokens': True, 'use_fused_ops_permute': True, 'callback_moe_droprate': False} 2025-07-15 10:34:49,880 - mindformers./output/log[mindformers/core/parallel_config.py:48] - INFO - initial swap_config from dict: {'swap': False, 'layer_swap': None, 'op_swap': None, 'default_prefetch': 1} 2025-07-15 10:34:49,880 - mindformers./output/log[mindformers/core/parallel_config.py:55] - INFO - initial recompute_config from dict: {'recompute': True, 'select_recompute': False, 'parallel_optimizer_comm_recompute': True, 'select_comm_recompute': False, 'mp_comm_recompute': True, 'recompute_slice_activation': True, 'select_recompute_exclude': False, 'select_comm_recompute_exclude': False} 2025-07-15 10:34:49,880 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 2, 'pipeline_stage': 2, 'micro_batch_num': 2, 'seq_split_num': 1, 'use_seq_parallel': True, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': True, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} 2025-07-15 10:34:49,880 - mindformers./output/log[mindformers/core/parallel_config.py:63] - INFO - pipeline_stage = 2 > 1, vocab_emd_dp will be reset to False. 2025-07-15 10:34:49,881 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' 2025-07-15 10:34:49,881 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_2.ckpt' 2025-07-15 10:34:49,894 - mindformers./output/log[mindformers/trainer/base_trainer.py:107] - INFO - host_name: ascend213, host_ip: 121.37.54.128 2025-07-15 10:34:49,895 - mindformers./output/log[mindformers/trainer/base_trainer.py:113] - INFO - Now Running Task is: text_generation, Model is: deepseekV3 2025-07-15 10:34:49,895 - mindformers./output/log[mindformers/trainer/base_trainer.py:143] - WARNING - Input model name is not in the supported list or unspecified. 2025-07-15 10:34:49,895 - mindformers./output/log[mindformers/trainer/base_trainer.py:144] - WARNING - See the list of supported task and model name: ['codellama_34b', 'common', 'deepseek1_5_7b', 'deepseek_33b', 'glm3_6b', 'glm4_9b', 'gpt2', 'gpt2_13b', 'gpt2_52b', 'gpt2_lora', 'gpt2_xl', 'gpt2_xl_lora', 'internlm_7b', 'internlm_7b_lora', 'llama2_13b', 'llama2_70b', 'llama2_7b', 'llama2_7b_lora', 'llama_7b_slora', 'yi_34b', 'yi_6b'] 2025-07-15 10:34:49,896 - mindformers./output/log[mindformers/trainer/base_trainer.py:145] - WARNING - The default model config: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/gpt2/run_gpt2.yaml will now be used for the text_generation task 2025-07-15 10:34:49,896 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:34:49,896 - mindformers./output/log[mindformers/trainer/trainer.py:323] - INFO - ==========Trainer Init Success!========== 2025-07-15 10:34:49,897 - mindformers./output/log[mindformers/trainer/trainer.py:406] - WARNING - sink_size will not be able to set in a future release. Modifying sink_size may cause functional issues when resuming training from a checkpoint. 2025-07-15 10:34:49,897 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:34:49,897 - mindformers./output/log[mindformers/trainer/base_trainer.py:204] - INFO - Pipeline parallel was opened: pipeline_stages = 2, full batch is True, gradient_accumulation_steps will not take effect in pipeline parallel, global batch size will be changed: global_batch_size = batch_size * data_parallel * micro_batch_num * micro_batch_interleave_num = 4 = 1 * 2 * 2 * 1). 2025-07-15 10:34:49,898 - mindformers./output/log[mindformers/trainer/base_trainer.py:338] - WARNING - When using the pipeline parallel mode, the MFPipelineWithLossScaleCell class is used by default. 2025-07-15 10:34:49,898 - mindformers./output/log[mindformers/trainer/base_trainer.py:346] - INFO - PipelineWrapper under evaluate or predict mode will not take effect. 2025-07-15 10:34:49,898 - mindformers./output/log[mindformers/trainer/base_trainer.py:920] - INFO - .........Build Dataset For Train.......... 2025-07-15 10:34:49,898 - mindformers./output/log[mindformers/trainer/base_trainer.py:464] - INFO - .........Build Dataset From Config.......... 2025-07-15 10:34:49,898 - mindformers./output/log[mindformers/dataset/causal_language_model_dataset.py:302] - INFO - Now Create Causal Language Model Dataset. 2025-07-15 10:34:49,899 - mindformers./output/log[mindformers/dataset/base_dataset.py:83] - INFO - Now dataset_strategy is full_batch, shard_id: None, num_shards: None 2025-07-15 10:34:49,907 - mindformers./output/log[mindformers/trainer/base_trainer.py:924] - INFO - Create train dataset finish, dataset size:6 2025-07-15 10:34:49,907 - mindformers./output/log[mindformers/trainer/utils.py:176] - INFO - Will be Training epochs:1, sink_size:1 2025-07-15 10:34:49,907 - mindformers./output/log[mindformers/trainer/utils.py:178] - INFO - Create training dataset finish, dataset size:6 2025-07-15 10:34:49,908 - mindformers./output/log[mindformers/trainer/base_trainer.py:971] - INFO - .........Build Net For Train.......... 2025-07-15 10:34:49,908 - mindformers./output/log[mindformers/trainer/base_trainer.py:498] - INFO - .........Build Network From Config.......... 2025-07-15 10:34:49,944 - mindformers./output/log[mindformers/trainer/base_trainer.py:107] - INFO - host_name: ascend213, host_ip: 121.37.54.128 2025-07-15 10:34:49,945 - mindformers./output/log[mindformers/trainer/base_trainer.py:113] - INFO - Now Running Task is: text_generation, Model is: deepseekV3 2025-07-15 10:34:49,945 - mindformers./output/log[mindformers/trainer/base_trainer.py:143] - WARNING - Input model name is not in the supported list or unspecified. 2025-07-15 10:34:49,945 - mindformers./output/log[mindformers/trainer/base_trainer.py:144] - WARNING - See the list of supported task and model name: ['codellama_34b', 'common', 'deepseek1_5_7b', 'deepseek_33b', 'glm3_6b', 'glm4_9b', 'gpt2', 'gpt2_13b', 'gpt2_52b', 'gpt2_lora', 'gpt2_xl', 'gpt2_xl_lora', 'internlm_7b', 'internlm_7b_lora', 'llama2_13b', 'llama2_70b', 'llama2_7b', 'llama2_7b_lora', 'llama_7b_slora', 'yi_34b', 'yi_6b'] 2025-07-15 10:34:49,946 - mindformers./output/log[mindformers/trainer/base_trainer.py:145] - WARNING - The default model config: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/gpt2/run_gpt2.yaml will now be used for the text_generation task 2025-07-15 10:34:49,946 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:34:49,946 - mindformers./output/log[mindformers/trainer/trainer.py:323] - INFO - ==========Trainer Init Success!========== 2025-07-15 10:34:49,947 - mindformers./output/log[mindformers/trainer/trainer.py:406] - WARNING - sink_size will not be able to set in a future release. Modifying sink_size may cause functional issues when resuming training from a checkpoint. 2025-07-15 10:34:49,947 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:34:49,948 - mindformers./output/log[mindformers/trainer/base_trainer.py:204] - INFO - Pipeline parallel was opened: pipeline_stages = 2, full batch is True, gradient_accumulation_steps will not take effect in pipeline parallel, global batch size will be changed: global_batch_size = batch_size * data_parallel * micro_batch_num * micro_batch_interleave_num = 4 = 1 * 2 * 2 * 1). 2025-07-15 10:34:49,948 - mindformers./output/log[mindformers/trainer/base_trainer.py:338] - WARNING - When using the pipeline parallel mode, the MFPipelineWithLossScaleCell class is used by default. 2025-07-15 10:34:49,948 - mindformers./output/log[mindformers/trainer/base_trainer.py:346] - INFO - PipelineWrapper under evaluate or predict mode will not take effect. 2025-07-15 10:34:49,948 - mindformers./output/log[mindformers/trainer/base_trainer.py:920] - INFO - .........Build Dataset For Train.......... 2025-07-15 10:34:49,948 - mindformers./output/log[mindformers/trainer/base_trainer.py:464] - INFO - .........Build Dataset From Config.......... 2025-07-15 10:34:49,949 - mindformers./output/log[mindformers/dataset/causal_language_model_dataset.py:302] - INFO - Now Create Causal Language Model Dataset. 2025-07-15 10:34:49,950 - mindformers./output/log[mindformers/dataset/base_dataset.py:83] - INFO - Now dataset_strategy is full_batch, shard_id: None, num_shards: None [WARNING] DISTRIBUTED(909469,ffffa361eec0,python):2025-07-15-10:34:49.952.397 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3, 4, 5, 6, 7}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(909469,ffffa361eec0,python):2025-07-15-10:34:49.952.626 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(909469,fffec695efa0,python):2025-07-15-10:34:49.952.854 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:7126, node_rank:2130706433, total_rank_size:8, local_rank_size8 [WARNING] HCCL_ADPT(909469,fffec695efa0,python):2025-07-15-10:34:49.952.955 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(909469,fffec695efa0,python):2025-07-15-10:34:49.953.012 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:635] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(909469,fffec695efa0,python):2025-07-15-10:34:49.953.042 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [WARNING] DISTRIBUTED(909469,fffec695efa0,python):2025-07-15-10:34:49.953.410 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group [WARNING] DEVICE(909469,fffec614efa0,python):2025-07-15-10:34:49.953.729 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 2025-07-15 10:34:49,954 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_3.ckpt' 2025-07-15 10:34:49,957 - mindformers./output/log[mindformers/trainer/base_trainer.py:924] - INFO - Create train dataset finish, dataset size:6 2025-07-15 10:34:49,958 - mindformers./output/log[mindformers/trainer/utils.py:176] - INFO - Will be Training epochs:1, sink_size:1 2025-07-15 10:34:49,958 - mindformers./output/log[mindformers/trainer/utils.py:178] - INFO - Create training dataset finish, dataset size:6 2025-07-15 10:34:49,958 - mindformers./output/log[mindformers/trainer/base_trainer.py:971] - INFO - .........Build Net For Train.......... 2025-07-15 10:34:49,959 - mindformers./output/log[mindformers/trainer/base_trainer.py:498] - INFO - .........Build Network From Config.......... [WARNING] DISTRIBUTED(909473,ffffb301eec0,python):2025-07-15-10:34:49.963.157 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3, 4, 5, 6, 7}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(909473,ffffb301eec0,python):2025-07-15-10:34:49.963.384 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(909473,fffee080efa0,python):2025-07-15-10:34:49.963.614 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:7126, node_rank:2130706433, total_rank_size:8, local_rank_size8 [WARNING] HCCL_ADPT(909473,fffee080efa0,python):2025-07-15-10:34:49.963.703 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(909473,fffee080efa0,python):2025-07-15-10:34:49.963.749 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:635] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(909473,fffee080efa0,python):2025-07-15-10:34:49.963.778 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [WARNING] DISTRIBUTED(909473,fffee080efa0,python):2025-07-15-10:34:49.964.181 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group [WARNING] DEVICE(909473,fffed5b1efa0,python):2025-07-15-10:34:49.964.475 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 2025-07-15 10:34:49,964 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_4.ckpt' 2025-07-15 10:34:49,981 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:34:49,981 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config metric is empty. 2025-07-15 10:34:49,981 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:34:49,982 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'eval_dataset', 'eval_dataset_task', 'filepath_prefix', 'processor'] 2025-07-15 10:34:49,982 - mindformers./output/log[mindformers/trainer/trainer.py:1008] - INFO - Load configs in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/general/run_general_task.yaml to build trainer. 2025-07-15 10:34:49,982 - mindformers./output/log[mindformers/trainer/trainer.py:1044] - INFO - ..........Init Config.......... 2025-07-15 10:34:49,982 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 4, 'capacity_factor': 1.5, 'aux_loss_factor': 0.05, 'num_experts_chosen': 2, 'expert_group_size': 2, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': False, 'enable_sdrop': False, 'use_fused_ops_topkrouter': True, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 3, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': ['expert'], 'aux_loss_factors': [0.0001], 'z_loss_factor': 0.0, 'balance_via_topk_bias': True, 'topk_bias_update_rate': 0.0001, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': 1, 'use_gating_sigmoid': True, 'enable_deredundency': True, 'npu_nums_per_device': 2, 'use_gmm': False, 'enable_gmm_safe_tokens': True, 'use_fused_ops_permute': True, 'callback_moe_droprate': False} 2025-07-15 10:34:49,983 - mindformers./output/log[mindformers/core/parallel_config.py:48] - INFO - initial swap_config from dict: {'swap': False, 'layer_swap': None, 'op_swap': None, 'default_prefetch': 1} 2025-07-15 10:34:49,983 - mindformers./output/log[mindformers/core/parallel_config.py:55] - INFO - initial recompute_config from dict: {'recompute': True, 'select_recompute': False, 'parallel_optimizer_comm_recompute': True, 'select_comm_recompute': False, 'mp_comm_recompute': True, 'recompute_slice_activation': True, 'select_recompute_exclude': False, 'select_comm_recompute_exclude': False} 2025-07-15 10:34:49,983 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 2, 'pipeline_stage': 2, 'micro_batch_num': 2, 'seq_split_num': 1, 'use_seq_parallel': True, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': True, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} 2025-07-15 10:34:49,983 - mindformers./output/log[mindformers/core/parallel_config.py:63] - INFO - pipeline_stage = 2 > 1, vocab_emd_dp will be reset to False. 2025-07-15 10:34:49,984 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' 2025-07-15 10:34:49,985 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_3.ckpt' 2025-07-15 10:34:49,991 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:34:49,991 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config metric is empty. 2025-07-15 10:34:49,992 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:34:49,992 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'eval_dataset', 'eval_dataset_task', 'filepath_prefix', 'processor'] 2025-07-15 10:34:49,992 - mindformers./output/log[mindformers/trainer/trainer.py:1008] - INFO - Load configs in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/general/run_general_task.yaml to build trainer. 2025-07-15 10:34:49,992 - mindformers./output/log[mindformers/trainer/trainer.py:1044] - INFO - ..........Init Config.......... 2025-07-15 10:34:49,993 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 4, 'capacity_factor': 1.5, 'aux_loss_factor': 0.05, 'num_experts_chosen': 2, 'expert_group_size': 2, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': False, 'enable_sdrop': False, 'use_fused_ops_topkrouter': True, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 3, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': ['expert'], 'aux_loss_factors': [0.0001], 'z_loss_factor': 0.0, 'balance_via_topk_bias': True, 'topk_bias_update_rate': 0.0001, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': 1, 'use_gating_sigmoid': True, 'enable_deredundency': True, 'npu_nums_per_device': 2, 'use_gmm': False, 'enable_gmm_safe_tokens': True, 'use_fused_ops_permute': True, 'callback_moe_droprate': False} 2025-07-15 10:34:49,993 - mindformers./output/log[mindformers/core/parallel_config.py:48] - INFO - initial swap_config from dict: {'swap': False, 'layer_swap': None, 'op_swap': None, 'default_prefetch': 1} 2025-07-15 10:34:49,993 - mindformers./output/log[mindformers/core/parallel_config.py:55] - INFO - initial recompute_config from dict: {'recompute': True, 'select_recompute': False, 'parallel_optimizer_comm_recompute': True, 'select_comm_recompute': False, 'mp_comm_recompute': True, 'recompute_slice_activation': True, 'select_recompute_exclude': False, 'select_comm_recompute_exclude': False} 2025-07-15 10:34:49,993 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 2, 'pipeline_stage': 2, 'micro_batch_num': 2, 'seq_split_num': 1, 'use_seq_parallel': True, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': True, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} 2025-07-15 10:34:49,994 - mindformers./output/log[mindformers/core/parallel_config.py:63] - INFO - pipeline_stage = 2 > 1, vocab_emd_dp will be reset to False. 2025-07-15 10:34:49,994 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' 2025-07-15 10:34:49,995 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_4.ckpt' 2025-07-15 10:34:50,026 - mindformers./output/log[mindformers/trainer/base_trainer.py:107] - INFO - host_name: ascend213, host_ip: 121.37.54.128 2025-07-15 10:34:50,027 - mindformers./output/log[mindformers/trainer/base_trainer.py:113] - INFO - Now Running Task is: text_generation, Model is: deepseekV3 2025-07-15 10:34:50,027 - mindformers./output/log[mindformers/trainer/base_trainer.py:143] - WARNING - Input model name is not in the supported list or unspecified. 2025-07-15 10:34:50,027 - mindformers./output/log[mindformers/trainer/base_trainer.py:144] - WARNING - See the list of supported task and model name: ['codellama_34b', 'common', 'deepseek1_5_7b', 'deepseek_33b', 'glm3_6b', 'glm4_9b', 'gpt2', 'gpt2_13b', 'gpt2_52b', 'gpt2_lora', 'gpt2_xl', 'gpt2_xl_lora', 'internlm_7b', 'internlm_7b_lora', 'llama2_13b', 'llama2_70b', 'llama2_7b', 'llama2_7b_lora', 'llama_7b_slora', 'yi_34b', 'yi_6b'] 2025-07-15 10:34:50,028 - mindformers./output/log[mindformers/trainer/base_trainer.py:145] - WARNING - The default model config: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/gpt2/run_gpt2.yaml will now be used for the text_generation task 2025-07-15 10:34:50,028 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:34:50,028 - mindformers./output/log[mindformers/trainer/trainer.py:323] - INFO - ==========Trainer Init Success!========== 2025-07-15 10:34:50,029 - mindformers./output/log[mindformers/trainer/trainer.py:406] - WARNING - sink_size will not be able to set in a future release. Modifying sink_size may cause functional issues when resuming training from a checkpoint. 2025-07-15 10:34:50,029 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:34:50,029 - mindformers./output/log[mindformers/trainer/base_trainer.py:204] - INFO - Pipeline parallel was opened: pipeline_stages = 2, full batch is True, gradient_accumulation_steps will not take effect in pipeline parallel, global batch size will be changed: global_batch_size = batch_size * data_parallel * micro_batch_num * micro_batch_interleave_num = 4 = 1 * 2 * 2 * 1). 2025-07-15 10:34:50,029 - mindformers./output/log[mindformers/trainer/base_trainer.py:338] - WARNING - When using the pipeline parallel mode, the MFPipelineWithLossScaleCell class is used by default. 2025-07-15 10:34:50,030 - mindformers./output/log[mindformers/trainer/base_trainer.py:346] - INFO - PipelineWrapper under evaluate or predict mode will not take effect. 2025-07-15 10:34:50,030 - mindformers./output/log[mindformers/trainer/base_trainer.py:920] - INFO - .........Build Dataset For Train.......... 2025-07-15 10:34:50,030 - mindformers./output/log[mindformers/trainer/base_trainer.py:464] - INFO - .........Build Dataset From Config.......... 2025-07-15 10:34:50,030 - mindformers./output/log[mindformers/dataset/causal_language_model_dataset.py:302] - INFO - Now Create Causal Language Model Dataset. 2025-07-15 10:34:50,031 - mindformers./output/log[mindformers/dataset/base_dataset.py:83] - INFO - Now dataset_strategy is full_batch, shard_id: None, num_shards: None 2025-07-15 10:34:50,038 - mindformers./output/log[mindformers/trainer/base_trainer.py:924] - INFO - Create train dataset finish, dataset size:6 2025-07-15 10:34:50,039 - mindformers./output/log[mindformers/trainer/utils.py:176] - INFO - Will be Training epochs:1, sink_size:1 2025-07-15 10:34:50,039 - mindformers./output/log[mindformers/trainer/utils.py:178] - INFO - Create training dataset finish, dataset size:6 2025-07-15 10:34:50,039 - mindformers./output/log[mindformers/trainer/base_trainer.py:971] - INFO - .........Build Net For Train.......... 2025-07-15 10:34:50,039 - mindformers./output/log[mindformers/trainer/base_trainer.py:498] - INFO - .........Build Network From Config.......... [WARNING] DISTRIBUTED(909481,ffffbd94eec0,python):2025-07-15-10:34:50.055.742 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3, 4, 5, 6, 7}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(909481,ffffbd94eec0,python):2025-07-15-10:34:50.055.977 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(909481,fffee0baefa0,python):2025-07-15-10:34:50.056.214 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:7126, node_rank:2130706433, total_rank_size:8, local_rank_size8 [WARNING] HCCL_ADPT(909481,fffee0baefa0,python):2025-07-15-10:34:50.056.320 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(909481,fffee0baefa0,python):2025-07-15-10:34:50.056.373 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:635] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(909481,fffee0baefa0,python):2025-07-15-10:34:50.056.403 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [WARNING] DISTRIBUTED(909481,fffee0baefa0,python):2025-07-15-10:34:50.056.774 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group [WARNING] DEVICE(909481,fffe93ffefa0,python):2025-07-15-10:34:50.057.165 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 2025-07-15 10:34:50,057 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_6.ckpt' 2025-07-15 10:34:50,084 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config swap_config is empty. 2025-07-15 10:34:50,084 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config metric is empty. 2025-07-15 10:34:50,085 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config monitor_config is empty. 2025-07-15 10:34:50,085 - mindformers./output/log[mindformers/tools/register/template.py:683] - WARNING - Some configs in yaml are useless for train: ['auto_tune', 'autotune_per_step', 'eval_callbacks', 'eval_dataset', 'eval_dataset_task', 'filepath_prefix', 'processor'] 2025-07-15 10:34:50,085 - mindformers./output/log[mindformers/trainer/trainer.py:1008] - INFO - Load configs in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/general/run_general_task.yaml to build trainer. 2025-07-15 10:34:50,085 - mindformers./output/log[mindformers/trainer/trainer.py:1044] - INFO - ..........Init Config.......... 2025-07-15 10:34:50,086 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 4, 'capacity_factor': 1.5, 'aux_loss_factor': 0.05, 'num_experts_chosen': 2, 'expert_group_size': 2, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': False, 'enable_sdrop': False, 'use_fused_ops_topkrouter': True, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 3, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': ['expert'], 'aux_loss_factors': [0.0001], 'z_loss_factor': 0.0, 'balance_via_topk_bias': True, 'topk_bias_update_rate': 0.0001, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': 1, 'use_gating_sigmoid': True, 'enable_deredundency': True, 'npu_nums_per_device': 2, 'use_gmm': False, 'enable_gmm_safe_tokens': True, 'use_fused_ops_permute': True, 'callback_moe_droprate': False} 2025-07-15 10:34:50,086 - mindformers./output/log[mindformers/core/parallel_config.py:48] - INFO - initial swap_config from dict: {'swap': False, 'layer_swap': None, 'op_swap': None, 'default_prefetch': 1} 2025-07-15 10:34:50,086 - mindformers./output/log[mindformers/core/parallel_config.py:55] - INFO - initial recompute_config from dict: {'recompute': True, 'select_recompute': False, 'parallel_optimizer_comm_recompute': True, 'select_comm_recompute': False, 'mp_comm_recompute': True, 'recompute_slice_activation': True, 'select_recompute_exclude': False, 'select_comm_recompute_exclude': False} 2025-07-15 10:34:50,086 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 2, 'pipeline_stage': 2, 'micro_batch_num': 2, 'seq_split_num': 1, 'use_seq_parallel': True, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': True, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} 2025-07-15 10:34:50,087 - mindformers./output/log[mindformers/core/parallel_config.py:63] - INFO - pipeline_stage = 2 > 1, vocab_emd_dp will be reset to False. 2025-07-15 10:34:50,088 - mindformers./output/log[mindformers/tools/utils.py:166] - INFO - set output path to '/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/output' 2025-07-15 10:34:50,088 - mindformers./output/log[mindformers/tools/utils.py:181] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_6.ckpt' 2025-07-15 10:34:50,103 - mindformers./output/log[mindformers/trainer/base_trainer.py:107] - INFO - host_name: ascend213, host_ip: 121.37.54.128 2025-07-15 10:34:50,103 - mindformers./output/log[mindformers/trainer/base_trainer.py:113] - INFO - Now Running Task is: text_generation, Model is: deepseekV3 2025-07-15 10:34:50,104 - mindformers./output/log[mindformers/trainer/base_trainer.py:143] - WARNING - Input model name is not in the supported list or unspecified. 2025-07-15 10:34:50,104 - mindformers./output/log[mindformers/trainer/base_trainer.py:144] - WARNING - See the list of supported task and model name: ['codellama_34b', 'common', 'deepseek1_5_7b', 'deepseek_33b', 'glm3_6b', 'glm4_9b', 'gpt2', 'gpt2_13b', 'gpt2_52b', 'gpt2_lora', 'gpt2_xl', 'gpt2_xl_lora', 'internlm_7b', 'internlm_7b_lora', 'llama2_13b', 'llama2_70b', 'llama2_7b', 'llama2_7b_lora', 'llama_7b_slora', 'yi_34b', 'yi_6b'] 2025-07-15 10:34:50,104 - mindformers./output/log[mindformers/trainer/base_trainer.py:145] - WARNING - The default model config: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/gpt2/run_gpt2.yaml will now be used for the text_generation task 2025-07-15 10:34:50,105 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:34:50,105 - mindformers./output/log[mindformers/trainer/trainer.py:323] - INFO - ==========Trainer Init Success!========== 2025-07-15 10:34:50,105 - mindformers./output/log[mindformers/trainer/trainer.py:406] - WARNING - sink_size will not be able to set in a future release. Modifying sink_size may cause functional issues when resuming training from a checkpoint. 2025-07-15 10:34:50,105 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:34:50,106 - mindformers./output/log[mindformers/trainer/base_trainer.py:204] - INFO - Pipeline parallel was opened: pipeline_stages = 2, full batch is True, gradient_accumulation_steps will not take effect in pipeline parallel, global batch size will be changed: global_batch_size = batch_size * data_parallel * micro_batch_num * micro_batch_interleave_num = 4 = 1 * 2 * 2 * 1). 2025-07-15 10:34:50,106 - mindformers./output/log[mindformers/trainer/base_trainer.py:338] - WARNING - When using the pipeline parallel mode, the MFPipelineWithLossScaleCell class is used by default. 2025-07-15 10:34:50,106 - mindformers./output/log[mindformers/trainer/base_trainer.py:346] - INFO - PipelineWrapper under evaluate or predict mode will not take effect. 2025-07-15 10:34:50,106 - mindformers./output/log[mindformers/trainer/base_trainer.py:920] - INFO - .........Build Dataset For Train.......... 2025-07-15 10:34:50,107 - mindformers./output/log[mindformers/trainer/base_trainer.py:464] - INFO - .........Build Dataset From Config.......... 2025-07-15 10:34:50,107 - mindformers./output/log[mindformers/dataset/causal_language_model_dataset.py:302] - INFO - Now Create Causal Language Model Dataset. 2025-07-15 10:34:50,108 - mindformers./output/log[mindformers/dataset/base_dataset.py:83] - INFO - Now dataset_strategy is full_batch, shard_id: None, num_shards: None 2025-07-15 10:34:50,108 - mindformers./output/log[mindformers/trainer/base_trainer.py:107] - INFO - host_name: ascend213, host_ip: 121.37.54.128 2025-07-15 10:34:50,108 - mindformers./output/log[mindformers/trainer/base_trainer.py:113] - INFO - Now Running Task is: text_generation, Model is: deepseekV3 2025-07-15 10:34:50,109 - mindformers./output/log[mindformers/trainer/base_trainer.py:143] - WARNING - Input model name is not in the supported list or unspecified. 2025-07-15 10:34:50,109 - mindformers./output/log[mindformers/trainer/base_trainer.py:144] - WARNING - See the list of supported task and model name: ['codellama_34b', 'common', 'deepseek1_5_7b', 'deepseek_33b', 'glm3_6b', 'glm4_9b', 'gpt2', 'gpt2_13b', 'gpt2_52b', 'gpt2_lora', 'gpt2_xl', 'gpt2_xl_lora', 'internlm_7b', 'internlm_7b_lora', 'llama2_13b', 'llama2_70b', 'llama2_7b', 'llama2_7b_lora', 'llama_7b_slora', 'yi_34b', 'yi_6b'] 2025-07-15 10:34:50,109 - mindformers./output/log[mindformers/trainer/base_trainer.py:145] - WARNING - The default model config: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/gpt2/run_gpt2.yaml will now be used for the text_generation task 2025-07-15 10:34:50,109 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:34:50,110 - mindformers./output/log[mindformers/trainer/trainer.py:323] - INFO - ==========Trainer Init Success!========== 2025-07-15 10:34:50,110 - mindformers./output/log[mindformers/trainer/trainer.py:406] - WARNING - sink_size will not be able to set in a future release. Modifying sink_size may cause functional issues when resuming training from a checkpoint. 2025-07-15 10:34:50,110 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:34:50,111 - mindformers./output/log[mindformers/trainer/base_trainer.py:204] - INFO - Pipeline parallel was opened: pipeline_stages = 2, full batch is True, gradient_accumulation_steps will not take effect in pipeline parallel, global batch size will be changed: global_batch_size = batch_size * data_parallel * micro_batch_num * micro_batch_interleave_num = 4 = 1 * 2 * 2 * 1). 2025-07-15 10:34:50,111 - mindformers./output/log[mindformers/trainer/base_trainer.py:338] - WARNING - When using the pipeline parallel mode, the MFPipelineWithLossScaleCell class is used by default. 2025-07-15 10:34:50,111 - mindformers./output/log[mindformers/trainer/base_trainer.py:346] - INFO - PipelineWrapper under evaluate or predict mode will not take effect. 2025-07-15 10:34:50,111 - mindformers./output/log[mindformers/trainer/base_trainer.py:920] - INFO - .........Build Dataset For Train.......... 2025-07-15 10:34:50,111 - mindformers./output/log[mindformers/trainer/base_trainer.py:464] - INFO - .........Build Dataset From Config.......... 2025-07-15 10:34:50,112 - mindformers./output/log[mindformers/dataset/causal_language_model_dataset.py:302] - INFO - Now Create Causal Language Model Dataset. 2025-07-15 10:34:50,113 - mindformers./output/log[mindformers/dataset/base_dataset.py:83] - INFO - Now dataset_strategy is full_batch, shard_id: None, num_shards: None 2025-07-15 10:34:50,114 - mindformers./output/log[mindformers/trainer/base_trainer.py:924] - INFO - Create train dataset finish, dataset size:6 2025-07-15 10:34:50,115 - mindformers./output/log[mindformers/trainer/utils.py:176] - INFO - Will be Training epochs:1, sink_size:1 2025-07-15 10:34:50,115 - mindformers./output/log[mindformers/trainer/utils.py:178] - INFO - Create training dataset finish, dataset size:6 2025-07-15 10:34:50,115 - mindformers./output/log[mindformers/trainer/base_trainer.py:971] - INFO - .........Build Net For Train.......... 2025-07-15 10:34:50,115 - mindformers./output/log[mindformers/trainer/base_trainer.py:498] - INFO - .........Build Network From Config.......... 2025-07-15 10:34:50,119 - mindformers./output/log[mindformers/trainer/base_trainer.py:924] - INFO - Create train dataset finish, dataset size:6 2025-07-15 10:34:50,120 - mindformers./output/log[mindformers/trainer/utils.py:176] - INFO - Will be Training epochs:1, sink_size:1 2025-07-15 10:34:50,120 - mindformers./output/log[mindformers/trainer/utils.py:178] - INFO - Create training dataset finish, dataset size:6 2025-07-15 10:34:50,120 - mindformers./output/log[mindformers/trainer/base_trainer.py:971] - INFO - .........Build Net For Train.......... 2025-07-15 10:34:50,120 - mindformers./output/log[mindformers/trainer/base_trainer.py:498] - INFO - .........Build Network From Config.......... [WARNING] DISTRIBUTED(909477,fffee1b1efa0,python):2025-07-15-10:34:50.163.303 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group [WARNING] DEVICE(909477,fffe937eefa0,python):2025-07-15-10:34:50.163.809 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 2025-07-15 10:34:50,193 - mindformers./output/log[mindformers/version_control.py:140] - INFO - The Lazy Inline compilation acceleration feature is turned on. 2025-07-15 10:34:50,198 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1216] - INFO - Enable flash attention. 2025-07-15 10:34:50,212 - mindformers./output/log[mindformers/trainer/base_trainer.py:107] - INFO - host_name: ascend213, host_ip: 121.37.54.128 2025-07-15 10:34:50,213 - mindformers./output/log[mindformers/trainer/base_trainer.py:113] - INFO - Now Running Task is: text_generation, Model is: deepseekV3 2025-07-15 10:34:50,213 - mindformers./output/log[mindformers/trainer/base_trainer.py:143] - WARNING - Input model name is not in the supported list or unspecified. 2025-07-15 10:34:50,214 - mindformers./output/log[mindformers/trainer/base_trainer.py:144] - WARNING - See the list of supported task and model name: ['codellama_34b', 'common', 'deepseek1_5_7b', 'deepseek_33b', 'glm3_6b', 'glm4_9b', 'gpt2', 'gpt2_13b', 'gpt2_52b', 'gpt2_lora', 'gpt2_xl', 'gpt2_xl_lora', 'internlm_7b', 'internlm_7b_lora', 'llama2_13b', 'llama2_70b', 'llama2_7b', 'llama2_7b_lora', 'llama_7b_slora', 'yi_34b', 'yi_6b'] 2025-07-15 10:34:50,214 - mindformers./output/log[mindformers/trainer/base_trainer.py:145] - WARNING - The default model config: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/configs/gpt2/run_gpt2.yaml will now be used for the text_generation task 2025-07-15 10:34:50,214 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:34:50,215 - mindformers./output/log[mindformers/trainer/trainer.py:323] - INFO - ==========Trainer Init Success!========== 2025-07-15 10:34:50,215 - mindformers./output/log[mindformers/trainer/trainer.py:406] - WARNING - sink_size will not be able to set in a future release. Modifying sink_size may cause functional issues when resuming training from a checkpoint. 2025-07-15 10:34:50,215 - mindformers./output/log[mindformers/trainer/trainer.py:1117] - INFO - ..........Init Model.......... 2025-07-15 10:34:50,216 - mindformers./output/log[mindformers/trainer/base_trainer.py:204] - INFO - Pipeline parallel was opened: pipeline_stages = 2, full batch is True, gradient_accumulation_steps will not take effect in pipeline parallel, global batch size will be changed: global_batch_size = batch_size * data_parallel * micro_batch_num * micro_batch_interleave_num = 4 = 1 * 2 * 2 * 1). 2025-07-15 10:34:50,216 - mindformers./output/log[mindformers/trainer/base_trainer.py:338] - WARNING - When using the pipeline parallel mode, the MFPipelineWithLossScaleCell class is used by default. 2025-07-15 10:34:50,216 - mindformers./output/log[mindformers/trainer/base_trainer.py:346] - INFO - PipelineWrapper under evaluate or predict mode will not take effect. 2025-07-15 10:34:50,216 - mindformers./output/log[mindformers/trainer/base_trainer.py:920] - INFO - .........Build Dataset For Train.......... 2025-07-15 10:34:50,216 - mindformers./output/log[mindformers/trainer/base_trainer.py:464] - INFO - .........Build Dataset From Config.......... 2025-07-15 10:34:50,217 - mindformers./output/log[mindformers/dataset/causal_language_model_dataset.py:302] - INFO - Now Create Causal Language Model Dataset. 2025-07-15 10:34:50,218 - mindformers./output/log[mindformers/dataset/base_dataset.py:83] - INFO - Now dataset_strategy is full_batch, shard_id: None, num_shards: None 2025-07-15 10:34:50,224 - mindformers./output/log[mindformers/trainer/base_trainer.py:924] - INFO - Create train dataset finish, dataset size:6 2025-07-15 10:34:50,224 - mindformers./output/log[mindformers/trainer/utils.py:176] - INFO - Will be Training epochs:1, sink_size:1 2025-07-15 10:34:50,225 - mindformers./output/log[mindformers/trainer/utils.py:178] - INFO - Create training dataset finish, dataset size:6 2025-07-15 10:34:50,225 - mindformers./output/log[mindformers/trainer/base_trainer.py:971] - INFO - .........Build Net For Train.......... 2025-07-15 10:34:50,225 - mindformers./output/log[mindformers/trainer/base_trainer.py:498] - INFO - .........Build Network From Config.......... 2025-07-15 10:34:50,235 - mindformers./output/log[mindformers/version_control.py:140] - INFO - The Lazy Inline compilation acceleration feature is turned on. 2025-07-15 10:34:50,240 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1216] - INFO - Enable flash attention. 2025-07-15 10:34:50,285 - mindformers./output/log[mindformers/version_control.py:140] - INFO - The Lazy Inline compilation acceleration feature is turned on. 2025-07-15 10:34:50,290 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1216] - INFO - Enable flash attention. 2025-07-15 10:34:50,367 - mindformers./output/log[mindformers/version_control.py:140] - INFO - The Lazy Inline compilation acceleration feature is turned on. 2025-07-15 10:34:50,372 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1216] - INFO - Enable flash attention. 2025-07-15 10:34:50,373 - mindformers./output/log[mindformers/models/utils.py:190] - INFO - num_layers per stage: [[1, 1], [1, 1]] 2025-07-15 10:34:50,373 - mindformers./output/log[mindformers/models/utils.py:191] - INFO - Accumulated num_layers per stage: [[1, 2], [3, 4]] 2025-07-15 10:34:50,373 - mindformers./output/log[mindformers/models/utils.py:193] - INFO - Pipeline id list with start_stage: [0, 1, 0, 1] 2025-07-15 10:34:50,373 - mindformers./output/log[mindformers/models/utils.py:194] - INFO - Interleave id list: [0, 0, 1, 1] 2025-07-15 10:34:50,374 - mindformers./output/log[mindformers/models/utils.py:212] - INFO - Formative layer_recompute: [[1, 1], [1, 1]] 2025-07-15 10:34:50,374 - mindformers./output/log[mindformers/models/utils.py:214] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-15 10:34:50,374 - mindformers./output/log[mindformers/models/utils.py:220] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0, 0], [0, 0]], 'feed_forward\\.w1\\.activation\\.silu': [[0, 0], [0, 0]]} 2025-07-15 10:34:50,374 - mindformers./output/log[mindformers/models/utils.py:221] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0, 0], [0, 0]]} 2025-07-15 10:34:50,374 - mindformers./output/log[mindformers/models/utils.py:222] - INFO - Formative select_recompute_exclude: {} 2025-07-15 10:34:50,374 - mindformers./output/log[mindformers/models/utils.py:223] - INFO - Formative select_comm_recompute_exclude: {} 2025-07-15 10:34:50,394 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1042] - WARNING - first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. 2025-07-15 10:34:50,402 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 0 2025-07-15 10:34:50,418 - mindformers./output/log[mindformers/models/utils.py:190] - INFO - num_layers per stage: [[1, 1], [1, 1]] 2025-07-15 10:34:50,418 - mindformers./output/log[mindformers/models/utils.py:191] - INFO - Accumulated num_layers per stage: [[1, 2], [3, 4]] 2025-07-15 10:34:50,418 - mindformers./output/log[mindformers/models/utils.py:193] - INFO - Pipeline id list with start_stage: [0, 1, 0, 1] 2025-07-15 10:34:50,418 - mindformers./output/log[mindformers/models/utils.py:194] - INFO - Interleave id list: [0, 0, 1, 1] 2025-07-15 10:34:50,419 - mindformers./output/log[mindformers/models/utils.py:212] - INFO - Formative layer_recompute: [[1, 1], [1, 1]] 2025-07-15 10:34:50,419 - mindformers./output/log[mindformers/models/utils.py:214] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-15 10:34:50,419 - mindformers./output/log[mindformers/models/utils.py:220] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0, 0], [0, 0]], 'feed_forward\\.w1\\.activation\\.silu': [[0, 0], [0, 0]]} 2025-07-15 10:34:50,419 - mindformers./output/log[mindformers/models/utils.py:221] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0, 0], [0, 0]]} 2025-07-15 10:34:50,419 - mindformers./output/log[mindformers/models/utils.py:222] - INFO - Formative select_recompute_exclude: {} 2025-07-15 10:34:50,419 - mindformers./output/log[mindformers/models/utils.py:223] - INFO - Formative select_comm_recompute_exclude: {} 2025-07-15 10:34:50,420 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,427 - mindformers./output/log[mindformers/version_control.py:140] - INFO - The Lazy Inline compilation acceleration feature is turned on. 2025-07-15 10:34:50,432 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1216] - INFO - Enable flash attention. 2025-07-15 10:34:50,439 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1042] - WARNING - first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. 2025-07-15 10:34:50,441 - mindformers./output/log[mindformers/version_control.py:140] - INFO - The Lazy Inline compilation acceleration feature is turned on. 2025-07-15 10:34:50,446 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1216] - INFO - Enable flash attention. 2025-07-15 10:34:50,447 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 0 2025-07-15 10:34:50,456 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 1 2025-07-15 10:34:50,463 - mindformers./output/log[mindformers/models/utils.py:190] - INFO - num_layers per stage: [[1, 1], [1, 1]] 2025-07-15 10:34:50,464 - mindformers./output/log[mindformers/models/utils.py:191] - INFO - Accumulated num_layers per stage: [[1, 2], [3, 4]] 2025-07-15 10:34:50,464 - mindformers./output/log[mindformers/models/utils.py:193] - INFO - Pipeline id list with start_stage: [0, 1, 0, 1] 2025-07-15 10:34:50,464 - mindformers./output/log[mindformers/models/utils.py:194] - INFO - Interleave id list: [0, 0, 1, 1] 2025-07-15 10:34:50,464 - mindformers./output/log[mindformers/models/utils.py:212] - INFO - Formative layer_recompute: [[1, 1], [1, 1]] 2025-07-15 10:34:50,464 - mindformers./output/log[mindformers/models/utils.py:214] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-15 10:34:50,465 - mindformers./output/log[mindformers/models/utils.py:220] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0, 0], [0, 0]], 'feed_forward\\.w1\\.activation\\.silu': [[0, 0], [0, 0]]} 2025-07-15 10:34:50,465 - mindformers./output/log[mindformers/models/utils.py:221] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0, 0], [0, 0]]} 2025-07-15 10:34:50,465 - mindformers./output/log[mindformers/models/utils.py:222] - INFO - Formative select_recompute_exclude: {} 2025-07-15 10:34:50,465 - mindformers./output/log[mindformers/models/utils.py:223] - INFO - Formative select_comm_recompute_exclude: {} 2025-07-15 10:34:50,465 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,476 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,484 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1042] - WARNING - first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. 2025-07-15 10:34:50,492 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 0 2025-07-15 10:34:50,501 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 1 2025-07-15 10:34:50,510 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 2 2025-07-15 10:34:50,511 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,520 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,530 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,546 - mindformers./output/log[mindformers/version_control.py:140] - INFO - The Lazy Inline compilation acceleration feature is turned on. 2025-07-15 10:34:50,547 - mindformers./output/log[mindformers/models/utils.py:190] - INFO - num_layers per stage: [[1, 1], [1, 1]] 2025-07-15 10:34:50,547 - mindformers./output/log[mindformers/models/utils.py:191] - INFO - Accumulated num_layers per stage: [[1, 2], [3, 4]] 2025-07-15 10:34:50,547 - mindformers./output/log[mindformers/models/utils.py:193] - INFO - Pipeline id list with start_stage: [0, 1, 0, 1] 2025-07-15 10:34:50,547 - mindformers./output/log[mindformers/models/utils.py:194] - INFO - Interleave id list: [0, 0, 1, 1] 2025-07-15 10:34:50,547 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 1 2025-07-15 10:34:50,547 - mindformers./output/log[mindformers/models/utils.py:212] - INFO - Formative layer_recompute: [[1, 1], [1, 1]] 2025-07-15 10:34:50,548 - mindformers./output/log[mindformers/models/utils.py:214] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-15 10:34:50,548 - mindformers./output/log[mindformers/models/utils.py:220] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0, 0], [0, 0]], 'feed_forward\\.w1\\.activation\\.silu': [[0, 0], [0, 0]]} 2025-07-15 10:34:50,548 - mindformers./output/log[mindformers/models/utils.py:221] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0, 0], [0, 0]]} 2025-07-15 10:34:50,548 - mindformers./output/log[mindformers/models/utils.py:222] - INFO - Formative select_recompute_exclude: {} 2025-07-15 10:34:50,548 - mindformers./output/log[mindformers/models/utils.py:223] - INFO - Formative select_comm_recompute_exclude: {} 2025-07-15 10:34:50,551 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1216] - INFO - Enable flash attention. 2025-07-15 10:34:50,554 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 2 2025-07-15 10:34:50,563 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 3 2025-07-15 10:34:50,567 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,567 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1042] - WARNING - first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. 2025-07-15 10:34:50,569 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 3 2025-07-15 10:34:50,572 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:134] - INFO - Using 2 data parallel, 1 context parallel and 2 model parallel for the embedding lookup. 2025-07-15 10:34:50,573 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,575 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 0 2025-07-15 10:34:50,579 - mindformers./output/log[mindformers/models/modeling_utils.py:1494] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. 2025-07-15 10:34:50,580 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1643] - INFO - Predict run mode:False 2025-07-15 10:34:50,588 - mindformers./output/log[mindformers/trainer/base_trainer.py:715] - INFO - Network Parameters: 817 M. 2025-07-15 10:34:50,588 - mindformers./output/log[mindformers/trainer/base_trainer.py:1010] - INFO - .........Build Optimizer For Train.......... 2025-07-15 10:34:50,589 - mindformers./output/log[mindformers/trainer/base_trainer.py:581] - INFO - .........Build Optimizer From Config.......... 2025-07-15 10:34:50,589 - mindformers./output/log[mindformers/trainer/base_trainer.py:628] - INFO - .........Build LR Schedule From Config.......... 2025-07-15 10:34:50,591 - mindformers./output/log[mindformers/trainer/optimizer_grouped_parameters.py:77] - WARNING - dynamic_lr_schedule will be reset and invalid when layer_scale is False. 2025-07-15 10:34:50,592 - mindformers./output/log[mindformers/trainer/optimizer_grouped_parameters.py:116] - INFO - Param groups = { "decay": { "weight_decay": 0.1, "params": [ "model.tok_embeddings.embedding_weight", "model.layers.0.attention.q2l_proj.weight", "model.layers.0.attention.l2q_nope_proj.weight", "model.layers.0.attention.l2q_pe_proj.weight", "model.layers.0.attention.kv2l_k_pe.weight", "model.layers.0.attention.kv2l_latent_kv.weight", "model.layers.0.attention.lkv2kv_k_nope.weight", "model.layers.0.attention.lkv2kv_v.weight", "model.layers.0.attention.wo.weight", "model.layers.0.feed_forward.w1.weight", "model.layers.0.feed_forward.w2.weight", "model.layers.0.feed_forward.w3.weight", "model.layers.1.attention.q2l_proj.weight", "model.layers.1.attention.l2q_nope_proj.weight", "model.layers.1.attention.l2q_pe_proj.weight", "model.layers.1.attention.kv2l_k_pe.weight", "model.layers.1.attention.kv2l_latent_kv.weight", "model.layers.1.attention.lkv2kv_k_nope.weight", "model.layers.1.attention.lkv2kv_v.weight", "model.layers.1.attention.wo.weight", "model.layers.1.feed_forward.routed_experts.ffn.w1.weight", "model.layers.1.feed_forward.routed_experts.ffn.w2.weight", "model.layers.1.feed_forward.routed_experts.ffn.w3.weight", "model.layers.1.feed_forward.routed_experts.router.dense.weight", "model.layers.1.feed_forward.shared_experts.w1.weight", "model.layers.1.feed_forward.shared_experts.w2.weight", "model.layers.1.feed_forward.shared_experts.w3.weight", "model.layers.2.attention.q2l_proj.weight", "model.layers.2.attention.l2q_nope_proj.weight", "model.layers.2.attention.l2q_pe_proj.weight", "model.layers.2.attention.kv2l_k_pe.weight", "model.layers.2.attention.kv2l_latent_kv.weight", "model.layers.2.attention.lkv2kv_k_nope.weight", "model.layers.2.attention.lkv2kv_v.weight", "model.layers.2.attention.wo.weight", "model.layers.2.feed_forward.routed_experts.ffn.w1.weight", "model.layers.2.feed_forward.routed_experts.ffn.w2.weight", "model.layers.2.feed_forward.routed_experts.ffn.w3.weight", "model.layers.2.feed_forward.routed_experts.router.dense.weight", "model.layers.2.feed_forward.shared_experts.w1.weight", "model.layers.2.feed_forward.shared_experts.w2.weight", "model.layers.2.feed_forward.shared_experts.w3.weight", "model.layers.3.attention.q2l_proj.weight", "model.layers.3.attention.l2q_nope_proj.weight", "model.layers.3.attention.l2q_pe_proj.weight", "model.layers.3.attention.kv2l_k_pe.weight", "model.layers.3.attention.kv2l_latent_kv.weight", "model.layers.3.attention.lkv2kv_k_nope.weight", "model.layers.3.attention.lkv2kv_v.weight", "model.layers.3.attention.wo.weight", "model.layers.3.feed_forward.routed_experts.ffn.w1.weight", "model.layers.3.feed_forward.routed_experts.ffn.w2.weight", "model.layers.3.feed_forward.routed_experts.ffn.w3.weight", "model.layers.3.feed_forward.routed_experts.router.dense.weight", "model.layers.3.feed_forward.shared_experts.w1.weight", "model.layers.3.feed_forward.shared_experts.w2.weight", "model.layers.3.feed_forward.shared_experts.w3.weight", "model.mtp_hidden_fusers.0.dense.weight", "lm_head.weight" ] }, "no_decay": { "weight_decay": 0.0, "params": [ "model.layers.0.ffn_norm.weight", "model.layers.0.attention_norm.weight", "model.layers.0.attention.lq_norm.weight", "model.layers.0.attention.lkv_norm.weight", "model.layers.1.ffn_norm.weight", "model.layers.1.attention_norm.weight", "model.layers.1.attention.lq_norm.weight", "model.layers.1.attention.lkv_norm.weight", "model.layers.2.ffn_norm.weight", "model.layers.2.attention_norm.weight", "model.layers.2.attention.lq_norm.weight", "model.layers.2.attention.lkv_norm.weight", "model.layers.3.ffn_norm.weight", "model.layers.3.attention_norm.weight", "model.layers.3.attention.lq_norm.weight", "model.layers.3.attention.lkv_norm.weight", "model.mtp_hidden_fusers.0.norm.weight", "model.mtp_hidden_fusers.0.norm_emb.weight", "model.mtp_norms.0.weight", "model.norm_out.weight" ] } } 2025-07-15 10:34:50,594 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,601 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 2 2025-07-15 10:34:50,607 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 3 2025-07-15 10:34:50,611 - mindformers./output/log[mindformers/models/utils.py:190] - INFO - num_layers per stage: [[1, 1], [1, 1]] 2025-07-15 10:34:50,611 - mindformers./output/log[mindformers/models/utils.py:191] - INFO - Accumulated num_layers per stage: [[1, 2], [3, 4]] 2025-07-15 10:34:50,612 - mindformers./output/log[mindformers/models/utils.py:193] - INFO - Pipeline id list with start_stage: [0, 1, 0, 1] 2025-07-15 10:34:50,612 - mindformers./output/log[mindformers/models/utils.py:194] - INFO - Interleave id list: [0, 0, 1, 1] 2025-07-15 10:34:50,612 - mindformers./output/log[mindformers/models/utils.py:212] - INFO - Formative layer_recompute: [[1, 1], [1, 1]] 2025-07-15 10:34:50,612 - mindformers./output/log[mindformers/models/utils.py:214] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-15 10:34:50,612 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 3 2025-07-15 10:34:50,612 - mindformers./output/log[mindformers/models/utils.py:220] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0, 0], [0, 0]], 'feed_forward\\.w1\\.activation\\.silu': [[0, 0], [0, 0]]} 2025-07-15 10:34:50,612 - mindformers./output/log[mindformers/models/utils.py:221] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0, 0], [0, 0]]} 2025-07-15 10:34:50,613 - mindformers./output/log[mindformers/models/utils.py:222] - INFO - Formative select_recompute_exclude: {} 2025-07-15 10:34:50,613 - mindformers./output/log[mindformers/models/utils.py:223] - INFO - Formative select_comm_recompute_exclude: {} 2025-07-15 10:34:50,615 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:134] - INFO - Using 2 data parallel, 1 context parallel and 2 model parallel for the embedding lookup. 2025-07-15 10:34:50,621 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,623 - mindformers./output/log[mindformers/models/modeling_utils.py:1494] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. 2025-07-15 10:34:50,623 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1643] - INFO - Predict run mode:False 2025-07-15 10:34:50,626 - mindformers./output/log[mindformers/models/utils.py:190] - INFO - num_layers per stage: [[1, 1], [1, 1]] 2025-07-15 10:34:50,626 - mindformers./output/log[mindformers/models/utils.py:191] - INFO - Accumulated num_layers per stage: [[1, 2], [3, 4]] 2025-07-15 10:34:50,626 - mindformers./output/log[mindformers/models/utils.py:193] - INFO - Pipeline id list with start_stage: [0, 1, 0, 1] 2025-07-15 10:34:50,626 - mindformers./output/log[mindformers/models/utils.py:194] - INFO - Interleave id list: [0, 0, 1, 1] 2025-07-15 10:34:50,627 - mindformers./output/log[mindformers/models/utils.py:212] - INFO - Formative layer_recompute: [[1, 1], [1, 1]] 2025-07-15 10:34:50,627 - mindformers./output/log[mindformers/models/utils.py:214] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-15 10:34:50,627 - mindformers./output/log[mindformers/models/utils.py:220] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0, 0], [0, 0]], 'feed_forward\\.w1\\.activation\\.silu': [[0, 0], [0, 0]]} 2025-07-15 10:34:50,627 - mindformers./output/log[mindformers/models/utils.py:221] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0, 0], [0, 0]]} 2025-07-15 10:34:50,627 - mindformers./output/log[mindformers/models/utils.py:222] - INFO - Formative select_recompute_exclude: {} 2025-07-15 10:34:50,627 - mindformers./output/log[mindformers/models/utils.py:223] - INFO - Formative select_comm_recompute_exclude: {} 2025-07-15 10:34:50,628 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 1 2025-07-15 10:34:50,631 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1042] - WARNING - first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. 2025-07-15 10:34:50,631 - mindformers./output/log[mindformers/trainer/base_trainer.py:715] - INFO - Network Parameters: 817 M. 2025-07-15 10:34:50,632 - mindformers./output/log[mindformers/trainer/base_trainer.py:1010] - INFO - .........Build Optimizer For Train.......... 2025-07-15 10:34:50,632 - mindformers./output/log[mindformers/trainer/base_trainer.py:581] - INFO - .........Build Optimizer From Config.......... 2025-07-15 10:34:50,632 - mindformers./output/log[mindformers/trainer/base_trainer.py:628] - INFO - .........Build LR Schedule From Config.......... 2025-07-15 10:34:50,634 - mindformers./output/log[mindformers/trainer/optimizer_grouped_parameters.py:77] - WARNING - dynamic_lr_schedule will be reset and invalid when layer_scale is False. 2025-07-15 10:34:50,636 - mindformers./output/log[mindformers/trainer/optimizer_grouped_parameters.py:116] - INFO - Param groups = { "decay": { "weight_decay": 0.1, "params": [ "model.tok_embeddings.embedding_weight", "model.layers.0.attention.q2l_proj.weight", "model.layers.0.attention.l2q_nope_proj.weight", "model.layers.0.attention.l2q_pe_proj.weight", "model.layers.0.attention.kv2l_k_pe.weight", "model.layers.0.attention.kv2l_latent_kv.weight", "model.layers.0.attention.lkv2kv_k_nope.weight", "model.layers.0.attention.lkv2kv_v.weight", "model.layers.0.attention.wo.weight", "model.layers.0.feed_forward.w1.weight", "model.layers.0.feed_forward.w2.weight", "model.layers.0.feed_forward.w3.weight", "model.layers.1.attention.q2l_proj.weight", "model.layers.1.attention.l2q_nope_proj.weight", "model.layers.1.attention.l2q_pe_proj.weight", "model.layers.1.attention.kv2l_k_pe.weight", "model.layers.1.attention.kv2l_latent_kv.weight", "model.layers.1.attention.lkv2kv_k_nope.weight", "model.layers.1.attention.lkv2kv_v.weight", "model.layers.1.attention.wo.weight", "model.layers.1.feed_forward.routed_experts.ffn.w1.weight", "model.layers.1.feed_forward.routed_experts.ffn.w2.weight", "model.layers.1.feed_forward.routed_experts.ffn.w3.weight", "model.layers.1.feed_forward.routed_experts.router.dense.weight", "model.layers.1.feed_forward.shared_experts.w1.weight", "model.layers.1.feed_forward.shared_experts.w2.weight", "model.layers.1.feed_forward.shared_experts.w3.weight", "model.layers.2.attention.q2l_proj.weight", "model.layers.2.attention.l2q_nope_proj.weight", "model.layers.2.attention.l2q_pe_proj.weight", "model.layers.2.attention.kv2l_k_pe.weight", "model.layers.2.attention.kv2l_latent_kv.weight", "model.layers.2.attention.lkv2kv_k_nope.weight", "model.layers.2.attention.lkv2kv_v.weight", "model.layers.2.attention.wo.weight", "model.layers.2.feed_forward.routed_experts.ffn.w1.weight", "model.layers.2.feed_forward.routed_experts.ffn.w2.weight", "model.layers.2.feed_forward.routed_experts.ffn.w3.weight", "model.layers.2.feed_forward.routed_experts.router.dense.weight", "model.layers.2.feed_forward.shared_experts.w1.weight", "model.layers.2.feed_forward.shared_experts.w2.weight", "model.layers.2.feed_forward.shared_experts.w3.weight", "model.layers.3.attention.q2l_proj.weight", "model.layers.3.attention.l2q_nope_proj.weight", "model.layers.3.attention.l2q_pe_proj.weight", "model.layers.3.attention.kv2l_k_pe.weight", "model.layers.3.attention.kv2l_latent_kv.weight", "model.layers.3.attention.lkv2kv_k_nope.weight", "model.layers.3.attention.lkv2kv_v.weight", "model.layers.3.attention.wo.weight", "model.layers.3.feed_forward.routed_experts.ffn.w1.weight", "model.layers.3.feed_forward.routed_experts.ffn.w2.weight", "model.layers.3.feed_forward.routed_experts.ffn.w3.weight", "model.layers.3.feed_forward.routed_experts.router.dense.weight", "model.layers.3.feed_forward.shared_experts.w1.weight", "model.layers.3.feed_forward.shared_experts.w2.weight", "model.layers.3.feed_forward.shared_experts.w3.weight", "model.mtp_hidden_fusers.0.dense.weight", "lm_head.weight" ] }, "no_decay": { "weight_decay": 0.0, "params": [ "model.layers.0.ffn_norm.weight", "model.layers.0.attention_norm.weight", "model.layers.0.attention.lq_norm.weight", "model.layers.0.attention.lkv_norm.weight", "model.layers.1.ffn_norm.weight", "model.layers.1.attention_norm.weight", "model.layers.1.attention.lq_norm.weight", "model.layers.1.attention.lkv_norm.weight", "model.layers.2.ffn_norm.weight", "model.layers.2.attention_norm.weight", "model.layers.2.attention.lq_norm.weight", "model.layers.2.attention.lkv_norm.weight", "model.layers.3.ffn_norm.weight", "model.layers.3.attention_norm.weight", "model.layers.3.attention.lq_norm.weight", "model.layers.3.attention.lkv_norm.weight", "model.mtp_hidden_fusers.0.norm.weight", "model.mtp_hidden_fusers.0.norm_emb.weight", "model.mtp_norms.0.weight", "model.norm_out.weight" ] } } 2025-07-15 10:34:50,639 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 0 2025-07-15 10:34:50,646 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1042] - WARNING - first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. 2025-07-15 10:34:50,647 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,654 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 0 2025-07-15 10:34:50,655 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 3 2025-07-15 10:34:50,657 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,658 - mindformers./output/log[mindformers/trainer/base_trainer.py:1019] - INFO - .........Build Running Wrapper From Config For Train.......... 2025-07-15 10:34:50,658 - mindformers./output/log[mindformers/trainer/base_trainer.py:665] - INFO - .........Build Model Wrapper for Train From Config.......... 2025-07-15 10:34:50,661 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 3 2025-07-15 10:34:50,664 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:134] - INFO - Using 2 data parallel, 1 context parallel and 2 model parallel for the embedding lookup. 2025-07-15 10:34:50,672 - mindformers./output/log[mindformers/models/modeling_utils.py:1494] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. 2025-07-15 10:34:50,672 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1643] - INFO - Predict run mode:False 2025-07-15 10:34:50,672 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,680 - mindformers./output/log[mindformers/trainer/base_trainer.py:715] - INFO - Network Parameters: 817 M. 2025-07-15 10:34:50,681 - mindformers./output/log[mindformers/trainer/base_trainer.py:1010] - INFO - .........Build Optimizer For Train.......... 2025-07-15 10:34:50,681 - mindformers./output/log[mindformers/trainer/base_trainer.py:581] - INFO - .........Build Optimizer From Config.......... 2025-07-15 10:34:50,681 - mindformers./output/log[mindformers/trainer/base_trainer.py:628] - INFO - .........Build LR Schedule From Config.......... 2025-07-15 10:34:50,681 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 2 2025-07-15 10:34:50,683 - mindformers./output/log[mindformers/trainer/optimizer_grouped_parameters.py:77] - WARNING - dynamic_lr_schedule will be reset and invalid when layer_scale is False. 2025-07-15 10:34:50,685 - mindformers./output/log[mindformers/trainer/optimizer_grouped_parameters.py:116] - INFO - Param groups = { "decay": { "weight_decay": 0.1, "params": [ "model.tok_embeddings.embedding_weight", "model.layers.0.attention.q2l_proj.weight", "model.layers.0.attention.l2q_nope_proj.weight", "model.layers.0.attention.l2q_pe_proj.weight", "model.layers.0.attention.kv2l_k_pe.weight", "model.layers.0.attention.kv2l_latent_kv.weight", "model.layers.0.attention.lkv2kv_k_nope.weight", "model.layers.0.attention.lkv2kv_v.weight", "model.layers.0.attention.wo.weight", "model.layers.0.feed_forward.w1.weight", "model.layers.0.feed_forward.w2.weight", "model.layers.0.feed_forward.w3.weight", "model.layers.1.attention.q2l_proj.weight", "model.layers.1.attention.l2q_nope_proj.weight", "model.layers.1.attention.l2q_pe_proj.weight", "model.layers.1.attention.kv2l_k_pe.weight", "model.layers.1.attention.kv2l_latent_kv.weight", "model.layers.1.attention.lkv2kv_k_nope.weight", "model.layers.1.attention.lkv2kv_v.weight", "model.layers.1.attention.wo.weight", "model.layers.1.feed_forward.routed_experts.ffn.w1.weight", "model.layers.1.feed_forward.routed_experts.ffn.w2.weight", "model.layers.1.feed_forward.routed_experts.ffn.w3.weight", "model.layers.1.feed_forward.routed_experts.router.dense.weight", "model.layers.1.feed_forward.shared_experts.w1.weight", "model.layers.1.feed_forward.shared_experts.w2.weight", "model.layers.1.feed_forward.shared_experts.w3.weight", "model.layers.2.attention.q2l_proj.weight", "model.layers.2.attention.l2q_nope_proj.weight", "model.layers.2.attention.l2q_pe_proj.weight", "model.layers.2.attention.kv2l_k_pe.weight", "model.layers.2.attention.kv2l_latent_kv.weight", "model.layers.2.attention.lkv2kv_k_nope.weight", "model.layers.2.attention.lkv2kv_v.weight", "model.layers.2.attention.wo.weight", "model.layers.2.feed_forward.routed_experts.ffn.w1.weight", "model.layers.2.feed_forward.routed_experts.ffn.w2.weight", "model.layers.2.feed_forward.routed_experts.ffn.w3.weight", "model.layers.2.feed_forward.routed_experts.router.dense.weight", "model.layers.2.feed_forward.shared_experts.w1.weight", "model.layers.2.feed_forward.shared_experts.w2.weight", "model.layers.2.feed_forward.shared_experts.w3.weight", "model.layers.3.attention.q2l_proj.weight", "model.layers.3.attention.l2q_nope_proj.weight", "model.layers.3.attention.l2q_pe_proj.weight", "model.layers.3.attention.kv2l_k_pe.weight", "model.layers.3.attention.kv2l_latent_kv.weight", "model.layers.3.attention.lkv2kv_k_nope.weight", "model.layers.3.attention.lkv2kv_v.weight", "model.layers.3.attention.wo.weight", "model.layers.3.feed_forward.routed_experts.ffn.w1.weight", "model.layers.3.feed_forward.routed_experts.ffn.w2.weight", "model.layers.3.feed_forward.routed_experts.ffn.w3.weight", "model.layers.3.feed_forward.routed_experts.router.dense.weight", "model.layers.3.feed_forward.shared_experts.w1.weight", "model.layers.3.feed_forward.shared_experts.w2.weight", "model.layers.3.feed_forward.shared_experts.w3.weight", "model.mtp_hidden_fusers.0.dense.weight", "lm_head.weight" ] }, "no_decay": { "weight_decay": 0.0, "params": [ "model.layers.0.ffn_norm.weight", "model.layers.0.attention_norm.weight", "model.layers.0.attention.lq_norm.weight", "model.layers.0.attention.lkv_norm.weight", "model.layers.1.ffn_norm.weight", "model.layers.1.attention_norm.weight", "model.layers.1.attention.lq_norm.weight", "model.layers.1.attention.lkv_norm.weight", "model.layers.2.ffn_norm.weight", "model.layers.2.attention_norm.weight", "model.layers.2.attention.lq_norm.weight", "model.layers.2.attention.lkv_norm.weight", "model.layers.3.ffn_norm.weight", "model.layers.3.attention_norm.weight", "model.layers.3.attention.lq_norm.weight", "model.layers.3.attention.lkv_norm.weight", "model.mtp_hidden_fusers.0.norm.weight", "model.mtp_hidden_fusers.0.norm_emb.weight", "model.mtp_norms.0.weight", "model.norm_out.weight" ] } } 2025-07-15 10:34:50,691 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 1 [WARNING] DISTRIBUTED(909477,ffffbea6eec0,python):2025-07-15-10:34:50.694.428 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: 12426c956d1bc5017082b12a969b0b7c [const vector]{1, 5}, async: 0, submit_now: 1 2025-07-15 10:34:50,701 - mindformers./output/log[mindformers/trainer/base_trainer.py:1019] - INFO - .........Build Running Wrapper From Config For Train.......... 2025-07-15 10:34:50,701 - mindformers./output/log[mindformers/trainer/base_trainer.py:665] - INFO - .........Build Model Wrapper for Train From Config.......... 2025-07-15 10:34:50,701 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,707 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 1 2025-07-15 10:34:50,709 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,726 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,727 - mindformers./output/log[mindformers/models/utils.py:190] - INFO - num_layers per stage: [[1, 1], [1, 1]] 2025-07-15 10:34:50,727 - mindformers./output/log[mindformers/models/utils.py:191] - INFO - Accumulated num_layers per stage: [[1, 2], [3, 4]] 2025-07-15 10:34:50,727 - mindformers./output/log[mindformers/models/utils.py:193] - INFO - Pipeline id list with start_stage: [0, 1, 0, 1] 2025-07-15 10:34:50,727 - mindformers./output/log[mindformers/models/utils.py:194] - INFO - Interleave id list: [0, 0, 1, 1] 2025-07-15 10:34:50,728 - mindformers./output/log[mindformers/models/utils.py:212] - INFO - Formative layer_recompute: [[1, 1], [1, 1]] 2025-07-15 10:34:50,728 - mindformers./output/log[mindformers/models/utils.py:214] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-15 10:34:50,728 - mindformers./output/log[mindformers/models/utils.py:220] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0, 0], [0, 0]], 'feed_forward\\.w1\\.activation\\.silu': [[0, 0], [0, 0]]} 2025-07-15 10:34:50,728 - mindformers./output/log[mindformers/models/utils.py:221] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0, 0], [0, 0]]} 2025-07-15 10:34:50,728 - mindformers./output/log[mindformers/models/utils.py:222] - INFO - Formative select_recompute_exclude: {} 2025-07-15 10:34:50,728 - mindformers./output/log[mindformers/models/utils.py:223] - INFO - Formative select_comm_recompute_exclude: {} 2025-07-15 10:34:50,735 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 3 [WARNING] DISTRIBUTED(909457,ffffb178eec0,python):2025-07-15-10:34:50.737.318 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: cb4ececddcb4517ca0bcddafd23813b9 [const vector]{0, 4}, async: 0, submit_now: 1 2025-07-15 10:34:50,741 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 3 2025-07-15 10:34:50,742 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 2 2025-07-15 10:34:50,743 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:134] - INFO - Using 2 data parallel, 1 context parallel and 2 model parallel for the embedding lookup. 2025-07-15 10:34:50,747 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1042] - WARNING - first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. 2025-07-15 10:34:50,750 - mindformers./output/log[mindformers/trainer/base_trainer.py:1019] - INFO - .........Build Running Wrapper From Config For Train.......... 2025-07-15 10:34:50,751 - mindformers./output/log[mindformers/trainer/base_trainer.py:665] - INFO - .........Build Model Wrapper for Train From Config.......... 2025-07-15 10:34:50,751 - mindformers./output/log[mindformers/models/modeling_utils.py:1494] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. 2025-07-15 10:34:50,751 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1643] - INFO - Predict run mode:False 2025-07-15 10:34:50,755 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 0 2025-07-15 10:34:50,759 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 2 2025-07-15 10:34:50,760 - mindformers./output/log[mindformers/trainer/base_trainer.py:715] - INFO - Network Parameters: 817 M. 2025-07-15 10:34:50,760 - mindformers./output/log[mindformers/trainer/base_trainer.py:1010] - INFO - .........Build Optimizer For Train.......... 2025-07-15 10:34:50,760 - mindformers./output/log[mindformers/trainer/base_trainer.py:581] - INFO - .........Build Optimizer From Config.......... 2025-07-15 10:34:50,760 - mindformers./output/log[mindformers/trainer/base_trainer.py:628] - INFO - .........Build LR Schedule From Config.......... 2025-07-15 10:34:50,761 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,763 - mindformers./output/log[mindformers/trainer/optimizer_grouped_parameters.py:77] - WARNING - dynamic_lr_schedule will be reset and invalid when layer_scale is False. 2025-07-15 10:34:50,764 - mindformers./output/log[mindformers/trainer/optimizer_grouped_parameters.py:116] - INFO - Param groups = { "decay": { "weight_decay": 0.1, "params": [ "model.tok_embeddings.embedding_weight", "model.layers.0.attention.q2l_proj.weight", "model.layers.0.attention.l2q_nope_proj.weight", "model.layers.0.attention.l2q_pe_proj.weight", "model.layers.0.attention.kv2l_k_pe.weight", "model.layers.0.attention.kv2l_latent_kv.weight", "model.layers.0.attention.lkv2kv_k_nope.weight", "model.layers.0.attention.lkv2kv_v.weight", "model.layers.0.attention.wo.weight", "model.layers.0.feed_forward.w1.weight", "model.layers.0.feed_forward.w2.weight", "model.layers.0.feed_forward.w3.weight", "model.layers.1.attention.q2l_proj.weight", "model.layers.1.attention.l2q_nope_proj.weight", "model.layers.1.attention.l2q_pe_proj.weight", "model.layers.1.attention.kv2l_k_pe.weight", "model.layers.1.attention.kv2l_latent_kv.weight", "model.layers.1.attention.lkv2kv_k_nope.weight", "model.layers.1.attention.lkv2kv_v.weight", "model.layers.1.attention.wo.weight", "model.layers.1.feed_forward.routed_experts.ffn.w1.weight", "model.layers.1.feed_forward.routed_experts.ffn.w2.weight", "model.layers.1.feed_forward.routed_experts.ffn.w3.weight", "model.layers.1.feed_forward.routed_experts.router.dense.weight", "model.layers.1.feed_forward.shared_experts.w1.weight", "model.layers.1.feed_forward.shared_experts.w2.weight", "model.layers.1.feed_forward.shared_experts.w3.weight", "model.layers.2.attention.q2l_proj.weight", "model.layers.2.attention.l2q_nope_proj.weight", "model.layers.2.attention.l2q_pe_proj.weight", "model.layers.2.attention.kv2l_k_pe.weight", "model.layers.2.attention.kv2l_latent_kv.weight", "model.layers.2.attention.lkv2kv_k_nope.weight", "model.layers.2.attention.lkv2kv_v.weight", "model.layers.2.attention.wo.weight", "model.layers.2.feed_forward.routed_experts.ffn.w1.weight", "model.layers.2.feed_forward.routed_experts.ffn.w2.weight", "model.layers.2.feed_forward.routed_experts.ffn.w3.weight", "model.layers.2.feed_forward.routed_experts.router.dense.weight", "model.layers.2.feed_forward.shared_experts.w1.weight", "model.layers.2.feed_forward.shared_experts.w2.weight", "model.layers.2.feed_forward.shared_experts.w3.weight", "model.layers.3.attention.q2l_proj.weight", "model.layers.3.attention.l2q_nope_proj.weight", "model.layers.3.attention.l2q_pe_proj.weight", "model.layers.3.attention.kv2l_k_pe.weight", "model.layers.3.attention.kv2l_latent_kv.weight", "model.layers.3.attention.lkv2kv_k_nope.weight", "model.layers.3.attention.lkv2kv_v.weight", "model.layers.3.attention.wo.weight", "model.layers.3.feed_forward.routed_experts.ffn.w1.weight", "model.layers.3.feed_forward.routed_experts.ffn.w2.weight", "model.layers.3.feed_forward.routed_experts.ffn.w3.weight", "model.layers.3.feed_forward.routed_experts.router.dense.weight", "model.layers.3.feed_forward.shared_experts.w1.weight", "model.layers.3.feed_forward.shared_experts.w2.weight", "model.layers.3.feed_forward.shared_experts.w3.weight", "model.mtp_hidden_fusers.0.dense.weight", "lm_head.weight" ] }, "no_decay": { "weight_decay": 0.0, "params": [ "model.layers.0.ffn_norm.weight", "model.layers.0.attention_norm.weight", "model.layers.0.attention.lq_norm.weight", "model.layers.0.attention.lkv_norm.weight", "model.layers.1.ffn_norm.weight", "model.layers.1.attention_norm.weight", "model.layers.1.attention.lq_norm.weight", "model.layers.1.attention.lkv_norm.weight", "model.layers.2.ffn_norm.weight", "model.layers.2.attention_norm.weight", "model.layers.2.attention.lq_norm.weight", "model.layers.2.attention.lkv_norm.weight", "model.layers.3.ffn_norm.weight", "model.layers.3.attention_norm.weight", "model.layers.3.attention.lq_norm.weight", "model.layers.3.attention.lkv_norm.weight", "model.mtp_hidden_fusers.0.norm.weight", "model.mtp_hidden_fusers.0.norm_emb.weight", "model.mtp_norms.0.weight", "model.norm_out.weight" ] } } 2025-07-15 10:34:50,773 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,779 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn [WARNING] DISTRIBUTED(909485,ffff9584eec0,python):2025-07-15-10:34:50.786.891 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: e30609fbce6a1a756f50a31ec86eae83 [const vector]{3, 7}, async: 0, submit_now: 1 2025-07-15 10:34:50,794 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 3 2025-07-15 10:34:50,799 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 3 2025-07-15 10:34:50,802 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:134] - INFO - Using 2 data parallel, 1 context parallel and 2 model parallel for the embedding lookup. 2025-07-15 10:34:50,808 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 1 2025-07-15 10:34:50,809 - mindformers./output/log[mindformers/models/modeling_utils.py:1494] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. 2025-07-15 10:34:50,809 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1643] - INFO - Predict run mode:False 2025-07-15 10:34:50,811 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 3 2025-07-15 10:34:50,816 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 3 2025-07-15 10:34:50,818 - mindformers./output/log[mindformers/trainer/base_trainer.py:715] - INFO - Network Parameters: 817 M. 2025-07-15 10:34:50,818 - mindformers./output/log[mindformers/trainer/base_trainer.py:1010] - INFO - .........Build Optimizer For Train.......... 2025-07-15 10:34:50,818 - mindformers./output/log[mindformers/trainer/base_trainer.py:581] - INFO - .........Build Optimizer From Config.......... 2025-07-15 10:34:50,818 - mindformers./output/log[mindformers/trainer/base_trainer.py:628] - INFO - .........Build LR Schedule From Config.......... 2025-07-15 10:34:50,819 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:134] - INFO - Using 2 data parallel, 1 context parallel and 2 model parallel for the embedding lookup. 2025-07-15 10:34:50,821 - mindformers./output/log[mindformers/trainer/optimizer_grouped_parameters.py:77] - WARNING - dynamic_lr_schedule will be reset and invalid when layer_scale is False. 2025-07-15 10:34:50,822 - mindformers./output/log[mindformers/trainer/optimizer_grouped_parameters.py:116] - INFO - Param groups = { "decay": { "weight_decay": 0.1, "params": [ "model.tok_embeddings.embedding_weight", "model.layers.0.attention.q2l_proj.weight", "model.layers.0.attention.l2q_nope_proj.weight", "model.layers.0.attention.l2q_pe_proj.weight", "model.layers.0.attention.kv2l_k_pe.weight", "model.layers.0.attention.kv2l_latent_kv.weight", "model.layers.0.attention.lkv2kv_k_nope.weight", "model.layers.0.attention.lkv2kv_v.weight", "model.layers.0.attention.wo.weight", "model.layers.0.feed_forward.w1.weight", "model.layers.0.feed_forward.w2.weight", "model.layers.0.feed_forward.w3.weight", "model.layers.1.attention.q2l_proj.weight", "model.layers.1.attention.l2q_nope_proj.weight", "model.layers.1.attention.l2q_pe_proj.weight", "model.layers.1.attention.kv2l_k_pe.weight", "model.layers.1.attention.kv2l_latent_kv.weight", "model.layers.1.attention.lkv2kv_k_nope.weight", "model.layers.1.attention.lkv2kv_v.weight", "model.layers.1.attention.wo.weight", "model.layers.1.feed_forward.routed_experts.ffn.w1.weight", "model.layers.1.feed_forward.routed_experts.ffn.w2.weight", "model.layers.1.feed_forward.routed_experts.ffn.w3.weight", "model.layers.1.feed_forward.routed_experts.router.dense.weight", "model.layers.1.feed_forward.shared_experts.w1.weight", "model.layers.1.feed_forward.shared_experts.w2.weight", "model.layers.1.feed_forward.shared_experts.w3.weight", "model.layers.2.attention.q2l_proj.weight", "model.layers.2.attention.l2q_nope_proj.weight", "model.layers.2.attention.l2q_pe_proj.weight", "model.layers.2.attention.kv2l_k_pe.weight", "model.layers.2.attention.kv2l_latent_kv.weight", "model.layers.2.attention.lkv2kv_k_nope.weight", "model.layers.2.attention.lkv2kv_v.weight", "model.layers.2.attention.wo.weight", "model.layers.2.feed_forward.routed_experts.ffn.w1.weight", "model.layers.2.feed_forward.routed_experts.ffn.w2.weight", "model.layers.2.feed_forward.routed_experts.ffn.w3.weight", "model.layers.2.feed_forward.routed_experts.router.dense.weight", "model.layers.2.feed_forward.shared_experts.w1.weight", "model.layers.2.feed_forward.shared_experts.w2.weight", "model.layers.2.feed_forward.shared_experts.w3.weight", "model.layers.3.attention.q2l_proj.weight", "model.layers.3.attention.l2q_nope_proj.weight", "model.layers.3.attention.l2q_pe_proj.weight", "model.layers.3.attention.kv2l_k_pe.weight", "model.layers.3.attention.kv2l_latent_kv.weight", "model.layers.3.attention.lkv2kv_k_nope.weight", "model.layers.3.attention.lkv2kv_v.weight", "model.layers.3.attention.wo.weight", "model.layers.3.feed_forward.routed_experts.ffn.w1.weight", "model.layers.3.feed_forward.routed_experts.ffn.w2.weight", "model.layers.3.feed_forward.routed_experts.ffn.w3.weight", "model.layers.3.feed_forward.routed_experts.router.dense.weight", "model.layers.3.feed_forward.shared_experts.w1.weight", "model.layers.3.feed_forward.shared_experts.w2.weight", "model.layers.3.feed_forward.shared_experts.w3.weight", "model.mtp_hidden_fusers.0.dense.weight", "lm_head.weight" ] }, "no_decay": { "weight_decay": 0.0, "params": [ "model.layers.0.ffn_norm.weight", "model.layers.0.attention_norm.weight", "model.layers.0.attention.lq_norm.weight", "model.layers.0.attention.lkv_norm.weight", "model.layers.1.ffn_norm.weight", "model.layers.1.attention_norm.weight", "model.layers.1.attention.lq_norm.weight", "model.layers.1.attention.lkv_norm.weight", "model.layers.2.ffn_norm.weight", "model.layers.2.attention_norm.weight", "model.layers.2.attention.lq_norm.weight", "model.layers.2.attention.lkv_norm.weight", "model.layers.3.ffn_norm.weight", "model.layers.3.attention_norm.weight", "model.layers.3.attention.lq_norm.weight", "model.layers.3.attention.lkv_norm.weight", "model.mtp_hidden_fusers.0.norm.weight", "model.mtp_hidden_fusers.0.norm_emb.weight", "model.mtp_norms.0.weight", "model.norm_out.weight" ] } } 2025-07-15 10:34:50,827 - mindformers./output/log[mindformers/models/modeling_utils.py:1494] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. 2025-07-15 10:34:50,827 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1643] - INFO - Predict run mode:False 2025-07-15 10:34:50,827 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,828 - mindformers./output/log[mindformers/trainer/base_trainer.py:1019] - INFO - .........Build Running Wrapper From Config For Train.......... 2025-07-15 10:34:50,829 - mindformers./output/log[mindformers/trainer/base_trainer.py:665] - INFO - .........Build Model Wrapper for Train From Config.......... 2025-07-15 10:34:50,835 - mindformers./output/log[mindformers/trainer/base_trainer.py:715] - INFO - Network Parameters: 817 M. 2025-07-15 10:34:50,835 - mindformers./output/log[mindformers/trainer/base_trainer.py:1010] - INFO - .........Build Optimizer For Train.......... 2025-07-15 10:34:50,836 - mindformers./output/log[mindformers/trainer/base_trainer.py:581] - INFO - .........Build Optimizer From Config.......... 2025-07-15 10:34:50,836 - mindformers./output/log[mindformers/trainer/base_trainer.py:628] - INFO - .........Build LR Schedule From Config.......... 2025-07-15 10:34:50,838 - mindformers./output/log[mindformers/trainer/optimizer_grouped_parameters.py:77] - WARNING - dynamic_lr_schedule will be reset and invalid when layer_scale is False. 2025-07-15 10:34:50,839 - mindformers./output/log[mindformers/trainer/optimizer_grouped_parameters.py:116] - INFO - Param groups = { "decay": { "weight_decay": 0.1, "params": [ "model.tok_embeddings.embedding_weight", "model.layers.0.attention.q2l_proj.weight", "model.layers.0.attention.l2q_nope_proj.weight", "model.layers.0.attention.l2q_pe_proj.weight", "model.layers.0.attention.kv2l_k_pe.weight", "model.layers.0.attention.kv2l_latent_kv.weight", "model.layers.0.attention.lkv2kv_k_nope.weight", "model.layers.0.attention.lkv2kv_v.weight", "model.layers.0.attention.wo.weight", "model.layers.0.feed_forward.w1.weight", "model.layers.0.feed_forward.w2.weight", "model.layers.0.feed_forward.w3.weight", "model.layers.1.attention.q2l_proj.weight", "model.layers.1.attention.l2q_nope_proj.weight", "model.layers.1.attention.l2q_pe_proj.weight", "model.layers.1.attention.kv2l_k_pe.weight", "model.layers.1.attention.kv2l_latent_kv.weight", "model.layers.1.attention.lkv2kv_k_nope.weight", "model.layers.1.attention.lkv2kv_v.weight", "model.layers.1.attention.wo.weight", "model.layers.1.feed_forward.routed_experts.ffn.w1.weight", "model.layers.1.feed_forward.routed_experts.ffn.w2.weight", "model.layers.1.feed_forward.routed_experts.ffn.w3.weight", "model.layers.1.feed_forward.routed_experts.router.dense.weight", "model.layers.1.feed_forward.shared_experts.w1.weight", "model.layers.1.feed_forward.shared_experts.w2.weight", "model.layers.1.feed_forward.shared_experts.w3.weight", "model.layers.2.attention.q2l_proj.weight", "model.layers.2.attention.l2q_nope_proj.weight", "model.layers.2.attention.l2q_pe_proj.weight", "model.layers.2.attention.kv2l_k_pe.weight", "model.layers.2.attention.kv2l_latent_kv.weight", "model.layers.2.attention.lkv2kv_k_nope.weight", "model.layers.2.attention.lkv2kv_v.weight", "model.layers.2.attention.wo.weight", "model.layers.2.feed_forward.routed_experts.ffn.w1.weight", "model.layers.2.feed_forward.routed_experts.ffn.w2.weight", "model.layers.2.feed_forward.routed_experts.ffn.w3.weight", "model.layers.2.feed_forward.routed_experts.router.dense.weight", "model.layers.2.feed_forward.shared_experts.w1.weight", "model.layers.2.feed_forward.shared_experts.w2.weight", "model.layers.2.feed_forward.shared_experts.w3.weight", "model.layers.3.attention.q2l_proj.weight", "model.layers.3.attention.l2q_nope_proj.weight", "model.layers.3.attention.l2q_pe_proj.weight", "model.layers.3.attention.kv2l_k_pe.weight", "model.layers.3.attention.kv2l_latent_kv.weight", "model.layers.3.attention.lkv2kv_k_nope.weight", "model.layers.3.attention.lkv2kv_v.weight", "model.layers.3.attention.wo.weight", "model.layers.3.feed_forward.routed_experts.ffn.w1.weight", "model.layers.3.feed_forward.routed_experts.ffn.w2.weight", "model.layers.3.feed_forward.routed_experts.ffn.w3.weight", "model.layers.3.feed_forward.routed_experts.router.dense.weight", "model.layers.3.feed_forward.shared_experts.w1.weight", "model.layers.3.feed_forward.shared_experts.w2.weight", "model.layers.3.feed_forward.shared_experts.w3.weight", "model.mtp_hidden_fusers.0.dense.weight", "lm_head.weight" ] }, "no_decay": { "weight_decay": 0.0, "params": [ "model.layers.0.ffn_norm.weight", "model.layers.0.attention_norm.weight", "model.layers.0.attention.lq_norm.weight", "model.layers.0.attention.lkv_norm.weight", "model.layers.1.ffn_norm.weight", "model.layers.1.attention_norm.weight", "model.layers.1.attention.lq_norm.weight", "model.layers.1.attention.lkv_norm.weight", "model.layers.2.ffn_norm.weight", "model.layers.2.attention_norm.weight", "model.layers.2.attention.lq_norm.weight", "model.layers.2.attention.lkv_norm.weight", "model.layers.3.ffn_norm.weight", "model.layers.3.attention_norm.weight", "model.layers.3.attention.lq_norm.weight", "model.layers.3.attention.lkv_norm.weight", "model.mtp_hidden_fusers.0.norm.weight", "model.mtp_hidden_fusers.0.norm_emb.weight", "model.mtp_norms.0.weight", "model.norm_out.weight" ] } } 2025-07-15 10:34:50,861 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 2 [WARNING] DISTRIBUTED(909465,ffffbf16eec0,python):2025-07-15-10:34:50.864.100 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: d9639340c2f0051c1a7a09da5ef07ed4 [const vector]{2, 6}, async: 0, submit_now: 1 2025-07-15 10:34:50,880 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1072] - INFO - MoE config is provided, use MoE FFN with shared ffn 2025-07-15 10:34:50,884 - mindformers./output/log[mindformers/trainer/base_trainer.py:1019] - INFO - .........Build Running Wrapper From Config For Train.......... 2025-07-15 10:34:50,884 - mindformers./output/log[mindformers/trainer/base_trainer.py:665] - INFO - .........Build Model Wrapper for Train From Config.......... 2025-07-15 10:34:50,902 - mindformers./output/log[mindformers/trainer/base_trainer.py:1019] - INFO - .........Build Running Wrapper From Config For Train.......... 2025-07-15 10:34:50,902 - mindformers./output/log[mindformers/trainer/base_trainer.py:665] - INFO - .........Build Model Wrapper for Train From Config.......... 2025-07-15 10:34:50,913 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 3 [WARNING] DISTRIBUTED(909473,ffffb301eec0,python):2025-07-15-10:34:50.918.806 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: cb4ececddcb4517ca0bcddafd23813b9 [const vector]{0, 4}, async: 0, submit_now: 1 2025-07-15 10:34:50,918 - mindformers./output/log[mindformers/models/utils.py:423] - INFO - Set full recompute at layer 3 2025-07-15 10:34:50,921 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:134] - INFO - Using 2 data parallel, 1 context parallel and 2 model parallel for the embedding lookup. 2025-07-15 10:34:50,928 - mindformers./output/log[mindformers/models/modeling_utils.py:1494] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. 2025-07-15 10:34:50,929 - mindformers./output/log[mindformers/research/deepseek3/deepseek2_model.py:1643] - INFO - Predict run mode:False [WARNING] DISTRIBUTED(909469,ffffa361eec0,python):2025-07-15-10:34:50.936.736 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: e30609fbce6a1a756f50a31ec86eae83 [const vector]{3, 7}, async: 0, submit_now: 1 2025-07-15 10:34:50,937 - mindformers./output/log[mindformers/trainer/base_trainer.py:715] - INFO - Network Parameters: 817 M. 2025-07-15 10:34:50,937 - mindformers./output/log[mindformers/trainer/base_trainer.py:1010] - INFO - .........Build Optimizer For Train.......... 2025-07-15 10:34:50,938 - mindformers./output/log[mindformers/trainer/base_trainer.py:581] - INFO - .........Build Optimizer From Config.......... 2025-07-15 10:34:50,938 - mindformers./output/log[mindformers/trainer/base_trainer.py:628] - INFO - .........Build LR Schedule From Config.......... 2025-07-15 10:34:50,940 - mindformers./output/log[mindformers/trainer/optimizer_grouped_parameters.py:77] - WARNING - dynamic_lr_schedule will be reset and invalid when layer_scale is False. 2025-07-15 10:34:50,941 - mindformers./output/log[mindformers/trainer/optimizer_grouped_parameters.py:116] - INFO - Param groups = { "decay": { "weight_decay": 0.1, "params": [ "model.tok_embeddings.embedding_weight", "model.layers.0.attention.q2l_proj.weight", "model.layers.0.attention.l2q_nope_proj.weight", "model.layers.0.attention.l2q_pe_proj.weight", "model.layers.0.attention.kv2l_k_pe.weight", "model.layers.0.attention.kv2l_latent_kv.weight", "model.layers.0.attention.lkv2kv_k_nope.weight", "model.layers.0.attention.lkv2kv_v.weight", "model.layers.0.attention.wo.weight", "model.layers.0.feed_forward.w1.weight", "model.layers.0.feed_forward.w2.weight", "model.layers.0.feed_forward.w3.weight", "model.layers.1.attention.q2l_proj.weight", "model.layers.1.attention.l2q_nope_proj.weight", "model.layers.1.attention.l2q_pe_proj.weight", "model.layers.1.attention.kv2l_k_pe.weight", "model.layers.1.attention.kv2l_latent_kv.weight", "model.layers.1.attention.lkv2kv_k_nope.weight", "model.layers.1.attention.lkv2kv_v.weight", "model.layers.1.attention.wo.weight", "model.layers.1.feed_forward.routed_experts.ffn.w1.weight", "model.layers.1.feed_forward.routed_experts.ffn.w2.weight", "model.layers.1.feed_forward.routed_experts.ffn.w3.weight", "model.layers.1.feed_forward.routed_experts.router.dense.weight", "model.layers.1.feed_forward.shared_experts.w1.weight", "model.layers.1.feed_forward.shared_experts.w2.weight", "model.layers.1.feed_forward.shared_experts.w3.weight", "model.layers.2.attention.q2l_proj.weight", "model.layers.2.attention.l2q_nope_proj.weight", "model.layers.2.attention.l2q_pe_proj.weight", "model.layers.2.attention.kv2l_k_pe.weight", "model.layers.2.attention.kv2l_latent_kv.weight", "model.layers.2.attention.lkv2kv_k_nope.weight", "model.layers.2.attention.lkv2kv_v.weight", "model.layers.2.attention.wo.weight", "model.layers.2.feed_forward.routed_experts.ffn.w1.weight", "model.layers.2.feed_forward.routed_experts.ffn.w2.weight", "model.layers.2.feed_forward.routed_experts.ffn.w3.weight", "model.layers.2.feed_forward.routed_experts.router.dense.weight", "model.layers.2.feed_forward.shared_experts.w1.weight", "model.layers.2.feed_forward.shared_experts.w2.weight", "model.layers.2.feed_forward.shared_experts.w3.weight", "model.layers.3.attention.q2l_proj.weight", "model.layers.3.attention.l2q_nope_proj.weight", "model.layers.3.attention.l2q_pe_proj.weight", "model.layers.3.attention.kv2l_k_pe.weight", "model.layers.3.attention.kv2l_latent_kv.weight", "model.layers.3.attention.lkv2kv_k_nope.weight", "model.layers.3.attention.lkv2kv_v.weight", "model.layers.3.attention.wo.weight", "model.layers.3.feed_forward.routed_experts.ffn.w1.weight", "model.layers.3.feed_forward.routed_experts.ffn.w2.weight", "model.layers.3.feed_forward.routed_experts.ffn.w3.weight", "model.layers.3.feed_forward.routed_experts.router.dense.weight", "model.layers.3.feed_forward.shared_experts.w1.weight", "model.layers.3.feed_forward.shared_experts.w2.weight", "model.layers.3.feed_forward.shared_experts.w3.weight", "model.mtp_hidden_fusers.0.dense.weight", "lm_head.weight" ] }, "no_decay": { "weight_decay": 0.0, "params": [ "model.layers.0.ffn_norm.weight", "model.layers.0.attention_norm.weight", "model.layers.0.attention.lq_norm.weight", "model.layers.0.attention.lkv_norm.weight", "model.layers.1.ffn_norm.weight", "model.layers.1.attention_norm.weight", "model.layers.1.attention.lq_norm.weight", "model.layers.1.attention.lkv_norm.weight", "model.layers.2.ffn_norm.weight", "model.layers.2.attention_norm.weight", "model.layers.2.attention.lq_norm.weight", "model.layers.2.attention.lkv_norm.weight", "model.layers.3.ffn_norm.weight", "model.layers.3.attention_norm.weight", "model.layers.3.attention.lq_norm.weight", "model.layers.3.attention.lkv_norm.weight", "model.mtp_hidden_fusers.0.norm.weight", "model.mtp_hidden_fusers.0.norm_emb.weight", "model.mtp_norms.0.weight", "model.norm_out.weight" ] } } 2025-07-15 10:34:51,004 - mindformers./output/log[mindformers/trainer/base_trainer.py:1019] - INFO - .........Build Running Wrapper From Config For Train.......... 2025-07-15 10:34:51,004 - mindformers./output/log[mindformers/trainer/base_trainer.py:665] - INFO - .........Build Model Wrapper for Train From Config.......... [WARNING] DISTRIBUTED(909481,ffffbd94eec0,python):2025-07-15-10:34:51.038.749 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: d9639340c2f0051c1a7a09da5ef07ed4 [const vector]{2, 6}, async: 0, submit_now: 1 2025-07-15 10:37:33,044 - mindformers./output/log[mindformers/core/context/parallel.py:88] - ERROR - Notice: if you are trying to run with a single device, please set use_parallel=False. If not, please check the error message above. 2025-07-15 10:37:33,045 - mindformers./output/log[mindformers/tools/cloud_adapter/cloud_monitor.py:43] - ERROR - Traceback (most recent call last): File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 34, in wrapper result = run_func(*args, **kwargs) File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py", line 68, in main build_context(config) File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/build_context.py", line 464, in build_context ctx = Context(mf_config) File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/build_context.py", line 71, in __init__ self.parallel_opr.init_communication() File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/parallel.py", line 86, in init_communication init() File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py", line 203, in init init_hccl() RuntimeError: Call aclrtSetDevice failed, ret[507033]. Got device count[8] and device id[1], please check if device id is valid. ---------------------------------------------------- - C++ Call Stack: (For framework developers) ---------------------------------------------------- mindspore/ccsrc/plugin/res_manager/ascend/hal_manager/ascend_hal_manager.cc:67 InitDevice Traceback (most recent call last): File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py", line 336, in main(config_) File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 44, in wrapper raise exc File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 34, in wrapper result = run_func(*args, **kwargs) File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py", line 68, in main build_context(config) File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/build_context.py", line 464, in build_context ctx = Context(mf_config) File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/build_context.py", line 71, in __init__ self.parallel_opr.init_communication() File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/parallel.py", line 86, in init_communication init() File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py", line 203, in init init_hccl() RuntimeError: Call aclrtSetDevice failed, ret[507033]. Got device count[8] and device id[1], please check if device id is valid. ---------------------------------------------------- - C++ Call Stack: (For framework developers) ---------------------------------------------------- mindspore/ccsrc/plugin/res_manager/ascend/hal_manager/ascend_hal_manager.cc:67 InitDevice [WARNING] DEVICE(909461,ffffa2cfeec0,python):2025-07-15-10:37:33.122.245 [mindspore/ccsrc/plugin/device/ascend/hal/hardware/ascend_device_res_manager.cc:350] SyncAllStreams] The ascend_res_manager_ is nullptr in scenarios where it is not actually executed [ERROR] ME(909097:281473613819584,MainProcess):2025-07-15-10:37:35.350.22 [mindspore/parallel/cluster/process_entity/_api.py:363] Worker process 909461 exit with exception. Error code: 1. [WARNING] ME(909097:281473613819584,MainProcess):2025-07-15-10:37:35.353.31 [mindspore/parallel/cluster/process_entity/_api.py:369] There's worker exits with exception, kill all other workers. [ERROR] ME(909097:281473613819584,MainProcess):2025-07-15-10:38:09.597.683 [mindspore/parallel/cluster/process_entity/_api.py:382] Scheduler process 909455 exit with exception. [ERROR] ME(909097:281473613819584,MainProcess):2025-07-15-10:38:09.599.061 [mindspore/parallel/cluster/process_entity/_api.py:603] Time out nodes are ['1'] /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-32-[WARNING] DISTRIBUTED(909461,ffffa2cfeec0,python):2025-07-15-10:34:46.567.281 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/14400). /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-33-[WARNING] DISTRIBUTED(909461,ffffa2cfeec0,python):2025-07-15-10:34:47.067.392 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/14400). /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-34-[WARNING] DISTRIBUTED(909461,ffffa2cfeec0,python):2025-07-15-10:34:47.567.494 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(3/14400). /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-35-[WARNING] DISTRIBUTED(909461,ffffa2cfeec0,python):2025-07-15-10:34:48.067.622 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-36-[WARNING] DISTRIBUTED(909461,ffffa2cfeec0,python):2025-07-15-10:34:48.067.662 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 1 rank id: 1 /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log:37:2025-07-15 10:37:33,044 - mindformers./output/log[mindformers/core/context/parallel.py:88] - ERROR - Notice: if you are trying to run with a single device, please set use_parallel=False. If not, please check the error message above. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log:38:2025-07-15 10:37:33,045 - mindformers./output/log[mindformers/tools/cloud_adapter/cloud_monitor.py:43] - ERROR - Traceback (most recent call last): /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-39- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 34, in wrapper /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-40- result = run_func(*args, **kwargs) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-41- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py", line 68, in main /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-42- build_context(config) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-43- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/build_context.py", line 464, in build_context -- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-46- self.parallel_opr.init_communication() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-47- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/parallel.py", line 86, in init_communication /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-48- init() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-49- File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py", line 203, in init /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-50- init_hccl() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log:51:RuntimeError: Call aclrtSetDevice failed, ret[507033]. Got device count[8] and device id[1], please check if device id is valid. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-52- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-53----------------------------------------------------- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-54-- C++ Call Stack: (For framework developers) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-55----------------------------------------------------- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-56-mindspore/ccsrc/plugin/res_manager/ascend/hal_manager/ascend_hal_manager.cc:67 InitDevice /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-57- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-58- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log:59:Traceback (most recent call last): /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-60- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py", line 336, in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-61- main(config_) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-62- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 44, in wrapper /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-63- raise exc /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-64- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 34, in wrapper -- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-71- self.parallel_opr.init_communication() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-72- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/parallel.py", line 86, in init_communication /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-73- init() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-74- File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py", line 203, in init /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-75- init_hccl() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log:76:RuntimeError: Call aclrtSetDevice failed, ret[507033]. Got device count[8] and device id[1], please check if device id is valid. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-77- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-78----------------------------------------------------- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-79-- C++ Call Stack: (For framework developers) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-80----------------------------------------------------- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_1.log-81-mindspore/ccsrc/plugin/res_manager/ascend/hal_manager/ascend_hal_manager.cc:67 InitDevice -- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-118-[WARNING] DISTRIBUTED(909455,ffffb789eec0,python):2025-07-15-10:37:52.496.212 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-119-[WARNING] DISTRIBUTED(909455,ffffb789eec0,python):2025-07-15-10:37:57.496.320 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 8 alive nodes. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-120-[WARNING] DISTRIBUTED(909455,ffffb789eec0,python):2025-07-15-10:37:57.496.370 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-121-[WARNING] DISTRIBUTED(909455,ffffb789eec0,python):2025-07-15-10:38:02.496.491 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 8 alive nodes. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-122-[WARNING] DISTRIBUTED(909455,ffffb789eec0,python):2025-07-15-10:38:02.496.537 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log:123:[ERROR] DISTRIBUTED(909455,ffff31eeefa0,python):2025-07-15-10:38:03.013.323 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:511] UpdateTopoState] The node: 1 is timed out. It may exit with exception, please check this node's log. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log:124:[ERROR] DISTRIBUTED(909455,ffffb789eec0,python):2025-07-15-10:38:07.496.649 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:103] Finalize] There are 1 abnormal compute graph nodes. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log:125:2025-07-15 10:38:07,497 - mindformers./output/log[mindformers/core/context/parallel.py:88] - ERROR - Notice: if you are trying to run with a single device, please set use_parallel=False. If not, please check the error message above. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log:126:2025-07-15 10:38:07,498 - mindformers./output/log[mindformers/tools/cloud_adapter/cloud_monitor.py:43] - ERROR - Traceback (most recent call last): /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-127- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 34, in wrapper /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-128- result = run_func(*args, **kwargs) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-129- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py", line 68, in main /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-130- build_context(config) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-131- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/build_context.py", line 464, in build_context -- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-134- self.parallel_opr.init_communication() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-135- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/parallel.py", line 86, in init_communication /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-136- init() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-137- File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py", line 213, in init /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-138- init_cluster() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log:139:RuntimeError: The total number of timed out node is 1. Timed out node list is: [const vector]{1}, worker 1 is the first one timed out, please check its log. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-140- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-141----------------------------------------------------- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-142-- C++ Call Stack: (For framework developers) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-143----------------------------------------------------- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-144-mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:517 UpdateTopoState /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-145- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-146- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log:147:Traceback (most recent call last): /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-148- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/../mindformers/run_mindformer.py", line 336, in /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-149- main(config_) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-150- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 44, in wrapper /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-151- raise exc /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-152- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/tools/cloud_adapter/cloud_monitor.py", line 34, in wrapper -- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-159- self.parallel_opr.init_communication() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-160- File "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/mindformers/mindformers/core/context/parallel.py", line 86, in init_communication /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-161- init() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-162- File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/management.py", line 213, in init /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-163- init_cluster() /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log:164:RuntimeError: The total number of timed out node is 1. Timed out node list is: [const vector]{1}, worker 1 is the first one timed out, please check its log. /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-165- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-166----------------------------------------------------- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-167-- C++ Call Stack: (For framework developers) /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-168----------------------------------------------------- /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/scheduler.log-169-mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:517 UpdateTopoState Traceback (most recent call last): File "/home/jenkins/anaconda3/envs/ci39/bin/msrun", line 8, in sys.exit(main()) File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/parallel/cluster/run.py", line 191, in main run(args) File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/parallel/cluster/run.py", line 185, in run process_manager.run() File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/parallel/cluster/process_entity/_api.py", line 268, in run self.join_processes() File "/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/parallel/cluster/process_entity/_api.py", line 387, in join_processes raise RuntimeError("Distributed job exited with exception. Please check logs in " RuntimeError: Distributed job exited with exception. Please check logs in directory: /home/jenkins/mindspore/testcases/testcases/tests/st/networks/llm_parallel_feature/deepseekv3/deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/. F =================================== FAILURES =================================== ________ test_deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_bmm_performance ________ @arg_mark(plat_marks=['platform_ascend910b'], level_mark='level0', card_mark='allcards', essential_mark='essential') def test_deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_bmm_performance(): """ Feature: test deepseekv3 cell dp2mp2ep4pp2mb4gas1bs1 8p bmm performance Description: test deepseekv3 cell dp2mp2ep4pp2mb4gas1bs1 8p bmm performance Expectation: st pass """ case_name = "deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance" sh_path = os.path.split(os.path.realpath(__file__))[0] # set the speed up json parallel_speed_up_json = {'matmul_grad_comm_overlap': True, 'pp_1f1b_overlap': 'AlltoAllV,AlltoAll'} # set the config deepseek_config = DeepseekConfig(num_samples=24, hidden_size=4096, intermediate_size=8192, moe_intermediate_size=2048, parallel_speed_up_json=parallel_speed_up_json, use_gmm=False, use_fused_swiglu=True, enable_fa_var_len=True, use_fused_rope=True, pp_interleave_num=2, deterministic="OFF" ) file_path = prepare_deepseekv3_testcase_env(case_name, deepseek_config) # set the communication parameters device_num = 8 master_port = 7126 hccl_if_base_port = 63435 # set env for training graph_kernel_flags = "--enable_cluster_ops=MatMul,BatchMatMul,Reshape --online_tuning=1" os.system(f"bash {sh_path}/run_llm.sh {device_num} \ {file_path} {case_name} {master_port} {hccl_if_base_port} pp mindrecord \"{graph_kernel_flags}\"") # check train over check_pair = {"Training Over": 1} real_log_path = log_path_preprocess(case_name, device_num) for log_path in real_log_path: > check_log(log_path, check_pair) test_deepseekv3_pretrain.py:435: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ file_path = './deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_0.log' check_pairs = {'Training Over': 1} def check_log(file_path, check_pairs=None): # check the number of key in check_pairs in log file is equal to the value log_error_count = subprocess.check_output( ["grep -rE '%s' %s | wc -l" % ("ERROR|Traceback", file_path)], shell=True) log_cnt = str(log_error_count, 'utf-8').strip() if log_cnt != "0": os.system(f"cat {file_path}") assert log_cnt == "0", f"Error found in {file_path}" if check_pairs is not None: for key_word, value in check_pairs.items(): log_output = subprocess.check_output( ["grep -r '%s' %s | wc -l" % (key_word, file_path)], shell=True) log_cnt = str(log_output, 'utf-8').strip() > assert log_cnt == str(value), (f"Failed to find {key_word} in {file_path} or content is not correct." f"Expected occurrences: {value}, but got {log_cnt}") E AssertionError: Failed to find Training Over in ./deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_performance_8p_bmm_performance/worker_0.log or content is not correct.Expected occurrences: 1, but got 0 ../utils.py:160: AssertionError =========================== short test summary info ============================ FAILED test_deepseekv3_pretrain.py::test_deepseekv3_cell_dp2mp2ep2pp2mb4gas1bs1_8p_bmm_performance ======================== 1 failed in 230.01s (0:03:50) =========================