large_models/0000755000175100017500000000000015040321273013704 5ustar jenkinsHwHiAiUserlarge_models/resume/0000755000175100017500000000000015040315702015204 5ustar jenkinsHwHiAiUserlarge_models/resume/resume_train_utils.py0000644000175100017500000000226215040315702021475 0ustar jenkinsHwHiAiUser# Copyright 2025 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """Resume training utils.""" import re import os def extract_loss_values(log_file_path): """extract loss values from log""" loss_values = [] loss_pattern = re.compile(r'loss: (\d+\.\d+)') with open(log_file_path, 'r') as file: for line in file: match = loss_pattern.search(line) if match: loss_value = float(match.group(1)) loss_values.append(loss_value) return loss_values def get_file_mtime(file_path): return os.path.getmtime(file_path) large_models/resume/test_parallel_resume.py0000644000175100017500000000511015040315702021766 0ustar jenkinsHwHiAiUser# Copyright 2025 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ Test module for testing resume training from specified checkpoint. How to run this: pytest tests/st/networks/large_models/resume/test_parallel_resume.py """ import os import shutil from tests.mark_utils import arg_mark from resume_train_utils import extract_loss_values def remove_folder(folder_path): """ If the folder exists, delete it and all its contents. Args: folder_path: The path to the folder to delete. """ if os.path.exists(folder_path): try: shutil.rmtree(folder_path) print(f"Directory '{folder_path}' has been removed.") except OSError as e: print(f"Remove directory '{folder_path}' failed: {e}") else: print(f"Directory '{folder_path}' is not exist.") class TestResumeTraining: """A test class for testing pipeline.""" @arg_mark(plat_marks=['platform_ascend910b'], level_mark='level1', card_mark='allcards', essential_mark='essential') def test_train(self): """ Feature: Trainer.train() Description: Test parallel trainer for train. Expectation: AssertionError """ ascend_home_path = os.getenv('ASCEND_HOME_PATH') if not ascend_home_path: os.environ['ASCEND_HOME_PATH'] = "/usr/local/Ascend/latest" sh_path = os.path.split(os.path.realpath(__file__))[0] ret = os.system(f"export MS_DEV_P2P_HCCL_BUFFSIZE=24 && " f"export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 && " f"bash {sh_path}/msrun_launch.sh 4") os.system(f"grep -E 'ERROR|error' {sh_path}/msrun_log/worker_3.log -C 3") assert ret == 0 loss = extract_loss_values("msrun_log/worker_3.log") resume_start = 256 train_middle = 128 for i in range(0, 10): assert abs(loss[resume_start + i] - loss[train_middle + i]) < 0.005 remove_folder("./output/test_resume_parallel/checkpoint") large_models/resume/__init__.py0000644000175100017500000000125615040315702017321 0ustar jenkinsHwHiAiUser# Copyright 2025 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """Test resume.""" large_models/resume/base_model.py0000644000175100017500000001135315040315702017653 0ustar jenkinsHwHiAiUser# Copyright 2025 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """Llama2 Base Model.""" import os import sys workspace = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) sys.path.insert(0, os.path.join(workspace, "mindformers")) from mindformers.models.llama import LlamaForCausalLM, LlamaConfig from mindformers.modules.transformer import TransformerOpParallelConfig, TransformerRecomputeConfig from mindformers import CosineWithWarmUpLR NUM_LAYERS = 16 NUM_HEADS = 4 HIDDEN_SIZE = 512 SEQ_LENGTH = 4096 RECOMPUTE_CONFIG = TransformerRecomputeConfig( recompute=False, select_recompute=False, parallel_optimizer_comm_recompute=False, mp_comm_recompute=True, recompute_slice_activation=False ) PARALLEL_CONFIG = TransformerOpParallelConfig( data_parallel=2, model_parallel=2, expert_parallel=1, pipeline_stage=1, micro_batch_num=2, recompute=RECOMPUTE_CONFIG, use_seq_parallel=False, gradient_aggregation_group=4, vocab_emb_dp=True, ) BASE_CONFIG = { 'trainer': { 'type': 'CausalLanguageModelingTrainer', 'model_name': 'llama2' }, 'train_dataset': { 'batch_size': 2 }, 'train_dataset_task': {}, 'micro_batch_interleave_num': 1, 'use_parallel': True, 'parallel': { 'parallel_mode': 1, # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel 'gradients_mean': False, 'enable_alltoall': False, 'full_batch': True, 'search_mode': "sharding_propagation", 'enable_parallel_optimizer': True, 'strategy_ckpt_save_file': r"./ckpt_strategy.ckpt", 'parallel_optimizer_config': { 'gradient_accumulation_shard': False, 'parallel_optimizer_threshold': 64, } }, 'parallel_config': { 'data_parallel': 2, 'model_parallel': 2, 'pipeline_stage': 1, 'use_seq_parallel': False, 'micro_batch_num': 2, 'vocab_emb_dp': True, 'gradient_aggregation_group': 4, }, 'runner_config': { 'epochs': 1, 'batch_size': 2, 'sink_mode': True, 'sink_size': 1 }, # optimizer 'optimizer': { 'type': "AdamW", 'betas': [0.9, 0.999], 'eps': 1.e-8, 'learning_rate': 5.e-5, }, 'context': { 'mode': 0, # 0--Graph Mode; 1--Pynative Mode 'device_target': "Ascend" }, # lr schedule 'lr_schedule': { 'type': CosineWithWarmUpLR, 'learning_rate': 5.e-5, 'lr_end': 1.e-6, 'total_steps': 64 }, 'runner_wrapper': { 'type': 'MFTrainOneStepCell', 'scale_sense': { 'type': 'DynamicLossScaleUpdateCell', 'loss_scale_value': 65536, 'scale_factor': 1, 'scale_window': 1000 }, 'use_clip_grad': True, }, 'model': { 'model_config': { 'type': LlamaConfig, 'offset': 0, 'batch_size': 1, 'seq_length': 4096, 'hidden_size': 512, 'num_layers': 16, 'num_heads': 4, 'use_past': False, 'compute_dtype': "float16", 'param_init_type': "float16", 'softmax_compute_type': "float16" } }, 'callbacks': [ { 'type': 'MFLossMonitor' }, { 'type': 'CheckpointMonitor', 'prefix': "llama2_7b", 'save_checkpoint_steps': 32, 'integrated_save': False, 'async_save': False, 'checkpoint_format': 'safetensors', 'remove_redundancy': True, 'keep_checkpoint_max': 8, 'directory': './output/test_resume_parallel' } ] } def get_config_dict(): """Get config dict.""" return BASE_CONFIG def get_model_config(): """get instanced model config.""" return LlamaConfig(num_layers=NUM_LAYERS, seq_length=SEQ_LENGTH, num_heads=NUM_HEADS, hidden_size=HIDDEN_SIZE, parallel_config=PARALLEL_CONFIG) def get_model(config): """Get instanced model.""" return LlamaForCausalLM(config) large_models/resume/msrun_launch.sh0000644000175100017500000000175315040315702020244 0ustar jenkinsHwHiAiUser#!/bin/bash # Copyright 2025 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ set -e BASE_PATH=$(cd "$(dirname $0)"; pwd) USE_DEVICE_NUM=$1 msrun --worker_num=${USE_DEVICE_NUM} \ --local_worker_num=${USE_DEVICE_NUM} \ --master_port=8118 \ --log_dir=msrun_log \ --join=True \ --cluster_time_out=300 \ ${BASE_PATH}/resume_parallel.py > parallel_resume_train.log 2>&1 large_models/resume/resume_parallel.py0000644000175100017500000001315615040315702020740 0ustar jenkinsHwHiAiUser# Copyright 2025 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ Test module for testing resume training from specified checkpoint. How to run this: pytest tests/st/networks/large_models/resume/test_parallel_resume.py """ import os import sys import json import time import random from glob import glob import numpy as np workspace = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) sys.path.insert(0, os.path.join(workspace, "mindformers")) from mindspore.dataset import GeneratorDataset from mindformers import build_context from mindformers.tools.utils import ( LOCAL_DEFAULT_PATH, get_real_rank, get_real_group_size, get_epoch_and_step_from_ckpt_name ) from mindformers.trainer import Trainer from mindformers.tools import MindFormerConfig from mindformers.tools.logger import logger from base_model import get_config_dict, get_model, get_model_config SEED = 42 DATA_SIZE = 2048 SEQ_LENGTH = 4096 def generator_1(): """dataset generator""" for i in range(DATA_SIZE): np.random.seed(SEED + i) input_ids = np.random.randint(low=0, high=DATA_SIZE, size=(SEQ_LENGTH + 1,)).astype(np.int32) yield input_ids def generator_2(): """dataset generator""" for i in range(DATA_SIZE // 2): np.random.seed(SEED + DATA_SIZE // 2 + i) input_ids = np.random.randint(low=0, high=DATA_SIZE, size=(SEQ_LENGTH + 1,)).astype(np.int32) yield input_ids def get_checkpoints_path(checkpoint_dir): """get checkpoints path""" checkpoints_path = glob(os.path.join(checkpoint_dir, "*.safetensors")) checkpoints_path.sort( key=lambda x: get_epoch_and_step_from_ckpt_name(x, ckpt_fmt='safetensors') ) return checkpoints_path def wait_training_over(): """wait current training task saving checkpoint over""" meta_json = os.path.join(LOCAL_DEFAULT_PATH, "test_resume_parallel", "checkpoint", "rank_{}".format(get_real_rank()), "meta.json") with open(meta_json, "r") as json_file: meta_data = json.load(json_file) last_epoch = meta_data["last_epoch"] last_step = meta_data["last_step"] logger.info(f"Rank_{get_real_rank()} get last_epoch={last_epoch}, last_step={last_step}") start_time = time.time() while True: save_over = True for rank_id in range(get_real_group_size()): meta_json = os.path.join(LOCAL_DEFAULT_PATH, "test_resume_parallel", "checkpoint", "rank_{}".format(rank_id), "meta.json") with open(meta_json, "r") as json_file: meta_data = json.load(json_file) compare_epoch = meta_data["last_epoch"] compare_step = meta_data["last_step"] if last_epoch != compare_epoch or last_step != compare_step: logger.info(f"Rank_{rank_id}'s last_epoch or last_step is not equal to Rank_{get_real_rank()}" f"expect last_epoch={last_epoch}, last_step={last_step}," f"but get last_epoch={compare_epoch}, last_step={compare_step}") save_over = False time.sleep(0.1 + random.uniform(0, 0.1)) if save_over: break if time.time() - start_time > 60: raise TimeoutError("Wait current training task saving checkpoint over timeout!") def llama_trainer_train_from_instance(): """ Feature: Create Trainer From Instance Description: Test Trainer API to train from self-define instance API. Expectation: TypeError """ # Config definition config = get_config_dict() mf_config = MindFormerConfig(**config) build_context(mf_config) # Model model_config = get_model_config() model = get_model(model_config) # Training using first dataset. dataset = GeneratorDataset(generator_1, column_names=["input_ids"]) dataset = dataset.batch(batch_size=8) trainer = Trainer(model=model, args=mf_config, train_dataset=dataset) trainer.train(train_checkpoint=False) wait_training_over() checkpoint_dir = os.path.join(LOCAL_DEFAULT_PATH, "test_resume_parallel", "checkpoint", "rank_{}".format(get_real_rank())) checkpoints_path = get_checkpoints_path(checkpoint_dir) for _ in range(len(checkpoints_path) // 2): os.remove(checkpoints_path.pop()) # Resume training using the new second dataset. mf_config = MindFormerConfig(**config) build_context(mf_config) mf_config.runner_config.epochs = 2 mf_config.load_ckpt_format = 'safetensors' mf_config.remove_redundancy = True model = get_model(model_config) dataset = GeneratorDataset(generator_2, column_names=["input_ids"]) dataset = dataset.batch(batch_size=8) trainer = Trainer(model=model, args=mf_config, train_dataset=dataset) trainer.train(resume_from_checkpoint=os.path.join(LOCAL_DEFAULT_PATH, "test_resume_parallel", "checkpoint"), resume_training=True, ignore_data_skip=True) wait_training_over() llama_trainer_train_from_instance() large_models/graph/0000700000175100017500000000000015040321130014763 5ustar jenkinsHwHiAiUserlarge_models/kernel_meta/0000750000175100017500000000000015040321216016162 5ustar jenkinsHwHiAiUserlarge_models/kernel_meta/buildPidInfo.json0000640000175100017500000000050015040321216021421 0ustar jenkinsHwHiAiUser[ [ 3806931, "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/kernel_meta/kernel_meta_1753326504624019543" ], [ 3806946, "/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/kernel_meta/kernel_meta_6957867551940763195" ] ]large_models/kernel_meta/kernel_meta_temp_1753326504624019543/0000750000175100017500000000000015040321225023562 5ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_1753326504624019543/3806931_3808872_bt.log0000644000175100017500000000000015040321225026525 0ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_1753326504624019543/3806931_3808873_bt.log0000644000175100017500000000000015040321225026526 0ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_1753326504624019543/3806931_3808876_bt.log0000644000175100017500000000000015040321225026531 0ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_1753326504624019543/3806931_3808877_bt.log0000644000175100017500000000000015040321225026532 0ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_1753326504624019543/3806931_3808871_bt.log0000644000175100017500000000000015040321225026524 0ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_1753326504624019543/3806931_3808870_bt.log0000644000175100017500000000000015040321225026523 0ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_1753326504624019543/3806931_3808874_bt.log0000644000175100017500000000000015040321225026527 0ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_1753326504624019543/3806931_3808875_bt.log0000644000175100017500000000000015040321225026530 0ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_6957867551940763195/0000750000175100017500000000000015040321224023622 5ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_6957867551940763195/3806946_3808784_bt.log0000644000175100017500000000000015040321224026575 0ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_6957867551940763195/3806946_3808783_bt.log0000644000175100017500000000000015040321224026574 0ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_6957867551940763195/3806946_3808785_bt.log0000644000175100017500000000000015040321224026576 0ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_6957867551940763195/3806946_3808782_bt.log0000644000175100017500000000000015040321224026573 0ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_6957867551940763195/3806946_3808789_bt.log0000644000175100017500000000000015040321224026602 0ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_6957867551940763195/3806946_3808787_bt.log0000644000175100017500000000000015040321224026600 0ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_6957867551940763195/3806946_3808786_bt.log0000644000175100017500000000000015040321224026577 0ustar jenkinsHwHiAiUserlarge_models/kernel_meta/kernel_meta_temp_6957867551940763195/3806946_3808788_bt.log0000644000175100017500000000000015040321224026601 0ustar jenkinsHwHiAiUserlarge_models/output/0000755000175100017500000000000015040321133015237 5ustar jenkinsHwHiAiUserlarge_models/output/strategy/0000755000175100017500000000000015040321133017101 5ustar jenkinsHwHiAiUserlarge_models/output/log/0000755000175100017500000000000015040321130016015 5ustar jenkinsHwHiAiUserlarge_models/output/log/rank_2/0000755000175100017500000000000015040321130017171 5ustar jenkinsHwHiAiUserlarge_models/output/log/rank_2/error.log0000644000175100017500000000000015040321130021013 0ustar jenkinsHwHiAiUserlarge_models/output/log/rank_2/info.log0000644000175100017500000002537715040321202020645 0ustar jenkinsHwHiAiUser[WARNING] 2025-07-24 11:02:48,654 [mindformers/tools/register/template.py:84] _none_process: The input config moe_config is empty. [INFO] 2025-07-24 11:02:48,689 [mindformers/core/context/build_context.py:168] _set_predict_jit_config: Predict context config, jit_level: O0, infer_boost: on [INFO] 2025-07-24 11:02:48,692 [mindformers/core/context/parallel.py:73] _get_parallel_ctx_config: full_batch is set to False for non-parallel modes [INFO] 2025-07-24 11:02:51,753 [mindformers/tools/utils.py:185] set_strategy_save_path: set strategy path to './output/strategy/ckpt_strategy_rank_2.ckpt' [INFO] 2025-07-24 11:02:51,755 [mindformers/core/context/build_context.py:383] set_cpu_affinity: cann workqueue cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191] [WARNING] 2025-07-24 11:02:51,755 [mindformers/core/context/build_context.py:387] set_cpu_affinity: CANN use cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], model get empty cpu list, disable binding cores [INFO] 2025-07-24 11:02:51,756 [mindformers/core/context/build_context.py:395] set_cpu_affinity: cpu_affinity, rank_id: 2, device_num: 4 [INFO] 2025-07-24 11:02:51,757 [mindformers/core/parallel_config.py:41] build_parallel_config: initial moe_config from dict: {'expert_num': 1, 'capacity_factor': 1.1, 'aux_loss_factor': 0.05, 'num_experts_chosen': 1, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV1', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 0, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': None, 'n_group': None, 'first_k_dense_replace': True, 'moe_intermediate_size': 1407, 'routed_scaling_factor': 1.0, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} [INFO] 2025-07-24 11:02:51,758 [mindformers/core/parallel_config.py:61] build_parallel_config: initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': False, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} [INFO] 2025-07-24 11:02:52,275 [mindformers/parallel_core/inference/parallel_state.py:358] initialize_model_parallel: expert_model_parallel_size(1) is not equal to world_size(4), so we will use 4 as the MOE_tensor_parallel_size. [INFO] 2025-07-24 11:02:53,547 [mindformers/models/modeling_utils.py:1517] load_checkpoint: model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. [INFO] 2025-07-24 11:02:53,547 [/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/run_parallel.py:178] parallel_qwen2_0_5b_predict_dp2_mp2: ----------------Transform and load checkpoint---------------- [WARNING] 2025-07-24 11:02:55,296 [mindformers/generation/text_generator.py:726] split_input_ids: batch size {batch} can not be divisible by data_parallel {data_parallel}, and would not split. [INFO] 2025-07-24 11:02:55,298 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:02:55,299 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:02:55,299 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:02:55,300 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [INFO] 2025-07-24 11:03:26,510 [mindformers/generation/text_generator.py:1159] generate: total time: 31.20920157432556 s; generated tokens: 8 tokens; generate speed: 0.25633465761524793 tokens/s [INFO] 2025-07-24 11:03:26,511 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0020928382873535156 s; prefill predict time: 16.69329285621643 s; prefill post time: 0.06165051460266113 s; decode prepare time: 0.0011292185102190291 s; decode predict time: 0.005893349647521973 s; decode post time: 0.005744048527308873 s [INFO] 2025-07-24 11:03:26,516 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. [INFO] 2025-07-24 11:03:27,799 [mindformers/generation/text_generator.py:735] split_input_ids: The batch is: 4, and the split_size is: 2, and the global_rank_id is: 2, and the dp_rank_id is: 1 and start is: 2, and stop is: 4 [INFO] 2025-07-24 11:03:27,802 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:03:27,803 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:03:27,803 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:03:27,804 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [INFO] 2025-07-24 11:03:28,764 [mindformers/generation/text_generator.py:1159] generate: total time: 0.9593923091888428 s; generated tokens: 208 tokens; generate speed: 216.8039059807161 tokens/s [INFO] 2025-07-24 11:03:28,764 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0016512870788574219 s; prefill predict time: 0.011078357696533203 s; prefill post time: 0.008404016494750977 s; decode prepare time: 0.0008294767546422273 s; decode predict time: 0.004994121252321729 s; decode post time: 0.003184698160412242 s [INFO] 2025-07-24 11:03:28,769 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. [INFO] 2025-07-24 11:03:28,785 [mindformers/generation/text_generator.py:735] split_input_ids: The batch is: 8, and the split_size is: 4, and the global_rank_id is: 2, and the dp_rank_id is: 1 and start is: 4, and stop is: 8 [INFO] 2025-07-24 11:03:28,786 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:03:28,787 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:03:28,787 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:03:28,788 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [INFO] 2025-07-24 11:03:30,031 [mindformers/generation/text_generator.py:1159] generate: total time: 1.242950439453125 s; generated tokens: 408 tokens; generate speed: 328.25122148837437 tokens/s [INFO] 2025-07-24 11:03:30,032 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.001247406005859375 s; prefill predict time: 0.012296199798583984 s; prefill post time: 0.006251096725463867 s; decode prepare time: 0.0009136648461370185 s; decode predict time: 0.005651011466979981 s; decode post time: 0.0054396780410615525 s [INFO] 2025-07-24 11:03:30,037 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. large_models/output/log/rank_1/0000755000175100017500000000000015040321130017170 5ustar jenkinsHwHiAiUserlarge_models/output/log/rank_1/error.log0000644000175100017500000000000015040321130021012 0ustar jenkinsHwHiAiUserlarge_models/output/log/rank_1/info.log0000644000175100017500000007436515040321253020653 0ustar jenkinsHwHiAiUser[WARNING] 2025-07-24 11:02:48,505 [mindformers/tools/register/template.py:84] _none_process: The input config moe_config is empty. [INFO] 2025-07-24 11:02:48,539 [mindformers/core/context/build_context.py:168] _set_predict_jit_config: Predict context config, jit_level: O0, infer_boost: on [INFO] 2025-07-24 11:02:48,542 [mindformers/core/context/parallel.py:73] _get_parallel_ctx_config: full_batch is set to False for non-parallel modes [WARNING] 2025-07-24 11:02:48,611 [mindformers/tools/register/template.py:84] _none_process: The input config runner_config is empty. [INFO] 2025-07-24 11:02:48,644 [mindformers/core/context/build_context.py:168] _set_predict_jit_config: Predict context config, jit_level: O0, infer_boost: on [WARNING] 2025-07-24 11:02:48,650 [mindformers/tools/register/template.py:84] _none_process: The input config moe_config is empty. [INFO] 2025-07-24 11:02:48,683 [mindformers/core/context/build_context.py:168] _set_predict_jit_config: Predict context config, jit_level: O0, infer_boost: on [INFO] 2025-07-24 11:02:48,687 [mindformers/core/context/parallel.py:73] _get_parallel_ctx_config: full_batch is set to False for non-parallel modes [INFO] 2025-07-24 11:02:51,565 [mindformers/tools/utils.py:185] set_strategy_save_path: set strategy path to './output/strategy/ckpt_strategy_rank_1.ckpt' [INFO] 2025-07-24 11:02:51,567 [mindformers/core/context/build_context.py:383] set_cpu_affinity: cann workqueue cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191] [WARNING] 2025-07-24 11:02:51,567 [mindformers/core/context/build_context.py:387] set_cpu_affinity: CANN use cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], model get empty cpu list, disable binding cores [INFO] 2025-07-24 11:02:51,568 [mindformers/core/context/build_context.py:395] set_cpu_affinity: cpu_affinity, rank_id: 1, device_num: 4 [INFO] 2025-07-24 11:02:51,569 [mindformers/core/parallel_config.py:41] build_parallel_config: initial moe_config from dict: {'expert_num': 1, 'capacity_factor': 1.1, 'aux_loss_factor': 0.05, 'num_experts_chosen': 1, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV1', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 0, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': None, 'n_group': None, 'first_k_dense_replace': True, 'moe_intermediate_size': 1407, 'routed_scaling_factor': 1.0, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} [INFO] 2025-07-24 11:02:51,570 [mindformers/core/parallel_config.py:61] build_parallel_config: initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': False, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} [INFO] 2025-07-24 11:02:51,656 [mindformers/tools/utils.py:185] set_strategy_save_path: set strategy path to './output/strategy/ckpt_strategy_rank_1.ckpt' [INFO] 2025-07-24 11:02:51,658 [mindformers/core/context/build_context.py:383] set_cpu_affinity: cann workqueue cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191] [WARNING] 2025-07-24 11:02:51,658 [mindformers/core/context/build_context.py:387] set_cpu_affinity: CANN use cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], model get empty cpu list, disable binding cores [INFO] 2025-07-24 11:02:51,659 [mindformers/core/context/build_context.py:395] set_cpu_affinity: cpu_affinity, rank_id: 1, device_num: 2 [WARNING] 2025-07-24 11:02:51,659 [mindformers/core/context/build_context.py:366] set_ms_affinity: custom bind policy affinity_cpu_list must be dict, but got None. [INFO] 2025-07-24 11:02:51,659 [mindformers/core/parallel_config.py:41] build_parallel_config: initial moe_config from dict: {'expert_num': 256, 'capacity_factor': 1.1, 'aux_loss_factor': 0.05, 'num_experts_chosen': 8, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 4, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} [INFO] 2025-07-24 11:02:51,660 [mindformers/core/parallel_config.py:61] build_parallel_config: initial parallel_config from dict: {'data_parallel': 1, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': False, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} [INFO] 2025-07-24 11:02:51,715 [mindformers/tools/utils.py:185] set_strategy_save_path: set strategy path to './output/strategy/ckpt_strategy_rank_1.ckpt' [INFO] 2025-07-24 11:02:51,717 [mindformers/core/context/build_context.py:383] set_cpu_affinity: cann workqueue cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191] [WARNING] 2025-07-24 11:02:51,717 [mindformers/core/context/build_context.py:387] set_cpu_affinity: CANN use cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], model get empty cpu list, disable binding cores [INFO] 2025-07-24 11:02:51,718 [mindformers/core/context/build_context.py:395] set_cpu_affinity: cpu_affinity, rank_id: 1, device_num: 2 [INFO] 2025-07-24 11:02:51,718 [mindformers/core/parallel_config.py:41] build_parallel_config: initial moe_config from dict: {'expert_num': 1, 'capacity_factor': 1.1, 'aux_loss_factor': 0.05, 'num_experts_chosen': 1, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV1', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 0, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': None, 'n_group': None, 'first_k_dense_replace': True, 'moe_intermediate_size': 1407, 'routed_scaling_factor': 1.0, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} [INFO] 2025-07-24 11:02:51,719 [mindformers/core/parallel_config.py:61] build_parallel_config: initial parallel_config from dict: {'data_parallel': 1, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': False, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} [INFO] 2025-07-24 11:02:52,079 [mindformers/parallel_core/inference/parallel_state.py:358] initialize_model_parallel: expert_model_parallel_size(1) is not equal to world_size(4), so we will use 4 as the MOE_tensor_parallel_size. [INFO] 2025-07-24 11:02:52,210 [mindformers/parallel_core/inference/parallel_state.py:358] initialize_model_parallel: expert_model_parallel_size(1) is not equal to world_size(2), so we will use 2 as the MOE_tensor_parallel_size. [INFO] 2025-07-24 11:02:52,300 [mindformers/version_control.py:119] decorator: The Lazy Inline compilation acceleration feature does not support single-card mode.This feature is disabled by default. ENABLE_LAZY_INLINE=1 does not take effect. [INFO] 2025-07-24 11:02:52,364 [mindformers/parallel_core/inference/parallel_state.py:358] initialize_model_parallel: expert_model_parallel_size(1) is not equal to world_size(2), so we will use 2 as the MOE_tensor_parallel_size. [INFO] 2025-07-24 11:02:53,406 [mindformers/models/utils.py:205] __init__: num_layers per stage: [[4]] [INFO] 2025-07-24 11:02:53,406 [mindformers/models/utils.py:206] __init__: Accumulated num_layers per stage: [[4]] [INFO] 2025-07-24 11:02:53,406 [mindformers/models/utils.py:208] __init__: Pipeline id list with start_stage: [0, 0, 0, 0] [INFO] 2025-07-24 11:02:53,407 [mindformers/models/utils.py:209] __init__: Interleave id list: [0, 0, 0, 0] [INFO] 2025-07-24 11:02:53,407 [mindformers/models/utils.py:227] __init__: Formative layer_recompute: [[0]] [INFO] 2025-07-24 11:02:53,407 [mindformers/models/utils.py:229] __init__: The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. [INFO] 2025-07-24 11:02:53,407 [mindformers/models/utils.py:235] __init__: Formative select_recompute: {'feed_forward\\.mul': [[0]], 'feed_forward\\.w1\\.activation\\.silu': [[0]]} [INFO] 2025-07-24 11:02:53,407 [mindformers/models/utils.py:236] __init__: Formative select_comm_recompute: {'.*\\.norm': [[0]]} [INFO] 2025-07-24 11:02:53,408 [mindformers/models/utils.py:237] __init__: Formative select_recompute_exclude: {} [INFO] 2025-07-24 11:02:53,408 [mindformers/models/utils.py:238] __init__: Formative select_comm_recompute_exclude: {} [INFO] 2025-07-24 11:02:53,418 [mindformers/models/modeling_utils.py:1517] load_checkpoint: model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. [INFO] 2025-07-24 11:02:53,419 [/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/run_parallel.py:178] parallel_qwen2_0_5b_predict_dp2_mp2: ----------------Transform and load checkpoint---------------- [WARNING] 2025-07-24 11:02:53,427 [mindformers/research/deepseek3/deepseek3_model_infer.py:927] __init__: first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. [INFO] 2025-07-24 11:02:53,437 [mindformers/version_control.py:76] decorator: Predict enable lazy inline. [INFO] 2025-07-24 11:02:53,742 [mindformers/version_control.py:76] decorator: Predict enable lazy inline. [INFO] 2025-07-24 11:02:53,749 [mindformers/models/modeling_utils.py:1517] load_checkpoint: model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. [INFO] 2025-07-24 11:02:53,749 [/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/run_parallel.py:85] parallel_qwen2_0_5b_predict_mp2: ----------------Transform and load checkpoint---------------- [INFO] 2025-07-24 11:02:53,797 [mindformers/version_control.py:76] decorator: Predict enable lazy inline. [INFO] 2025-07-24 11:02:53,852 [mindformers/version_control.py:76] decorator: Predict enable lazy inline. [INFO] 2025-07-24 11:02:53,858 [mindformers/models/modeling_utils.py:1517] load_checkpoint: model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. [INFO] 2025-07-24 11:02:53,859 [mindformers/research/deepseek3/deepseek3_model_infer.py:1247] __init__: Predict run mode:True [INFO] 2025-07-24 11:02:53,859 [/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/run_parallel.py:274] parallel_deepseek_r1_bf16_predict_mp2: ----------------Transform and load checkpoint---------------- [INFO] 2025-07-24 11:02:55,043 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:02:55,044 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:02:55,044 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:02:55,045 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [WARNING] 2025-07-24 11:02:55,352 [mindformers/generation/text_generator.py:726] split_input_ids: batch size {batch} can not be divisible by data_parallel {data_parallel}, and would not split. [INFO] 2025-07-24 11:02:55,354 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:02:55,355 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:02:55,356 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:02:55,357 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [INFO] 2025-07-24 11:03:23,853 [mindformers/generation/text_generator.py:1159] generate: total time: 28.80747652053833 s; generated tokens: 8 tokens; generate speed: 0.27770568499106085 tokens/s [INFO] 2025-07-24 11:03:23,854 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0018775463104248047 s; prefill predict time: 15.510640859603882 s; prefill post time: 0.05953335762023926 s; decode prepare time: 0.0009551388876778739 s; decode predict time: 0.007169802983601888 s; decode post time: 0.005802801677158901 s [INFO] 2025-07-24 11:03:23,858 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. [INFO] 2025-07-24 11:03:25,123 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:03:25,124 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:03:25,124 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:03:25,125 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [INFO] 2025-07-24 11:03:26,348 [mindformers/generation/text_generator.py:1159] generate: total time: 1.2234437465667725 s; generated tokens: 416 tokens; generate speed: 340.02380670740206 tokens/s [INFO] 2025-07-24 11:03:26,349 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0016663074493408203 s; prefill predict time: 0.034148454666137695 s; prefill post time: 0.007892370223999023 s; decode prepare time: 0.0009008078899198365 s; decode predict time: 0.0050388200610291724 s; decode post time: 0.005396947119999858 s [INFO] 2025-07-24 11:03:26,353 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. [INFO] 2025-07-24 11:03:26,380 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:03:26,380 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:03:26,381 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:03:26,381 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [INFO] 2025-07-24 11:03:27,026 [mindformers/generation/text_generator.py:1159] generate: total time: 31.66836953163147 s; generated tokens: 8 tokens; generate speed: 0.25261799449476935 tokens/s [INFO] 2025-07-24 11:03:27,027 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.002757549285888672 s; prefill predict time: 17.069538831710815 s; prefill post time: 0.06441974639892578 s; decode prepare time: 0.0012051037379673549 s; decode predict time: 0.010319113731384277 s; decode post time: 0.00853729248046875 s [INFO] 2025-07-24 11:03:27,031 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. [INFO] 2025-07-24 11:03:27,957 [mindformers/generation/text_generator.py:1159] generate: total time: 1.5749998092651367 s; generated tokens: 816 tokens; generate speed: 518.0953008373564 tokens/s [INFO] 2025-07-24 11:03:27,957 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0014197826385498047 s; prefill predict time: 0.012376785278320312 s; prefill post time: 0.009716987609863281 s; decode prepare time: 0.001100462261993106 s; decode predict time: 0.0063411545753479 s; decode post time: 0.0077996135938285605 s [INFO] 2025-07-24 11:03:27,961 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. [INFO] 2025-07-24 11:03:28,326 [mindformers/generation/text_generator.py:735] split_input_ids: The batch is: 4, and the split_size is: 2, and the global_rank_id is: 1, and the dp_rank_id is: 0 and start is: 0, and stop is: 2 [INFO] 2025-07-24 11:03:28,328 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:03:28,329 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:03:28,329 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:03:28,330 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [INFO] 2025-07-24 11:03:29,678 [mindformers/generation/text_generator.py:1159] generate: total time: 1.3481841087341309 s; generated tokens: 208 tokens; generate speed: 154.2815989689274 tokens/s [INFO] 2025-07-24 11:03:29,679 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0016849040985107422 s; prefill predict time: 0.011673927307128906 s; prefill post time: 0.008545160293579102 s; decode prepare time: 0.0008681514888133818 s; decode predict time: 0.008306054507984835 s; decode post time: 0.0036244693311672767 s [INFO] 2025-07-24 11:03:29,683 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. [INFO] 2025-07-24 11:03:29,699 [mindformers/generation/text_generator.py:735] split_input_ids: The batch is: 8, and the split_size is: 4, and the global_rank_id is: 1, and the dp_rank_id is: 0 and start is: 0, and stop is: 4 [INFO] 2025-07-24 11:03:29,701 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:03:29,701 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:03:29,702 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:03:29,702 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [INFO] 2025-07-24 11:03:30,809 [mindformers/generation/text_generator.py:1159] generate: total time: 1.1067819595336914 s; generated tokens: 408 tokens; generate speed: 368.63629415490135 tokens/s [INFO] 2025-07-24 11:03:30,810 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0013515949249267578 s; prefill predict time: 0.01081395149230957 s; prefill post time: 0.007124185562133789 s; decode prepare time: 0.0009387884989823446 s; decode predict time: 0.005060186386108398 s; decode post time: 0.004634642364955185 s [INFO] 2025-07-24 11:03:30,814 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. [WARNING] 2025-07-24 11:03:33,595 [mindformers/models/tokenization_utils_base.py:1807] default_chat_template: No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. [INFO] 2025-07-24 11:03:33,660 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 1.0, 'top_k': 1, 'top_p': 1, 'repetition_penalty': 1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 1, 'bos_token_id': 0, 'eos_token_id': [1], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:03:33,662 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:03:33,662 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:03:33,663 [mindformers/research/deepseek3/deepseek3_model_infer.py:1419] set_dynamic_inputs: Set dynamic input for DeepseekV3. [INFO] 2025-07-24 11:04:11,682 [mindformers/generation/text_generator.py:1159] generate: total time: 38.018627643585205 s; generated tokens: 220 tokens; generate speed: 5.786637068082601 tokens/s [INFO] 2025-07-24 11:04:11,683 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0036611557006835938 s; prefill predict time: 32.50302577018738 s; prefill post time: 0.012887954711914062 s; decode prepare time: 0.0009113682640923394 s; decode predict time: 0.005116939544677734 s; decode post time: 0.0005336249316180194 s [INFO] 2025-07-24 11:04:11,685 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. large_models/output/log/rank_0/0000755000175100017500000000000015040321130017167 5ustar jenkinsHwHiAiUserlarge_models/output/log/rank_0/error.log0000644000175100017500000000000015040321130021011 0ustar jenkinsHwHiAiUserlarge_models/output/log/rank_0/info.log0000644000175100017500000007661015040321253020645 0ustar jenkinsHwHiAiUser[WARNING] 2025-07-24 11:02:48,491 [mindformers/tools/register/template.py:84] _none_process: The input config moe_config is empty. [WARNING] 2025-07-24 11:02:48,491 [mindformers/tools/register/template.py:84] _none_process: The input config moe_config is empty. [WARNING] 2025-07-24 11:02:48,491 [mindformers/tools/register/template.py:84] _none_process: The input config moe_config is empty. [INFO] 2025-07-24 11:02:48,525 [mindformers/core/context/build_context.py:168] _set_predict_jit_config: Predict context config, jit_level: O0, infer_boost: on [INFO] 2025-07-24 11:02:48,525 [mindformers/core/context/build_context.py:168] _set_predict_jit_config: Predict context config, jit_level: O0, infer_boost: on [INFO] 2025-07-24 11:02:48,526 [mindformers/core/context/build_context.py:168] _set_predict_jit_config: Predict context config, jit_level: O0, infer_boost: on [INFO] 2025-07-24 11:02:48,527 [mindformers/core/context/parallel.py:73] _get_parallel_ctx_config: full_batch is set to False for non-parallel modes [INFO] 2025-07-24 11:02:48,528 [mindformers/core/context/parallel.py:73] _get_parallel_ctx_config: full_batch is set to False for non-parallel modes [INFO] 2025-07-24 11:02:48,528 [mindformers/core/context/parallel.py:73] _get_parallel_ctx_config: full_batch is set to False for non-parallel modes [WARNING] 2025-07-24 11:02:48,609 [mindformers/tools/register/template.py:84] _none_process: The input config runner_config is empty. [INFO] 2025-07-24 11:02:48,642 [mindformers/core/context/build_context.py:168] _set_predict_jit_config: Predict context config, jit_level: O0, infer_boost: on [WARNING] 2025-07-24 11:02:48,695 [mindformers/tools/register/template.py:84] _none_process: The input config runner_config is empty. [WARNING] 2025-07-24 11:02:48,703 [mindformers/tools/register/template.py:84] _none_process: The input config moe_config is empty. [INFO] 2025-07-24 11:02:48,729 [mindformers/core/context/build_context.py:168] _set_predict_jit_config: Predict context config, jit_level: O0, infer_boost: on [INFO] 2025-07-24 11:02:48,736 [mindformers/core/context/build_context.py:168] _set_predict_jit_config: Predict context config, jit_level: O0, infer_boost: on [INFO] 2025-07-24 11:02:48,740 [mindformers/core/context/parallel.py:73] _get_parallel_ctx_config: full_batch is set to False for non-parallel modes [INFO] 2025-07-24 11:02:51,488 [mindformers/tools/utils.py:185] set_strategy_save_path: set strategy path to './output/strategy/ckpt_strategy_rank_0.ckpt' [INFO] 2025-07-24 11:02:51,490 [mindformers/core/context/build_context.py:383] set_cpu_affinity: cann workqueue cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191] [WARNING] 2025-07-24 11:02:51,490 [mindformers/core/context/build_context.py:387] set_cpu_affinity: CANN use cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], model get empty cpu list, disable binding cores [INFO] 2025-07-24 11:02:51,491 [mindformers/core/context/build_context.py:395] set_cpu_affinity: cpu_affinity, rank_id: 0, device_num: 4 [INFO] 2025-07-24 11:02:51,492 [mindformers/core/parallel_config.py:41] build_parallel_config: initial moe_config from dict: {'expert_num': 1, 'capacity_factor': 1.1, 'aux_loss_factor': 0.05, 'num_experts_chosen': 1, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV1', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 0, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': None, 'n_group': None, 'first_k_dense_replace': True, 'moe_intermediate_size': 1407, 'routed_scaling_factor': 1.0, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} [INFO] 2025-07-24 11:02:51,492 [mindformers/core/parallel_config.py:61] build_parallel_config: initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': False, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} [INFO] 2025-07-24 11:02:51,756 [mindformers/tools/utils.py:185] set_strategy_save_path: set strategy path to './output/strategy/ckpt_strategy_rank_0.ckpt' [INFO] 2025-07-24 11:02:51,758 [mindformers/core/context/build_context.py:383] set_cpu_affinity: cann workqueue cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191] [WARNING] 2025-07-24 11:02:51,759 [mindformers/core/context/build_context.py:387] set_cpu_affinity: CANN use cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], model get empty cpu list, disable binding cores [INFO] 2025-07-24 11:02:51,759 [mindformers/core/context/build_context.py:395] set_cpu_affinity: cpu_affinity, rank_id: 0, device_num: 2 [WARNING] 2025-07-24 11:02:51,759 [mindformers/core/context/build_context.py:366] set_ms_affinity: custom bind policy affinity_cpu_list must be dict, but got None. [INFO] 2025-07-24 11:02:51,759 [mindformers/core/parallel_config.py:41] build_parallel_config: initial moe_config from dict: {'expert_num': 256, 'capacity_factor': 1.1, 'aux_loss_factor': 0.05, 'num_experts_chosen': 8, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 4, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} [INFO] 2025-07-24 11:02:51,760 [mindformers/core/parallel_config.py:61] build_parallel_config: initial parallel_config from dict: {'data_parallel': 1, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': False, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} [INFO] 2025-07-24 11:02:51,774 [mindformers/tools/utils.py:185] set_strategy_save_path: set strategy path to './output/strategy/ckpt_strategy_rank_0.ckpt' [INFO] 2025-07-24 11:02:51,777 [mindformers/core/context/build_context.py:383] set_cpu_affinity: cann workqueue cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191] [WARNING] 2025-07-24 11:02:51,777 [mindformers/core/context/build_context.py:387] set_cpu_affinity: CANN use cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], model get empty cpu list, disable binding cores [INFO] 2025-07-24 11:02:51,777 [mindformers/core/context/build_context.py:395] set_cpu_affinity: cpu_affinity, rank_id: 0, device_num: 2 [INFO] 2025-07-24 11:02:51,778 [mindformers/core/parallel_config.py:41] build_parallel_config: initial moe_config from dict: {'expert_num': 1, 'capacity_factor': 1.1, 'aux_loss_factor': 0.05, 'num_experts_chosen': 1, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV1', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 0, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': None, 'n_group': None, 'first_k_dense_replace': True, 'moe_intermediate_size': 1407, 'routed_scaling_factor': 1.0, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} [INFO] 2025-07-24 11:02:51,778 [mindformers/core/parallel_config.py:61] build_parallel_config: initial parallel_config from dict: {'data_parallel': 1, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': False, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} [INFO] 2025-07-24 11:02:52,070 [mindformers/parallel_core/inference/parallel_state.py:358] initialize_model_parallel: expert_model_parallel_size(1) is not equal to world_size(4), so we will use 4 as the MOE_tensor_parallel_size. [INFO] 2025-07-24 11:02:52,286 [mindformers/parallel_core/inference/parallel_state.py:358] initialize_model_parallel: expert_model_parallel_size(1) is not equal to world_size(2), so we will use 2 as the MOE_tensor_parallel_size. [INFO] 2025-07-24 11:02:52,397 [mindformers/version_control.py:119] decorator: The Lazy Inline compilation acceleration feature does not support single-card mode.This feature is disabled by default. ENABLE_LAZY_INLINE=1 does not take effect. [INFO] 2025-07-24 11:02:52,448 [mindformers/parallel_core/inference/parallel_state.py:358] initialize_model_parallel: expert_model_parallel_size(1) is not equal to world_size(2), so we will use 2 as the MOE_tensor_parallel_size. [INFO] 2025-07-24 11:02:53,207 [mindformers/models/utils.py:205] __init__: num_layers per stage: [[4]] [INFO] 2025-07-24 11:02:53,207 [mindformers/models/utils.py:206] __init__: Accumulated num_layers per stage: [[4]] [INFO] 2025-07-24 11:02:53,208 [mindformers/models/utils.py:208] __init__: Pipeline id list with start_stage: [0, 0, 0, 0] [INFO] 2025-07-24 11:02:53,208 [mindformers/models/utils.py:209] __init__: Interleave id list: [0, 0, 0, 0] [INFO] 2025-07-24 11:02:53,208 [mindformers/models/utils.py:227] __init__: Formative layer_recompute: [[0]] [INFO] 2025-07-24 11:02:53,208 [mindformers/models/utils.py:229] __init__: The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. [INFO] 2025-07-24 11:02:53,208 [mindformers/models/utils.py:235] __init__: Formative select_recompute: {'feed_forward\\.mul': [[0]], 'feed_forward\\.w1\\.activation\\.silu': [[0]]} [INFO] 2025-07-24 11:02:53,209 [mindformers/models/utils.py:236] __init__: Formative select_comm_recompute: {'.*\\.norm': [[0]]} [INFO] 2025-07-24 11:02:53,209 [mindformers/models/utils.py:237] __init__: Formative select_recompute_exclude: {} [INFO] 2025-07-24 11:02:53,209 [mindformers/models/utils.py:238] __init__: Formative select_comm_recompute_exclude: {} [WARNING] 2025-07-24 11:02:53,228 [mindformers/research/deepseek3/deepseek3_model_infer.py:927] __init__: first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. [INFO] 2025-07-24 11:02:53,238 [mindformers/version_control.py:76] decorator: Predict enable lazy inline. [INFO] 2025-07-24 11:02:53,421 [mindformers/models/modeling_utils.py:1517] load_checkpoint: model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. [INFO] 2025-07-24 11:02:53,423 [/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/run_parallel.py:178] parallel_qwen2_0_5b_predict_dp2_mp2: ----------------Transform and load checkpoint---------------- [INFO] 2025-07-24 11:02:53,561 [mindformers/version_control.py:76] decorator: Predict enable lazy inline. [INFO] 2025-07-24 11:02:53,576 [mindformers/models/modeling_utils.py:1517] load_checkpoint: model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. [INFO] 2025-07-24 11:02:53,576 [/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/run_parallel.py:85] parallel_qwen2_0_5b_predict_mp2: ----------------Transform and load checkpoint---------------- [INFO] 2025-07-24 11:02:53,611 [mindformers/version_control.py:76] decorator: Predict enable lazy inline. [INFO] 2025-07-24 11:02:53,665 [mindformers/version_control.py:76] decorator: Predict enable lazy inline. [INFO] 2025-07-24 11:02:53,671 [mindformers/models/modeling_utils.py:1517] load_checkpoint: model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. [INFO] 2025-07-24 11:02:53,671 [mindformers/research/deepseek3/deepseek3_model_infer.py:1247] __init__: Predict run mode:True [INFO] 2025-07-24 11:02:53,671 [/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/run_parallel.py:274] parallel_deepseek_r1_bf16_predict_mp2: ----------------Transform and load checkpoint---------------- [INFO] 2025-07-24 11:02:54,877 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:02:54,878 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:02:54,879 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:02:54,880 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [WARNING] 2025-07-24 11:02:55,040 [mindformers/generation/text_generator.py:726] split_input_ids: batch size {batch} can not be divisible by data_parallel {data_parallel}, and would not split. [INFO] 2025-07-24 11:02:55,042 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:02:55,043 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:02:55,044 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:02:55,045 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [INFO] 2025-07-24 11:03:23,852 [mindformers/generation/text_generator.py:1159] generate: total time: 28.971864938735962 s; generated tokens: 8 tokens; generate speed: 0.27612996322179595 tokens/s [INFO] 2025-07-24 11:03:23,854 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0019178390502929688 s; prefill predict time: 15.667938709259033 s; prefill post time: 0.05588793754577637 s; decode prepare time: 0.0012546948024204799 s; decode predict time: 0.007117470105489095 s; decode post time: 0.005690302167619977 s [INFO] 2025-07-24 11:03:23,859 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. [INFO] 2025-07-24 11:03:25,146 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:03:25,147 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:03:25,147 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:03:25,148 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [INFO] 2025-07-24 11:03:26,348 [mindformers/generation/text_generator.py:1159] generate: total time: 1.1993143558502197 s; generated tokens: 416 tokens; generate speed: 346.86485488209524 tokens/s [INFO] 2025-07-24 11:03:26,349 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0017671585083007812 s; prefill predict time: 0.010278463363647461 s; prefill post time: 0.006461620330810547 s; decode prepare time: 0.0009049674839649386 s; decode predict time: 0.0057804561128803325 s; decode post time: 0.004656923627390445 s [INFO] 2025-07-24 11:03:26,353 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. [INFO] 2025-07-24 11:03:26,381 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:03:26,382 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:03:26,382 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:03:26,383 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [INFO] 2025-07-24 11:03:27,026 [mindformers/generation/text_generator.py:1159] generate: total time: 31.981154680252075 s; generated tokens: 8 tokens; generate speed: 0.2501473158172081 tokens/s [INFO] 2025-07-24 11:03:27,027 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0020170211791992188 s; prefill predict time: 17.373072147369385 s; prefill post time: 0.05473589897155762 s; decode prepare time: 0.001155580793108259 s; decode predict time: 0.00666348139444987 s; decode post time: 0.011922836303710938 s [INFO] 2025-07-24 11:03:27,032 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. [INFO] 2025-07-24 11:03:27,956 [mindformers/generation/text_generator.py:1159] generate: total time: 1.5726771354675293 s; generated tokens: 816 tokens; generate speed: 518.8604714834985 tokens/s [INFO] 2025-07-24 11:03:27,956 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0013928413391113281 s; prefill predict time: 0.010842323303222656 s; prefill post time: 0.008968353271484375 s; decode prepare time: 0.0010875522500217551 s; decode predict time: 0.005750880241394043 s; decode post time: 0.008397107077117013 s [INFO] 2025-07-24 11:03:27,961 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. [INFO] 2025-07-24 11:03:28,300 [mindformers/generation/text_generator.py:735] split_input_ids: The batch is: 4, and the split_size is: 2, and the global_rank_id is: 0, and the dp_rank_id is: 0 and start is: 0, and stop is: 2 [INFO] 2025-07-24 11:03:28,301 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:03:28,302 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:03:28,302 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:03:28,303 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [INFO] 2025-07-24 11:03:29,678 [mindformers/generation/text_generator.py:1159] generate: total time: 1.3750135898590088 s; generated tokens: 208 tokens; generate speed: 151.27123217838735 tokens/s [INFO] 2025-07-24 11:03:29,679 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0015180110931396484 s; prefill predict time: 0.037731170654296875 s; prefill post time: 0.005625009536743164 s; decode prepare time: 0.0008118013733799018 s; decode predict time: 0.00613552215052586 s; decode post time: 0.005840921864926236 s [INFO] 2025-07-24 11:03:29,683 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. [INFO] 2025-07-24 11:03:29,700 [mindformers/generation/text_generator.py:735] split_input_ids: The batch is: 8, and the split_size is: 4, and the global_rank_id is: 0, and the dp_rank_id is: 0 and start is: 0, and stop is: 4 [INFO] 2025-07-24 11:03:29,701 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:03:29,702 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:03:29,702 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:03:29,703 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [INFO] 2025-07-24 11:03:30,809 [mindformers/generation/text_generator.py:1159] generate: total time: 1.1057765483856201 s; generated tokens: 408 tokens; generate speed: 368.9714713118669 tokens/s [INFO] 2025-07-24 11:03:30,810 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0012700557708740234 s; prefill predict time: 0.010054349899291992 s; prefill post time: 0.008684635162353516 s; decode prepare time: 0.0008821086128159325 s; decode predict time: 0.00499650239944458 s; decode post time: 0.004758544487528282 s [INFO] 2025-07-24 11:03:30,814 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. [WARNING] 2025-07-24 11:03:34,583 [mindformers/models/tokenization_utils_base.py:1807] default_chat_template: No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. [INFO] 2025-07-24 11:03:34,664 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 1.0, 'top_k': 1, 'top_p': 1, 'repetition_penalty': 1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 1, 'bos_token_id': 0, 'eos_token_id': [1], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:03:34,665 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:03:34,666 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:03:34,667 [mindformers/research/deepseek3/deepseek3_model_infer.py:1419] set_dynamic_inputs: Set dynamic input for DeepseekV3. [INFO] 2025-07-24 11:04:11,682 [mindformers/generation/text_generator.py:1159] generate: total time: 37.01454830169678 s; generated tokens: 220 tokens; generate speed: 5.943608934704061 tokens/s [INFO] 2025-07-24 11:04:11,683 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0035898685455322266 s; prefill predict time: 31.498730421066284 s; prefill post time: 0.01184535026550293 s; decode prepare time: 0.0008928908242119683 s; decode predict time: 0.005141928510845832 s; decode post time: 0.0005322959687974718 s [INFO] 2025-07-24 11:04:11,685 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. large_models/output/log/rank_3/0000755000175100017500000000000015040321130017172 5ustar jenkinsHwHiAiUserlarge_models/output/log/rank_3/error.log0000644000175100017500000000000015040321130021014 0ustar jenkinsHwHiAiUserlarge_models/output/log/rank_3/info.log0000644000175100017500000002537415040321202020643 0ustar jenkinsHwHiAiUser[WARNING] 2025-07-24 11:02:48,490 [mindformers/tools/register/template.py:84] _none_process: The input config moe_config is empty. [INFO] 2025-07-24 11:02:48,524 [mindformers/core/context/build_context.py:168] _set_predict_jit_config: Predict context config, jit_level: O0, infer_boost: on [INFO] 2025-07-24 11:02:48,527 [mindformers/core/context/parallel.py:73] _get_parallel_ctx_config: full_batch is set to False for non-parallel modes [INFO] 2025-07-24 11:02:51,988 [mindformers/tools/utils.py:185] set_strategy_save_path: set strategy path to './output/strategy/ckpt_strategy_rank_3.ckpt' [INFO] 2025-07-24 11:02:51,990 [mindformers/core/context/build_context.py:383] set_cpu_affinity: cann workqueue cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191] [WARNING] 2025-07-24 11:02:51,990 [mindformers/core/context/build_context.py:387] set_cpu_affinity: CANN use cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], model get empty cpu list, disable binding cores [INFO] 2025-07-24 11:02:51,990 [mindformers/core/context/build_context.py:395] set_cpu_affinity: cpu_affinity, rank_id: 3, device_num: 4 [INFO] 2025-07-24 11:02:51,991 [mindformers/core/parallel_config.py:41] build_parallel_config: initial moe_config from dict: {'expert_num': 1, 'capacity_factor': 1.1, 'aux_loss_factor': 0.05, 'num_experts_chosen': 1, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV1', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 0, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': None, 'n_group': None, 'first_k_dense_replace': True, 'moe_intermediate_size': 1407, 'routed_scaling_factor': 1.0, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} [INFO] 2025-07-24 11:02:51,991 [mindformers/core/parallel_config.py:61] build_parallel_config: initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': False, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} [INFO] 2025-07-24 11:02:52,506 [mindformers/parallel_core/inference/parallel_state.py:358] initialize_model_parallel: expert_model_parallel_size(1) is not equal to world_size(4), so we will use 4 as the MOE_tensor_parallel_size. [INFO] 2025-07-24 11:02:53,727 [mindformers/models/modeling_utils.py:1517] load_checkpoint: model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. [INFO] 2025-07-24 11:02:53,728 [/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/run_parallel.py:178] parallel_qwen2_0_5b_predict_dp2_mp2: ----------------Transform and load checkpoint---------------- [WARNING] 2025-07-24 11:02:55,257 [mindformers/generation/text_generator.py:726] split_input_ids: batch size {batch} can not be divisible by data_parallel {data_parallel}, and would not split. [INFO] 2025-07-24 11:02:55,259 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:02:55,260 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:02:55,260 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:02:55,261 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [INFO] 2025-07-24 11:03:26,509 [mindformers/generation/text_generator.py:1159] generate: total time: 31.247695446014404 s; generated tokens: 8 tokens; generate speed: 0.2560188802985914 tokens/s [INFO] 2025-07-24 11:03:26,510 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.002022266387939453 s; prefill predict time: 16.738914489746094 s; prefill post time: 0.05533266067504883 s; decode prepare time: 0.0010406289781842912 s; decode predict time: 0.0066778262456258135 s; decode post time: 0.005154609680175781 s [INFO] 2025-07-24 11:03:26,515 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. [INFO] 2025-07-24 11:03:27,790 [mindformers/generation/text_generator.py:735] split_input_ids: The batch is: 4, and the split_size is: 2, and the global_rank_id is: 3, and the dp_rank_id is: 1 and start is: 2, and stop is: 4 [INFO] 2025-07-24 11:03:27,792 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:03:27,793 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:03:27,793 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:03:27,794 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [INFO] 2025-07-24 11:03:28,764 [mindformers/generation/text_generator.py:1159] generate: total time: 0.9698691368103027 s; generated tokens: 208 tokens; generate speed: 214.4619228570038 tokens/s [INFO] 2025-07-24 11:03:28,765 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0013117790222167969 s; prefill predict time: 0.02124929428100586 s; prefill post time: 0.005386829376220703 s; decode prepare time: 0.000852976030516393 s; decode predict time: 0.004929540204066856 s; decode post time: 0.003230787017970409 s [INFO] 2025-07-24 11:03:28,769 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. [INFO] 2025-07-24 11:03:28,785 [mindformers/generation/text_generator.py:735] split_input_ids: The batch is: 8, and the split_size is: 4, and the global_rank_id is: 3, and the dp_rank_id is: 1 and start is: 4, and stop is: 8 [INFO] 2025-07-24 11:03:28,786 [mindformers/generation/text_generator.py:892] generate: Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} [INFO] 2025-07-24 11:03:28,787 [mindformers/generation/text_generator.py:950] generate: The generation mode will be **GREEDY_SEARCH**. [INFO] 2025-07-24 11:03:28,787 [mindformers/modules/block_tables.py:63] init_cache_engine: init cache engine success. [INFO] 2025-07-24 11:03:28,788 [mindformers/research/qwen2_5/infer/qwen2_5.py:188] set_dynamic_inputs: Set dynamic input for llama. [INFO] 2025-07-24 11:03:30,032 [mindformers/generation/text_generator.py:1159] generate: total time: 1.243579626083374 s; generated tokens: 408 tokens; generate speed: 328.0851434378889 tokens/s [INFO] 2025-07-24 11:03:30,032 [mindformers/tools/debug_info.py:93] print_info: prefill prepare time: 0.0013360977172851562 s; prefill predict time: 0.011849641799926758 s; prefill post time: 0.00650477409362793 s; decode prepare time: 0.0009570310611536007 s; decode predict time: 0.006123967170715332 s; decode post time: 0.004934752341544274 s [INFO] 2025-07-24 11:03:30,036 [mindformers/modules/block_tables.py:126] clear_cache: Clear block table cache engines. large_models/offload/0000700000175100017500000000000015040321132015276 5ustar jenkinsHwHiAiUserlarge_models/similarity.py0000644000175100017500000000363515040315702016453 0ustar jenkinsHwHiAiUser# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ import math import jieba import numpy as np def _get_all_words(standard_cut_infer_ret_list, test_cut_infer_ret_list): all_words = [] for s_cut in standard_cut_infer_ret_list: if s_cut not in all_words: all_words.append(s_cut) for t_cut in test_cut_infer_ret_list: if t_cut not in all_words: all_words.append(t_cut) return all_words def _get_word_vector(standard_cut_infer_ret_list, test_cut_infer_ret_list, all_words): la_standard = [] lb_test = [] for word in all_words: la_standard.append(standard_cut_infer_ret_list.count(word)) lb_test.append(test_cut_infer_ret_list.count(word)) return la_standard, lb_test def _get_calculate_cos(la_standard, lb_test): laa = np.array(la_standard) lbb = np.array(lb_test) cos = (np.dot(laa, lbb.T)) / ((math.sqrt(np.dot(laa, laa.T))) * (math.sqrt(np.dot(lbb, lbb.T)))) return np.round(cos, 2) def compare_distance(x1, x2, bench_sim=0.95): """compare distance""" y1 = list(jieba.cut(x1)) y2 = list(jieba.cut(x2)) all_words = _get_all_words(y1, y2) laa, lbb = _get_word_vector(y1, y2, all_words) sim = _get_calculate_cos(laa, lbb) print("calculate sim is:{}".format(str(sim))) assert sim >= bench_sim large_models/weight_processor.py0000644000175100017500000001107415040315702017647 0ustar jenkinsHwHiAiUser# Copyright 2025 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ transform huggingface safetensor. """ import os from enum import Enum from safetensors import safe_open from mindspore.communication.management import get_rank from mindformers.parallel_core.inference.utils import get_tp_world_size class EPMethod(Enum): """ EP method enums """ DEFAULT = 'default' ALLTOALL = 'alltoall' ALLGATHER = 'allgather' class BaseWeightProcessor: r""" Provide model weight load and shards. Args: config (MF Config): The config of Infer model. network (InferenceModelForCausalLM): The network of infer model. """ def __init__(self, config, network, is_quant): self.config = config self.network = network self.is_quant = is_quant self.global_rank_id = get_rank() self.tp_group_size = get_tp_world_size() self.tp_rank_id = self.global_rank_id % self.tp_group_size self.parameter_dict = {} self.file_handles = {} def get_file_handles(self, filename): if filename not in self.file_handles: fp = safe_open(filename, framework="np") self.file_handles[filename] = fp return self.file_handles[filename] def release_file_handles(self): del self.file_handles def get_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map): safetensor_file = hf_weight_map[hf_param_name] filename = os.path.join(src_hf_dir, safetensor_file) sf_file = self.get_file_handles(filename) qint4 = False if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys(): qint4 = True np_data = sf_file.get_tensor(hf_param_name) return np_data, qint4 def get_safetensor_from_file_split_tp_group(self, hf_param_name, src_hf_dir, hf_weight_map, split_axis=0): safetensor_file = hf_weight_map[hf_param_name] filename = os.path.join(src_hf_dir, safetensor_file) sf_file = self.get_file_handles(filename) qint4 = False if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys(): qint4 = True np_data = sf_file.get_slice(hf_param_name) shape = np_data.get_shape() if split_axis == 0: split_size = shape[0] // self.tp_group_size start = self.tp_rank_id * split_size stop = (self.tp_rank_id + 1) * split_size split_data = np_data[start:stop] elif split_axis == 1: split_size = shape[1] // self.tp_group_size start = self.tp_rank_id * split_size stop = (self.tp_rank_id + 1) * split_size split_data = np_data[:, start:stop] elif split_axis == 2: split_size = shape[2] // self.tp_group_size start = self.tp_rank_id * split_size stop = (self.tp_rank_id + 1) * split_size split_data = np_data[:, :, start:stop] else: raise ValueError("split_axis:{} is not supported.".format(split_axis)) return split_data, qint4 def split_weight_by_rank(self, weight, split_axis=0): if self.tp_group_size == 1: return weight shape = weight.shape if split_axis == 0: split_size = shape[0] // self.tp_group_size start = self.tp_rank_id * split_size stop = (self.tp_rank_id + 1) * split_size split_data = weight[start:stop] elif split_axis == 1: split_size = shape[1] // self.tp_group_size start = self.tp_rank_id * split_size stop = (self.tp_rank_id + 1) * split_size split_data = weight[:, start:stop] else: raise ValueError("split_axis:{} is not supported.".format(split_axis)) return split_data def load_safetensors_shard(self, src_hf_dir): """ load safetensors and shards """ raise NotImplementedError("load_safetensors_shard method is not implemented.") large_models/parallel_qwen2_0_5b_predict_mp2/0000700000175100017500000000000015040321111021666 5ustar jenkinsHwHiAiUserlarge_models/parallel_qwen2_0_5b_predict_mp2/worker_1.log0000644000175100017500000042071315040321200024142 0ustar jenkinsHwHiAiUser/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) 2025-07-24 11:02:48,650 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config moe_config is empty. 2025-07-24 11:02:48,683 - mindformers./output/log[mindformers/core/context/build_context.py:168] - INFO - Predict context config, jit_level: O0, infer_boost: on [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:48.684.944 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:48.685.766 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:48.686.256 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:48.686.380 [mindspore/context.py:1655] For 'context.set_context', 'enable_graph_kernel' parameter is deprecated, and will be removed in the next version. Please use jit_config={'jit_level': 'O1'} instead. [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:48.686.520 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:48.686.672 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:48.686.799 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. 2025-07-24 11:02:48,687 - mindformers./output/log[mindformers/core/context/parallel.py:73] - INFO - full_batch is set to False for non-parallel modes [WARNING] DISTRIBUTED(3806944,ffff9a73ff30,python):2025-07-24-11:02:48.688.997 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 20 source: 127.0.0.1:38714, destination: 127.0.0.1:8222 [WARNING] DISTRIBUTED(3806944,ffff0ada9060,python):2025-07-24-11:02:48.689.001 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:38714 to 127.0.0.1:8222 is successfully created. System errno: Success [WARNING] DISTRIBUTED(3806944,ffff9a73ff30,python):2025-07-24-11:02:48.689.063 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:8222 to be connected...Retry number: 1 [WARNING] DISTRIBUTED(3806944,ffff9a73ff30,python):2025-07-24-11:02:49.189.250 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:38720, destination: 127.0.0.1:8222 [WARNING] DISTRIBUTED(3806944,ffff9a73ff30,python):2025-07-24-11:02:49.189.276 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:8222 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(3806944,ffff0bdab060,python):2025-07-24-11:02:49.189.288 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:38720 to 127.0.0.1:8222 is successfully created. System errno: Success [WARNING] DISTRIBUTED(3806944,ffff9a73ff30,python):2025-07-24-11:02:49.689.692 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/1200). [WARNING] DISTRIBUTED(3806944,ffff9a73ff30,python):2025-07-24-11:02:50.189.803 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(3806944,ffff9a73ff30,python):2025-07-24-11:02:50.189.829 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 1 rank id: 1 [MS_ALLOC_CONF]Runtime config: enable_vmm:False [WARNING] DEVICE(3806944,ffff9a73ff30,python):2025-07-24-11:02:50.411.392 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_memory_adapter.cc:155] Initialize] Reserved memory size for other components(2101346304) is less than recommend size(4068418816), It may lead to Out Of Memory in HCCL or other components, Please double check context key 'variable_memory_max_size'/'max_device_memory' [WARNING] DEVICE(3806944,ffff9a73ff30,python):2025-07-24-11:02:51.709.717 [mindspore/ccsrc/plugin/res_manager/ascend/collective/multi_ascend_collective_comm_lib.cc:84] Initialize] Loading LCCL because env MS_ENABLE_LCCL is set to on. Pay attention that LCCL only supports communication group within single node in KernelByKernel for now. [WARNING] DISTRIBUTED(3806944,ffff9a73ff30,python):2025-07-24-11:02:51.713.426 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(3806944,ffff9a73ff30,python):2025-07-24-11:02:51.713.628 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(3806944,fffe7e7fc060,python):2025-07-24-11:02:51.713.775 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:8222, node_rank:2130706433, total_rank_size:2, local_rank_size2 [WARNING] HCCL_ADPT(3806944,fffe7e7fc060,python):2025-07-24-11:02:51.713.844 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(3806944,fffe7e7fc060,python):2025-07-24-11:02:51.713.866 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:632] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(3806944,fffe7e7fc060,python):2025-07-24-11:02:51.713.883 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [WARNING] DEVICE(3806944,fffe7e7fc060,python):2025-07-24-11:02:51.714.170 [mindspore/ccsrc/plugin/device/cpu/hal/hardware/ms_collective_comm_lib.cc:251] QueryUniqueID] Retry to lookup the unique id for group hccl_world_group from the meta server node...Retry time: 399/400, sleep 2 2025-07-24 11:02:51,715 - mindformers./output/log[mindformers/tools/utils.py:185] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_1.ckpt' 2025-07-24 11:02:51,717 - mindformers./output/log[mindformers/core/context/build_context.py:383] - INFO - cann workqueue cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191] 2025-07-24 11:02:51,717 - mindformers./output/log[mindformers/core/context/build_context.py:387] - WARNING - CANN use cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], model get empty cpu list, disable binding cores 2025-07-24 11:02:51,718 - mindformers./output/log[mindformers/core/context/build_context.py:395] - INFO - cpu_affinity, rank_id: 1, device_num: 2 2025-07-24 11:02:51,718 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 1, 'capacity_factor': 1.1, 'aux_loss_factor': 0.05, 'num_experts_chosen': 1, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV1', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 0, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': None, 'n_group': None, 'first_k_dense_replace': True, 'moe_intermediate_size': 1407, 'routed_scaling_factor': 1.0, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} 2025-07-24 11:02:51,719 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 1, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': False, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} tp_group is:True dp_group is:True 2025-07-24 11:02:52,210 - mindformers./output/log[mindformers/parallel_core/inference/parallel_state.py:358] - INFO - expert_model_parallel_size(1) is not equal to world_size(2), so we will use 2 as the MOE_tensor_parallel_size. [WARNING] DISTRIBUTED(3806944,ffff9a73ff30,python):2025-07-24-11:02:52.211.967 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: tp-0-1 [const vector]{0, 1}, async: 0, submit_now: 1 [WARNING] DISTRIBUTED(3806944,fffe7e7fc060,python):2025-07-24-11:02:52.214.441 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group [WARNING] DEVICE(3806944,fffe5eabf060,python):2025-07-24-11:02:52.228.831 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806944,fffe5eabf060,python):2025-07-24-11:02:52.789.518 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group [WARNING] DISTRIBUTED(3806944,fffe5eabf060,python):2025-07-24-11:02:52.789.635 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: hccl_world_group [WARNING] DISTRIBUTED(3806944,fffe5eabf060,python):2025-07-24-11:02:52.789.662 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: hccl_world_group end. [WARNING] DISTRIBUTED(3806944,fffe7e7fc060,python):2025-07-24-11:02:52.789.756 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: hccl_world_group [WARNING] DISTRIBUTED(3806944,fffe7e7fc060,python):2025-07-24-11:02:52.790.015 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: tp-0-1 [WARNING] DEVICE(3806944,fffe5eabf060,python):2025-07-24-11:02:52.801.841 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for tp-0-1, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806944,fffe5eabf060,python):2025-07-24-11:02:53.035.499 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for tp-0-1 [WARNING] DISTRIBUTED(3806944,fffe5eabf060,python):2025-07-24-11:02:53.035.627 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: tp-0-1 [WARNING] DISTRIBUTED(3806944,fffe5eabf060,python):2025-07-24-11:02:53.035.655 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: tp-0-1 end. [WARNING] DISTRIBUTED(3806944,fffe7e7fc060,python):2025-07-24-11:02:53.035.707 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: tp-0-1 [WARNING] DISTRIBUTED(3806944,ffff9a73ff30,python):2025-07-24-11:02:53.035.889 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: dp-1 [const vector]{1}, async: 0, submit_now: 1 [WARNING] DISTRIBUTED(3806944,fffe7e7fc060,python):2025-07-24-11:02:53.038.399 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: dp-1 [WARNING] DEVICE(3806944,fffe3ffff060,python):2025-07-24-11:02:53.049.760 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for dp-1, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806944,fffe3ffff060,python):2025-07-24-11:02:53.101.938 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for dp-1 [WARNING] DISTRIBUTED(3806944,fffe3ffff060,python):2025-07-24-11:02:53.102.040 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: dp-1 [WARNING] DISTRIBUTED(3806944,fffe3ffff060,python):2025-07-24-11:02:53.102.066 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: dp-1 end. [WARNING] DISTRIBUTED(3806944,fffe7e7fc060,python):2025-07-24-11:02:53.102.125 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: dp-1 [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:53.116.947 [mindspore/ops/primitive.py:220] The in_strategy/in_layout of the operator in your network will not take effect in stand_alone mode. This means the the shard function called in the network is ignored. If you want to enable it, please use semi auto or auto parallel mode by context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL or context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL) data_parallel_group:dp-1 tensor_model_parallel_group:tp-0-1 2025-07-24 11:02:53,749 - mindformers./output/log[mindformers/models/modeling_utils.py:1517] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. 2025-07-24 11:02:53,749 - mindformers./output/log[/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/run_parallel.py:85] - INFO - ----------------Transform and load checkpoint---------------- [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.676.830 [mindspore/train/serialization.py:333] The type of model.layers.0.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.677.597 [mindspore/train/serialization.py:333] The type of model.layers.0.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.678.080 [mindspore/train/serialization.py:333] The type of model.layers.1.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.678.698 [mindspore/train/serialization.py:333] The type of model.layers.1.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.679.148 [mindspore/train/serialization.py:333] The type of model.layers.2.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.679.776 [mindspore/train/serialization.py:333] The type of model.layers.2.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.680.229 [mindspore/train/serialization.py:333] The type of model.layers.3.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.680.832 [mindspore/train/serialization.py:333] The type of model.layers.3.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.681.278 [mindspore/train/serialization.py:333] The type of model.layers.4.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.681.904 [mindspore/train/serialization.py:333] The type of model.layers.4.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.682.358 [mindspore/train/serialization.py:333] The type of model.layers.5.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.682.974 [mindspore/train/serialization.py:333] The type of model.layers.5.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.683.418 [mindspore/train/serialization.py:333] The type of model.layers.6.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.684.034 [mindspore/train/serialization.py:333] The type of model.layers.6.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.684.481 [mindspore/train/serialization.py:333] The type of model.layers.7.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.685.095 [mindspore/train/serialization.py:333] The type of model.layers.7.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.685.541 [mindspore/train/serialization.py:333] The type of model.layers.8.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.686.175 [mindspore/train/serialization.py:333] The type of model.layers.8.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.686.625 [mindspore/train/serialization.py:333] The type of model.layers.9.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.687.224 [mindspore/train/serialization.py:333] The type of model.layers.9.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.687.669 [mindspore/train/serialization.py:333] The type of model.layers.10.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.688.269 [mindspore/train/serialization.py:333] The type of model.layers.10.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.688.724 [mindspore/train/serialization.py:333] The type of model.layers.11.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.689.322 [mindspore/train/serialization.py:333] The type of model.layers.11.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.689.780 [mindspore/train/serialization.py:333] The type of model.layers.12.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.690.399 [mindspore/train/serialization.py:333] The type of model.layers.12.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.690.845 [mindspore/train/serialization.py:333] The type of model.layers.13.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.691.445 [mindspore/train/serialization.py:333] The type of model.layers.13.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.691.890 [mindspore/train/serialization.py:333] The type of model.layers.14.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.692.512 [mindspore/train/serialization.py:333] The type of model.layers.14.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.692.968 [mindspore/train/serialization.py:333] The type of model.layers.15.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.693.563 [mindspore/train/serialization.py:333] The type of model.layers.15.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.694.026 [mindspore/train/serialization.py:333] The type of model.layers.16.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.694.641 [mindspore/train/serialization.py:333] The type of model.layers.16.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.695.088 [mindspore/train/serialization.py:333] The type of model.layers.17.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.695.688 [mindspore/train/serialization.py:333] The type of model.layers.17.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.696.131 [mindspore/train/serialization.py:333] The type of model.layers.18.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.696.731 [mindspore/train/serialization.py:333] The type of model.layers.18.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.697.181 [mindspore/train/serialization.py:333] The type of model.layers.19.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.697.802 [mindspore/train/serialization.py:333] The type of model.layers.19.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.698.253 [mindspore/train/serialization.py:333] The type of model.layers.20.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.698.875 [mindspore/train/serialization.py:333] The type of model.layers.20.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.699.324 [mindspore/train/serialization.py:333] The type of model.layers.21.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.699.929 [mindspore/train/serialization.py:333] The type of model.layers.21.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.700.390 [mindspore/train/serialization.py:333] The type of model.layers.22.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.701.007 [mindspore/train/serialization.py:333] The type of model.layers.22.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.701.452 [mindspore/train/serialization.py:333] The type of model.layers.23.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.702.071 [mindspore/train/serialization.py:333] The type of model.layers.23.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.702.523 [mindspore/train/serialization.py:333] The type of model.norm_out.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.705.595 [mindspore/train/serialization.py:1789] For 'load_param_into_net', 48 parameters in the 'net' are not loaded, because they are not in the 'parameter_dict', please check whether the network structure is consistent when training and loading checkpoint. [WARNING] ME(3806944:281473273036592,MainProcess):2025-07-24-11:02:54.705.742 [mindspore/train/serialization.py:1793] ['model.layers.0.attention.paged_attention_mgr.key_cache', 'model.layers.0.attention.paged_attention_mgr.value_cache', 'model.layers.1.attention.paged_attention_mgr.key_cache', 'model.layers.1.attention.paged_attention_mgr.value_cache', 'model.layers.2.attention.paged_attention_mgr.key_cache', 'model.layers.2.attention.paged_attention_mgr.value_cache', 'model.layers.3.attention.paged_attention_mgr.key_cache', 'model.layers.3.attention.paged_attention_mgr.value_cache', 'model.layers.4.attention.paged_attention_mgr.key_cache', 'model.layers.4.attention.paged_attention_mgr.value_cache', 'model.layers.5.attention.paged_attention_mgr.key_cache', 'model.layers.5.attention.paged_attention_mgr.value_cache', 'model.layers.6.attention.paged_attention_mgr.key_cache', 'model.layers.6.attention.paged_attention_mgr.value_cache', 'model.layers.7.attention.paged_attention_mgr.key_cache', 'model.layers.7.attention.paged_attention_mgr.value_cache', 'model.layers.8.attention.paged_attention_mgr.key_cache', 'model.layers.8.attention.paged_attention_mgr.value_cache', 'model.layers.9.attention.paged_attention_mgr.key_cache', 'model.layers.9.attention.paged_attention_mgr.value_cache', 'model.layers.10.attention.paged_attention_mgr.key_cache', 'model.layers.10.attention.paged_attention_mgr.value_cache', 'model.layers.11.attention.paged_attention_mgr.key_cache', 'model.layers.11.attention.paged_attention_mgr.value_cache', 'model.layers.12.attention.paged_attention_mgr.key_cache', 'model.layers.12.attention.paged_attention_mgr.value_cache', 'model.layers.13.attention.paged_attention_mgr.key_cache', 'model.layers.13.attention.paged_attention_mgr.value_cache', 'model.layers.14.attention.paged_attention_mgr.key_cache', 'model.layers.14.attention.paged_attention_mgr.value_cache', 'model.layers.15.attention.paged_attention_mgr.key_cache', 'model.layers.15.attention.paged_attention_mgr.value_cache', 'model.layers.16.attention.paged_attention_mgr.key_cache', 'model.layers.16.attention.paged_attention_mgr.value_cache', 'model.layers.17.attention.paged_attention_mgr.key_cache', 'model.layers.17.attention.paged_attention_mgr.value_cache', 'model.layers.18.attention.paged_attention_mgr.key_cache', 'model.layers.18.attention.paged_attention_mgr.value_cache', 'model.layers.19.attention.paged_attention_mgr.key_cache', 'model.layers.19.attention.paged_attention_mgr.value_cache', 'model.layers.20.attention.paged_attention_mgr.key_cache', 'model.layers.20.attention.paged_attention_mgr.value_cache', 'model.layers.21.attention.paged_attention_mgr.key_cache', 'model.layers.21.attention.paged_attention_mgr.value_cache', 'model.layers.22.attention.paged_attention_mgr.key_cache', 'model.layers.22.attention.paged_attention_mgr.value_cache', 'model.layers.23.attention.paged_attention_mgr.key_cache', 'model.layers.23.attention.paged_attention_mgr.value_cache'] are not loaded. param_not_load: ['model.layers.0.attention.paged_attention_mgr.key_cache', 'model.layers.0.attention.paged_attention_mgr.value_cache', 'model.layers.1.attention.paged_attention_mgr.key_cache', 'model.layers.1.attention.paged_attention_mgr.value_cache', 'model.layers.2.attention.paged_attention_mgr.key_cache', 'model.layers.2.attention.paged_attention_mgr.value_cache', 'model.layers.3.attention.paged_attention_mgr.key_cache', 'model.layers.3.attention.paged_attention_mgr.value_cache', 'model.layers.4.attention.paged_attention_mgr.key_cache', 'model.layers.4.attention.paged_attention_mgr.value_cache', 'model.layers.5.attention.paged_attention_mgr.key_cache', 'model.layers.5.attention.paged_attention_mgr.value_cache', 'model.layers.6.attention.paged_attention_mgr.key_cache', 'model.layers.6.attention.paged_attention_mgr.value_cache', 'model.layers.7.attention.paged_attention_mgr.key_cache', 'model.layers.7.attention.paged_attention_mgr.value_cache', 'model.layers.8.attention.paged_attention_mgr.key_cache', 'model.layers.8.attention.paged_attention_mgr.value_cache', 'model.layers.9.attention.paged_attention_mgr.key_cache', 'model.layers.9.attention.paged_attention_mgr.value_cache', 'model.layers.10.attention.paged_attention_mgr.key_cache', 'model.layers.10.attention.paged_attention_mgr.value_cache', 'model.layers.11.attention.paged_attention_mgr.key_cache', 'model.layers.11.attention.paged_attention_mgr.value_cache', 'model.layers.12.attention.paged_attention_mgr.key_cache', 'model.layers.12.attention.paged_attention_mgr.value_cache', 'model.layers.13.attention.paged_attention_mgr.key_cache', 'model.layers.13.attention.paged_attention_mgr.value_cache', 'model.layers.14.attention.paged_attention_mgr.key_cache', 'model.layers.14.attention.paged_attention_mgr.value_cache', 'model.layers.15.attention.paged_attention_mgr.key_cache', 'model.layers.15.attention.paged_attention_mgr.value_cache', 'model.layers.16.attention.paged_attention_mgr.key_cache', 'model.layers.16.attention.paged_attention_mgr.value_cache', 'model.layers.17.attention.paged_attention_mgr.key_cache', 'model.layers.17.attention.paged_attention_mgr.value_cache', 'model.layers.18.attention.paged_attention_mgr.key_cache', 'model.layers.18.attention.paged_attention_mgr.value_cache', 'model.layers.19.attention.paged_attention_mgr.key_cache', 'model.layers.19.attention.paged_attention_mgr.value_cache', 'model.layers.20.attention.paged_attention_mgr.key_cache', 'model.layers.20.attention.paged_attention_mgr.value_cache', 'model.layers.21.attention.paged_attention_mgr.key_cache', 'model.layers.21.attention.paged_attention_mgr.value_cache', 'model.layers.22.attention.paged_attention_mgr.key_cache', 'model.layers.22.attention.paged_attention_mgr.value_cache', 'model.layers.23.attention.paged_attention_mgr.key_cache', 'model.layers.23.attention.paged_attention_mgr.value_cache'], ckpt_not_load: [] 2025-07-24 11:02:55,043 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:02:55,044 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:02:55,044 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:02:55,045 - mindformers./output/log[mindformers/research/qwen2_5/infer/qwen2_5.py:188] - INFO - Set dynamic input for llama. TotalTime = 13.4871, [24] [bootstrap]: 0.0118184 [type_inference]: 8.28798 [auto_monad]: 0.123224 [graph_reusing]: 0.00503581 [inline]: 4.35009e-06 [add_attr]: 0.103459, [1] [add_attr_with_inline]: 0.103404, [1] [Cycle 1]: 0.0391927, [2] [tag_attr]: 0.025754 [meta_addattr_fg_expand]: 0.0132499 [parallel-infer-symbol]: 5.08991e-06 [pre_auto_parallel]: 0.0324508 [insert-virtual-dataset]: 9.34e-06 [parallel-infer-symbol-second]: 3.13995e-06 [dataset_repeat_opt]: 3.36999e-06 [pipeline_split]: 2.26998e-06 [optimize]: 3.57688, [53] [py_interpret_to_execute]: 0.0355279 [rewriter_before_opt_a]: 0.144316 [opt_a]: 3.11178, [3] [Cycle 1]: 2.46837, [45] [expand_dump_flag]: 0.0020939 [switch_simplify]: 0.0492118 [loop_unroll]: 0.0350788 [a_1]: 1.1919 [invalid_dout_check]: 0.010235 [recompute_prepare]: 0.00869014 [updatestate_depend_eliminate]: 0.0333403 [updatestate_assign_eliminate]: 0.00427447 [updatestate_loads_eliminate]: 0.0144465 [parameter_eliminate]: 1.022e-05 [a_2]: 0.104931 [accelerated_algorithm]: 0.00659016 [shard]: 4.99003e-06 [meta_shard_fg_expand]: 0.00464915 [shard_inline]: 0.0033198 [merge_send_recv]: 0.00299908 [auto_parallel]: 0.00294665 [parallel]: 1.40901e-05 [flash_sp]: 0.00170664 [merge_comm]: 0.00296214 [allreduce_fusion]: 0.00300066 [matmul_add_comm_reduction]: 0.00426971 [allreduce_slice_to_reducescatter]: 1.12003e-06 [virtual_shard_identity]: 0.00330141 [virtual_dataset]: 0.00324354 [get_grad_eliminate_]: 0.00333189 [virtual_output]: 0.00319039 [merge_forward]: 0.0029066 [offload_activation]: 0.00432183 [cell_reuse_recompute_pass]: 4.14008e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00542946 [merge_recompute_call_nodes]: 2.31003e-06 [before_grad]: 0.00521327 [set_forward_comm_id_for_comm_node_pass]: 0.00320367 [meta_fg_expand]: 0.00648664 [flash_sp_send_recv_attached]: 5.09003e-06 [receive_attached]: 1.5201e-05 [after_resolve]: 0.00350466 [a_after_grad]: 0.00493843 [renormalize]: 0.86418 [add_forward_monad_depend]: 2.406e-05 [auto_monad_grad]: 3.85998e-06 [auto_monad_eliminator]: 0.0251528 [cse]: 0.0209479 [a_3]: 0.0213047 [Cycle 2]: 0.40985, [45] [expand_dump_flag]: 4.80993e-06 [switch_simplify]: 0.00286075 [loop_unroll]: 0.00286448 [a_1]: 0.0734645 [invalid_dout_check]: 0.00354367 [recompute_prepare]: 0.00271742 [updatestate_depend_eliminate]: 0.00264695 [updatestate_assign_eliminate]: 0.0026351 [updatestate_loads_eliminate]: 0.00268785 [parameter_eliminate]: 8.34989e-06 [a_2]: 0.0467559 [accelerated_algorithm]: 0.00360263 [shard]: 3.82995e-06 [meta_shard_fg_expand]: 0.00193385 [shard_inline]: 0.00293388 [merge_send_recv]: 0.00281401 [auto_parallel]: 0.00283388 [parallel]: 1.199e-05 [flash_sp]: 6.32997e-06 [merge_comm]: 0.00280685 [allreduce_fusion]: 0.00278674 [matmul_add_comm_reduction]: 0.00356424 [allreduce_slice_to_reducescatter]: 1.13004e-06 [virtual_shard_identity]: 0.00293839 [virtual_dataset]: 0.00288229 [get_grad_eliminate_]: 0.00292756 [virtual_output]: 0.00286814 [merge_forward]: 0.00275982 [offload_activation]: 0.00362616 [cell_reuse_recompute_pass]: 3.43996e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00480277 [merge_recompute_call_nodes]: 2.20002e-06 [before_grad]: 0.00469719 [set_forward_comm_id_for_comm_node_pass]: 0.00297451 [meta_fg_expand]: 0.00364581 [flash_sp_send_recv_attached]: 3.62005e-06 [receive_attached]: 3.66999e-06 [after_resolve]: 0.00338831 [a_after_grad]: 0.00441756 [renormalize]: 0.164442 [add_forward_monad_depend]: 1.76501e-05 [auto_monad_grad]: 4.12995e-06 [auto_monad_eliminator]: 0.00505985 [cse]: 0.0127166 [a_3]: 0.0214202 [Cycle 3]: 0.233515, [45] [expand_dump_flag]: 4.89003e-06 [switch_simplify]: 0.00287915 [loop_unroll]: 0.00286873 [a_1]: 0.0696707 [invalid_dout_check]: 0.0022429 [recompute_prepare]: 0.00268607 [updatestate_depend_eliminate]: 0.00258556 [updatestate_assign_eliminate]: 0.00256838 [updatestate_loads_eliminate]: 0.00260091 [parameter_eliminate]: 5.60004e-06 [a_2]: 0.0467848 [accelerated_algorithm]: 0.00359905 [shard]: 2.95008e-06 [meta_shard_fg_expand]: 0.00125599 [shard_inline]: 0.00290726 [merge_send_recv]: 0.00270372 [auto_parallel]: 0.00271557 [parallel]: 1.087e-05 [flash_sp]: 2.16998e-06 [merge_comm]: 0.00273999 [allreduce_fusion]: 0.00270607 [matmul_add_comm_reduction]: 0.0034313 [allreduce_slice_to_reducescatter]: 1.2801e-06 [virtual_shard_identity]: 0.00287636 [virtual_dataset]: 0.00288303 [get_grad_eliminate_]: 0.00292373 [virtual_output]: 0.00285943 [merge_forward]: 0.00270782 [offload_activation]: 0.00344767 [cell_reuse_recompute_pass]: 3.75998e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00495474 [merge_recompute_call_nodes]: 2.44996e-06 [before_grad]: 0.00466504 [set_forward_comm_id_for_comm_node_pass]: 0.00296205 [meta_fg_expand]: 0.00357932 [flash_sp_send_recv_attached]: 2.52004e-06 [receive_attached]: 4.10993e-06 [after_resolve]: 0.00322359 [a_after_grad]: 0.0044446 [renormalize]: 2.09897e-07 [add_forward_monad_depend]: 4.38001e-06 [auto_monad_grad]: 2.34006e-06 [auto_monad_eliminator]: 0.0044937 [cse]: 0.00851221 [a_3]: 0.0213732 [py_interpret_to_execute_after_opt_a]: 0.00403177 [slice_cell_reuse_recomputed_activation]: 3.15998e-06 [rewriter_after_opt_a]: 0.0370823 [convert_after_rewriter]: 0.00270593 [order_py_execute_after_rewriter]: 0.00231722 [opt_b]: 0.0948498, [1] [Cycle 1]: 0.0948392, [7] [b_1]: 0.0751214 [b_2]: 0.00303137 [updatestate_depend_eliminate]: 0.00262753 [updatestate_assign_eliminate]: 0.00264429 [updatestate_loads_eliminate]: 0.00264572 [renormalize]: 8.10018e-07 [cse]: 0.0086457 [optimize_parallel_all_gather_comm]: 0.00483786 [overlap_param_gather]: 9.65991e-06 [cconv]: 0.00141131 [loop_unroll]: 0.00376588 [opt_after_cconv]: 0.0299833, [1] [Cycle 1]: 0.0299749, [7] [c_1]: 0.0131869 [parameter_eliminate]: 3.50003e-06 [updatestate_depend_eliminate]: 0.00296427 [updatestate_assign_eliminate]: 0.00260649 [updatestate_loads_eliminate]: 0.0026277 [cse]: 0.00848113 [renormalize]: 4.50062e-07 [remove_dup_value]: 0.0145852 [tuple_transform]: 0.0181345, [1] [Cycle 1]: 0.018124, [2] [d_1]: 0.0180949 [renormalize]: 2.69967e-07 [partial_unused_args_eliminate]: 4.65999e-06 [add_cache_embedding]: 0.00284897 [add_recomputation]: 0.0165963 [cse_after_recomputation]: 0.00495532, [1] [Cycle 1]: 0.00494451, [1] [cse]: 0.00492261 [environ_conv]: 0.00173839 [swap_dp_allreduce_reducescatter]: 0.0027316 [bias_add_comm_swap]: 3.46999e-06 [label_micro_interleaved_index]: 7.32997e-06 [label_fine_grained_interleaved_index]: 3.03006e-06 [merge_cast_opt]: 2.00002e-06 [slice_recompute_activation]: 2.20002e-06 [micro_interleaved_order_control]: 2.92994e-06 [assign_add_opt]: 1.638e-05 [ForceFp32Comm]: 1.07009e-06 [remove_cast_before_assign_add]: 1.27009e-06 [full_micro_interleaved_order_control]: 2.60002e-06 [reorder_send_recv_between_fp_bp]: 2.66999e-06 [comm_op_add_attrs]: 1.50001e-06 [add_comm_op_reuse_tag]: 1.04995e-06 [interleave_split_concat_branches]: 1.40991e-06 [interleave_parallel_branches]: 1.21002e-06 [overlap_opt_shard_in_pipeline]: 3.5931e-05 [overlap_opt_shard_grad_in_pipeline]: 2.73006e-06 [control_data_broadcast_order]: 0.00468588 [grouped_pairwise_exchange_alltoall]: 1.54006e-06 [offloading_packed_experts]: 0.00104585 [overlap_recompute_and_grad_model_parallel]: 0.00104845 [overlap_grad_matmul_and_grad_allreduce]: 3.64007e-06 [overlap_recompute_allgather_and_fa_grad]: 1.71992e-06 [overlap_recompute_comm]: 2.68e-06 [overlap_grad_ring_attention]: 0.00103036 [overlap_grad_flash_sp]: 0.00521518 [begin_end_overlap_inline]: 1.17009e-06 [split_matmul_comm_elemetwise]: 2.80002e-06 [split_layernorm_comm]: 2.22004e-06 [handle_group_info]: 1.47009e-06 [symbol_engine_optimizer]: 0.0288651, [1] [Cycle 1]: 0.0288569, [6] [build]: 0.0138519 [elim_shapecalc]: 0.00260058 [elim_not_effective]: 0.00635708 [opt_reshape]: 0.00191464 [fold_const_symbol]: 0.00403067 [renormalize]: 4.7998e-07 [detach_backward]: 3.29001e-06 [pipeline_parallel_scheduler]: 2.48e-06 [auto_monad_reorder]: 0.00398925 [get_jit_bprop_graph]: 2.33995e-06 [rewriter_after_jit_bprop_graph]: 7.29994e-06 [opt_after_jit_grad]: 0.00657245 [distribtued_split]: 0.00471831 [validate]: 0.00333971 [backend_pass]: 2.0701e-06 [task_emit]: 1.32349 [execute]: 1.0261e-05 Sums bootstrap : 0.011818s : 0.09% type_inference : 8.287978s : 61.78% auto_monad : 0.123224s : 0.92% graph_reusing : 0.005036s : 0.04% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.025754s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.013250s : 0.10% parallel-infer-symbol : 0.000005s : 0.00% pre_auto_parallel : 0.032451s : 0.24% insert-virtual-dataset : 0.000009s : 0.00% parallel-infer-symbol-second : 0.000003s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.035528s : 0.26% optimize.rewriter_before_opt_a : 0.144316s : 1.08% optimize.opt_a.expand_dump_flag : 0.002104s : 0.02% optimize.opt_a.switch_simplify : 0.054952s : 0.41% optimize.opt_a.loop_unroll : 0.040812s : 0.30% optimize.opt_a.a_1 : 1.335038s : 9.95% optimize.opt_a.invalid_dout_check : 0.016022s : 0.12% optimize.opt_a.recompute_prepare : 0.014094s : 0.11% optimize.opt_a.updatestate_depend_eliminate : 0.038573s : 0.29% optimize.opt_a.updatestate_assign_eliminate : 0.009478s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.019735s : 0.15% optimize.opt_a.parameter_eliminate : 0.000024s : 0.00% optimize.opt_a.a_2 : 0.198472s : 1.48% optimize.opt_a.accelerated_algorithm : 0.013792s : 0.10% optimize.opt_a.shard : 0.000012s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.007839s : 0.06% optimize.opt_a.shard_inline : 0.009161s : 0.07% optimize.opt_a.merge_send_recv : 0.008517s : 0.06% optimize.opt_a.auto_parallel : 0.008496s : 0.06% optimize.opt_a.parallel : 0.000037s : 0.00% optimize.opt_a.flash_sp : 0.001715s : 0.01% optimize.opt_a.merge_comm : 0.008509s : 0.06% optimize.opt_a.allreduce_fusion : 0.008493s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.011265s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.009116s : 0.07% optimize.opt_a.virtual_dataset : 0.009009s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.009183s : 0.07% optimize.opt_a.virtual_output : 0.008918s : 0.07% optimize.opt_a.merge_forward : 0.008374s : 0.06% optimize.opt_a.offload_activation : 0.011396s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000011s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.015187s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000007s : 0.00% optimize.opt_a.before_grad : 0.014576s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.009140s : 0.07% optimize.opt_a.meta_fg_expand : 0.013712s : 0.10% optimize.opt_a.flash_sp_send_recv_attached : 0.000011s : 0.00% optimize.opt_a.receive_attached : 0.000023s : 0.00% optimize.opt_a.after_resolve : 0.010117s : 0.08% optimize.opt_a.a_after_grad : 0.013801s : 0.10% optimize.opt_a.renormalize : 1.028622s : 7.67% optimize.opt_a.add_forward_monad_depend : 0.000046s : 0.00% optimize.opt_a.auto_monad_grad : 0.000010s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.034706s : 0.26% optimize.opt_a.cse : 0.042177s : 0.31% optimize.opt_a.a_3 : 0.064098s : 0.48% optimize.py_interpret_to_execute_after_opt_a : 0.004032s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.037082s : 0.28% optimize.convert_after_rewriter : 0.002706s : 0.02% optimize.order_py_execute_after_rewriter : 0.002317s : 0.02% optimize.opt_b.b_1 : 0.075121s : 0.56% optimize.opt_b.b_2 : 0.003031s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.002628s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.002644s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.002646s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.008646s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.004838s : 0.04% optimize.overlap_param_gather : 0.000010s : 0.00% optimize.cconv : 0.001411s : 0.01% optimize.loop_unroll : 0.003766s : 0.03% optimize.opt_after_cconv.c_1 : 0.013187s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.002964s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.002606s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.002628s : 0.02% optimize.opt_after_cconv.cse : 0.008481s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.014585s : 0.11% optimize.tuple_transform.d_1 : 0.018095s : 0.13% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_cache_embedding : 0.002849s : 0.02% optimize.add_recomputation : 0.016596s : 0.12% optimize.cse_after_recomputation.cse : 0.004923s : 0.04% optimize.environ_conv : 0.001738s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.002732s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000016s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000002s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000036s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.004686s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.001046s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.001048s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.001030s : 0.01% optimize.overlap_grad_flash_sp : 0.005215s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.013852s : 0.10% optimize.symbol_engine_optimizer.elim_shapecalc : 0.002601s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.006357s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.001915s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.004031s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.003989s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.006572s : 0.05% distribtued_split : 0.004718s : 0.04% validate : 0.003340s : 0.02% backend_pass : 0.000002s : 0.00% task_emit : 1.323487s : 9.87% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.331789 67729 1.14% : 0.003777s : 1231: substitution.arithmetic_simplify 2.09% : 0.006922s : 1800: substitution.cast_eliminate 0.06% : 0.000185s : 124: substitution.depend_value_elim 0.32% : 0.001046s : 1449: substitution.elim_not_effective 0.36% : 0.001179s : 1234: substitution.float_tuple_getitem_switch 0.42% : 0.001404s : 1135: substitution.fold_const_symbol 0.40% : 0.001323s : 1963: substitution.graph_param_transform 76.77% : 0.254710s : 5675: substitution.inline 0.81% : 0.002701s : 5144: substitution.j_node_and_user_rematch 1.00% : 0.003331s : 1354: substitution.less_batch_normalization 0.45% : 0.001501s : 2708: substitution.load_eliminater 0.49% : 0.001617s : 1473: substitution.minmaximum_grad 0.05% : 0.000151s : 196: substitution.reduce_all_const_elim 0.83% : 0.002750s : 5144: substitution.remove_not_recompute_node 0.15% : 0.000510s : 1029: substitution.replace_old_param 1.16% : 0.003844s : 1449: substitution.reshape_eliminate 0.40% : 0.001337s : 596: substitution.switch_simplify 0.01% : 0.000031s : 6: substitution.transpose_eliminate 1.35% : 0.004468s : 1909: substitution.tuple_list_convert_item_index_to_positive 0.73% : 0.002411s : 2029: substitution.tuple_list_get_item_const_eliminator 1.24% : 0.004121s : 2029: substitution.tuple_list_get_item_depend_reorder 2.56% : 0.008496s : 3684: substitution.tuple_list_get_item_eliminator 0.95% : 0.003168s : 2029: substitution.tuple_list_get_set_item_eliminator 2.79% : 0.009263s : 11069: substitution.updatestate_pure_node_eliminater 3.47% : 0.011512s : 11269: substitution.updatestate_useless_node_eliminater 0.01% : 0.000032s : 1: substitution.value_based_eliminate ------[type_inference.] 8.255571 2 86.86% : 7.170517s : 1: type_inference.infer 13.14% : 1.085054s : 1: type_inference.specialize ------[replace.] 0.086494 8070 7.52% : 0.006501s : 676: replace.cast_eliminate 0.30% : 0.000259s : 24: replace.depend_value_elim 2.04% : 0.001763s : 169: replace.elim_not_effective 69.10% : 0.059763s : 5675: replace.inline 1.97% : 0.001703s : 170: replace.reshape_eliminate 8.62% : 0.007452s : 596: replace.switch_simplify 1.55% : 0.001340s : 120: replace.tuple_list_get_item_depend_reorder 8.90% : 0.007695s : 639: replace.tuple_list_get_item_eliminator 0.02% : 0.000018s : 1: replace.updatestate_pure_node_eliminater ------[match.] 0.258326 8070 1.17% : 0.003013s : 676: match.cast_eliminate 0.01% : 0.000014s : 24: match.depend_value_elim 0.09% : 0.000233s : 169: match.elim_not_effective 97.11% : 0.250867s : 5675: match.inline 0.21% : 0.000534s : 170: match.reshape_eliminate 0.40% : 0.001033s : 596: match.switch_simplify 0.30% : 0.000787s : 120: match.tuple_list_get_item_depend_reorder 0.71% : 0.001841s : 639: match.tuple_list_get_item_eliminator 0.00% : 0.000004s : 1: match.updatestate_pure_node_eliminater ------[predicate.] 0.3231251635139 1.05% : 0.003393s : 24391: predicate.accumulaten_eliminater 0.11% : 0.000366s : 1479: predicate.ad_related_special_op_eliminate 0.86% : 0.002774s : 8454: predicate.addn_check_dump 1.09% : 0.003529s : 24391: predicate.addn_zero_filter 1.08% : 0.003475s : 24391: predicate.adjust_all_reduce_mul_add 2.39% : 0.007715s : 32845: predicate.arithmetic_simplify 1.23% : 0.003966s : 25237: predicate.cast_eliminate 0.61% : 0.001975s : 5893: predicate.check_bprop_eliminate 0.85% : 0.002762s : 8454: predicate.compare_switch_simplify 0.05% : 0.000176s : 1964: predicate.const_output_eliminate 0.86% : 0.002776s : 8479: predicate.depend_value_elim 1.17% : 0.003788s : 25237: predicate.dict_get_item_const_eliminator 2.69% : 0.008707s : 25237: predicate.dict_get_item_eliminator 1.11% : 0.003579s : 25237: predicate.dict_set_item_eliminator 0.27% : 0.000881s : 3443: predicate.dumpgradient_eliminate 0.04% : 0.000139s : 1793: predicate.elim_not_effective 0.16% : 0.000517s : 1963: predicate.elim_shapecalc_of_broadcastargs 1.38% : 0.004444s : 27201: predicate.environ_add_const_eliminate 1.36% : 0.004397s : 27201: predicate.environ_get_add_eliminate 1.31% : 0.004226s : 27201: predicate.environ_get_depend_swap 2.28% : 0.007380s : 35655: predicate.environ_get_eliminate 1.29% : 0.004170s : 27201: predicate.environ_get_set_eliminate 1.42% : 0.004590s : 31672: predicate.exchange_switch_depend_value 1.82% : 0.005896s : 31672: predicate.float_depend_g_call 0.86% : 0.002783s : 8454: predicate.float_environ_get_switch 0.99% : 0.003203s : 10418: predicate.float_tuple_getitem_switch 0.04% : 0.000115s : 1479: predicate.fold_const_symbol 0.66% : 0.002131s : 6179: predicate.get_grad_eliminate 0.06% : 0.000179s : 1963: predicate.graph_param_transform 0.75% : 0.002427s : 8454: predicate.incorporate_call 0.74% : 0.002403s : 8454: predicate.incorporate_call_switch 5.52% : 0.017839s : 74624: predicate.inline 0.75% : 0.002415s : 6179: predicate.inline_without_move 0.15% : 0.000469s : 6179: predicate.j_node_and_user_rematch 0.70% : 0.002264s : 6186: predicate.less_batch_normalization 1.56% : 0.005043s : 29923: predicate.list_to_tuple_eliminator_ 2.55% : 0.008230s : 54315: predicate.load_eliminater 0.21% : 0.000675s : 1964: predicate.loop_unroll_after_grad 3.17% : 0.010254s : 31204: predicate.loop_unroll_before_grad 1.53% : 0.004941s : 29285: predicate.make_slice_get_slice_eliminator 0.86% : 0.002775s : 8454: predicate.merge_addn 0.61% : 0.001959s : 5893: predicate.micro_step_allgather_replace 0.61% : 0.001961s : 5893: predicate.mini_step_allgather_replace 1.14% : 0.003668s : 24391: predicate.minmaximum_grad 0.11% : 0.000362s : 1479: predicate.mutable_eliminate 0.11% : 0.000366s : 1479: predicate.opt_reshape 0.22% : 0.000708s : 1964: predicate.parallel_virtual_node 3.44% : 0.011105s : 31672: predicate.partial_defer_inline 1.42% : 0.004577s : 27960: predicate.partial_eliminate 1.06% : 0.003423s : 24391: predicate.print_const_string_wrapper 0.87% : 0.002807s : 8430: predicate.reduce_all_const_elim 1.45% : 0.004688s : 24391: predicate.reduce_eliminate 2.48% : 0.008024s : 54315: predicate.redundant_stop_gradient_eliminater 0.14% : 0.000437s : 6179: predicate.remove_not_recompute_node 0.92% : 0.002987s : 31889: predicate.replace_applicator 0.14% : 0.000450s : 6179: predicate.replace_old_param 0.04% : 0.000142s : 1964: predicate.reset_defer_inline 1.23% : 0.003979s : 24561: predicate.reshape_eliminate 0.62% : 0.002000s : 5893: predicate.row_tensor_add_zeros_like 0.22% : 0.000708s : 1964: predicate.row_tensor_eliminate 0.63% : 0.002050s : 5893: predicate.same_eliminate 0.21% : 0.000684s : 8875: predicate.set_cell_output_no_recompute 0.65% : 0.002106s : 6179: predicate.shard_identity_eliminate 0.27% : 0.000876s : 3443: predicate.special_op_eliminate 0.94% : 0.003053s : 8454: predicate.specialize_transform 0.64% : 0.002053s : 5893: predicate.split_environ_get_set_with_tuple_value 0.31% : 0.001016s : 6179: predicate.stack_unstack_eliminate 0.09% : 0.000275s : 1964: predicate.switch_call_monad_eliminater 1.59% : 0.005141s : 31672: predicate.switch_defer_inline 2.10% : 0.006795s : 37565: predicate.switch_layer_defer_inline 5.53% : 0.017881s : 72522: predicate.switch_simplify 1.07% : 0.003464s : 24391: predicate.tile_eliminate 1.18% : 0.003810s : 24391: predicate.transpose_eliminate 1.47% : 0.004748s : 29164: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.005009s : 29284: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.004673s : 29284: predicate.tuple_list_get_item_depend_reorder 2.58% : 0.008351s : 38377: predicate.tuple_list_get_item_eliminator 1.44% : 0.004640s : 29284: predicate.tuple_list_get_set_item_eliminator 2.55% : 0.008234s : 37738: predicate.tuple_list_set_item_eliminator 1.46% : 0.004706s : 29923: predicate.tuple_to_list_eliminator_ 2.67% : 0.008637s : 54316: predicate.updatestate_pure_node_eliminater 3.49% : 0.011262s : 62770: predicate.updatestate_useless_node_eliminater 0.22% : 0.000696s : 1964: predicate.value_based_eliminate 0.65% : 0.002098s : 6179: predicate.virtual_dataset_eliminate 0.65% : 0.002095s : 6179: predicate.virtual_output_eliminate 0.20% : 0.000653s : 1964: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.664182 7279 62.18% : 0.412958s : 2038: func_graph_cloner_run.FuncGraphClonerGraph 4.49% : 0.029835s : 304: func_graph_cloner_run.FuncGraphClonerNode 33.33% : 0.221388s : 4937: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 20.147926 253 0.00% : 0.000004s : 1: ForceFp32Comm 0.51% : 0.103478s : 1: add_attr 0.51% : 0.103411s : 1: add_attr_with_inline 0.01% : 0.002863s : 1: add_cache_embedding 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.016614s : 1: add_recomputation 0.00% : 0.000021s : 1: assign_add_opt 0.61% : 0.123285s : 1: auto_monad 0.02% : 0.004009s : 1: auto_monad_reorder 0.00% : 0.000014s : 1: backend_pass 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.06% : 0.011887s : 1: bootstrap 0.01% : 0.001423s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.004698s : 1: control_data_broadcast_order 0.01% : 0.002721s : 1: convert_after_rewriter 0.02% : 0.004961s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.00% : 0.000009s : 1: detach_backward 0.02% : 0.004738s : 1: distribtued_split 0.01% : 0.001750s : 1: environ_conv 0.00% : 0.000019s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.03% : 0.005062s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000011s : 1: inline 0.00% : 0.000017s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.02% : 0.003779s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.01% : 0.001056s : 1: offloading_packed_experts 0.01% : 0.002908s : 1: opt.transform.loop_unroll_optimizer 9.03% : 1.819649s : 134: opt.transform.opt_a 0.07% : 0.013183s : 1: opt.transform.opt_after_cconv 0.03% : 0.005867s : 2: opt.transform.opt_after_jit_grad 0.39% : 0.078020s : 28: opt.transform.opt_b 0.09% : 0.018089s : 1: opt.transform.opt_trans_graph 0.07% : 0.014889s : 4: opt.transform.symbol_engine_opt 15.44% : 3.111785s : 1: opt_a 0.15% : 0.029990s : 1: opt_after_cconv 0.03% : 0.006589s : 1: opt_after_jit_grad 0.47% : 0.094855s : 1: opt_b 17.75% : 3.576890s : 1: optimize 0.02% : 0.004852s : 1: optimize_parallel_all_gather_comm 0.01% : 0.002331s : 1: order_py_execute_after_rewriter 0.03% : 0.005229s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.001039s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000042s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000016s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.001057s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000013s : 1: parallel-infer-symbol 0.00% : 0.000008s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000007s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.16% : 0.032505s : 1: pre_auto_parallel 0.18% : 0.035568s : 1: py_interpret_to_execute 0.02% : 0.004044s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.014607s : 1: remove_dup_value 2.50% : 0.503126s : 2: renormalize.infer 2.61% : 0.525118s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.037097s : 1: rewriter_after_opt_a 0.72% : 0.144361s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.002745s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.028870s : 1: symbol_engine_optimizer 6.57% : 1.323523s : 1: task_emit 0.09% : 0.018141s : 1: tuple_transform 41.14% : 8.288043s : 1: type_inference 0.03% : 0.006857s : 1: validate [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:10.519.618 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:10.521.052 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:10.524.050 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:10.548.549 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty TotalTime = 12.5765, [24] [bootstrap]: 0.00860439 [type_inference]: 7.85671 [auto_monad]: 0.123259 [graph_reusing]: 0.00201983 [inline]: 9.55001e-06 [add_attr]: 0.10655, [1] [add_attr_with_inline]: 0.10653, [1] [Cycle 1]: 0.0407475, [2] [tag_attr]: 0.0267062 [meta_addattr_fg_expand]: 0.0139255 [parallel-infer-symbol]: 6.08002e-06 [pre_auto_parallel]: 0.0359282 [insert-virtual-dataset]: 9.52999e-06 [parallel-infer-symbol-second]: 3.29001e-06 [dataset_repeat_opt]: 3.16999e-06 [pipeline_split]: 2.65997e-06 [optimize]: 3.4221, [53] [py_interpret_to_execute]: 0.0370955 [rewriter_before_opt_a]: 0.145968 [opt_a]: 2.95208, [3] [Cycle 1]: 2.30867, [45] [expand_dump_flag]: 0.00210036 [switch_simplify]: 0.0502427 [loop_unroll]: 0.0350753 [a_1]: 1.04405 [invalid_dout_check]: 0.00948477 [recompute_prepare]: 0.0065766 [updatestate_depend_eliminate]: 0.0261943 [updatestate_assign_eliminate]: 0.00416837 [updatestate_loads_eliminate]: 0.0139653 [parameter_eliminate]: 8.91997e-06 [a_2]: 0.100985 [accelerated_algorithm]: 0.0065883 [shard]: 3.04997e-06 [meta_shard_fg_expand]: 0.00285241 [shard_inline]: 0.00311483 [merge_send_recv]: 0.00288624 [auto_parallel]: 0.00291549 [parallel]: 1.707e-05 [flash_sp]: 0.00165811 [merge_comm]: 0.00292124 [allreduce_fusion]: 0.00289749 [matmul_add_comm_reduction]: 0.00382895 [allreduce_slice_to_reducescatter]: 1.06997e-06 [virtual_shard_identity]: 0.00317269 [virtual_dataset]: 0.00311434 [get_grad_eliminate_]: 0.00322516 [virtual_output]: 0.00308346 [merge_forward]: 0.00287822 [offload_activation]: 0.00388856 [cell_reuse_recompute_pass]: 3.83996e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00515255 [merge_recompute_call_nodes]: 2.41993e-06 [before_grad]: 0.00500132 [set_forward_comm_id_for_comm_node_pass]: 0.00313951 [meta_fg_expand]: 0.00658035 [flash_sp_send_recv_attached]: 5.14998e-06 [receive_attached]: 3.35998e-06 [after_resolve]: 0.00337084 [a_after_grad]: 0.00473709 [renormalize]: 0.869761 [add_forward_monad_depend]: 2.42799e-05 [auto_monad_grad]: 4.00003e-06 [auto_monad_eliminator]: 0.0289592 [cse]: 0.0176837 [a_3]: 0.0213768 [Cycle 2]: 0.405959, [45] [expand_dump_flag]: 4.73997e-06 [switch_simplify]: 0.00289075 [loop_unroll]: 0.00292831 [a_1]: 0.0704485 [invalid_dout_check]: 0.0026925 [recompute_prepare]: 0.00269325 [updatestate_depend_eliminate]: 0.00265477 [updatestate_assign_eliminate]: 0.00270248 [updatestate_loads_eliminate]: 0.00273066 [parameter_eliminate]: 6.54999e-06 [a_2]: 0.0463536 [accelerated_algorithm]: 0.00360057 [shard]: 3.41004e-06 [meta_shard_fg_expand]: 0.00149653 [shard_inline]: 0.00287002 [merge_send_recv]: 0.00290001 [auto_parallel]: 0.00288931 [parallel]: 1.09799e-05 [flash_sp]: 5.33997e-06 [merge_comm]: 0.00290324 [allreduce_fusion]: 0.00287325 [matmul_add_comm_reduction]: 0.00366218 [allreduce_slice_to_reducescatter]: 1.03004e-06 [virtual_shard_identity]: 0.00289965 [virtual_dataset]: 0.00289248 [get_grad_eliminate_]: 0.00290048 [virtual_output]: 0.00740295 [merge_forward]: 0.0031021 [offload_activation]: 0.0038158 [cell_reuse_recompute_pass]: 3.86999e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.004976 [merge_recompute_call_nodes]: 2.61003e-06 [before_grad]: 0.00474971 [set_forward_comm_id_for_comm_node_pass]: 0.00311934 [meta_fg_expand]: 0.00426716 [flash_sp_send_recv_attached]: 3.44997e-06 [receive_attached]: 3.62005e-06 [after_resolve]: 0.00326999 [a_after_grad]: 0.00445708 [renormalize]: 0.15998 [add_forward_monad_depend]: 2.003e-05 [auto_monad_grad]: 3.75998e-06 [auto_monad_eliminator]: 0.0053453 [cse]: 0.0114671 [a_3]: 0.0212239 [Cycle 3]: 0.237422, [45] [expand_dump_flag]: 4.15999e-06 [switch_simplify]: 0.00286283 [loop_unroll]: 0.0028854 [a_1]: 0.0699749 [invalid_dout_check]: 0.00229596 [recompute_prepare]: 0.00268187 [updatestate_depend_eliminate]: 0.00258562 [updatestate_assign_eliminate]: 0.00259563 [updatestate_loads_eliminate]: 0.00261936 [parameter_eliminate]: 5.30994e-06 [a_2]: 0.0468688 [accelerated_algorithm]: 0.00358627 [shard]: 3.21993e-06 [meta_shard_fg_expand]: 0.00129288 [shard_inline]: 0.00289359 [merge_send_recv]: 0.00292644 [auto_parallel]: 0.00316499 [parallel]: 1.173e-05 [flash_sp]: 2.72994e-06 [merge_comm]: 0.00319262 [allreduce_fusion]: 0.00303173 [matmul_add_comm_reduction]: 0.00402436 [allreduce_slice_to_reducescatter]: 1.12993e-06 [virtual_shard_identity]: 0.00289077 [virtual_dataset]: 0.00288189 [get_grad_eliminate_]: 0.00301731 [virtual_output]: 0.00287377 [merge_forward]: 0.0029585 [offload_activation]: 0.00390813 [cell_reuse_recompute_pass]: 3.80003e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00478832 [merge_recompute_call_nodes]: 2.82994e-06 [before_grad]: 0.0047065 [set_forward_comm_id_for_comm_node_pass]: 0.00320387 [meta_fg_expand]: 0.00373802 [flash_sp_send_recv_attached]: 2.88e-06 [receive_attached]: 3.40992e-06 [after_resolve]: 0.00331235 [a_after_grad]: 0.00440888 [renormalize]: 1.40048e-07 [add_forward_monad_depend]: 3.82995e-06 [auto_monad_grad]: 3.05998e-06 [auto_monad_eliminator]: 0.00484135 [cse]: 0.00856898 [a_3]: 0.021099 [py_interpret_to_execute_after_opt_a]: 0.00427762 [slice_cell_reuse_recomputed_activation]: 3.46999e-06 [rewriter_after_opt_a]: 0.0363521 [convert_after_rewriter]: 0.00275299 [order_py_execute_after_rewriter]: 0.00236413 [opt_b]: 0.0936148, [1] [Cycle 1]: 0.0936041, [7] [b_1]: 0.074008 [b_2]: 0.00292924 [updatestate_depend_eliminate]: 0.002608 [updatestate_assign_eliminate]: 0.00262652 [updatestate_loads_eliminate]: 0.00265571 [renormalize]: 5.50062e-07 [cse]: 0.00865843 [optimize_parallel_all_gather_comm]: 0.00496926 [overlap_param_gather]: 9.75002e-06 [cconv]: 0.00143598 [loop_unroll]: 0.00378811 [opt_after_cconv]: 0.0301394, [1] [Cycle 1]: 0.0301316, [7] [c_1]: 0.013241 [parameter_eliminate]: 4.43996e-06 [updatestate_depend_eliminate]: 0.00301204 [updatestate_assign_eliminate]: 0.00261641 [updatestate_loads_eliminate]: 0.00264502 [cse]: 0.00850944 [renormalize]: 4.7998e-07 [remove_dup_value]: 0.0134542 [tuple_transform]: 0.0181608, [1] [Cycle 1]: 0.0181496, [2] [d_1]: 0.0181197 [renormalize]: 4.4005e-07 [partial_unused_args_eliminate]: 5.25999e-06 [add_cache_embedding]: 0.00284565 [add_recomputation]: 0.0180126 [cse_after_recomputation]: 0.00563421, [1] [Cycle 1]: 0.00551236, [1] [cse]: 0.00548883 [environ_conv]: 0.00167048 [swap_dp_allreduce_reducescatter]: 0.00329139 [bias_add_comm_swap]: 3.35998e-06 [label_micro_interleaved_index]: 8.18993e-06 [label_fine_grained_interleaved_index]: 3.85998e-06 [merge_cast_opt]: 1.74996e-06 [slice_recompute_activation]: 2.42004e-06 [micro_interleaved_order_control]: 3.51097e-06 [assign_add_opt]: 2.077e-05 [ForceFp32Comm]: 1.42003e-06 [remove_cast_before_assign_add]: 1.65997e-06 [full_micro_interleaved_order_control]: 2.91003e-06 [reorder_send_recv_between_fp_bp]: 2.94007e-06 [comm_op_add_attrs]: 1.35996e-06 [add_comm_op_reuse_tag]: 1.51002e-06 [interleave_split_concat_branches]: 1.56998e-06 [interleave_parallel_branches]: 1.33994e-06 [overlap_opt_shard_in_pipeline]: 3.62995e-06 [overlap_opt_shard_grad_in_pipeline]: 2.65997e-06 [control_data_broadcast_order]: 0.00529118 [grouped_pairwise_exchange_alltoall]: 2.06009e-06 [offloading_packed_experts]: 0.00109607 [overlap_recompute_and_grad_model_parallel]: 0.0010796 [overlap_grad_matmul_and_grad_allreduce]: 3.59002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.90991e-06 [overlap_recompute_comm]: 2.78e-06 [overlap_grad_ring_attention]: 0.00107442 [overlap_grad_flash_sp]: 0.00563351 [begin_end_overlap_inline]: 1.11002e-06 [split_matmul_comm_elemetwise]: 3.23995e-06 [split_layernorm_comm]: 2.45997e-06 [handle_group_info]: 1.24006e-06 [symbol_engine_optimizer]: 0.0290801, [1] [Cycle 1]: 0.0290712, [6] [build]: 0.0140752 [elim_shapecalc]: 0.00258388 [elim_not_effective]: 0.00639053 [opt_reshape]: 0.00191136 [fold_const_symbol]: 0.00400626 [renormalize]: 4.7998e-07 [detach_backward]: 3.78001e-06 [pipeline_parallel_scheduler]: 2.09e-06 [auto_monad_reorder]: 0.0045597 [get_jit_bprop_graph]: 3.05998e-06 [rewriter_after_jit_bprop_graph]: 7.41007e-06 [opt_after_jit_grad]: 0.00687943 [distribtued_split]: 0.00482522 [validate]: 0.00346407 [backend_pass]: 2.29001e-06 [task_emit]: 0.997225 [execute]: 1.139e-05 Sums bootstrap : 0.008604s : 0.07% type_inference : 7.856708s : 62.84% auto_monad : 0.123259s : 0.99% graph_reusing : 0.002020s : 0.02% inline : 0.000010s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.026706s : 0.21% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.013925s : 0.11% parallel-infer-symbol : 0.000006s : 0.00% pre_auto_parallel : 0.035928s : 0.29% insert-virtual-dataset : 0.000010s : 0.00% parallel-infer-symbol-second : 0.000003s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000003s : 0.00% optimize.py_interpret_to_execute : 0.037095s : 0.30% optimize.rewriter_before_opt_a : 0.145968s : 1.17% optimize.opt_a.expand_dump_flag : 0.002109s : 0.02% optimize.opt_a.switch_simplify : 0.055996s : 0.45% optimize.opt_a.loop_unroll : 0.040889s : 0.33% optimize.opt_a.a_1 : 1.184472s : 9.47% optimize.opt_a.invalid_dout_check : 0.014473s : 0.12% optimize.opt_a.recompute_prepare : 0.011952s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.031435s : 0.25% optimize.opt_a.updatestate_assign_eliminate : 0.009466s : 0.08% optimize.opt_a.updatestate_loads_eliminate : 0.019315s : 0.15% optimize.opt_a.parameter_eliminate : 0.000021s : 0.00% optimize.opt_a.a_2 : 0.194208s : 1.55% optimize.opt_a.accelerated_algorithm : 0.013775s : 0.11% optimize.opt_a.shard : 0.000010s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.005642s : 0.05% optimize.opt_a.shard_inline : 0.008878s : 0.07% optimize.opt_a.merge_send_recv : 0.008713s : 0.07% optimize.opt_a.auto_parallel : 0.008970s : 0.07% optimize.opt_a.parallel : 0.000040s : 0.00% optimize.opt_a.flash_sp : 0.001666s : 0.01% optimize.opt_a.merge_comm : 0.009017s : 0.07% optimize.opt_a.allreduce_fusion : 0.008802s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.011515s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.008963s : 0.07% optimize.opt_a.virtual_dataset : 0.008889s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.009143s : 0.07% optimize.opt_a.virtual_output : 0.013360s : 0.11% optimize.opt_a.merge_forward : 0.008939s : 0.07% optimize.opt_a.offload_activation : 0.011613s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000012s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.014917s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000008s : 0.00% optimize.opt_a.before_grad : 0.014458s : 0.12% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.009463s : 0.08% optimize.opt_a.meta_fg_expand : 0.014586s : 0.12% optimize.opt_a.flash_sp_send_recv_attached : 0.000011s : 0.00% optimize.opt_a.receive_attached : 0.000010s : 0.00% optimize.opt_a.after_resolve : 0.009953s : 0.08% optimize.opt_a.a_after_grad : 0.013603s : 0.11% optimize.opt_a.renormalize : 1.029742s : 8.24% optimize.opt_a.add_forward_monad_depend : 0.000048s : 0.00% optimize.opt_a.auto_monad_grad : 0.000011s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.039146s : 0.31% optimize.opt_a.cse : 0.037720s : 0.30% optimize.opt_a.a_3 : 0.063700s : 0.51% optimize.py_interpret_to_execute_after_opt_a : 0.004278s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.036352s : 0.29% optimize.convert_after_rewriter : 0.002753s : 0.02% optimize.order_py_execute_after_rewriter : 0.002364s : 0.02% optimize.opt_b.b_1 : 0.074008s : 0.59% optimize.opt_b.b_2 : 0.002929s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.002608s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.002627s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.002656s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.008658s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.004969s : 0.04% optimize.overlap_param_gather : 0.000010s : 0.00% optimize.cconv : 0.001436s : 0.01% optimize.loop_unroll : 0.003788s : 0.03% optimize.opt_after_cconv.c_1 : 0.013241s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.003012s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.002616s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.002645s : 0.02% optimize.opt_after_cconv.cse : 0.008509s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.013454s : 0.11% optimize.tuple_transform.d_1 : 0.018120s : 0.14% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_cache_embedding : 0.002846s : 0.02% optimize.add_recomputation : 0.018013s : 0.14% optimize.cse_after_recomputation.cse : 0.005489s : 0.04% optimize.environ_conv : 0.001670s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.003291s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000004s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000004s : 0.00% optimize.assign_add_opt : 0.000021s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000002s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.005291s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.001096s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.001080s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.001074s : 0.01% optimize.overlap_grad_flash_sp : 0.005634s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.014075s : 0.11% optimize.symbol_engine_optimizer.elim_shapecalc : 0.002584s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.006391s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.001911s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.004006s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.004560s : 0.04% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.006879s : 0.06% distribtued_split : 0.004825s : 0.04% validate : 0.003464s : 0.03% backend_pass : 0.000002s : 0.00% task_emit : 0.997225s : 7.98% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.291707 64790 1.25% : 0.003656s : 1231: substitution.arithmetic_simplify 1.65% : 0.004805s : 1027: substitution.cast_eliminate 0.07% : 0.000190s : 124: substitution.depend_value_elim 0.36% : 0.001062s : 1468: substitution.elim_not_effective 0.37% : 0.001083s : 1114: substitution.float_tuple_getitem_switch 0.47% : 0.001373s : 1154: substitution.fold_const_symbol 0.48% : 0.001404s : 1984: substitution.graph_param_transform 75.23% : 0.219449s : 5025: substitution.inline 0.93% : 0.002710s : 5110: substitution.j_node_and_user_rematch 1.19% : 0.003475s : 1354: substitution.less_batch_normalization 0.55% : 0.001596s : 3090: substitution.load_eliminater 0.49% : 0.001435s : 1305: substitution.minmaximum_grad 0.05% : 0.000150s : 196: substitution.reduce_all_const_elim 0.93% : 0.002716s : 5110: substitution.remove_not_recompute_node 0.18% : 0.000520s : 1035: substitution.replace_old_param 1.26% : 0.003667s : 1449: substitution.reshape_eliminate 0.47% : 0.001358s : 588: substitution.switch_simplify 0.01% : 0.000030s : 6: substitution.transpose_eliminate 1.38% : 0.004038s : 1693: substitution.tuple_list_convert_item_index_to_positive 0.71% : 0.002077s : 1741: substitution.tuple_list_get_item_const_eliminator 1.12% : 0.003274s : 1741: substitution.tuple_list_get_item_depend_reorder 2.66% : 0.007759s : 3349: substitution.tuple_list_get_item_eliminator 0.96% : 0.002798s : 1741: substitution.tuple_list_get_set_item_eliminator 3.15% : 0.009191s : 10976: substitution.updatestate_pure_node_eliminater 4.07% : 0.011874s : 11178: substitution.updatestate_useless_node_eliminater 0.01% : 0.000015s : 1: substitution.value_based_eliminate ------[type_inference.] 7.824486 2 86.04% : 6.732369s : 1: type_inference.infer 13.96% : 1.092117s : 1: type_inference.specialize ------[replace.] 0.073427 7390 8.34% : 0.006127s : 676: replace.cast_eliminate 0.34% : 0.000251s : 24: replace.depend_value_elim 2.38% : 0.001749s : 169: replace.elim_not_effective 65.06% : 0.047771s : 5025: replace.inline 2.15% : 0.001580s : 170: replace.reshape_eliminate 10.37% : 0.007614s : 588: replace.switch_simplify 0.68% : 0.000500s : 48: replace.tuple_list_get_item_depend_reorder 10.64% : 0.007811s : 688: replace.tuple_list_get_item_eliminator 0.03% : 0.000024s : 2: replace.updatestate_pure_node_eliminater ------[match.] 0.223091 7390 1.27% : 0.002833s : 676: match.cast_eliminate 0.01% : 0.000015s : 24: match.depend_value_elim 0.11% : 0.000241s : 169: match.elim_not_effective 96.91% : 0.216192s : 5025: match.inline 0.23% : 0.000515s : 170: match.reshape_eliminate 0.45% : 0.001004s : 588: match.switch_simplify 0.16% : 0.000356s : 48: match.tuple_list_get_item_depend_reorder 0.87% : 0.001931s : 688: match.tuple_list_get_item_eliminator 0.00% : 0.000005s : 2: match.updatestate_pure_node_eliminater ------[predicate.] 0.3021291525361 1.02% : 0.003078s : 22122: predicate.accumulaten_eliminater 0.12% : 0.000369s : 1500: predicate.ad_related_special_op_eliminate 0.95% : 0.002856s : 8356: predicate.addn_check_dump 1.04% : 0.003141s : 22122: predicate.addn_zero_filter 1.07% : 0.003231s : 22122: predicate.adjust_all_reduce_mul_add 2.46% : 0.007430s : 30478: predicate.arithmetic_simplify 1.19% : 0.003592s : 22968: predicate.cast_eliminate 0.67% : 0.002017s : 5957: predicate.check_bprop_eliminate 0.92% : 0.002793s : 8356: predicate.compare_switch_simplify 0.05% : 0.000143s : 1985: predicate.const_output_eliminate 0.93% : 0.002810s : 8426: predicate.depend_value_elim 1.15% : 0.003474s : 22968: predicate.dict_get_item_const_eliminator 1.24% : 0.003745s : 22968: predicate.dict_get_item_eliminator 1.09% : 0.003279s : 22968: predicate.dict_set_item_eliminator 0.31% : 0.000926s : 3485: predicate.dumpgradient_eliminate 0.05% : 0.000143s : 1814: predicate.elim_not_effective 0.17% : 0.000507s : 1984: predicate.elim_shapecalc_of_broadcastargs 1.28% : 0.003869s : 24953: predicate.environ_add_const_eliminate 1.26% : 0.003817s : 24953: predicate.environ_get_add_eliminate 1.27% : 0.003848s : 24953: predicate.environ_get_depend_swap 2.23% : 0.006729s : 33309: predicate.environ_get_eliminate 1.32% : 0.003988s : 24953: predicate.environ_get_set_eliminate 1.42% : 0.004284s : 28731: predicate.exchange_switch_depend_value 1.73% : 0.005228s : 28731: predicate.float_depend_g_call 0.93% : 0.002797s : 8356: predicate.float_environ_get_switch 1.09% : 0.003282s : 10341: predicate.float_tuple_getitem_switch 0.04% : 0.000119s : 1500: predicate.fold_const_symbol 0.71% : 0.002137s : 6151: predicate.get_grad_eliminate 0.06% : 0.000190s : 1984: predicate.graph_param_transform 0.81% : 0.002457s : 8356: predicate.incorporate_call 0.81% : 0.002437s : 8356: predicate.incorporate_call_switch 5.59% : 0.016892s : 69330: predicate.inline 0.81% : 0.002439s : 6151: predicate.inline_without_move 0.16% : 0.000472s : 6151: predicate.j_node_and_user_rematch 0.76% : 0.002296s : 6160: predicate.less_batch_normalization 1.45% : 0.004390s : 27673: predicate.list_to_tuple_eliminator_ 2.48% : 0.007497s : 49796: predicate.load_eliminater 0.23% : 0.000693s : 1985: predicate.loop_unroll_after_grad 3.56% : 0.010752s : 31312: predicate.loop_unroll_before_grad 1.45% : 0.004368s : 26986: predicate.make_slice_get_slice_eliminator 0.93% : 0.002810s : 8356: predicate.merge_addn 0.67% : 0.002020s : 5957: predicate.micro_step_allgather_replace 0.67% : 0.002016s : 5957: predicate.mini_step_allgather_replace 1.05% : 0.003163s : 22122: predicate.minmaximum_grad 0.12% : 0.000370s : 1500: predicate.mutable_eliminate 0.12% : 0.000369s : 1500: predicate.opt_reshape 0.24% : 0.000738s : 1985: predicate.parallel_virtual_node 3.32% : 0.010029s : 28731: predicate.partial_defer_inline 1.34% : 0.004061s : 25689: predicate.partial_eliminate 1.04% : 0.003135s : 22122: predicate.print_const_string_wrapper 0.95% : 0.002872s : 8332: predicate.reduce_all_const_elim 1.38% : 0.004166s : 22122: predicate.reduce_eliminate 2.48% : 0.007496s : 49796: predicate.redundant_stop_gradient_eliminater 0.14% : 0.000431s : 6151: predicate.remove_not_recompute_node 0.92% : 0.002771s : 29661: predicate.replace_applicator 0.15% : 0.000453s : 6151: predicate.replace_old_param 0.05% : 0.000143s : 1985: predicate.reset_defer_inline 1.20% : 0.003637s : 22292: predicate.reshape_eliminate 0.66% : 0.002003s : 5957: predicate.row_tensor_add_zeros_like 0.24% : 0.000723s : 1985: predicate.row_tensor_eliminate 0.69% : 0.002071s : 5957: predicate.same_eliminate 0.22% : 0.000677s : 8821: predicate.set_cell_output_no_recompute 0.71% : 0.002157s : 6151: predicate.shard_identity_eliminate 0.30% : 0.000896s : 3485: predicate.special_op_eliminate 1.02% : 0.003093s : 8356: predicate.specialize_transform 0.70% : 0.002119s : 5957: predicate.split_environ_get_set_with_tuple_value 0.34% : 0.001031s : 6151: predicate.stack_unstack_eliminate 0.09% : 0.000277s : 1985: predicate.switch_call_monad_eliminater 1.48% : 0.004485s : 28731: predicate.switch_defer_inline 2.11% : 0.006365s : 34688: predicate.switch_layer_defer_inline 6.01% : 0.018154s : 69575: predicate.switch_simplify 1.05% : 0.003183s : 22122: predicate.tile_eliminate 1.05% : 0.003161s : 22122: predicate.transpose_eliminate 1.51% : 0.004565s : 26937: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.004705s : 26985: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.004421s : 26985: predicate.tuple_list_get_item_depend_reorder 2.59% : 0.007813s : 36029: predicate.tuple_list_get_item_eliminator 1.47% : 0.004436s : 26985: predicate.tuple_list_get_set_item_eliminator 2.52% : 0.007619s : 35341: predicate.tuple_list_set_item_eliminator 1.42% : 0.004293s : 27673: predicate.tuple_to_list_eliminator_ 2.72% : 0.008219s : 49798: predicate.updatestate_pure_node_eliminater 3.50% : 0.010565s : 58154: predicate.updatestate_useless_node_eliminater 0.23% : 0.000704s : 1985: predicate.value_based_eliminate 0.71% : 0.002160s : 6151: predicate.virtual_dataset_eliminate 0.79% : 0.002393s : 6151: predicate.virtual_output_eliminate 0.21% : 0.000648s : 1985: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.673538 6967 64.29% : 0.433017s : 1979: func_graph_cloner_run.FuncGraphClonerGraph 0.90% : 0.006039s : 116: func_graph_cloner_run.FuncGraphClonerNode 34.81% : 0.234482s : 4872: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 18.933029 253 0.00% : 0.000005s : 1: ForceFp32Comm 0.56% : 0.106568s : 1: add_attr 0.56% : 0.106537s : 1: add_attr_with_inline 0.02% : 0.002859s : 1: add_cache_embedding 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.10% : 0.018033s : 1: add_recomputation 0.00% : 0.000025s : 1: assign_add_opt 0.65% : 0.123319s : 1: auto_monad 0.02% : 0.004581s : 1: auto_monad_reorder 0.00% : 0.000016s : 1: backend_pass 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.00% : 0.000013s : 1: bias_add_comm_swap 0.05% : 0.008650s : 1: bootstrap 0.01% : 0.001450s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.03% : 0.005305s : 1: control_data_broadcast_order 0.01% : 0.002767s : 1: convert_after_rewriter 0.03% : 0.005642s : 1: cse_after_recomputation 0.00% : 0.000009s : 1: dataset_repeat_opt 0.00% : 0.000009s : 1: detach_backward 0.03% : 0.004846s : 1: distribtued_split 0.01% : 0.001682s : 1: environ_conv 0.00% : 0.000023s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000010s : 1: get_jit_bprop_graph 0.01% : 0.002041s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000016s : 1: inline 0.00% : 0.000019s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.00% : 0.000012s : 1: label_micro_interleaved_index 0.02% : 0.003800s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.01% : 0.001107s : 1: offloading_packed_experts 0.02% : 0.002907s : 1: opt.transform.loop_unroll_optimizer 8.80% : 1.666544s : 134: opt.transform.opt_a 0.07% : 0.013238s : 1: opt.transform.opt_after_cconv 0.03% : 0.006153s : 2: opt.transform.opt_after_jit_grad 0.41% : 0.076812s : 28: opt.transform.opt_b 0.10% : 0.018114s : 1: opt.transform.opt_trans_graph 0.08% : 0.014877s : 4: opt.transform.symbol_engine_opt 15.59% : 2.952092s : 1: opt_a 0.16% : 0.030145s : 1: opt_after_cconv 0.04% : 0.006896s : 1: opt_after_jit_grad 0.49% : 0.093620s : 1: opt_b 18.07% : 3.422116s : 1: optimize 0.03% : 0.004983s : 1: optimize_parallel_all_gather_comm 0.01% : 0.002377s : 1: order_py_execute_after_rewriter 0.03% : 0.005647s : 1: overlap_grad_flash_sp 0.00% : 0.000008s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.001083s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000016s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.001090s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000014s : 1: parallel-infer-symbol 0.00% : 0.000008s : 1: parallel-infer-symbol-second 0.00% : 0.000009s : 1: partial_unused_args_eliminate 0.00% : 0.000007s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.19% : 0.035986s : 1: pre_auto_parallel 0.20% : 0.037135s : 1: py_interpret_to_execute 0.02% : 0.004290s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.07% : 0.013698s : 1: remove_dup_value 2.57% : 0.486934s : 2: renormalize.infer 2.87% : 0.542504s : 2: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.036364s : 1: rewriter_after_opt_a 0.77% : 0.146015s : 1: rewriter_before_opt_a 0.00% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.003305s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.029086s : 1: symbol_engine_optimizer 5.27% : 0.997261s : 1: task_emit 0.10% : 0.018167s : 1: tuple_transform 41.50% : 7.856779s : 1: type_inference 0.04% : 0.007256s : 1: validate [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:23.742.487 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:23.742.771 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:23.743.609 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:23.743.769 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:23.744.516 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:03:23,853 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 28.80747652053833 s; generated tokens: 8 tokens; generate speed: 0.27770568499106085 tokens/s 2025-07-24 11:03:23,854 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.0018775463104248047 s; prefill predict time: 15.510640859603882 s; prefill post time: 0.05953335762023926 s; decode prepare time: 0.0009551388876778739 s; decode predict time: 0.007169802983601888 s; decode post time: 0.005802801677158901 s 2025-07-24 11:03:23,858 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. Building prefix dict from the default dictionary ... DEBUG:jieba:Building prefix dict from the default dictionary ... Loading model from cache /tmp/jieba.cache DEBUG:jieba:Loading model from cache /tmp/jieba.cache Loading model cost 1.253 seconds. DEBUG:jieba:Loading model cost 1.253 seconds. Prefix dict has been built successfully. DEBUG:jieba:Prefix dict has been built successfully. parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 你好!<|im_end|> <|im_start|>assistant 你好!有什么可以帮助你的吗?<|im_end|> parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 你好!<|im_end|> <|im_start|>assistant 你好!有什么可以帮助你的吗?<|im_end|> calculate sim is:1.0 2025-07-24 11:03:25,123 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:03:25,124 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:03:25,124 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:03:25,125 - mindformers./output/log[mindformers/research/qwen2_5/infer/qwen2_5.py:188] - INFO - Set dynamic input for llama. [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:25.130.250 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:25.130.409 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:25.130.875 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:25.136.981 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:25.173.584 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:25.173.760 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:25.174.134 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:25.174.221 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:25.174.399 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:03:26,348 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 1.2234437465667725 s; generated tokens: 416 tokens; generate speed: 340.02380670740206 tokens/s 2025-07-24 11:03:26,349 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.0016663074493408203 s; prefill predict time: 0.034148454666137695 s; prefill post time: 0.007892370223999023 s; decode prepare time: 0.0009008078899198365 s; decode predict time: 0.0050388200610291724 s; decode post time: 0.005396947119999858 s 2025-07-24 11:03:26,353 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] return quick_sort(left) + middle + quick_sort(right) # 示例输入 arr = [ parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1 calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] return quick_sort(left) + middle + quick_sort(right) # 示例输入 arr = [ parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1 calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] return quick_sort(left) + middle + quick_sort(right) # 示例输入 arr = [ parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1 calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] return quick_sort(left) + middle + quick_sort(right) # 示例输入 arr = [ parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1 calculate sim is:1.0 2025-07-24 11:03:26,380 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:03:26,380 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:03:26,381 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:03:26,381 - mindformers./output/log[mindformers/research/qwen2_5/infer/qwen2_5.py:188] - INFO - Set dynamic input for llama. [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:26.386.352 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:26.386.523 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:26.387.004 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:26.393.107 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:26.410.230 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:26.410.386 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:26.410.737 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:26.410.841 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806944,fffd7f75a060,python):2025-07-24-11:03:26.411.046 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:03:27,957 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 1.5749998092651367 s; generated tokens: 816 tokens; generate speed: 518.0953008373564 tokens/s 2025-07-24 11:03:27,957 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.0014197826385498047 s; prefill predict time: 0.012376785278320312 s; prefill post time: 0.009716987609863281 s; decode prepare time: 0.001100462261993106 s; decode predict time: 0.0063411545753479 s; decode post time: 0.0077996135938285605 s 2025-07-24 11:03:27,961 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to contribute positively to society, and to make a positive impact on the world around us. Others may believe that the meaning of life is to seek knowledge and understanding parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to contribute positively to society, and to make a positive impact on the world around us. Others may believe that the meaning of life is to seek knowledge and understanding parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to contribute positively to society, and to make a positive impact on the world around us. Others may believe that the meaning of life is to seek knowledge and understanding parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to contribute positively to society, and to make a positive impact on the world around us. Others may believe that the meaning of life is to seek knowledge and understanding parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to contribute positively to society, and to make a positive impact on the world around us. Others may believe that the meaning of life is to seek knowledge and understanding parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to make a positive impact on the world, and to contribute to society in some way. Others may believe that the meaning of life is to seek knowledge and understanding parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to make a positive impact on the world, and to contribute to society in some way. Others may believe that the meaning of life is to seek knowledge and understanding parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to make a positive impact on the world, and to contribute to society in some way. Others may believe that the meaning of life is to seek knowledge and understanding parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 large_models/parallel_qwen2_0_5b_predict_mp2/worker_0.log0000644000175100017500000042177615040321200024153 0ustar jenkinsHwHiAiUser/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) 2025-07-24 11:02:48,703 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config moe_config is empty. 2025-07-24 11:02:48,736 - mindformers./output/log[mindformers/core/context/build_context.py:168] - INFO - Predict context config, jit_level: O0, infer_boost: on [WARNING] ME(3806933:281473875971888,MainProcess):2025-07-24-11:02:48.738.374 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(3806933:281473875971888,MainProcess):2025-07-24-11:02:48.739.199 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(3806933:281473875971888,MainProcess):2025-07-24-11:02:48.739.692 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(3806933:281473875971888,MainProcess):2025-07-24-11:02:48.739.824 [mindspore/context.py:1655] For 'context.set_context', 'enable_graph_kernel' parameter is deprecated, and will be removed in the next version. Please use jit_config={'jit_level': 'O1'} instead. [WARNING] ME(3806933:281473875971888,MainProcess):2025-07-24-11:02:48.739.969 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(3806933:281473875971888,MainProcess):2025-07-24-11:02:48.740.127 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(3806933:281473875971888,MainProcess):2025-07-24-11:02:48.740.258 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. 2025-07-24 11:02:48,740 - mindformers./output/log[mindformers/core/context/parallel.py:73] - INFO - full_batch is set to False for non-parallel modes [WARNING] DISTRIBUTED(3806933,ffff2eca9060,python):2025-07-24-11:02:48.742.679 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:38716 to 127.0.0.1:8222 is successfully created. System errno: Success [WARNING] DISTRIBUTED(3806933,ffffbe640f30,python):2025-07-24-11:02:48.742.673 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 20 source: 127.0.0.1:38716, destination: 127.0.0.1:8222 [WARNING] DISTRIBUTED(3806933,ffffbe640f30,python):2025-07-24-11:02:48.742.871 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:38718, destination: 127.0.0.1:8222 [WARNING] DISTRIBUTED(3806933,ffff2fcab060,python):2025-07-24-11:02:48.742.900 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:38718 to 127.0.0.1:8222 is successfully created. System errno: Success [WARNING] DISTRIBUTED(3806933,ffffbe640f30,python):2025-07-24-11:02:48.742.911 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:8222 to be connected...Retry number: 1 [WARNING] DISTRIBUTED(3806933,ffffbe640f30,python):2025-07-24-11:02:49.243.485 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/1200). [WARNING] DISTRIBUTED(3806933,ffffbe640f30,python):2025-07-24-11:02:49.743.573 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/1200). [WARNING] DISTRIBUTED(3806933,ffffbe640f30,python):2025-07-24-11:02:50.243.671 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(3806933,ffffbe640f30,python):2025-07-24-11:02:50.243.697 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 0 rank id: 0 [MS_ALLOC_CONF]Runtime config: enable_vmm:False [WARNING] DEVICE(3806933,ffffbe640f30,python):2025-07-24-11:02:50.467.030 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_memory_adapter.cc:155] Initialize] Reserved memory size for other components(2101346304) is less than recommend size(4068242176), It may lead to Out Of Memory in HCCL or other components, Please double check context key 'variable_memory_max_size'/'max_device_memory' [WARNING] DEVICE(3806933,ffffbe640f30,python):2025-07-24-11:02:51.768.414 [mindspore/ccsrc/plugin/res_manager/ascend/collective/multi_ascend_collective_comm_lib.cc:84] Initialize] Loading LCCL because env MS_ENABLE_LCCL is set to on. Pay attention that LCCL only supports communication group within single node in KernelByKernel for now. [WARNING] DISTRIBUTED(3806933,ffffbe640f30,python):2025-07-24-11:02:51.773.281 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(3806933,ffffbe640f30,python):2025-07-24-11:02:51.773.480 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(3806933,fffed6314060,python):2025-07-24-11:02:51.773.659 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:8222, node_rank:2130706433, total_rank_size:2, local_rank_size2 [WARNING] HCCL_ADPT(3806933,fffed6314060,python):2025-07-24-11:02:51.773.728 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(3806933,fffed6314060,python):2025-07-24-11:02:51.773.752 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:632] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(3806933,fffed6314060,python):2025-07-24-11:02:51.773.769 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group 2025-07-24 11:02:51,774 - mindformers./output/log[mindformers/tools/utils.py:185] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_0.ckpt' 2025-07-24 11:02:51,777 - mindformers./output/log[mindformers/core/context/build_context.py:383] - INFO - cann workqueue cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191] 2025-07-24 11:02:51,777 - mindformers./output/log[mindformers/core/context/build_context.py:387] - WARNING - CANN use cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], model get empty cpu list, disable binding cores 2025-07-24 11:02:51,777 - mindformers./output/log[mindformers/core/context/build_context.py:395] - INFO - cpu_affinity, rank_id: 0, device_num: 2 [WARNING] DISTRIBUTED(3806933,fffed6314060,python):2025-07-24-11:02:51.778.165 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group 2025-07-24 11:02:51,778 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 1, 'capacity_factor': 1.1, 'aux_loss_factor': 0.05, 'num_experts_chosen': 1, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV1', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 0, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': None, 'n_group': None, 'first_k_dense_replace': True, 'moe_intermediate_size': 1407, 'routed_scaling_factor': 1.0, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} 2025-07-24 11:02:51,778 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 1, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': False, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} [WARNING] DEVICE(3806933,fffe7ffff060,python):2025-07-24-11:02:52.228.922 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 tp_group is:True dp_group is:True 2025-07-24 11:02:52,286 - mindformers./output/log[mindformers/parallel_core/inference/parallel_state.py:358] - INFO - expert_model_parallel_size(1) is not equal to world_size(2), so we will use 2 as the MOE_tensor_parallel_size. [WARNING] DISTRIBUTED(3806933,ffffbe640f30,python):2025-07-24-11:02:52.289.184 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: tp-0-1 [const vector]{0, 1}, async: 0, submit_now: 1 [WARNING] DEVICE(3806933,fffe7ffff060,python):2025-07-24-11:02:52.571.132 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group [WARNING] DISTRIBUTED(3806933,fffe7ffff060,python):2025-07-24-11:02:52.571.289 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: hccl_world_group [WARNING] DISTRIBUTED(3806933,fffe7ffff060,python):2025-07-24-11:02:52.571.318 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: hccl_world_group end. [WARNING] DISTRIBUTED(3806933,fffed6314060,python):2025-07-24-11:02:52.571.388 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: hccl_world_group [WARNING] DISTRIBUTED(3806933,fffed6314060,python):2025-07-24-11:02:52.571.988 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: tp-0-1 [WARNING] DEVICE(3806933,fffe7c97c060,python):2025-07-24-11:02:52.801.909 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for tp-0-1, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806933,fffe7c97c060,python):2025-07-24-11:02:52.855.750 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for tp-0-1 [WARNING] DISTRIBUTED(3806933,fffe7c97c060,python):2025-07-24-11:02:52.855.885 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: tp-0-1 [WARNING] DISTRIBUTED(3806933,fffe7c97c060,python):2025-07-24-11:02:52.855.913 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: tp-0-1 end. [WARNING] DISTRIBUTED(3806933,fffed6314060,python):2025-07-24-11:02:52.856.013 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: tp-0-1 [WARNING] DISTRIBUTED(3806933,ffffbe640f30,python):2025-07-24-11:02:52.856.275 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: dp-0 [const vector]{0}, async: 0, submit_now: 1 [WARNING] DISTRIBUTED(3806933,fffed6314060,python):2025-07-24-11:02:52.857.173 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: dp-0 [WARNING] DEVICE(3806933,fffe6f0bf060,python):2025-07-24-11:02:52.868.392 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for dp-0, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806933,fffe6f0bf060,python):2025-07-24-11:02:52.919.300 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for dp-0 [WARNING] DISTRIBUTED(3806933,fffe6f0bf060,python):2025-07-24-11:02:52.919.449 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: dp-0 [WARNING] DISTRIBUTED(3806933,fffe6f0bf060,python):2025-07-24-11:02:52.919.477 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: dp-0 end. [WARNING] DISTRIBUTED(3806933,fffed6314060,python):2025-07-24-11:02:52.919.551 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: dp-0 [WARNING] ME(3806933:281473875971888,MainProcess):2025-07-24-11:02:52.935.066 [mindspore/ops/primitive.py:220] The in_strategy/in_layout of the operator in your network will not take effect in stand_alone mode. This means the the shard function called in the network is ignored. If you want to enable it, please use semi auto or auto parallel mode by context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL or context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL) data_parallel_group:dp-0 tensor_model_parallel_group:tp-0-1 2025-07-24 11:02:53,576 - mindformers./output/log[mindformers/models/modeling_utils.py:1517] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. 2025-07-24 11:02:53,576 - mindformers./output/log[/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/run_parallel.py:85] - INFO - ----------------Transform and load checkpoint---------------- Weight loading: 0%| | 0/24 [00:00system You are a helpful assistant.<|im_end|> <|im_start|>user 你好!<|im_end|> <|im_start|>assistant 你好!有什么可以帮助你的吗?<|im_end|> parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 你好!<|im_end|> <|im_start|>assistant 你好!有什么可以帮助你的吗?<|im_end|> calculate sim is:1.0 2025-07-24 11:03:25,146 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:03:25,147 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:03:25,147 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:03:25,148 - mindformers./output/log[mindformers/research/qwen2_5/infer/qwen2_5.py:188] - INFO - Set dynamic input for llama. [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:25.153.901 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:25.154.056 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:25.154.509 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:25.161.130 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:25.171.762 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:25.171.907 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:25.172.229 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:25.172.341 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:25.172.512 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:03:26,348 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 1.1993143558502197 s; generated tokens: 416 tokens; generate speed: 346.86485488209524 tokens/s 2025-07-24 11:03:26,349 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.0017671585083007812 s; prefill predict time: 0.010278463363647461 s; prefill post time: 0.006461620330810547 s; decode prepare time: 0.0009049674839649386 s; decode predict time: 0.0057804561128803325 s; decode post time: 0.004656923627390445 s 2025-07-24 11:03:26,353 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] return quick_sort(left) + middle + quick_sort(right) # 示例输入 arr = [ parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1 calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] return quick_sort(left) + middle + quick_sort(right) # 示例输入 arr = [ parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1 calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] return quick_sort(left) + middle + quick_sort(right) # 示例输入 arr = [ parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1 calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] return quick_sort(left) + middle + quick_sort(right) # 示例输入 arr = [ parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1 calculate sim is:1.0 2025-07-24 11:03:26,381 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:03:26,382 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:03:26,382 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:03:26,383 - mindformers./output/log[mindformers/research/qwen2_5/infer/qwen2_5.py:188] - INFO - Set dynamic input for llama. [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:26.387.455 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:26.387.609 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:26.388.018 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:26.393.999 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:26.408.879 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:26.409.029 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:26.409.358 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:26.409.443 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806933,fffdaa662060,python):2025-07-24-11:03:26.409.607 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:03:27,956 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 1.5726771354675293 s; generated tokens: 816 tokens; generate speed: 518.8604714834985 tokens/s 2025-07-24 11:03:27,956 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.0013928413391113281 s; prefill predict time: 0.010842323303222656 s; prefill post time: 0.008968353271484375 s; decode prepare time: 0.0010875522500217551 s; decode predict time: 0.005750880241394043 s; decode post time: 0.008397107077117013 s 2025-07-24 11:03:27,961 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to contribute positively to society, and to make a positive impact on the world around us. Others may believe that the meaning of life is to seek knowledge and understanding parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to contribute positively to society, and to make a positive impact on the world around us. Others may believe that the meaning of life is to seek knowledge and understanding parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to contribute positively to society, and to make a positive impact on the world around us. Others may believe that the meaning of life is to seek knowledge and understanding parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to contribute positively to society, and to make a positive impact on the world around us. Others may believe that the meaning of life is to seek knowledge and understanding parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to contribute positively to society, and to make a positive impact on the world around us. Others may believe that the meaning of life is to seek knowledge and understanding parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to make a positive impact on the world, and to contribute to society in some way. Others may believe that the meaning of life is to seek knowledge and understanding parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to make a positive impact on the world, and to contribute to society in some way. Others may believe that the meaning of life is to seek knowledge and understanding parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0.5b_predict_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to make a positive impact on the world, and to contribute to society in some way. Others may believe that the meaning of life is to seek knowledge and understanding parallel_qwen2_0.5b_predict_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 large_models/parallel_qwen2_0_5b_predict_mp2/scheduler.log0000644000175100017500000002507515040321200024371 0ustar jenkinsHwHiAiUser/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) 2025-07-24 11:02:48,491 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config moe_config is empty. 2025-07-24 11:02:48,526 - mindformers./output/log[mindformers/core/context/build_context.py:168] - INFO - Predict context config, jit_level: O0, infer_boost: on [WARNING] ME(3806921:281473051811632,MainProcess):2025-07-24-11:02:48.527.412 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(3806921:281473051811632,MainProcess):2025-07-24-11:02:48.527.595 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(3806921:281473051811632,MainProcess):2025-07-24-11:02:48.527.696 [mindspore/context.py:1346] For 'context.set_context', when set the argument 'max_device_memory', the argument 'device_target' only supports devices in '['Ascend', 'GPU']', but got 'CPU', ignore it. [WARNING] ME(3806921:281473051811632,MainProcess):2025-07-24-11:02:48.527.777 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(3806921:281473051811632,MainProcess):2025-07-24-11:02:48.527.868 [mindspore/context.py:1655] For 'context.set_context', 'enable_graph_kernel' parameter is deprecated, and will be removed in the next version. Please use jit_config={'jit_level': 'O1'} instead. [WARNING] ME(3806921:281473051811632,MainProcess):2025-07-24-11:02:48.527.989 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(3806921:281473051811632,MainProcess):2025-07-24-11:02:48.528.070 [mindspore/context.py:1346] For 'context.set_context', when set the argument 'ascend_config', the argument 'device_target' only supports devices in '['Ascend']', but got 'CPU', ignore it. [WARNING] ME(3806921:281473051811632,MainProcess):2025-07-24-11:02:48.528.141 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(3806921:281473051811632,MainProcess):2025-07-24-11:02:48.528.252 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. 2025-07-24 11:02:48,528 - mindformers./output/log[mindformers/core/context/parallel.py:73] - INFO - full_batch is set to False for non-parallel modes [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:02:48.529.378 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/1200). [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:02:49.029.487 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/1200). [WARNING] DISTRIBUTED(3806921,fffefeaa9060,python):2025-07-24-11:02:49.243.229 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:254] ProcessRegister] The new node: 0(role: MS_WORKER), rank id: 0, device id: 0, hostname: devserver-dfb8-1, ip: 127.0.0.1 is registered successfully. Currently registered node number: 1, expected node number: 2 [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:02:49.529.566 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(3/1200). [WARNING] DISTRIBUTED(3806921,fffefeaa9060,python):2025-07-24-11:02:49.689.458 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:768] ReassignNodeRank] Rank ids are already set by numeric node ids. No need to reassign them. [WARNING] DISTRIBUTED(3806921,fffefeaa9060,python):2025-07-24-11:02:49.689.498 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:254] ProcessRegister] The new node: 1(role: MS_WORKER), rank id: 1, device id: 1, hostname: devserver-dfb8-1, ip: 127.0.0.1 is registered successfully. Currently registered node number: 2, expected node number: 2 [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:02:50.029.657 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:02:50.029.708 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:02:50.029.721 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:02:55.029.844 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:02:55.029.887 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:03:00.029.993 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:03:00.030.039 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:03:05.030.160 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:03:05.030.214 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:03:10.030.335 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:03:10.030.417 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:03:15.030.541 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:03:15.030.597 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:03:20.030.717 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:03:20.030.774 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:03:25.030.895 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806921,ffff8d445f30,python):2025-07-24-11:03:25.030.939 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806921,fffefeaa9060,python):2025-07-24-11:03:28.205.176 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:323] ProcessUnregister] Node 1 has unregistered. [WARNING] DISTRIBUTED(3806921,fffefeaa9060,python):2025-07-24-11:03:28.205.436 [mindspore/ccsrc/distributed/rpc/tcp/connection.cc:79] SocketEventHandler] Event value fd: 20, events: 8193, state: 4, errcode: 11, errno: 11 Resource temporarily unavailable, remote peer: 127.0.0.1:38720, type:1, remote: 1, count: 1, this peer: 127.0.0.1:8222, please check remote peer address: 127.0.0.1:38720 in worker log to find out which worker disconnected. [WARNING] DISTRIBUTED(3806921,fffefeaa9060,python):2025-07-24-11:03:28.265.408 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:323] ProcessUnregister] Node 0 has unregistered. large_models/parallel_deepseek_r1_bf16_predict_mp2/0000700000175100017500000000000015040321111023032 5ustar jenkinsHwHiAiUserlarge_models/parallel_deepseek_r1_bf16_predict_mp2/worker_1.log0000644000175100017500000030551715040321253025322 0ustar jenkinsHwHiAiUser/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) 2025-07-24 11:02:48,611 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config runner_config is empty. 2025-07-24 11:02:48,644 - mindformers./output/log[mindformers/core/context/build_context.py:168] - INFO - Predict context config, jit_level: O0, infer_boost: on [WARNING] ME(3806946:281473008537392,MainProcess):2025-07-24-11:02:48.646.239 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(3806946:281473008537392,MainProcess):2025-07-24-11:02:48.647.057 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(3806946:281473008537392,MainProcess):2025-07-24-11:02:48.647.541 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] DISTRIBUTED(3806946,ffff8ab00f30,python):2025-07-24-11:02:48.649.646 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 20 source: 127.0.0.1:53600, destination: 127.0.0.1:8230 [WARNING] DISTRIBUTED(3806946,fffefb15c060,python):2025-07-24-11:02:48.649.659 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53600 to 127.0.0.1:8230 is successfully created. System errno: Success [WARNING] DISTRIBUTED(3806946,ffff8ab00f30,python):2025-07-24-11:02:48.649.698 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:8230 to be connected...Retry number: 1 [WARNING] DISTRIBUTED(3806946,ffff8ab00f30,python):2025-07-24-11:02:49.149.876 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:53606, destination: 127.0.0.1:8230 [WARNING] DISTRIBUTED(3806946,ffff8ab00f30,python):2025-07-24-11:02:49.149.901 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:8230 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(3806946,fffefc15e060,python):2025-07-24-11:02:49.149.911 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53606 to 127.0.0.1:8230 is successfully created. System errno: Success [WARNING] DISTRIBUTED(3806946,ffff8ab00f30,python):2025-07-24-11:02:49.650.342 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/1200). [WARNING] DISTRIBUTED(3806946,ffff8ab00f30,python):2025-07-24-11:02:50.150.452 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(3806946,ffff8ab00f30,python):2025-07-24-11:02:50.150.478 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 1 rank id: 1 [MS_ALLOC_CONF]Runtime config: enable_vmm:False [WARNING] DEVICE(3806946,ffff8ab00f30,python):2025-07-24-11:02:50.368.270 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_memory_adapter.cc:155] Initialize] Reserved memory size for other components(2101346304) is less than recommend size(4067995648), It may lead to Out Of Memory in HCCL or other components, Please double check context key 'variable_memory_max_size'/'max_device_memory' [WARNING] DEVICE(3806946,ffff8ab00f30,python):2025-07-24-11:02:51.651.045 [mindspore/ccsrc/plugin/res_manager/ascend/collective/multi_ascend_collective_comm_lib.cc:84] Initialize] Loading LCCL because env MS_ENABLE_LCCL is set to on. Pay attention that LCCL only supports communication group within single node in KernelByKernel for now. [WARNING] DISTRIBUTED(3806946,ffff8ab00f30,python):2025-07-24-11:02:51.654.648 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(3806946,ffff8ab00f30,python):2025-07-24-11:02:51.654.817 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(3806946,fffe66ffd060,python):2025-07-24-11:02:51.654.972 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:8230, node_rank:2130706433, total_rank_size:2, local_rank_size2 [WARNING] HCCL_ADPT(3806946,fffe66ffd060,python):2025-07-24-11:02:51.655.036 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(3806946,fffe66ffd060,python):2025-07-24-11:02:51.655.057 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:632] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(3806946,fffe66ffd060,python):2025-07-24-11:02:51.655.071 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [WARNING] DEVICE(3806946,fffe66ffd060,python):2025-07-24-11:02:51.655.339 [mindspore/ccsrc/plugin/device/cpu/hal/hardware/ms_collective_comm_lib.cc:251] QueryUniqueID] Retry to lookup the unique id for group hccl_world_group from the meta server node...Retry time: 399/400, sleep 1 2025-07-24 11:02:51,656 - mindformers./output/log[mindformers/tools/utils.py:185] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_1.ckpt' 2025-07-24 11:02:51,658 - mindformers./output/log[mindformers/core/context/build_context.py:383] - INFO - cann workqueue cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191] 2025-07-24 11:02:51,658 - mindformers./output/log[mindformers/core/context/build_context.py:387] - WARNING - CANN use cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], model get empty cpu list, disable binding cores 2025-07-24 11:02:51,659 - mindformers./output/log[mindformers/core/context/build_context.py:395] - INFO - cpu_affinity, rank_id: 1, device_num: 2 2025-07-24 11:02:51,659 - mindformers./output/log[mindformers/core/context/build_context.py:366] - WARNING - custom bind policy affinity_cpu_list must be dict, but got None. 2025-07-24 11:02:51,659 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 256, 'capacity_factor': 1.1, 'aux_loss_factor': 0.05, 'num_experts_chosen': 8, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 4, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} 2025-07-24 11:02:51,660 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 1, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': False, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} [WARNING] DISTRIBUTED(3806946,fffe66ffd060,python):2025-07-24-11:02:52.155.608 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group [WARNING] DEVICE(3806946,fffe667fc060,python):2025-07-24-11:02:52.170.688 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 2025-07-24 11:02:52,300 - mindformers./output/log[mindformers/version_control.py:119] - INFO - The Lazy Inline compilation acceleration feature does not support single-card mode.This feature is disabled by default. ENABLE_LAZY_INLINE=1 does not take effect. 2025-07-24 11:02:52,364 - mindformers./output/log[mindformers/parallel_core/inference/parallel_state.py:358] - INFO - expert_model_parallel_size(1) is not equal to world_size(2), so we will use 2 as the MOE_tensor_parallel_size. [WARNING] DISTRIBUTED(3806946,ffff8ab00f30,python):2025-07-24-11:02:52.366.853 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: tp-0-1 [const vector]{0, 1}, async: 0, submit_now: 1 [WARNING] DEVICE(3806946,fffe667fc060,python):2025-07-24-11:02:52.714.043 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group [WARNING] DISTRIBUTED(3806946,fffe667fc060,python):2025-07-24-11:02:52.714.179 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: hccl_world_group [WARNING] DISTRIBUTED(3806946,fffe667fc060,python):2025-07-24-11:02:52.714.204 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: hccl_world_group end. [WARNING] DISTRIBUTED(3806946,fffe66ffd060,python):2025-07-24-11:02:52.714.271 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: hccl_world_group [WARNING] DISTRIBUTED(3806946,fffe66ffd060,python):2025-07-24-11:02:52.714.668 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: tp-0-1 [WARNING] DEVICE(3806946,fffe667fc060,python):2025-07-24-11:02:52.726.364 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for tp-0-1, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806946,fffe667fc060,python):2025-07-24-11:02:52.959.235 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for tp-0-1 [WARNING] DISTRIBUTED(3806946,fffe667fc060,python):2025-07-24-11:02:52.959.333 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: tp-0-1 [WARNING] DISTRIBUTED(3806946,fffe667fc060,python):2025-07-24-11:02:52.959.356 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: tp-0-1 end. [WARNING] DISTRIBUTED(3806946,fffe66ffd060,python):2025-07-24-11:02:52.959.400 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: tp-0-1 [WARNING] ME(3806946:281473008537392,MainProcess):2025-07-24-11:02:53.398.589 [mindspore/ops/primitive.py:220] The in_strategy/in_layout of the operator in your network will not take effect in stand_alone mode. This means the the shard function called in the network is ignored. If you want to enable it, please use semi auto or auto parallel mode by context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL or context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL) 2025-07-24 11:02:53,406 - mindformers./output/log[mindformers/models/utils.py:205] - INFO - num_layers per stage: [[4]] 2025-07-24 11:02:53,406 - mindformers./output/log[mindformers/models/utils.py:206] - INFO - Accumulated num_layers per stage: [[4]] 2025-07-24 11:02:53,406 - mindformers./output/log[mindformers/models/utils.py:208] - INFO - Pipeline id list with start_stage: [0, 0, 0, 0] 2025-07-24 11:02:53,407 - mindformers./output/log[mindformers/models/utils.py:209] - INFO - Interleave id list: [0, 0, 0, 0] 2025-07-24 11:02:53,407 - mindformers./output/log[mindformers/models/utils.py:227] - INFO - Formative layer_recompute: [[0]] 2025-07-24 11:02:53,407 - mindformers./output/log[mindformers/models/utils.py:229] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-24 11:02:53,407 - mindformers./output/log[mindformers/models/utils.py:235] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0]], 'feed_forward\\.w1\\.activation\\.silu': [[0]]} 2025-07-24 11:02:53,407 - mindformers./output/log[mindformers/models/utils.py:236] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0]]} 2025-07-24 11:02:53,408 - mindformers./output/log[mindformers/models/utils.py:237] - INFO - Formative select_recompute_exclude: {} 2025-07-24 11:02:53,408 - mindformers./output/log[mindformers/models/utils.py:238] - INFO - Formative select_comm_recompute_exclude: {} 2025-07-24 11:02:53,427 - mindformers./output/log[mindformers/research/deepseek3/deepseek3_model_infer.py:927] - WARNING - first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. 2025-07-24 11:02:53,437 - mindformers./output/log[mindformers/version_control.py:76] - INFO - Predict enable lazy inline. [WARNING] DISTRIBUTED(3806946,ffff8ab00f30,python):2025-07-24-11:02:53.461.243 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: moe_tp-0-1 [const vector]{0, 1}, async: 0, submit_now: 1 [WARNING] DISTRIBUTED(3806946,fffe66ffd060,python):2025-07-24-11:02:53.461.880 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: moe_tp-0-1 [WARNING] DEVICE(3806946,fffe667fc060,python):2025-07-24-11:02:53.473.652 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for moe_tp-0-1, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806946,fffe667fc060,python):2025-07-24-11:02:53.710.018 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for moe_tp-0-1 [WARNING] DISTRIBUTED(3806946,fffe667fc060,python):2025-07-24-11:02:53.710.122 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: moe_tp-0-1 [WARNING] DISTRIBUTED(3806946,fffe667fc060,python):2025-07-24-11:02:53.710.146 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: moe_tp-0-1 end. [WARNING] DISTRIBUTED(3806946,fffe66ffd060,python):2025-07-24-11:02:53.710.196 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: moe_tp-0-1 2025-07-24 11:02:53,742 - mindformers./output/log[mindformers/version_control.py:76] - INFO - Predict enable lazy inline. 2025-07-24 11:02:53,797 - mindformers./output/log[mindformers/version_control.py:76] - INFO - Predict enable lazy inline. 2025-07-24 11:02:53,852 - mindformers./output/log[mindformers/version_control.py:76] - INFO - Predict enable lazy inline. 2025-07-24 11:02:53,858 - mindformers./output/log[mindformers/models/modeling_utils.py:1517] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. 2025-07-24 11:02:53,859 - mindformers./output/log[mindformers/research/deepseek3/deepseek3_model_infer.py:1247] - INFO - Predict run mode:True 2025-07-24 11:02:53,859 - mindformers./output/log[/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/run_parallel.py:274] - INFO - ----------------Transform and load checkpoint---------------- [WARNING] ME(3806946:281473008537392,MainProcess):2025-07-24-11:03:32.980.928 [mindspore/train/serialization.py:1789] For 'load_param_into_net', 23 parameters in the 'net' are not loaded, because they are not in the 'parameter_dict', please check whether the network structure is consistent when training and loading checkpoint. [WARNING] ME(3806946:281473008537392,MainProcess):2025-07-24-11:03:32.981.514 [mindspore/train/serialization.py:1793] ['model.layers.0.attention.infer_attention.paged_attention_mgr.key_cache', 'model.layers.1.attention.infer_attention.paged_attention_mgr.key_cache', 'model.layers.1.feed_forward.routed_experts.ffn.w1.weight', 'model.layers.1.feed_forward.routed_experts.ffn.w3.weight', 'model.layers.1.feed_forward.routed_experts.ffn.w2.weight', 'model.layers.1.feed_forward.routed_experts.router.e_score_correction_bias', 'model.layers.1.feed_forward.routed_experts.router.dense.weight', 'model.layers.1.feed_forward.routed_experts.router.router.topk_bias', 'model.layers.1.feed_forward.shared_experts.w1.weight', 'model.layers.1.feed_forward.shared_experts.w3.weight', 'model.layers.1.feed_forward.shared_experts.w2.weight', 'model.layers.2.attention.infer_attention.paged_attention_mgr.key_cache', 'model.layers.2.feed_forward.routed_experts.ffn.w1.weight', 'model.layers.2.feed_forward.routed_experts.ffn.w3.weight', 'model.layers.2.feed_forward.routed_experts.ffn.w2.weight', 'model.layers.2.feed_forward.routed_experts.router.e_score_correction_bias', 'model.layers.2.feed_forward.routed_experts.router.dense.weight', 'model.layers.2.feed_forward.routed_experts.router.router.topk_bias', 'model.layers.2.feed_forward.shared_experts.w1.weight', 'model.layers.2.feed_forward.shared_experts.w3.weight', 'model.layers.2.feed_forward.shared_experts.w2.weight', 'model.layers.3.attention.infer_attention.paged_attention_mgr.key_cache', 'model.layers.3.feed_forward.routed_experts.router.router.topk_bias'] are not loaded. param_not_load: ['model.layers.0.attention.infer_attention.paged_attention_mgr.key_cache', 'model.layers.1.attention.infer_attention.paged_attention_mgr.key_cache', 'model.layers.1.feed_forward.routed_experts.ffn.w1.weight', 'model.layers.1.feed_forward.routed_experts.ffn.w3.weight', 'model.layers.1.feed_forward.routed_experts.ffn.w2.weight', 'model.layers.1.feed_forward.routed_experts.router.e_score_correction_bias', 'model.layers.1.feed_forward.routed_experts.router.dense.weight', 'model.layers.1.feed_forward.routed_experts.router.router.topk_bias', 'model.layers.1.feed_forward.shared_experts.w1.weight', 'model.layers.1.feed_forward.shared_experts.w3.weight', 'model.layers.1.feed_forward.shared_experts.w2.weight', 'model.layers.2.attention.infer_attention.paged_attention_mgr.key_cache', 'model.layers.2.feed_forward.routed_experts.ffn.w1.weight', 'model.layers.2.feed_forward.routed_experts.ffn.w3.weight', 'model.layers.2.feed_forward.routed_experts.ffn.w2.weight', 'model.layers.2.feed_forward.routed_experts.router.e_score_correction_bias', 'model.layers.2.feed_forward.routed_experts.router.dense.weight', 'model.layers.2.feed_forward.routed_experts.router.router.topk_bias', 'model.layers.2.feed_forward.shared_experts.w1.weight', 'model.layers.2.feed_forward.shared_experts.w3.weight', 'model.layers.2.feed_forward.shared_experts.w2.weight', 'model.layers.3.attention.infer_attention.paged_attention_mgr.key_cache', 'model.layers.3.feed_forward.routed_experts.router.router.topk_bias'], ckpt_not_load: ['model.layers.1.feed_forward.w1.weight', 'model.layers.1.feed_forward.w3.weight', 'model.layers.1.feed_forward.w2.weight', 'model.layers.2.feed_forward.w1.weight', 'model.layers.2.feed_forward.w3.weight', 'model.layers.2.feed_forward.w2.weight'] 2025-07-24 11:03:33,595 - mindformers./output/log[mindformers/models/tokenization_utils_base.py:1807] - WARNING - No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. 2025-07-24 11:03:33,660 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 1.0, 'top_k': 1, 'top_p': 1, 'repetition_penalty': 1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 1, 'bos_token_id': 0, 'eos_token_id': [1], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:03:33,662 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:03:33,662 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:03:33,663 - mindformers./output/log[mindformers/research/deepseek3/deepseek3_model_infer.py:1419] - INFO - Set dynamic input for DeepseekV3. huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) TotalTime = 10.2606, [24] [bootstrap]: 0.00414295 [type_inference]: 3.5839 [auto_monad]: 0.030906 [graph_reusing]: 0.0896324 [inline]: 3.93996e-06 [add_attr]: 0.0303593, [1] [add_attr_with_inline]: 0.030339, [1] [Cycle 1]: 0.0102953, [2] [tag_attr]: 0.00728943 [meta_addattr_fg_expand]: 0.00290694 [parallel-infer-symbol]: 4.27e-06 [pre_auto_parallel]: 0.00894544 [insert-virtual-dataset]: 5.81995e-06 [parallel-infer-symbol-second]: 1.32003e-06 [dataset_repeat_opt]: 2.24996e-06 [pipeline_split]: 2.40002e-06 [optimize]: 1.00916, [53] [py_interpret_to_execute]: 0.00826204 [rewriter_before_opt_a]: 0.0402562 [opt_a]: 0.862275, [3] [Cycle 1]: 0.680568, [45] [expand_dump_flag]: 0.000609204 [switch_simplify]: 0.0146618 [loop_unroll]: 0.0094148 [a_1]: 0.278374 [invalid_dout_check]: 0.00162714 [recompute_prepare]: 0.00159886 [updatestate_depend_eliminate]: 0.00515522 [updatestate_assign_eliminate]: 0.000904596 [updatestate_loads_eliminate]: 0.00267248 [parameter_eliminate]: 5.05999e-06 [a_2]: 0.0291739 [accelerated_algorithm]: 0.00180039 [shard]: 3.45998e-06 [meta_shard_fg_expand]: 0.000737285 [shard_inline]: 0.000945446 [merge_send_recv]: 0.000766775 [auto_parallel]: 0.000685475 [parallel]: 1.29e-05 [flash_sp]: 0.000377112 [merge_comm]: 0.000700915 [allreduce_fusion]: 0.000688235 [matmul_add_comm_reduction]: 0.000854505 [allreduce_slice_to_reducescatter]: 1.25996e-06 [virtual_shard_identity]: 0.000948337 [virtual_dataset]: 0.000898946 [get_grad_eliminate_]: 0.000901296 [virtual_output]: 0.000899106 [merge_forward]: 0.000695484 [offload_activation]: 0.000903696 [cell_reuse_recompute_pass]: 2.60992e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00165162 [merge_recompute_call_nodes]: 2.22994e-06 [before_grad]: 0.00159146 [set_forward_comm_id_for_comm_node_pass]: 0.000776755 [meta_fg_expand]: 0.00127898 [flash_sp_send_recv_attached]: 5.02006e-06 [receive_attached]: 1.762e-05 [after_resolve]: 0.000998656 [a_after_grad]: 0.00147887 [renormalize]: 0.299993 [add_forward_monad_depend]: 1.739e-05 [auto_monad_grad]: 3.91004e-06 [auto_monad_eliminator]: 0.00468799 [cse]: 0.00392916 [a_3]: 0.00633629 [Cycle 2]: 0.114071, [45] [expand_dump_flag]: 3.99002e-06 [switch_simplify]: 0.000869826 [loop_unroll]: 0.000858196 [a_1]: 0.0246977 [invalid_dout_check]: 0.000733655 [recompute_prepare]: 0.000802795 [updatestate_depend_eliminate]: 0.000622794 [updatestate_assign_eliminate]: 0.000603904 [updatestate_loads_eliminate]: 0.000600554 [parameter_eliminate]: 4.431e-06 [a_2]: 0.0128166 [accelerated_algorithm]: 0.00100154 [shard]: 3.06999e-06 [meta_shard_fg_expand]: 0.000312822 [shard_inline]: 0.000846615 [merge_send_recv]: 0.000651254 [auto_parallel]: 0.000615254 [parallel]: 1.146e-05 [flash_sp]: 5.13997e-06 [merge_comm]: 0.000632864 [allreduce_fusion]: 0.000614234 [matmul_add_comm_reduction]: 0.000762195 [allreduce_slice_to_reducescatter]: 1.29e-06 [virtual_shard_identity]: 0.000836816 [virtual_dataset]: 0.000805275 [get_grad_eliminate_]: 0.000807306 [virtual_output]: 0.000790895 [merge_forward]: 0.000633904 [offload_activation]: 0.000779625 [cell_reuse_recompute_pass]: 4.11994e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00142799 [merge_recompute_call_nodes]: 2.02004e-06 [before_grad]: 0.0014028 [set_forward_comm_id_for_comm_node_pass]: 0.000709515 [meta_fg_expand]: 0.000657834 [flash_sp_send_recv_attached]: 2.73995e-06 [receive_attached]: 3.65009e-06 [after_resolve]: 0.000918456 [a_after_grad]: 0.00130471 [renormalize]: 0.0439665 [add_forward_monad_depend]: 1.093e-05 [auto_monad_grad]: 3.26999e-06 [auto_monad_eliminator]: 0.00128841 [cse]: 0.00315247 [a_3]: 0.00590789 [Cycle 3]: 0.067607, [45] [expand_dump_flag]: 3.26999e-06 [switch_simplify]: 0.000806605 [loop_unroll]: 0.000804666 [a_1]: 0.0231525 [invalid_dout_check]: 0.000628554 [recompute_prepare]: 0.000791355 [updatestate_depend_eliminate]: 0.000606654 [updatestate_assign_eliminate]: 0.000598934 [updatestate_loads_eliminate]: 0.000603844 [parameter_eliminate]: 3.88001e-06 [a_2]: 0.0128816 [accelerated_algorithm]: 0.00120487 [shard]: 2.84996e-06 [meta_shard_fg_expand]: 0.000251392 [shard_inline]: 0.000839895 [merge_send_recv]: 0.000656435 [auto_parallel]: 0.000618774 [parallel]: 9.71998e-06 [flash_sp]: 2.16998e-06 [merge_comm]: 0.000644634 [allreduce_fusion]: 0.000645624 [matmul_add_comm_reduction]: 0.000732864 [allreduce_slice_to_reducescatter]: 8.70903e-07 [virtual_shard_identity]: 0.000854595 [virtual_dataset]: 0.000842656 [get_grad_eliminate_]: 0.000805505 [virtual_output]: 0.000845706 [merge_forward]: 0.000616584 [offload_activation]: 0.000754265 [cell_reuse_recompute_pass]: 3.65998e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00143912 [merge_recompute_call_nodes]: 1.67009e-06 [before_grad]: 0.0014554 [set_forward_comm_id_for_comm_node_pass]: 0.000711745 [meta_fg_expand]: 0.000629184 [flash_sp_send_recv_attached]: 2.62004e-06 [receive_attached]: 3.33006e-06 [after_resolve]: 0.000917506 [a_after_grad]: 0.00134535 [renormalize]: 1.59955e-07 [add_forward_monad_depend]: 3.35998e-06 [auto_monad_grad]: 1.61002e-06 [auto_monad_eliminator]: 0.00108683 [cse]: 0.00237681 [a_3]: 0.0058813 [py_interpret_to_execute_after_opt_a]: 0.000819685 [slice_cell_reuse_recomputed_activation]: 3.09001e-06 [rewriter_after_opt_a]: 0.0277043 [convert_after_rewriter]: 0.000707125 [order_py_execute_after_rewriter]: 0.000523303 [opt_b]: 0.0259996, [1] [Cycle 1]: 0.02599, [7] [b_1]: 0.0207348 [b_2]: 0.000817485 [updatestate_depend_eliminate]: 0.000622264 [updatestate_assign_eliminate]: 0.000618494 [updatestate_loads_eliminate]: 0.000614874 [renormalize]: 1.0099e-06 [cse]: 0.00247277 [optimize_parallel_all_gather_comm]: 0.00115248 [overlap_param_gather]: 1.06799e-05 [cconv]: 0.000343572 [loop_unroll]: 0.00185375 [opt_after_cconv]: 0.00884002, [1] [Cycle 1]: 0.00883143, [7] [c_1]: 0.00434606 [parameter_eliminate]: 3.35008e-06 [updatestate_depend_eliminate]: 0.000712755 [updatestate_assign_eliminate]: 0.000626274 [updatestate_loads_eliminate]: 0.000620674 [cse]: 0.00242634 [renormalize]: 7.00005e-07 [remove_dup_value]: 0.00469013 [tuple_transform]: 0.00598844, [1] [Cycle 1]: 0.00598063, [2] [d_1]: 0.00595356 [renormalize]: 5.10016e-07 [partial_unused_args_eliminate]: 4.18001e-06 [add_cache_embedding]: 0.000683235 [add_recomputation]: 0.00413416 [cse_after_recomputation]: 0.00121317, [1] [Cycle 1]: 0.00120516, [1] [cse]: 0.00118672 [environ_conv]: 0.000360893 [swap_dp_allreduce_reducescatter]: 0.000625654 [bias_add_comm_swap]: 3.43006e-06 [label_micro_interleaved_index]: 6.32997e-06 [label_fine_grained_interleaved_index]: 3.21004e-06 [merge_cast_opt]: 2.02993e-06 [slice_recompute_activation]: 1.91003e-06 [micro_interleaved_order_control]: 2.51003e-06 [assign_add_opt]: 1.57499e-05 [ForceFp32Comm]: 1.11992e-06 [remove_cast_before_assign_add]: 1.16008e-06 [full_micro_interleaved_order_control]: 2.86999e-06 [reorder_send_recv_between_fp_bp]: 2.43005e-06 [comm_op_add_attrs]: 1.06997e-06 [add_comm_op_reuse_tag]: 9.69972e-07 [interleave_split_concat_branches]: 1.37999e-06 [interleave_parallel_branches]: 1.25996e-06 [overlap_opt_shard_in_pipeline]: 1.357e-05 [overlap_opt_shard_grad_in_pipeline]: 2.89001e-06 [control_data_broadcast_order]: 0.00103267 [grouped_pairwise_exchange_alltoall]: 1.61002e-06 [offloading_packed_experts]: 0.000234591 [overlap_recompute_and_grad_model_parallel]: 0.000227382 [overlap_grad_matmul_and_grad_allreduce]: 2.24996e-06 [overlap_recompute_allgather_and_fa_grad]: 1.46998e-06 [overlap_recompute_comm]: 2.48e-06 [overlap_grad_ring_attention]: 0.000226641 [overlap_grad_flash_sp]: 0.00132391 [begin_end_overlap_inline]: 1.05996e-06 [split_matmul_comm_elemetwise]: 2.55997e-06 [split_layernorm_comm]: 1.92004e-06 [handle_group_info]: 1.1099e-06 [symbol_engine_optimizer]: 0.00911001, [1] [Cycle 1]: 0.00910265, [6] [build]: 0.0042297 [elim_shapecalc]: 0.000769965 [elim_not_effective]: 0.00165589 [opt_reshape]: 0.000965667 [fold_const_symbol]: 0.00138071 [renormalize]: 6.79982e-07 [detach_backward]: 3.09991e-06 [pipeline_parallel_scheduler]: 1.87999e-06 [auto_monad_reorder]: 0.00113003 [get_jit_bprop_graph]: 2.03005e-06 [rewriter_after_jit_bprop_graph]: 5.01995e-06 [opt_after_jit_grad]: 0.00248957 [distribtued_split]: 0.00145091 [validate]: 0.00109133 [backend_pass]: 2.31003e-06 [task_emit]: 5.49358 [execute]: 1.164e-05 Sums bootstrap : 0.004143s : 0.04% type_inference : 3.583899s : 35.02% auto_monad : 0.030906s : 0.30% graph_reusing : 0.089632s : 0.88% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.007289s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.002907s : 0.03% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.008945s : 0.09% insert-virtual-dataset : 0.000006s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.008262s : 0.08% optimize.rewriter_before_opt_a : 0.040256s : 0.39% optimize.opt_a.expand_dump_flag : 0.000616s : 0.01% optimize.opt_a.switch_simplify : 0.016338s : 0.16% optimize.opt_a.loop_unroll : 0.011078s : 0.11% optimize.opt_a.a_1 : 0.326224s : 3.19% optimize.opt_a.invalid_dout_check : 0.002989s : 0.03% optimize.opt_a.recompute_prepare : 0.003193s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.006385s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.002107s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.003877s : 0.04% optimize.opt_a.parameter_eliminate : 0.000013s : 0.00% optimize.opt_a.a_2 : 0.054872s : 0.54% optimize.opt_a.accelerated_algorithm : 0.004007s : 0.04% optimize.opt_a.shard : 0.000009s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.001301s : 0.01% optimize.opt_a.shard_inline : 0.002632s : 0.03% optimize.opt_a.merge_send_recv : 0.002074s : 0.02% optimize.opt_a.auto_parallel : 0.001920s : 0.02% optimize.opt_a.parallel : 0.000034s : 0.00% optimize.opt_a.flash_sp : 0.000384s : 0.00% optimize.opt_a.merge_comm : 0.001978s : 0.02% optimize.opt_a.allreduce_fusion : 0.001948s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.002350s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.002640s : 0.03% optimize.opt_a.virtual_dataset : 0.002547s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.002514s : 0.02% optimize.opt_a.virtual_output : 0.002536s : 0.02% optimize.opt_a.merge_forward : 0.001946s : 0.02% optimize.opt_a.offload_activation : 0.002438s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000010s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.004519s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000006s : 0.00% optimize.opt_a.before_grad : 0.004450s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.002198s : 0.02% optimize.opt_a.meta_fg_expand : 0.002566s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000010s : 0.00% optimize.opt_a.receive_attached : 0.000025s : 0.00% optimize.opt_a.after_resolve : 0.002835s : 0.03% optimize.opt_a.a_after_grad : 0.004129s : 0.04% optimize.opt_a.renormalize : 0.343960s : 3.36% optimize.opt_a.add_forward_monad_depend : 0.000032s : 0.00% optimize.opt_a.auto_monad_grad : 0.000009s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.007063s : 0.07% optimize.opt_a.cse : 0.009458s : 0.09% optimize.opt_a.a_3 : 0.018125s : 0.18% optimize.py_interpret_to_execute_after_opt_a : 0.000820s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.027704s : 0.27% optimize.convert_after_rewriter : 0.000707s : 0.01% optimize.order_py_execute_after_rewriter : 0.000523s : 0.01% optimize.opt_b.b_1 : 0.020735s : 0.20% optimize.opt_b.b_2 : 0.000817s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000622s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000618s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000615s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.002473s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.001152s : 0.01% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.000344s : 0.00% optimize.loop_unroll : 0.001854s : 0.02% optimize.opt_after_cconv.c_1 : 0.004346s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000713s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000626s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000621s : 0.01% optimize.opt_after_cconv.cse : 0.002426s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.004690s : 0.05% optimize.tuple_transform.d_1 : 0.005954s : 0.06% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.partial_unused_args_eliminate : 0.000004s : 0.00% optimize.add_cache_embedding : 0.000683s : 0.01% optimize.add_recomputation : 0.004134s : 0.04% optimize.cse_after_recomputation.cse : 0.001187s : 0.01% optimize.environ_conv : 0.000361s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000626s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000016s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000014s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.001033s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000235s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000227s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000227s : 0.00% optimize.overlap_grad_flash_sp : 0.001324s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.004230s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000770s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.001656s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000966s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.001381s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.001130s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.002490s : 0.02% distribtued_split : 0.001451s : 0.01% validate : 0.001091s : 0.01% backend_pass : 0.000002s : 0.00% task_emit : 5.493584s : 53.68% execute : 0.000012s : 0.00% Time group info: ------[substitution.] 0.087690 20183 0.64% : 0.000563s : 159: substitution.arithmetic_simplify 1.43% : 0.001258s : 262: substitution.cast_eliminate 0.05% : 0.000047s : 24: substitution.depend_value_elim 0.36% : 0.000316s : 492: substitution.elim_not_effective 0.43% : 0.000379s : 506: substitution.float_tuple_getitem_switch 0.51% : 0.000449s : 424: substitution.fold_const_symbol 0.46% : 0.000405s : 613: substitution.graph_param_transform 74.72% : 0.065526s : 1725: substitution.inline 0.81% : 0.000712s : 1664: substitution.j_node_and_user_rematch 0.97% : 0.000854s : 377: substitution.less_batch_normalization 0.26% : 0.000228s : 18: substitution.list_to_tuple_eliminator_ 0.36% : 0.000320s : 636: substitution.load_eliminater 0.68% : 0.000600s : 551: substitution.minmaximum_grad 0.15% : 0.000131s : 62: substitution.opt_reshape 0.03% : 0.000027s : 32: substitution.reduce_all_const_elim 1.01% : 0.000884s : 1664: substitution.remove_not_recompute_node 0.14% : 0.000127s : 237: substitution.replace_old_param 1.76% : 0.001546s : 555: substitution.reshape_eliminate 0.68% : 0.000600s : 302: substitution.switch_simplify 0.05% : 0.000046s : 12: substitution.tile_eliminate 0.03% : 0.000028s : 6: substitution.transpose_eliminate 1.93% : 0.001692s : 753: substitution.tuple_list_convert_item_index_to_positive 1.03% : 0.000906s : 753: substitution.tuple_list_get_item_const_eliminator 1.35% : 0.001183s : 753: substitution.tuple_list_get_item_depend_reorder 3.17% : 0.002782s : 1386: substitution.tuple_list_get_item_eliminator 1.30% : 0.001144s : 753: substitution.tuple_list_get_set_item_eliminator 2.52% : 0.002207s : 2702: substitution.updatestate_pure_node_eliminater 3.11% : 0.002731s : 2762: substitution.updatestate_useless_node_eliminater ------[type_inference.] 3.575276 2 89.94% : 3.215597s : 1: type_inference.infer 10.06% : 0.359680s : 1: type_inference.specialize ------[replace.] 0.023057 2551 6.87% : 0.001584s : 186: replace.cast_eliminate 0.17% : 0.000039s : 4: replace.depend_value_elim 1.13% : 0.000261s : 25: replace.elim_not_effective 63.50% : 0.014641s : 1725: replace.inline 1.04% : 0.000239s : 18: replace.list_to_tuple_eliminator_ 0.53% : 0.000122s : 9: replace.opt_reshape 2.97% : 0.000684s : 59: replace.reshape_eliminate 13.41% : 0.003091s : 302: replace.switch_simplify 10.24% : 0.002360s : 219: replace.tuple_list_get_item_eliminator 0.16% : 0.000037s : 4: replace.updatestate_pure_node_eliminater ------[match.] 0.066967 2551 1.24% : 0.000834s : 186: match.cast_eliminate 0.00% : 0.000003s : 4: match.depend_value_elim 0.04% : 0.000028s : 25: match.elim_not_effective 96.36% : 0.064530s : 1725: match.inline 0.32% : 0.000217s : 18: match.list_to_tuple_eliminator_ 0.12% : 0.000083s : 9: match.opt_reshape 0.33% : 0.000218s : 59: match.reshape_eliminate 0.72% : 0.000485s : 302: match.switch_simplify 0.84% : 0.000564s : 219: match.tuple_list_get_item_eliminator 0.01% : 0.000006s : 4: match.updatestate_pure_node_eliminater ------[predicate.] 0.074173439327 1.13% : 0.000836s : 5875: predicate.accumulaten_eliminater 0.15% : 0.000112s : 524: predicate.ad_related_special_op_eliminate 0.77% : 0.000569s : 2850: predicate.addn_check_dump 1.10% : 0.000817s : 5875: predicate.addn_zero_filter 1.08% : 0.000804s : 5875: predicate.adjust_all_reduce_mul_add 2.22% : 0.001650s : 8725: predicate.arithmetic_simplify 1.25% : 0.000930s : 6120: predicate.cast_eliminate 0.58% : 0.000431s : 2034: predicate.check_bprop_eliminate 0.78% : 0.000576s : 2850: predicate.compare_switch_simplify 0.07% : 0.000050s : 664: predicate.const_output_eliminate 0.79% : 0.000582s : 2853: predicate.depend_value_elim 1.23% : 0.000916s : 6120: predicate.dict_get_item_const_eliminator 1.32% : 0.000976s : 6120: predicate.dict_get_item_eliminator 1.16% : 0.000862s : 6120: predicate.dict_set_item_eliminator 0.28% : 0.000210s : 1188: predicate.dumpgradient_eliminate 0.06% : 0.000046s : 583: predicate.elim_not_effective 0.17% : 0.000125s : 613: predicate.elim_shapecalc_of_broadcastargs 1.35% : 0.000999s : 6784: predicate.environ_add_const_eliminate 1.33% : 0.000988s : 6784: predicate.environ_get_add_eliminate 1.31% : 0.000974s : 6784: predicate.environ_get_depend_swap 2.10% : 0.001561s : 9634: predicate.environ_get_eliminate 1.31% : 0.000975s : 6784: predicate.environ_get_set_eliminate 1.57% : 0.001164s : 8086: predicate.exchange_switch_depend_value 2.04% : 0.001513s : 8086: predicate.float_depend_g_call 0.78% : 0.000575s : 2850: predicate.float_environ_get_switch 0.94% : 0.000698s : 3514: predicate.float_tuple_getitem_switch 0.06% : 0.000042s : 524: predicate.fold_const_symbol 0.59% : 0.000437s : 2087: predicate.get_grad_eliminate 0.08% : 0.000056s : 613: predicate.graph_param_transform 0.74% : 0.000546s : 2850: predicate.incorporate_call 0.71% : 0.000524s : 2850: predicate.incorporate_call_switch 5.66% : 0.004196s : 20226: predicate.inline 0.70% : 0.000522s : 2087: predicate.inline_without_move 0.22% : 0.000160s : 2087: predicate.j_node_and_user_rematch 0.64% : 0.000478s : 2087: predicate.less_batch_normalization 1.54% : 0.001143s : 7634: predicate.list_to_tuple_eliminator_ 2.75% : 0.002038s : 13560: predicate.load_eliminater 0.24% : 0.000177s : 664: predicate.loop_unroll_after_grad 3.24% : 0.002403s : 10083: predicate.loop_unroll_before_grad 1.57% : 0.001164s : 7448: predicate.make_slice_get_slice_eliminator 0.79% : 0.000582s : 2850: predicate.merge_addn 0.55% : 0.000410s : 2034: predicate.micro_step_allgather_replace 0.57% : 0.000420s : 2034: predicate.mini_step_allgather_replace 1.13% : 0.000839s : 5875: predicate.minmaximum_grad 0.12% : 0.000088s : 524: predicate.mutable_eliminate 0.14% : 0.000106s : 542: predicate.opt_reshape 0.18% : 0.000137s : 664: predicate.parallel_virtual_node 3.19% : 0.002367s : 8086: predicate.partial_defer_inline 1.46% : 0.001086s : 7021: predicate.partial_eliminate 1.12% : 0.000832s : 5875: predicate.print_const_string_wrapper 0.79% : 0.000583s : 2846: predicate.reduce_all_const_elim 1.46% : 0.001080s : 5875: predicate.reduce_eliminate 2.61% : 0.001934s : 13560: predicate.redundant_stop_gradient_eliminater 0.20% : 0.000150s : 2087: predicate.remove_not_recompute_node 1.07% : 0.000792s : 8391: predicate.replace_applicator 0.21% : 0.000153s : 2087: predicate.replace_old_param 0.07% : 0.000051s : 664: predicate.reset_defer_inline 1.26% : 0.000931s : 5934: predicate.reshape_eliminate 0.55% : 0.000409s : 2034: predicate.row_tensor_add_zeros_like 0.20% : 0.000147s : 664: predicate.row_tensor_eliminate 0.66% : 0.000492s : 2034: predicate.same_eliminate 0.29% : 0.000215s : 2665: predicate.set_cell_output_no_recompute 0.61% : 0.000450s : 2087: predicate.shard_identity_eliminate 0.27% : 0.000204s : 1188: predicate.special_op_eliminate 0.87% : 0.000646s : 2850: predicate.specialize_transform 0.57% : 0.000424s : 2034: predicate.split_environ_get_set_with_tuple_value 0.44% : 0.000330s : 2087: predicate.stack_unstack_eliminate 0.13% : 0.000093s : 664: predicate.switch_call_monad_eliminater 1.70% : 0.001258s : 8086: predicate.switch_defer_inline 2.19% : 0.001623s : 10120: predicate.switch_layer_defer_inline 5.90% : 0.004379s : 21623: predicate.switch_simplify 1.14% : 0.000847s : 5875: predicate.tile_eliminate 1.10% : 0.000820s : 5875: predicate.transpose_eliminate 1.58% : 0.001175s : 7397: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.001218s : 7397: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.001128s : 7397: predicate.tuple_list_get_item_depend_reorder 2.66% : 0.001975s : 10466: predicate.tuple_list_get_item_eliminator 1.57% : 0.001163s : 7397: predicate.tuple_list_get_set_item_eliminator 2.41% : 0.001784s : 10247: predicate.tuple_list_set_item_eliminator 1.51% : 0.001120s : 7616: predicate.tuple_to_list_eliminator_ 2.77% : 0.002051s : 13564: predicate.updatestate_pure_node_eliminater 3.64% : 0.002698s : 16414: predicate.updatestate_useless_node_eliminater 0.18% : 0.000136s : 664: predicate.value_based_eliminate 0.60% : 0.000443s : 2087: predicate.virtual_dataset_eliminate 0.59% : 0.000439s : 2087: predicate.virtual_output_eliminate 0.19% : 0.000141s : 664: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.240932 2585 66.70% : 0.160694s : 959: func_graph_cloner_run.FuncGraphClonerGraph 1.19% : 0.002864s : 25: func_graph_cloner_run.FuncGraphClonerNode 32.11% : 0.077374s : 1601: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 12.145490 253 0.00% : 0.000005s : 1: ForceFp32Comm 0.25% : 0.030370s : 1: add_attr 0.25% : 0.030344s : 1: add_attr_with_inline 0.01% : 0.000695s : 1: add_cache_embedding 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.004150s : 1: add_recomputation 0.00% : 0.000019s : 1: assign_add_opt 0.25% : 0.030950s : 1: auto_monad 0.01% : 0.001157s : 1: auto_monad_reorder 0.00% : 0.000016s : 1: backend_pass 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.03% : 0.004240s : 1: bootstrap 0.00% : 0.000353s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.01% : 0.001043s : 1: control_data_broadcast_order 0.01% : 0.000721s : 1: convert_after_rewriter 0.01% : 0.001219s : 1: cse_after_recomputation 0.00% : 0.000007s : 1: dataset_repeat_opt 0.00% : 0.000009s : 1: detach_backward 0.01% : 0.001469s : 1: distribtued_split 0.00% : 0.000372s : 1: environ_conv 0.00% : 0.000026s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000008s : 1: get_jit_bprop_graph 0.74% : 0.089695s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000013s : 1: inline 0.00% : 0.000013s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.02% : 0.001866s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.00% : 0.000241s : 1: offloading_packed_experts 0.01% : 0.001049s : 1: opt.transform.loop_unroll_optimizer 3.81% : 0.462169s : 134: opt.transform.opt_a 0.04% : 0.004343s : 1: opt.transform.opt_after_cconv 0.02% : 0.001937s : 2: opt.transform.opt_after_jit_grad 0.18% : 0.021456s : 28: opt.transform.opt_b 0.05% : 0.005949s : 1: opt.transform.opt_trans_graph 0.04% : 0.004760s : 4: opt.transform.symbol_engine_opt 7.10% : 0.862282s : 1: opt_a 0.07% : 0.008845s : 1: opt_after_cconv 0.02% : 0.002505s : 1: opt_after_jit_grad 0.21% : 0.026004s : 1: opt_b 8.31% : 1.009171s : 1: optimize 0.01% : 0.001165s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000534s : 1: order_py_execute_after_rewriter 0.01% : 0.001334s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000232s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000018s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000017s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000233s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000011s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000007s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.07% : 0.008968s : 1: pre_auto_parallel 0.07% : 0.008280s : 1: py_interpret_to_execute 0.01% : 0.000831s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.004706s : 1: remove_dup_value 1.38% : 0.167693s : 2: renormalize.infer 1.45% : 0.176188s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.23% : 0.027716s : 1: rewriter_after_opt_a 0.33% : 0.040280s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000636s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.009115s : 1: symbol_engine_optimizer 45.23% : 5.493644s : 1: task_emit 0.05% : 0.005994s : 1: tuple_transform 29.51% : 3.583957s : 1: type_inference 0.04% : 0.004290s : 1: validate [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:03:57.828.730 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:03:57.829.320 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:03:57.829.490 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:03:57.829.856 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:03:57.878.854 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:03:57.882.085 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:03:58.585.529 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:03:58.586.166 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:03:58.608.945 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty TotalTime = 4.50075, [24] [bootstrap]: 0.00336455 [type_inference]: 3.03728 [auto_monad]: 0.0295035 [graph_reusing]: 0.0882091 [inline]: 3.80003e-06 [add_attr]: 0.0285597, [1] [add_attr_with_inline]: 0.028536, [1] [Cycle 1]: 0.0101432, [2] [tag_attr]: 0.00706075 [meta_addattr_fg_expand]: 0.00301783 [parallel-infer-symbol]: 4.07e-06 [pre_auto_parallel]: 0.0085073 [insert-virtual-dataset]: 5.2799e-06 [parallel-infer-symbol-second]: 1.51002e-06 [dataset_repeat_opt]: 2.51003e-06 [pipeline_split]: 2.42994e-06 [optimize]: 0.970811, [53] [py_interpret_to_execute]: 0.00861216 [rewriter_before_opt_a]: 0.0388916 [opt_a]: 0.830139, [3] [Cycle 1]: 0.657154, [45] [expand_dump_flag]: 0.000592504 [switch_simplify]: 0.0134226 [loop_unroll]: 0.0100559 [a_1]: 0.252799 [invalid_dout_check]: 0.00135468 [recompute_prepare]: 0.00154577 [updatestate_depend_eliminate]: 0.00512581 [updatestate_assign_eliminate]: 0.000976036 [updatestate_loads_eliminate]: 0.00294621 [parameter_eliminate]: 5.52996e-06 [a_2]: 0.0276398 [accelerated_algorithm]: 0.00172176 [shard]: 3.35998e-06 [meta_shard_fg_expand]: 0.000673444 [shard_inline]: 0.000932026 [merge_send_recv]: 0.000729924 [auto_parallel]: 0.000689335 [parallel]: 1.353e-05 [flash_sp]: 0.000387223 [merge_comm]: 0.000713005 [allreduce_fusion]: 0.000690814 [matmul_add_comm_reduction]: 0.000887486 [allreduce_slice_to_reducescatter]: 1.34995e-06 [virtual_shard_identity]: 0.000903936 [virtual_dataset]: 0.000873526 [get_grad_eliminate_]: 0.000870196 [virtual_output]: 0.000871326 [merge_forward]: 0.000722984 [offload_activation]: 0.000911036 [cell_reuse_recompute_pass]: 2.4999e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00154632 [merge_recompute_call_nodes]: 2.20991e-06 [before_grad]: 0.00151436 [set_forward_comm_id_for_comm_node_pass]: 0.000808005 [meta_fg_expand]: 0.0014226 [flash_sp_send_recv_attached]: 5.23997e-06 [receive_attached]: 3.21004e-06 [after_resolve]: 0.00100423 [a_after_grad]: 0.00138009 [renormalize]: 0.306051 [add_forward_monad_depend]: 1.65e-05 [auto_monad_grad]: 3.58e-06 [auto_monad_eliminator]: 0.00402999 [cse]: 0.00358433 [a_3]: 0.00595214 [Cycle 2]: 0.109038, [45] [expand_dump_flag]: 4.30003e-06 [switch_simplify]: 0.000817245 [loop_unroll]: 0.000807645 [a_1]: 0.0225021 [invalid_dout_check]: 0.000726935 [recompute_prepare]: 0.000752235 [updatestate_depend_eliminate]: 0.000629634 [updatestate_assign_eliminate]: 0.000633354 [updatestate_loads_eliminate]: 0.000616974 [parameter_eliminate]: 3.78001e-06 [a_2]: 0.0124528 [accelerated_algorithm]: 0.000957827 [shard]: 3.21993e-06 [meta_shard_fg_expand]: 0.000314872 [shard_inline]: 0.000808605 [merge_send_recv]: 0.000677805 [auto_parallel]: 0.000627104 [parallel]: 1.02001e-05 [flash_sp]: 5.2501e-06 [merge_comm]: 0.000641684 [allreduce_fusion]: 0.000620715 [matmul_add_comm_reduction]: 0.000759455 [allreduce_slice_to_reducescatter]: 1.30001e-06 [virtual_shard_identity]: 0.000803165 [virtual_dataset]: 0.000801595 [get_grad_eliminate_]: 0.000781686 [virtual_output]: 0.000766805 [merge_forward]: 0.000644324 [offload_activation]: 0.000795025 [cell_reuse_recompute_pass]: 3.45998e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00140911 [merge_recompute_call_nodes]: 2.06998e-06 [before_grad]: 0.00135461 [set_forward_comm_id_for_comm_node_pass]: 0.000718424 [meta_fg_expand]: 0.000737184 [flash_sp_send_recv_attached]: 2.60002e-06 [receive_attached]: 3.84997e-06 [after_resolve]: 0.000904086 [a_after_grad]: 0.00124057 [renormalize]: 0.0422666 [add_forward_monad_depend]: 1.11499e-05 [auto_monad_grad]: 3.14997e-06 [auto_monad_eliminator]: 0.00127133 [cse]: 0.00299972 [a_3]: 0.00553406 [Cycle 3]: 0.0639171, [45] [expand_dump_flag]: 2.13005e-06 [switch_simplify]: 0.000755575 [loop_unroll]: 0.000766355 [a_1]: 0.0216206 [invalid_dout_check]: 0.000632274 [recompute_prepare]: 0.000736205 [updatestate_depend_eliminate]: 0.000596224 [updatestate_assign_eliminate]: 0.000603964 [updatestate_loads_eliminate]: 0.000588504 [parameter_eliminate]: 3.15998e-06 [a_2]: 0.0121958 [accelerated_algorithm]: 0.000942767 [shard]: 2.71993e-06 [meta_shard_fg_expand]: 0.000258301 [shard_inline]: 0.000799736 [merge_send_recv]: 0.000643844 [auto_parallel]: 0.000617044 [parallel]: 1.08e-05 [flash_sp]: 2.03005e-06 [merge_comm]: 0.000620145 [allreduce_fusion]: 0.000605334 [matmul_add_comm_reduction]: 0.000762275 [allreduce_slice_to_reducescatter]: 8.2003e-07 [virtual_shard_identity]: 0.000791425 [virtual_dataset]: 0.000775145 [get_grad_eliminate_]: 0.000777485 [virtual_output]: 0.000750675 [merge_forward]: 0.000611314 [offload_activation]: 0.000773775 [cell_reuse_recompute_pass]: 3.65998e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00139555 [merge_recompute_call_nodes]: 1.75997e-06 [before_grad]: 0.00133598 [set_forward_comm_id_for_comm_node_pass]: 0.000692404 [meta_fg_expand]: 0.000668064 [flash_sp_send_recv_attached]: 2.79001e-06 [receive_attached]: 3.01003e-06 [after_resolve]: 0.000899266 [a_after_grad]: 0.00122529 [renormalize]: 1.30036e-07 [add_forward_monad_depend]: 3.35998e-06 [auto_monad_grad]: 1.55007e-06 [auto_monad_eliminator]: 0.00107525 [cse]: 0.00231583 [a_3]: 0.00548512 [py_interpret_to_execute_after_opt_a]: 0.000913236 [slice_cell_reuse_recomputed_activation]: 2.99001e-06 [rewriter_after_opt_a]: 0.0248569 [convert_after_rewriter]: 0.000679835 [order_py_execute_after_rewriter]: 0.000495403 [opt_b]: 0.0246852, [1] [Cycle 1]: 0.0246759, [7] [b_1]: 0.0196602 [b_2]: 0.000769865 [updatestate_depend_eliminate]: 0.000597124 [updatestate_assign_eliminate]: 0.000596844 [updatestate_loads_eliminate]: 0.000591834 [renormalize]: 1.11002e-06 [cse]: 0.00234916 [optimize_parallel_all_gather_comm]: 0.00116216 [overlap_param_gather]: 6.13998e-06 [cconv]: 0.000348632 [loop_unroll]: 0.00167837 [opt_after_cconv]: 0.00828532, [1] [Cycle 1]: 0.00827681, [7] [c_1]: 0.00393996 [parameter_eliminate]: 2.94007e-06 [updatestate_depend_eliminate]: 0.000699185 [updatestate_assign_eliminate]: 0.000604104 [updatestate_loads_eliminate]: 0.000600184 [cse]: 0.00233132 [renormalize]: 5.59958e-07 [remove_dup_value]: 0.00442566 [tuple_transform]: 0.00575233, [1] [Cycle 1]: 0.005744, [2] [d_1]: 0.00571704 [renormalize]: 5.19911e-07 [partial_unused_args_eliminate]: 4.12995e-06 [add_cache_embedding]: 0.000722535 [add_recomputation]: 0.00418561 [cse_after_recomputation]: 0.00124629, [1] [Cycle 1]: 0.00123835, [1] [cse]: 0.00121864 [environ_conv]: 0.000377122 [swap_dp_allreduce_reducescatter]: 0.000658745 [bias_add_comm_swap]: 3.56999e-06 [label_micro_interleaved_index]: 5.92007e-06 [label_fine_grained_interleaved_index]: 2.96999e-06 [merge_cast_opt]: 1.90001e-06 [slice_recompute_activation]: 2.06009e-06 [micro_interleaved_order_control]: 2.55008e-06 [assign_add_opt]: 1.509e-05 [ForceFp32Comm]: 1.27009e-06 [remove_cast_before_assign_add]: 1.2099e-06 [full_micro_interleaved_order_control]: 3.22994e-06 [reorder_send_recv_between_fp_bp]: 2.61003e-06 [comm_op_add_attrs]: 1.34006e-06 [add_comm_op_reuse_tag]: 1.29e-06 [interleave_split_concat_branches]: 1.53005e-06 [interleave_parallel_branches]: 1.24995e-06 [overlap_opt_shard_in_pipeline]: 2.20002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.22004e-06 [control_data_broadcast_order]: 0.000980486 [grouped_pairwise_exchange_alltoall]: 1.55007e-06 [offloading_packed_experts]: 0.000220602 [overlap_recompute_and_grad_model_parallel]: 0.000214851 [overlap_grad_matmul_and_grad_allreduce]: 2.26998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.49e-06 [overlap_recompute_comm]: 2.34996e-06 [overlap_grad_ring_attention]: 0.000213602 [overlap_grad_flash_sp]: 0.00125006 [begin_end_overlap_inline]: 9.59961e-07 [split_matmul_comm_elemetwise]: 2.46009e-06 [split_layernorm_comm]: 2.05997e-06 [handle_group_info]: 1.16008e-06 [symbol_engine_optimizer]: 0.00926327, [1] [Cycle 1]: 0.00925581, [6] [build]: 0.00413584 [elim_shapecalc]: 0.000737744 [elim_not_effective]: 0.0017723 [opt_reshape]: 0.00102824 [fold_const_symbol]: 0.00147787 [renormalize]: 2.70084e-07 [detach_backward]: 3.16999e-06 [pipeline_parallel_scheduler]: 1.85997e-06 [auto_monad_reorder]: 0.00158198 [get_jit_bprop_graph]: 2.25997e-06 [rewriter_after_jit_bprop_graph]: 7.08993e-06 [opt_after_jit_grad]: 0.00266588 [distribtued_split]: 0.00140036 [validate]: 0.00120036 [backend_pass]: 2.2999e-06 [task_emit]: 0.324083 [execute]: 1.02799e-05 Sums bootstrap : 0.003365s : 0.08% type_inference : 3.037285s : 67.86% auto_monad : 0.029503s : 0.66% graph_reusing : 0.088209s : 1.97% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.007061s : 0.16% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.003018s : 0.07% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.008507s : 0.19% insert-virtual-dataset : 0.000005s : 0.00% parallel-infer-symbol-second : 0.000002s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.008612s : 0.19% optimize.rewriter_before_opt_a : 0.038892s : 0.87% optimize.opt_a.expand_dump_flag : 0.000599s : 0.01% optimize.opt_a.switch_simplify : 0.014995s : 0.34% optimize.opt_a.loop_unroll : 0.011630s : 0.26% optimize.opt_a.a_1 : 0.296922s : 6.63% optimize.opt_a.invalid_dout_check : 0.002714s : 0.06% optimize.opt_a.recompute_prepare : 0.003034s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.006352s : 0.14% optimize.opt_a.updatestate_assign_eliminate : 0.002213s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.004152s : 0.09% optimize.opt_a.parameter_eliminate : 0.000012s : 0.00% optimize.opt_a.a_2 : 0.052288s : 1.17% optimize.opt_a.accelerated_algorithm : 0.003622s : 0.08% optimize.opt_a.shard : 0.000009s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.001247s : 0.03% optimize.opt_a.shard_inline : 0.002540s : 0.06% optimize.opt_a.merge_send_recv : 0.002052s : 0.05% optimize.opt_a.auto_parallel : 0.001933s : 0.04% optimize.opt_a.parallel : 0.000035s : 0.00% optimize.opt_a.flash_sp : 0.000395s : 0.01% optimize.opt_a.merge_comm : 0.001975s : 0.04% optimize.opt_a.allreduce_fusion : 0.001917s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.002409s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.002499s : 0.06% optimize.opt_a.virtual_dataset : 0.002450s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.002429s : 0.05% optimize.opt_a.virtual_output : 0.002389s : 0.05% optimize.opt_a.merge_forward : 0.001979s : 0.04% optimize.opt_a.offload_activation : 0.002480s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000010s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.004351s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000006s : 0.00% optimize.opt_a.before_grad : 0.004205s : 0.09% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.002219s : 0.05% optimize.opt_a.meta_fg_expand : 0.002828s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000011s : 0.00% optimize.opt_a.receive_attached : 0.000010s : 0.00% optimize.opt_a.after_resolve : 0.002808s : 0.06% optimize.opt_a.a_after_grad : 0.003846s : 0.09% optimize.opt_a.renormalize : 0.348318s : 7.78% optimize.opt_a.add_forward_monad_depend : 0.000031s : 0.00% optimize.opt_a.auto_monad_grad : 0.000008s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.006377s : 0.14% optimize.opt_a.cse : 0.008900s : 0.20% optimize.opt_a.a_3 : 0.016971s : 0.38% optimize.py_interpret_to_execute_after_opt_a : 0.000913s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.024857s : 0.56% optimize.convert_after_rewriter : 0.000680s : 0.02% optimize.order_py_execute_after_rewriter : 0.000495s : 0.01% optimize.opt_b.b_1 : 0.019660s : 0.44% optimize.opt_b.b_2 : 0.000770s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000597s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000597s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000592s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.002349s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.001162s : 0.03% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000349s : 0.01% optimize.loop_unroll : 0.001678s : 0.04% optimize.opt_after_cconv.c_1 : 0.003940s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000699s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000604s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000600s : 0.01% optimize.opt_after_cconv.cse : 0.002331s : 0.05% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.004426s : 0.10% optimize.tuple_transform.d_1 : 0.005717s : 0.13% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.partial_unused_args_eliminate : 0.000004s : 0.00% optimize.add_cache_embedding : 0.000723s : 0.02% optimize.add_recomputation : 0.004186s : 0.09% optimize.cse_after_recomputation.cse : 0.001219s : 0.03% optimize.environ_conv : 0.000377s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000659s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000015s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000980s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000221s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000215s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000214s : 0.00% optimize.overlap_grad_flash_sp : 0.001250s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.004136s : 0.09% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000738s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.001772s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.001028s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.001478s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.001582s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.002666s : 0.06% distribtued_split : 0.001400s : 0.03% validate : 0.001200s : 0.03% backend_pass : 0.000002s : 0.00% task_emit : 0.324083s : 7.24% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.080102 18799 0.65% : 0.000524s : 159: substitution.arithmetic_simplify 1.29% : 0.001036s : 206: substitution.cast_eliminate 0.06% : 0.000047s : 24: substitution.depend_value_elim 0.43% : 0.000347s : 467: substitution.elim_not_effective 0.47% : 0.000376s : 470: substitution.float_tuple_getitem_switch 0.61% : 0.000487s : 399: substitution.fold_const_symbol 0.49% : 0.000391s : 589: substitution.graph_param_transform 74.08% : 0.059342s : 1504: substitution.inline 0.87% : 0.000700s : 1545: substitution.j_node_and_user_rematch 1.03% : 0.000825s : 321: substitution.less_batch_normalization 0.27% : 0.000216s : 18: substitution.list_to_tuple_eliminator_ 0.40% : 0.000316s : 668: substitution.load_eliminater 0.68% : 0.000542s : 515: substitution.minmaximum_grad 0.17% : 0.000134s : 50: substitution.opt_reshape 0.03% : 0.000027s : 32: substitution.reduce_all_const_elim 1.05% : 0.000839s : 1545: substitution.remove_not_recompute_node 0.16% : 0.000131s : 243: substitution.replace_old_param 1.56% : 0.001253s : 443: substitution.reshape_eliminate 0.63% : 0.000503s : 244: substitution.switch_simplify 0.16% : 0.000132s : 54: substitution.transpose_eliminate 2.00% : 0.001600s : 709: substitution.tuple_list_convert_item_index_to_positive 1.03% : 0.000828s : 709: substitution.tuple_list_get_item_const_eliminator 1.36% : 0.001087s : 709: substitution.tuple_list_get_item_depend_reorder 3.35% : 0.002681s : 1355: substitution.tuple_list_get_item_eliminator 1.33% : 0.001068s : 709: substitution.tuple_list_get_set_item_eliminator 2.56% : 0.002051s : 2526: substitution.updatestate_pure_node_eliminater 3.27% : 0.002617s : 2586: substitution.updatestate_useless_node_eliminater ------[type_inference.] 3.025880 2 88.15% : 2.667283s : 1: type_inference.infer 11.85% : 0.358597s : 1: type_inference.specialize ------[replace.] 0.020499 2277 6.49% : 0.001330s : 162: replace.cast_eliminate 0.21% : 0.000044s : 4: replace.depend_value_elim 1.35% : 0.000277s : 25: replace.elim_not_effective 62.96% : 0.012906s : 1504: replace.inline 1.12% : 0.000229s : 18: replace.list_to_tuple_eliminator_ 0.63% : 0.000130s : 9: replace.opt_reshape 2.21% : 0.000453s : 43: replace.reshape_eliminate 12.90% : 0.002644s : 244: replace.switch_simplify 11.94% : 0.002447s : 264: replace.tuple_list_get_item_eliminator 0.19% : 0.000038s : 4: replace.updatestate_pure_node_eliminater ------[match.] 0.060608 2277 1.17% : 0.000710s : 162: match.cast_eliminate 0.00% : 0.000002s : 4: match.depend_value_elim 0.05% : 0.000032s : 25: match.elim_not_effective 96.37% : 0.058407s : 1504: match.inline 0.34% : 0.000203s : 18: match.list_to_tuple_eliminator_ 0.15% : 0.000088s : 9: match.opt_reshape 0.25% : 0.000153s : 43: match.reshape_eliminate 0.66% : 0.000403s : 244: match.switch_simplify 1.00% : 0.000604s : 264: match.tuple_list_get_item_eliminator 0.01% : 0.000006s : 4: match.updatestate_pure_node_eliminater ------[predicate.] 0.069485396436 1.08% : 0.000754s : 5279: predicate.accumulaten_eliminater 0.16% : 0.000110s : 500: predicate.ad_related_special_op_eliminate 0.84% : 0.000586s : 2550: predicate.addn_check_dump 1.05% : 0.000730s : 5279: predicate.addn_zero_filter 1.03% : 0.000717s : 5279: predicate.adjust_all_reduce_mul_add 2.26% : 0.001573s : 7829: predicate.arithmetic_simplify 1.17% : 0.000811s : 5484: predicate.cast_eliminate 0.61% : 0.000423s : 1846: predicate.check_bprop_eliminate 0.84% : 0.000583s : 2550: predicate.compare_switch_simplify 0.07% : 0.000045s : 608: predicate.const_output_eliminate 0.86% : 0.000600s : 2555: predicate.depend_value_elim 1.17% : 0.000811s : 5484: predicate.dict_get_item_const_eliminator 1.23% : 0.000858s : 5484: predicate.dict_get_item_eliminator 1.12% : 0.000779s : 5484: predicate.dict_set_item_eliminator 0.29% : 0.000203s : 1108: predicate.dumpgradient_eliminate 0.06% : 0.000042s : 559: predicate.elim_not_effective 0.18% : 0.000125s : 589: predicate.elim_shapecalc_of_broadcastargs 1.29% : 0.000898s : 6092: predicate.environ_add_const_eliminate 1.29% : 0.000898s : 6092: predicate.environ_get_add_eliminate 1.26% : 0.000878s : 6092: predicate.environ_get_depend_swap 2.13% : 0.001477s : 8642: predicate.environ_get_eliminate 1.32% : 0.000916s : 6092: predicate.environ_get_set_eliminate 1.50% : 0.001044s : 7274: predicate.exchange_switch_depend_value 1.94% : 0.001347s : 7274: predicate.float_depend_g_call 0.84% : 0.000582s : 2550: predicate.float_environ_get_switch 1.00% : 0.000693s : 3158: predicate.float_tuple_getitem_switch 0.06% : 0.000039s : 500: predicate.fold_const_symbol 0.64% : 0.000445s : 1881: predicate.get_grad_eliminate 0.08% : 0.000053s : 589: predicate.graph_param_transform 0.76% : 0.000529s : 2550: predicate.incorporate_call 0.75% : 0.000519s : 2550: predicate.incorporate_call_switch 5.62% : 0.003903s : 18200: predicate.inline 0.77% : 0.000534s : 1881: predicate.inline_without_move 0.21% : 0.000143s : 1881: predicate.j_node_and_user_rematch 0.69% : 0.000476s : 1881: predicate.less_batch_normalization 1.55% : 0.001075s : 6963: predicate.list_to_tuple_eliminator_ 2.65% : 0.001842s : 12261: predicate.load_eliminater 0.25% : 0.000174s : 608: predicate.loop_unroll_after_grad 3.24% : 0.002250s : 9240: predicate.loop_unroll_before_grad 1.45% : 0.001005s : 6700: predicate.make_slice_get_slice_eliminator 0.84% : 0.000584s : 2550: predicate.merge_addn 0.57% : 0.000399s : 1846: predicate.micro_step_allgather_replace 0.58% : 0.000401s : 1846: predicate.mini_step_allgather_replace 1.19% : 0.000825s : 5279: predicate.minmaximum_grad 0.13% : 0.000093s : 500: predicate.mutable_eliminate 0.15% : 0.000108s : 518: predicate.opt_reshape 0.21% : 0.000145s : 608: predicate.parallel_virtual_node 3.19% : 0.002219s : 7274: predicate.partial_defer_inline 1.42% : 0.000986s : 6374: predicate.partial_eliminate 1.12% : 0.000777s : 5279: predicate.print_const_string_wrapper 0.87% : 0.000601s : 2546: predicate.reduce_all_const_elim 1.74% : 0.001209s : 5279: predicate.reduce_eliminate 2.57% : 0.001785s : 12261: predicate.redundant_stop_gradient_eliminater 0.19% : 0.000134s : 1881: predicate.remove_not_recompute_node 0.99% : 0.000689s : 7612: predicate.replace_applicator 0.20% : 0.000138s : 1881: predicate.replace_old_param 0.07% : 0.000046s : 608: predicate.reset_defer_inline 1.17% : 0.000811s : 5322: predicate.reshape_eliminate 0.60% : 0.000415s : 1846: predicate.row_tensor_add_zeros_like 0.20% : 0.000142s : 608: predicate.row_tensor_eliminate 0.69% : 0.000479s : 1846: predicate.same_eliminate 0.28% : 0.000194s : 2423: predicate.set_cell_output_no_recompute 0.64% : 0.000445s : 1881: predicate.shard_identity_eliminate 0.29% : 0.000199s : 1108: predicate.special_op_eliminate 0.94% : 0.000654s : 2550: predicate.specialize_transform 0.60% : 0.000414s : 1846: predicate.split_environ_get_set_with_tuple_value 0.43% : 0.000297s : 1881: predicate.stack_unstack_eliminate 0.12% : 0.000085s : 608: predicate.switch_call_monad_eliminater 1.62% : 0.001124s : 7274: predicate.switch_defer_inline 2.21% : 0.001538s : 9120: predicate.switch_layer_defer_inline 5.92% : 0.004114s : 19552: predicate.switch_simplify 1.07% : 0.000746s : 5279: predicate.tile_eliminate 1.06% : 0.000739s : 5279: predicate.transpose_eliminate 1.52% : 0.001053s : 6681: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.001095s : 6681: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.001020s : 6681: predicate.tuple_list_get_item_depend_reorder 2.62% : 0.001823s : 9495: predicate.tuple_list_get_item_eliminator 1.55% : 0.001075s : 6681: predicate.tuple_list_get_set_item_eliminator 2.56% : 0.001779s : 9231: predicate.tuple_list_set_item_eliminator 1.47% : 0.001024s : 6945: predicate.tuple_to_list_eliminator_ 2.79% : 0.001936s : 12265: predicate.updatestate_pure_node_eliminater 3.59% : 0.002495s : 14815: predicate.updatestate_useless_node_eliminater 0.19% : 0.000132s : 608: predicate.value_based_eliminate 0.67% : 0.000465s : 1881: predicate.virtual_dataset_eliminate 0.64% : 0.000444s : 1881: predicate.virtual_output_eliminate 0.19% : 0.000134s : 608: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.218882 2327 66.55% : 0.145672s : 871: func_graph_cloner_run.FuncGraphClonerGraph 0.49% : 0.001076s : 17: func_graph_cloner_run.FuncGraphClonerNode 32.96% : 0.072134s : 1439: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 6.312529 253 0.00% : 0.000005s : 1: ForceFp32Comm 0.45% : 0.028571s : 1: add_attr 0.45% : 0.028541s : 1: add_attr_with_inline 0.01% : 0.000735s : 1: add_cache_embedding 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.07% : 0.004202s : 1: add_recomputation 0.00% : 0.000019s : 1: assign_add_opt 0.47% : 0.029545s : 1: auto_monad 0.03% : 0.001608s : 1: auto_monad_reorder 0.00% : 0.000016s : 1: backend_pass 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.05% : 0.003416s : 1: bootstrap 0.01% : 0.000358s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.02% : 0.000992s : 1: control_data_broadcast_order 0.01% : 0.000693s : 1: convert_after_rewriter 0.02% : 0.001252s : 1: cse_after_recomputation 0.00% : 0.000007s : 1: dataset_repeat_opt 0.00% : 0.000009s : 1: detach_backward 0.02% : 0.001420s : 1: distribtued_split 0.01% : 0.000389s : 1: environ_conv 0.00% : 0.000018s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 1.40% : 0.088267s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000011s : 1: inline 0.00% : 0.000012s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.03% : 0.001691s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.00% : 0.000227s : 1: offloading_packed_experts 0.01% : 0.000851s : 1: opt.transform.loop_unroll_optimizer 6.76% : 0.426525s : 134: opt.transform.opt_a 0.06% : 0.003937s : 1: opt.transform.opt_after_cconv 0.03% : 0.001980s : 2: opt.transform.opt_after_jit_grad 0.32% : 0.020334s : 28: opt.transform.opt_b 0.09% : 0.005713s : 1: opt.transform.opt_trans_graph 0.08% : 0.005004s : 4: opt.transform.symbol_engine_opt 13.15% : 0.830145s : 1: opt_a 0.13% : 0.008290s : 1: opt_after_cconv 0.04% : 0.002683s : 1: opt_after_jit_grad 0.39% : 0.024690s : 1: opt_b 15.38% : 0.970825s : 1: optimize 0.02% : 0.001176s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000506s : 1: order_py_execute_after_rewriter 0.02% : 0.001260s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000219s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000011s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000220s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000009s : 1: partial_unused_args_eliminate 0.00% : 0.000007s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.14% : 0.008529s : 1: pre_auto_parallel 0.14% : 0.008632s : 1: py_interpret_to_execute 0.01% : 0.000925s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.07% : 0.004442s : 1: remove_dup_value 2.64% : 0.166865s : 2: renormalize.infer 2.87% : 0.181374s : 2: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.39% : 0.024868s : 1: rewriter_after_opt_a 0.62% : 0.038916s : 1: rewriter_before_opt_a 0.00% : 0.000007s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000669s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.009268s : 1: symbol_engine_optimizer 5.13% : 0.324109s : 1: task_emit 0.09% : 0.005757s : 1: tuple_transform 48.12% : 3.037344s : 1: type_inference 0.07% : 0.004254s : 1: validate [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:04:11.314.601 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:04:11.314.740 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:04:11.314.881 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:04:11.316.021 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:04:11.316.562 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:04:11.316.665 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:04:11.316.845 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:04:11.317.337 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:04:11.318.491 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:04:11.319.125 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806946,fff151008060,python):2025-07-24-11:04:11.319.585 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:04:11,682 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 38.018627643585205 s; generated tokens: 220 tokens; generate speed: 5.786637068082601 tokens/s 2025-07-24 11:04:11,683 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.0036611557006835938 s; prefill predict time: 32.50302577018738 s; prefill post time: 0.012887954711914062 s; decode prepare time: 0.0009113682640923394 s; decode predict time: 0.005116939544677734 s; decode post time: 0.0005336249316180194 s 2025-07-24 11:04:11,685 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. large_models/parallel_deepseek_r1_bf16_predict_mp2/worker_0.log0000644000175100017500000030625415040321253025320 0ustar jenkinsHwHiAiUser/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) 2025-07-24 11:02:48,695 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config runner_config is empty. 2025-07-24 11:02:48,729 - mindformers./output/log[mindformers/core/context/build_context.py:168] - INFO - Predict context config, jit_level: O0, infer_boost: on [WARNING] ME(3806931:281472833470256,MainProcess):2025-07-24-11:02:48.730.340 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(3806931:281472833470256,MainProcess):2025-07-24-11:02:48.731.147 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(3806931:281472833470256,MainProcess):2025-07-24-11:02:48.731.613 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] DISTRIBUTED(3806931,fffef0a67060,python):2025-07-24-11:02:48.733.806 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53602 to 127.0.0.1:8230 is successfully created. System errno: Success [WARNING] DISTRIBUTED(3806931,ffff8040bf30,python):2025-07-24-11:02:48.733.799 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 20 source: 127.0.0.1:53602, destination: 127.0.0.1:8230 [WARNING] DISTRIBUTED(3806931,ffff8040bf30,python):2025-07-24-11:02:48.733.949 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:53604, destination: 127.0.0.1:8230 [WARNING] DISTRIBUTED(3806931,ffff8040bf30,python):2025-07-24-11:02:48.733.981 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:8230 to be connected...Retry number: 1 [WARNING] DISTRIBUTED(3806931,fffef1a69060,python):2025-07-24-11:02:48.733.996 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:53604 to 127.0.0.1:8230 is successfully created. System errno: Success [WARNING] DISTRIBUTED(3806931,ffff8040bf30,python):2025-07-24-11:02:49.234.493 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/1200). [WARNING] DISTRIBUTED(3806931,ffff8040bf30,python):2025-07-24-11:02:49.734.572 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/1200). [WARNING] DISTRIBUTED(3806931,ffff8040bf30,python):2025-07-24-11:02:50.234.664 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(3806931,ffff8040bf30,python):2025-07-24-11:02:50.234.687 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 0 rank id: 0 [MS_ALLOC_CONF]Runtime config: enable_vmm:False [WARNING] DEVICE(3806931,ffff8040bf30,python):2025-07-24-11:02:50.453.397 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_memory_adapter.cc:155] Initialize] Reserved memory size for other components(2101346304) is less than recommend size(4068090624), It may lead to Out Of Memory in HCCL or other components, Please double check context key 'variable_memory_max_size'/'max_device_memory' [WARNING] DEVICE(3806931,ffff8040bf30,python):2025-07-24-11:02:51.751.261 [mindspore/ccsrc/plugin/res_manager/ascend/collective/multi_ascend_collective_comm_lib.cc:84] Initialize] Loading LCCL because env MS_ENABLE_LCCL is set to on. Pay attention that LCCL only supports communication group within single node in KernelByKernel for now. [WARNING] DISTRIBUTED(3806931,ffff8040bf30,python):2025-07-24-11:02:51.754.828 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(3806931,ffff8040bf30,python):2025-07-24-11:02:51.755.017 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(3806931,fffe988a0060,python):2025-07-24-11:02:51.755.183 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:8230, node_rank:2130706433, total_rank_size:2, local_rank_size2 [WARNING] HCCL_ADPT(3806931,fffe988a0060,python):2025-07-24-11:02:51.755.246 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(3806931,fffe988a0060,python):2025-07-24-11:02:51.755.267 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:632] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(3806931,fffe988a0060,python):2025-07-24-11:02:51.755.282 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group 2025-07-24 11:02:51,756 - mindformers./output/log[mindformers/tools/utils.py:185] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_0.ckpt' 2025-07-24 11:02:51,758 - mindformers./output/log[mindformers/core/context/build_context.py:383] - INFO - cann workqueue cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191] 2025-07-24 11:02:51,759 - mindformers./output/log[mindformers/core/context/build_context.py:387] - WARNING - CANN use cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], model get empty cpu list, disable binding cores 2025-07-24 11:02:51,759 - mindformers./output/log[mindformers/core/context/build_context.py:395] - INFO - cpu_affinity, rank_id: 0, device_num: 2 [WARNING] DISTRIBUTED(3806931,fffe988a0060,python):2025-07-24-11:02:51.759.601 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group 2025-07-24 11:02:51,759 - mindformers./output/log[mindformers/core/context/build_context.py:366] - WARNING - custom bind policy affinity_cpu_list must be dict, but got None. 2025-07-24 11:02:51,759 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 256, 'capacity_factor': 1.1, 'aux_loss_factor': 0.05, 'num_experts_chosen': 8, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV2', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 1, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': 4, 'n_group': 8, 'first_k_dense_replace': 1, 'moe_intermediate_size': 2048, 'routed_scaling_factor': 2.5, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} 2025-07-24 11:02:51,760 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 1, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': False, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} [WARNING] DEVICE(3806931,fffe4a7fc060,python):2025-07-24-11:02:52.170.685 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 2025-07-24 11:02:52,397 - mindformers./output/log[mindformers/version_control.py:119] - INFO - The Lazy Inline compilation acceleration feature does not support single-card mode.This feature is disabled by default. ENABLE_LAZY_INLINE=1 does not take effect. 2025-07-24 11:02:52,448 - mindformers./output/log[mindformers/parallel_core/inference/parallel_state.py:358] - INFO - expert_model_parallel_size(1) is not equal to world_size(2), so we will use 2 as the MOE_tensor_parallel_size. [WARNING] DISTRIBUTED(3806931,ffff8040bf30,python):2025-07-24-11:02:52.450.185 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: tp-0-1 [const vector]{0, 1}, async: 0, submit_now: 1 [WARNING] DEVICE(3806931,fffe4a7fc060,python):2025-07-24-11:02:52.515.950 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group [WARNING] DISTRIBUTED(3806931,fffe4a7fc060,python):2025-07-24-11:02:52.516.137 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: hccl_world_group [WARNING] DISTRIBUTED(3806931,fffe4a7fc060,python):2025-07-24-11:02:52.516.163 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: hccl_world_group end. [WARNING] DISTRIBUTED(3806931,fffe988a0060,python):2025-07-24-11:02:52.516.228 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: hccl_world_group [WARNING] DISTRIBUTED(3806931,fffe988a0060,python):2025-07-24-11:02:52.516.841 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: tp-0-1 [WARNING] DEVICE(3806931,fffe48ff9060,python):2025-07-24-11:02:52.726.675 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for tp-0-1, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806931,fffe48ff9060,python):2025-07-24-11:02:52.779.218 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for tp-0-1 [WARNING] DISTRIBUTED(3806931,fffe48ff9060,python):2025-07-24-11:02:52.779.398 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: tp-0-1 [WARNING] DISTRIBUTED(3806931,fffe48ff9060,python):2025-07-24-11:02:52.779.425 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: tp-0-1 end. [WARNING] DISTRIBUTED(3806931,fffe988a0060,python):2025-07-24-11:02:52.779.491 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: tp-0-1 [WARNING] ME(3806931:281472833470256,MainProcess):2025-07-24-11:02:53.199.790 [mindspore/ops/primitive.py:220] The in_strategy/in_layout of the operator in your network will not take effect in stand_alone mode. This means the the shard function called in the network is ignored. If you want to enable it, please use semi auto or auto parallel mode by context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL or context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL) 2025-07-24 11:02:53,207 - mindformers./output/log[mindformers/models/utils.py:205] - INFO - num_layers per stage: [[4]] 2025-07-24 11:02:53,207 - mindformers./output/log[mindformers/models/utils.py:206] - INFO - Accumulated num_layers per stage: [[4]] 2025-07-24 11:02:53,208 - mindformers./output/log[mindformers/models/utils.py:208] - INFO - Pipeline id list with start_stage: [0, 0, 0, 0] 2025-07-24 11:02:53,208 - mindformers./output/log[mindformers/models/utils.py:209] - INFO - Interleave id list: [0, 0, 0, 0] 2025-07-24 11:02:53,208 - mindformers./output/log[mindformers/models/utils.py:227] - INFO - Formative layer_recompute: [[0]] 2025-07-24 11:02:53,208 - mindformers./output/log[mindformers/models/utils.py:229] - INFO - The configuration of select_recompute_exclude and select_comm_recompute_exclude have the highest priority. 2025-07-24 11:02:53,208 - mindformers./output/log[mindformers/models/utils.py:235] - INFO - Formative select_recompute: {'feed_forward\\.mul': [[0]], 'feed_forward\\.w1\\.activation\\.silu': [[0]]} 2025-07-24 11:02:53,209 - mindformers./output/log[mindformers/models/utils.py:236] - INFO - Formative select_comm_recompute: {'.*\\.norm': [[0]]} 2025-07-24 11:02:53,209 - mindformers./output/log[mindformers/models/utils.py:237] - INFO - Formative select_recompute_exclude: {} 2025-07-24 11:02:53,209 - mindformers./output/log[mindformers/models/utils.py:238] - INFO - Formative select_comm_recompute_exclude: {} 2025-07-24 11:02:53,228 - mindformers./output/log[mindformers/research/deepseek3/deepseek3_model_infer.py:927] - WARNING - first_k_dense_replace is provided in MoEConfig, a normal dense FFN will be used in this block. 2025-07-24 11:02:53,238 - mindformers./output/log[mindformers/version_control.py:76] - INFO - Predict enable lazy inline. [WARNING] DISTRIBUTED(3806931,ffff8040bf30,python):2025-07-24-11:02:53.262.107 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: moe_tp-0-1 [const vector]{0, 1}, async: 0, submit_now: 1 [WARNING] DISTRIBUTED(3806931,fffe988a0060,python):2025-07-24-11:02:53.263.135 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: moe_tp-0-1 [WARNING] DEVICE(3806931,fffe2f0bf060,python):2025-07-24-11:02:53.473.755 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for moe_tp-0-1, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806931,fffe2f0bf060,python):2025-07-24-11:02:53.531.160 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for moe_tp-0-1 [WARNING] DISTRIBUTED(3806931,fffe2f0bf060,python):2025-07-24-11:02:53.531.348 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: moe_tp-0-1 [WARNING] DISTRIBUTED(3806931,fffe2f0bf060,python):2025-07-24-11:02:53.531.377 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: moe_tp-0-1 end. [WARNING] DISTRIBUTED(3806931,fffe988a0060,python):2025-07-24-11:02:53.531.465 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: moe_tp-0-1 2025-07-24 11:02:53,561 - mindformers./output/log[mindformers/version_control.py:76] - INFO - Predict enable lazy inline. 2025-07-24 11:02:53,611 - mindformers./output/log[mindformers/version_control.py:76] - INFO - Predict enable lazy inline. 2025-07-24 11:02:53,665 - mindformers./output/log[mindformers/version_control.py:76] - INFO - Predict enable lazy inline. 2025-07-24 11:02:53,671 - mindformers./output/log[mindformers/models/modeling_utils.py:1517] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. 2025-07-24 11:02:53,671 - mindformers./output/log[mindformers/research/deepseek3/deepseek3_model_infer.py:1247] - INFO - Predict run mode:True 2025-07-24 11:02:53,671 - mindformers./output/log[/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/run_parallel.py:274] - INFO - ----------------Transform and load checkpoint---------------- Weight loading: 0%| | 0/4 [00:00 type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) 2025-07-24 11:02:48,609 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config runner_config is empty. 2025-07-24 11:02:48,642 - mindformers./output/log[mindformers/core/context/build_context.py:168] - INFO - Predict context config, jit_level: O0, infer_boost: on [WARNING] ME(3806923:281473850801968,MainProcess):2025-07-24-11:02:48.643.248 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(3806923:281473850801968,MainProcess):2025-07-24-11:02:48.643.421 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(3806923:281473850801968,MainProcess):2025-07-24-11:02:48.643.514 [mindspore/context.py:1346] For 'context.set_context', when set the argument 'max_device_memory', the argument 'device_target' only supports devices in '['Ascend', 'GPU']', but got 'CPU', ignore it. [WARNING] ME(3806923:281473850801968,MainProcess):2025-07-24-11:02:48.643.588 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:02:48.644.439 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/1200). [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:02:49.144.545 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/1200). [WARNING] DISTRIBUTED(3806923,ffff2e4d9060,python):2025-07-24-11:02:49.234.288 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:254] ProcessRegister] The new node: 0(role: MS_WORKER), rank id: 0, device id: 0, hostname: devserver-dfb8-1, ip: 127.0.0.1 is registered successfully. Currently registered node number: 1, expected node number: 2 [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:02:49.644.622 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(3/1200). [WARNING] DISTRIBUTED(3806923,ffff2e4d9060,python):2025-07-24-11:02:49.650.097 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:768] ReassignNodeRank] Rank ids are already set by numeric node ids. No need to reassign them. [WARNING] DISTRIBUTED(3806923,ffff2e4d9060,python):2025-07-24-11:02:49.650.140 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:254] ProcessRegister] The new node: 1(role: MS_WORKER), rank id: 1, device id: 1, hostname: devserver-dfb8-1, ip: 127.0.0.1 is registered successfully. Currently registered node number: 2, expected node number: 2 [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:02:50.144.714 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:02:50.144.767 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:02:50.144.798 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:02:55.144.933 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:02:55.145.002 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:00.145.113 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:00.145.174 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:05.145.289 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:05.145.352 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:10.145.455 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:10.145.533 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:15.145.674 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:15.145.745 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:20.145.865 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:20.145.928 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:25.146.052 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:25.146.107 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:30.146.235 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:30.146.332 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:35.146.464 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:35.146.517 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:40.146.641 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:40.146.687 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:45.146.823 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:45.146.891 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:50.147.034 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:50.147.104 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:55.147.254 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:03:55.147.333 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:04:00.147.471 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:04:00.147.524 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:04:05.147.653 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:04:05.147.706 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:04:10.147.840 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 2 alive nodes. [WARNING] DISTRIBUTED(3806923,ffffbce3ff30,python):2025-07-24-11:04:10.147.896 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806923,ffff2e4d9060,python):2025-07-24-11:04:13.229.451 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:323] ProcessUnregister] Node 1 has unregistered. [WARNING] DISTRIBUTED(3806923,ffff2e4d9060,python):2025-07-24-11:04:13.229.913 [mindspore/ccsrc/distributed/rpc/tcp/connection.cc:79] SocketEventHandler] Event value fd: 20, events: 8193, state: 4, errcode: 11, errno: 11 Resource temporarily unavailable, remote peer: 127.0.0.1:53606, type:1, remote: 1, count: 1, this peer: 127.0.0.1:8230, please check remote peer address: 127.0.0.1:53606 in worker log to find out which worker disconnected. [WARNING] DISTRIBUTED(3806923,ffff2e4d9060,python):2025-07-24-11:04:13.324.948 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:323] ProcessUnregister] Node 0 has unregistered. large_models/deepseek/0000755000175100017500000000000015040315702015471 5ustar jenkinsHwHiAiUserlarge_models/deepseek/configs/0000755000175100017500000000000015040315702017121 5ustar jenkinsHwHiAiUserlarge_models/deepseek/configs/ci_predict_deepseek3_671b.yaml0000644000175100017500000000563115040315702024606 0ustar jenkinsHwHiAiUserseed: 0 output_dir: './output' # path to save checkpoint/strategy run_mode: 'predict' use_parallel: True load_checkpoint: "/path/to/deepseekv3/model.safetensors" load_ckpt_format: "safetensors" auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model # trainer config trainer: type: CausalLanguageModelingTrainer model_name: 'DeepSeekV3' # default parallel of device num = 32 for Atlas 800T A2 parallel_config: model_parallel: 32 pipeline_stage: 1 expert_parallel: 1 vocab_emb_dp: False # mindspore context init config context: mode: 0 # 0--Graph Mode; 1--Pynative Mode max_device_memory: "59GB" device_id: 0 affinity_cpu_list: None # parallel context config parallel: parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend full_batch: False strategy_ckpt_save_file: "./ckpt_strategy.ckpt" # model config model: model_config: type: DeepseekV3Config auto_register: deepseek3_config.DeepseekV3Config batch_size: 1 # add for incre predict seq_length: 4096 hidden_size: 7168 num_layers: 4 num_heads: 128 max_position_embeddings: 163840 intermediate_size: 18432 kv_lora_rank: 512 q_lora_rank: 1536 qk_rope_head_dim: 64 v_head_dim: 128 qk_nope_head_dim: 128 vocab_size: 129280 multiple_of: 256 rms_norm_eps: 1.0e-6 bos_token_id: 0 eos_token_id: 1 pad_token_id: 1 ignore_token_id: -100 compute_dtype: "bfloat16" layernorm_compute_type: "bfloat16" softmax_compute_type: "bfloat16" rotary_dtype: "bfloat16" router_dense_type: "bfloat16" param_init_type: "bfloat16" scaling_factor: beta_fast: 32.0 beta_slow: 1.0 factor: 40.0 mscale: 1.0 mscale_all_dim: 1.0 original_max_position_embeddings: 4096 use_past: True extend_method: "YARN" use_flash_attention: True block_size: 16 num_blocks: 512 offset: 0 checkpoint_name_or_path: "" repetition_penalty: 1 max_decode_length: 1024 top_k: 1 top_p: 1 theta: 10000.0 do_sample: False is_dynamic: True qkv_concat: False ffn_concat: False auto_map: AutoConfig: deepseek3_config.DeepseekV3Config AutoModel: deepseek3.DeepseekV3ForCausalLM arch: type: DeepseekV3ForCausalLM auto_register: deepseek3.DeepseekV3ForCausalLM moe_config: expert_num: 256 num_experts_chosen: 8 routing_policy: "TopkRouterV2" shared_expert_num: 1 routed_scaling_factor: 2.5 first_k_dense_replace: 1 moe_intermediate_size: 2048 topk_group: 4 n_group: 8 processor: return_tensors: ms tokenizer: unk_token: '' bos_token: '<|begin_of_sentence|>' eos_token: '<|end_of_sentence|>' pad_token: '<|end_of_sentence|>' type: LlamaTokenizerFast vocab_file: '/path/to/deepseekv3/tokenizer.json' tokenizer_file: '/path/to/deepseekv3/tokenizer.json' type: LlamaProcessor large_models/__init__.py0000644000175100017500000000000015040315702016003 0ustar jenkinsHwHiAiUserlarge_models/deepseekv3_weight_processor.py0000644000175100017500000006341115040315702021767 0ustar jenkinsHwHiAiUser# Copyright 2025 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ transform huggingface model to mindspore safetensor. """ import os import json import gc import numpy as np from tqdm import tqdm import mindspore as ms from mindspore.communication.management import get_rank from weight_processor import BaseWeightProcessor def convert_np_to_ms_dtype(value): """convert_np_to_ms_dtype""" if value.dtype == np.int8: value_dtype = ms.int8 elif value.dtype == np.int32: value_dtype = ms.int32 elif value.dtype == np.int64: value_dtype = ms.int64 elif value.dtype == np.float64: value_dtype = ms.float64 elif value.dtype == np.float32: value_dtype = ms.float32 else: value_dtype = ms.bfloat16 return value_dtype class DeepseekV3WeightProcessor(BaseWeightProcessor): r""" Provide DeepseekV3/R1 Model weight load and shards. Args: config (DeepseekV3/R1Config): The config of DeepseekV3/R1 model. network (InferenceDeepseekV3ForCausalLM): The network of DeepseekV3/R1. """ def __init__(self, config, network, is_quant): super().__init__(config, network, is_quant) self.num_layers = self.config.model.model_config.num_layers self.expert_num = self.config.moe_config.expert_num self.num_router_experts = self.config.moe_config.expert_num if self.config.moe_config.expert_num else 1 def infer_trans_rope_weight(self, weight, qk_rope_head_dim): """process rope router weight""" w1 = weight[..., -qk_rope_head_dim::2, :] w2 = weight[..., -qk_rope_head_dim + 1::2, :] weight[..., -qk_rope_head_dim:, :] = np.concatenate([w1, w2], axis=-2) return weight def convert_weight_name(self, weight_name: str): """replace weight name""" weight_name = weight_name.replace('embed_tokens.weight', 'tok_embeddings.embedding_weight') weight_name = weight_name.replace('.self_attn.q_a_proj.', '.attention.q2l_proj.') weight_name = weight_name.replace('.self_attn.q_a_layernorm.', '.attention.lq_norm.') weight_name = weight_name.replace('.self_attn.q_b_proj.', '.attention.l2q_proj.') weight_name = weight_name.replace('.self_attn.kv_a_proj_with_mqa.', '.attention.kv2l.') weight_name = weight_name.replace('.self_attn.kv_a_layernorm.', '.attention.lkv_norm.') weight_name = weight_name.replace('.self_attn.kv_b_proj.', '.attention.lkv2kv.') weight_name = weight_name.replace('.self_attn.o_proj.', '.attention.wo.') weight_name = weight_name.replace('mlp.gate_proj.', 'feed_forward.w1.') weight_name = weight_name.replace('mlp.down_proj.', 'feed_forward.w2.') weight_name = weight_name.replace('mlp.up_proj.', 'feed_forward.w3.') weight_name = weight_name.replace('mlp.experts.', 'feed_forward.routed_experts.ffn.') weight_name = weight_name.replace('mlp.shared_experts.gate_proj.', 'feed_forward.shared_experts.w1.') weight_name = weight_name.replace('mlp.shared_experts.down_proj.', 'feed_forward.shared_experts.w2.') weight_name = weight_name.replace('mlp.shared_experts.up_proj.', 'feed_forward.shared_experts.w3.') weight_name = weight_name.replace('mlp.gate.weight', 'feed_forward.routed_experts.router.dense.weight') weight_name = weight_name.replace('mlp.gate.e_score_correction_bias', 'feed_forward.routed_experts.router.e_score_correction_bias') weight_name = weight_name.replace('.input_layernorm.', '.attention_norm.') weight_name = weight_name.replace('.post_attention_layernorm.', '.ffn_norm.') weight_name = weight_name.replace('model.norm.weight', 'model.norm_out.weight') return weight_name def infer_process_moe_routed_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): """process moe router expert weight""" ffn_concat = self.config.model.model_config.ffn_concat # router expert dense router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight" router_dense_ms_name = self.convert_weight_name(router_dense_hf_name) router_dense_ms_param, _ = self.get_safetensor_from_file(router_dense_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[router_dense_ms_name] = ms.Parameter( ms.from_numpy(router_dense_ms_param).astype(ms.bfloat16), name=router_dense_ms_name, requires_grad=False) # e_score_correction_bias e_score_correction_bias_hf_name = f"model.layers.{layer_id}.mlp.gate.e_score_correction_bias" e_score_correction_bias_ms_name = self.convert_weight_name(e_score_correction_bias_hf_name) e_score_correction_bias_ms_param, _ = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter( ms.from_numpy(e_score_correction_bias_ms_param).astype(ms.float32), name=e_score_correction_bias_ms_name, requires_grad=False) w1_list = [] w2_list = [] w3_list = [] w1_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1.weight" w2_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2.weight" w3_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3.weight" for index in range(0, self.num_router_experts): w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight" w1_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w1_hf_name, src_hf_dir, hf_weight_map, split_axis=0) w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight" w2_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w2_hf_name, src_hf_dir, hf_weight_map, split_axis=1) w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight" w3_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w3_hf_name, src_hf_dir, hf_weight_map, split_axis=0) w1_list.append(w1_ms_param) w2_list.append(w2_ms_param) w3_list.append(w3_ms_param) w1_ms_stack_param = np.stack(w1_list, axis=0) w2_ms_stack_param = np.stack(w2_list, axis=0) w3_ms_stack_param = np.stack(w3_list, axis=0) if ffn_concat: w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w_gate_hidden.weight" w_gate_hidden_np = np.concatenate([w1_ms_stack_param, w3_ms_stack_param], axis=1) w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).permute(0, 2, 1).astype(dtype=ms.bfloat16) self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name, requires_grad=False) else: w1_ms_stack_param = ms.from_numpy(w1_ms_stack_param).permute(0, 2, 1).astype(ms.bfloat16) self.parameter_dict[w1_ms_name] = ms.Parameter(w1_ms_stack_param, name=w1_ms_name, requires_grad=False) w3_ms_stack_param = ms.from_numpy(w3_ms_stack_param).permute(0, 2, 1).astype(ms.bfloat16) self.parameter_dict[w3_ms_name] = ms.Parameter(w3_ms_stack_param, name=w3_ms_name, requires_grad=False) w2_ms_stack_param = ms.from_numpy(w2_ms_stack_param).permute(0, 2, 1).astype(ms.bfloat16) self.parameter_dict[w2_ms_name] = ms.Parameter(w2_ms_stack_param, name=w2_ms_name, requires_grad=False) def get_moe_shared_expert_weight(self, w1_hf_name, w2_hf_name, w3_hf_name, src_hf_dir, hf_weight_map): w1_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w1_hf_name, src_hf_dir, hf_weight_map, split_axis=0) w2_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w2_hf_name, src_hf_dir, hf_weight_map, split_axis=1) w3_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w3_hf_name, src_hf_dir, hf_weight_map, split_axis=0) return w1_ms_param, w2_ms_param, w3_ms_param def infer_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): """infer process moe shared expert ffn weight""" ffn_concat = self.config.model.model_config.ffn_concat w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight" w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight" w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight" w1_ms_name = self.convert_weight_name(w1_hf_name) w2_ms_name = self.convert_weight_name(w2_hf_name) w3_ms_name = self.convert_weight_name(w3_hf_name) w1_ms_param, w2_ms_param, w3_ms_param = self.get_moe_shared_expert_weight(w1_hf_name, w2_hf_name, w3_hf_name, src_hf_dir, hf_weight_map) if ffn_concat: w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden.weight" w_gate_hidden_np = np.concatenate([w1_ms_param, w3_ms_param], axis=0) w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).astype(ms.bfloat16) self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name, requires_grad=False) else: self.parameter_dict[w1_ms_name] = ms.Parameter(ms.from_numpy(w1_ms_param).astype(ms.bfloat16), name=w1_ms_name, requires_grad=False) self.parameter_dict[w3_ms_name] = ms.Parameter(ms.from_numpy(w3_ms_param).astype(ms.bfloat16), name=w3_ms_name, requires_grad=False) self.parameter_dict[w2_ms_name] = ms.Parameter(ms.from_numpy(w2_ms_param).astype(ms.bfloat16), name=w2_ms_name, requires_grad=False) def infer_process_dense_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): """infer process dense ffn weight""" ffn_concat = self.config.model.model_config.ffn_concat w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight" w1_ms_name = self.convert_weight_name(w1_hf_name) w1_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w1_hf_name, src_hf_dir, hf_weight_map, split_axis=0) w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight" w2_ms_name = self.convert_weight_name(w2_hf_name) w2_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w2_hf_name, src_hf_dir, hf_weight_map, split_axis=1) w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight" w3_ms_name = self.convert_weight_name(w3_hf_name) w3_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w3_hf_name, src_hf_dir, hf_weight_map, split_axis=0) if ffn_concat: w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden.weight" w_gate_hidden_np = np.concatenate([w1_ms_param, w3_ms_param], axis=0) w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).astype(ms.bfloat16) self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name, requires_grad=False) else: self.parameter_dict[w1_ms_name] = ms.Parameter(ms.from_numpy(w1_ms_param).astype(ms.bfloat16), name=w1_ms_name, requires_grad=False) self.parameter_dict[w3_ms_name] = ms.Parameter(ms.from_numpy(w3_ms_param).astype(ms.bfloat16), name=w3_ms_name, requires_grad=False) self.parameter_dict[w2_ms_name] = ms.Parameter(ms.from_numpy(w2_ms_param).astype(ms.bfloat16), name=w2_ms_name, requires_grad=False) def infer_process_attention_weight(self, src_hf_dir, layer_id, hf_weight_map): """infer process attention weight""" num_heads = self.config.model.model_config.num_heads kv_lora_rank = self.config.model.model_config.kv_lora_rank qk_rope_head_dim = self.config.model.model_config.qk_rope_head_dim v_head_dim = self.config.model.model_config.v_head_dim qk_nope_head_dim = self.config.model.model_config.qk_nope_head_dim rope_dim = qk_rope_head_dim + qk_nope_head_dim kv_head_dim = kv_lora_rank + qk_rope_head_dim qkv_concat = self.config.model.model_config.qkv_concat # q2l_proj q2l_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.weight" q2l_proj_ms_name = self.convert_weight_name(q2l_proj_hf_name) q_a_proj_ms_param, _ = self.get_safetensor_from_file(q2l_proj_hf_name, src_hf_dir, hf_weight_map) # kv2l kv2l_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.weight" kv2l_ms_name = self.convert_weight_name(kv2l_hf_name) kv2l_ms_param, _ = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map) kv2l_ms_param = kv2l_ms_param.reshape(kv_head_dim, -1) kv2l_ms_param = self.infer_trans_rope_weight(kv2l_ms_param, qk_rope_head_dim) if qkv_concat: wqkv2l_weight = np.concatenate((q_a_proj_ms_param, kv2l_ms_param), 0) wqkv2l_weight_name = f"model.layers.{layer_id}.attention.qkv2l.weight" self.parameter_dict[wqkv2l_weight_name] = ms.Parameter(ms.from_numpy(wqkv2l_weight).astype(ms.bfloat16), name=wqkv2l_weight_name, requires_grad=False) else: self.parameter_dict[q2l_proj_ms_name] = ms.Parameter(ms.from_numpy(q_a_proj_ms_param).astype(ms.bfloat16), name=q2l_proj_ms_name, requires_grad=False) self.parameter_dict[kv2l_ms_name] = ms.Parameter(ms.from_numpy(kv2l_ms_param).astype(ms.bfloat16), name=kv2l_ms_name, requires_grad=False) # lq_norm lq_norm_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.weight" lq_norm_ms_name = self.convert_weight_name(lq_norm_hf_name) lq_norm_ms_param, _ = self.get_safetensor_from_file(lq_norm_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[lq_norm_ms_name] = ms.Parameter(ms.from_numpy(lq_norm_ms_param).astype(ms.bfloat16), name=lq_norm_ms_name, requires_grad=False) # l2q_proj l2q_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_b_proj.weight" l2q_proj_ms_name = self.convert_weight_name(l2q_proj_hf_name) l2q_proj_ms_param, _ = self.get_safetensor_from_file(l2q_proj_hf_name, src_hf_dir, hf_weight_map) l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads, rope_dim, -1) l2q_proj_ms_param = self.infer_trans_rope_weight(l2q_proj_ms_param, qk_rope_head_dim) l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads * rope_dim, -1) l2q_proj_ms_param = self.split_weight_by_rank(l2q_proj_ms_param, split_axis=0) self.parameter_dict[l2q_proj_ms_name] = ms.Parameter( ms.from_numpy(l2q_proj_ms_param).astype(ms.bfloat16), name=l2q_proj_ms_name, requires_grad=False) # lkv_norm lkv_norm_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_layernorm.weight" lkv_norm_ms_name = self.convert_weight_name(lkv_norm_hf_name) lkv_norm_ms_param, _ = self.get_safetensor_from_file(lkv_norm_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[lkv_norm_ms_name] = ms.Parameter( ms.from_numpy(lkv_norm_ms_param).astype(ms.bfloat16), name=lkv_norm_ms_name, requires_grad=False) # lkv2kv lkv2kv_hf_name = f"model.layers.{layer_id}.self_attn.kv_b_proj.weight" lkv2kv_ms_name = self.convert_weight_name(lkv2kv_hf_name) lkv2kv_ms_param, _ = self.get_safetensor_from_file(lkv2kv_hf_name, src_hf_dir, hf_weight_map) lkv2kv_head = qk_nope_head_dim + v_head_dim lkv2kv_ms_param = lkv2kv_ms_param.reshape(num_heads, lkv2kv_head, -1) value_k_nope, value_v = lkv2kv_ms_param[:, :qk_nope_head_dim, :], lkv2kv_ms_param[:, qk_nope_head_dim:, :] # value_k_nope value_k_nope = value_k_nope.reshape(-1, value_k_nope.shape[-1]) value_k_nope = self.split_weight_by_rank(value_k_nope, split_axis=0) name_k_nope = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_k_nope.") self.parameter_dict[name_k_nope] = ms.Parameter(ms.from_numpy(value_k_nope).astype(ms.bfloat16), name=name_k_nope, requires_grad=False) # value_v value_v = value_v.reshape(-1, value_v.shape[-1]) value_v = self.split_weight_by_rank(value_v, split_axis=0) name_v = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_v.") self.parameter_dict[name_v] = ms.Parameter(ms.from_numpy(value_v).astype(ms.bfloat16), name=name_v, requires_grad=False) # wo wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight" wo_ms_name = self.convert_weight_name(wo_hf_name) wo_ms_param, _ = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map) wo_ms_param = self.split_weight_by_rank(wo_ms_param, split_axis=1) self.parameter_dict[wo_ms_name] = ms.Parameter(ms.from_numpy(wo_ms_param).astype(ms.bfloat16), name=wo_ms_name, requires_grad=False) def infer_process_norm_weight(self, src_hf_dir, layer_id, hf_weight_map): """infer process attention weight""" # attention_norm attention_norm_hf_name = f"model.layers.{layer_id}.input_layernorm.weight" attention_norm_ms_name = self.convert_weight_name(attention_norm_hf_name) attention_norm_ms_param, _ = self.get_safetensor_from_file(attention_norm_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[attention_norm_ms_name] = ms.Parameter( ms.from_numpy(attention_norm_ms_param).astype(ms.bfloat16), name=attention_norm_ms_name, requires_grad=False) # ffn_norm ffn_norm_hf_name = f"model.layers.{layer_id}.post_attention_layernorm.weight" ffn_norm_ms_name = self.convert_weight_name(ffn_norm_hf_name) ffn_norm_ms_param, _ = self.get_safetensor_from_file(ffn_norm_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[ffn_norm_ms_name] = ms.Parameter( ms.from_numpy(ffn_norm_ms_param).astype(ms.bfloat16), name=ffn_norm_ms_name, requires_grad=False) def infer_convert_outer_weight(self, src_hf_dir, hf_weight_map): """convert weight not in model""" embed_tokens_hf_name = "model.embed_tokens.weight" embed_tokens_ms_name = self.convert_weight_name(embed_tokens_hf_name) np_data, _ = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[embed_tokens_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16), name=embed_tokens_ms_name, requires_grad=False) norm_hf_name = "model.norm.weight" norm_ms_name = self.convert_weight_name(norm_hf_name) np_data, _ = self.get_safetensor_from_file(norm_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[norm_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16), name=norm_ms_name, requires_grad=False) lm_head_hf_name = "lm_head.weight" lm_head_ms_name = self.convert_weight_name(lm_head_hf_name) if not self.config.parallel_config.vocab_emb_dp: np_data, _ = self.get_safetensor_from_file_split_tp_group(lm_head_hf_name, src_hf_dir, hf_weight_map, split_axis=0) else: np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[lm_head_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16), name=lm_head_ms_name, requires_grad=False) def infer_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map): """infer convert layer weight""" if layer_id >= 3: self.infer_process_moe_routed_expert_ffn_weight(src_hf_dir, layer_id, hf_weight_map) self.infer_process_moe_shared_expert_ffn_weight(src_hf_dir, layer_id, hf_weight_map) else: self.infer_process_dense_ffn_weight(src_hf_dir, layer_id, hf_weight_map) self.infer_process_attention_weight(src_hf_dir, layer_id, hf_weight_map) self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map) def load_safetensors_shard(self, src_hf_dir, is_mtp_model=False): """deepseek load safetensors and shard """ rank_id = get_rank() param_json_path = "" for file in os.listdir(src_hf_dir): if file.endswith('index.json'): # mtp model do not support quantization, needs to load bf16 weight. if ('quant' in file and self.is_quant) or \ ('quant' not in file and (not self.is_quant or is_mtp_model)): param_json_path = os.path.join(src_hf_dir, file) with open(param_json_path, "r") as fp: hf_weight_map = json.load(fp)['weight_map'] break elif file.endswith('_name_map.json'): param_json_path = os.path.join(src_hf_dir, file) with open(param_json_path, "r") as fp: hf_weight_map = json.load(fp) if hf_weight_map.get('weight_map'): hf_weight_map = hf_weight_map['weight_map'] break if not param_json_path: raise ValueError(f"Not found param_json_path in {src_hf_dir}") enable_tqdm = rank_id == 0 mtp_layers = self.config.model.model_config.num_nextn_predict_layers start_layer = 0 if not is_mtp_model else self.num_layers end_layer = self.num_layers if not is_mtp_model else self.num_layers + mtp_layers self.infer_convert_outer_weight(src_hf_dir, hf_weight_map) for layer_id in tqdm(range(start_layer, end_layer), desc="Weight loading", disable=not enable_tqdm): self.infer_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map) param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, self.parameter_dict) print("param_not_load: %s, ckpt_not_load: %s" % (str(param_not_load), str(ckpt_not_load))) del self.parameter_dict gc.collect() large_models/test_parallel_infer.py0000644000175100017500000000617015040315702020300 0ustar jenkinsHwHiAiUser# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ import os from multiprocessing.pool import Pool from tests.mark_utils import arg_mark cur_dir = os.path.dirname(os.path.abspath(__file__)) def run_command(command_info): cmd, log_path = command_info ret = os.system(cmd) return ret, log_path def check_results(commands, results): error_idx = [_ for _ in range(len(results)) if results[_][0] != 0] for idx in error_idx: print(f"testcase {commands[idx]} failed. please check log {results[idx][1]}.") os.system(f"grep -E 'ERROR|error|Error' {results[idx][1]} -C 5") assert error_idx == [] class TestInferParallel: """A test class for testing pipeline.""" @staticmethod def setup_method(): ascend_home_path = os.getenv('ASCEND_HOME_PATH') if not ascend_home_path: os.environ['ASCEND_HOME_PATH'] = "/usr/local/Ascend/latest" @arg_mark(plat_marks=['platform_ascend910b'], level_mark='level0', card_mark='allcards', essential_mark='essential') def test_base_cases(self): """ Feature: Infer interface Description: Test parallel interface for training and prediction. Expectation: AssertionError """ commands = [ (f"export ASCEND_RT_VISIBLE_DEVICES=0,1 && export LCAL_COMM_ID=127.0.0.1:10068 && msrun --worker_num=2 " f"--local_worker_num=2 --master_port=8222 --log_dir=parallel_qwen2_0_5b_predict_mp2 --join=True " f"{cur_dir}/run_parallel.py --mode parallel_qwen2_0_5b_predict_mp2", 'parallel_qwen2_0_5b_predict_mp2/worker_0.log'), # command, log_path (f"export ASCEND_RT_VISIBLE_DEVICES=2,3 && export LCAL_COMM_ID=127.0.0.1:10070 && msrun --worker_num=2 " f"--local_worker_num=2 --master_port=8230 --log_dir=parallel_deepseek_r1_bf16_predict_mp2 --join=True " f"{cur_dir}/run_parallel.py --mode parallel_deepseek_r1_bf16_predict_mp2", 'parallel_deepseek_r1_bf16_predict_mp2/worker_0.log'), (f"export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7 && export LCAL_COMM_ID=127.0.0.1:10074 && msrun --worker_num=4 " f"--local_worker_num=4 --master_port=8240 --log_dir=parallel_qwen2_0_5b_predict_dp2_mp2 --join=True " f"{cur_dir}/run_parallel.py --mode parallel_qwen2_0_5b_predict_dp2_mp2", 'parallel_qwen2_0_5b_predict_dp2_mp2/worker_0.log'), ] with Pool(len(commands)) as pool: results = list(pool.imap(run_command, commands)) check_results(commands, results) large_models/llama/0000755000175100017500000000000015040315764015002 5ustar jenkinsHwHiAiUserlarge_models/llama/training_checker.py0000644000175100017500000001624315040315702020651 0ustar jenkinsHwHiAiUser# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """Training checker""" import os import sys import time import mindspore as ms from mindspore import Callback from mindspore.communication import get_rank, get_group_size workspace = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) sys.path.insert(0, os.path.join(workspace, "mindformers")) from mindformers.core.callback.callback import _get_loss_output class TrainingChecker(Callback): """ Callback function for precision and performance checking. Raise an AssertionError once the difference between a step's loss and the corresponding expected value is greater than the error value or the difference ratio between average step time and expected value is greater than the error ratio. Args: loss_list_std (list[float]): A list of expected loss values. avg_step_time_std (float): Expected average step time value (in millisecond). Defaults to None. loss_error (float, optional): Allowable loss error between true and expected values. Defaults to 1e-3. time_error_ratio (float, optional): Allowable time error ratio between true and expected values. Defaults to 0.1. skip_step_num (int, optional): Skip a certain number of steps before counting the time. Defaults to 2. skip_time_num (int, optional): Remove the largest values in collected step time list. Defaults to 5. micro_batch_num (int, optional): The number of micro-batch in a pipeline stage. Defaults to 1. micro_batch_interleave_num (int, optional): Multi-copy parallel configuration. Defaults to 1. gradient_accumulation_steps (int, optional): The number of gradient accumulation steps. Defaults to 1. loss_mode (bool, optional): The mode of checking loss, 'abs' and 'relative' are supported. Defaults to 'abs'. experiment_mode (bool, optional): Enables or disables the developer debugging mode. Defaults to False. If set True, will not check values of loss and time. Raises: AssertionError """ def __init__(self, loss_list_std: list, avg_step_time_std: float = None, loss_error: float = 1e-3, time_error_ratio: float = 0.1, skip_step_num: int = 2, skip_time_num: int = 5, micro_batch_num: int = 1, micro_batch_interleave_num: int = 1, gradient_accumulation_steps: int = 1, loss_mode: str = 'abs', experiment_mode: bool = False): super(TrainingChecker, self).__init__() self.loss_list_std = loss_list_std self.avg_step_time_std = avg_step_time_std self.loss_error = loss_error self.time_error_ratio = time_error_ratio self.step_time = time.time() self.total_time = [] self.skip_step_num = skip_step_num self.skip_time_num = skip_time_num # init pipeline parallel status self.pipeline_parallel = False self.is_last_stage = True self.micro_size = micro_batch_num self.gradient_accumulation_steps = gradient_accumulation_steps self.micro_batch_interleave_num = micro_batch_interleave_num self.loss_mode = loss_mode self.experiment_mode = experiment_mode self.loss_recoder = [] self.time_recoder = [] def on_train_begin(self, run_context): """Called once before the network training.""" self.begin(run_context) # Check pipeline parallel training status. pipeline_stages = ms.get_auto_parallel_context('pipeline_stages') self.pipeline_parallel = pipeline_stages > 1 if self.pipeline_parallel: rank_id = get_rank() device_num = get_group_size() per_stage_device_num = device_num // pipeline_stages stage_id = rank_id // per_stage_device_num self.is_last_stage = (stage_id == pipeline_stages - 1) def on_train_step_begin(self, run_context): """Called on each training step begin.""" _ = run_context self.step_time = time.time() def on_train_step_end(self, run_context): """Called on each training step end.""" cb_params = run_context.original_args() net_outputs = cb_params.net_outputs loss = _get_loss_output(net_outputs)[0] cur_step_num = cb_params.cur_step_num cur_step_time = (time.time() - self.step_time) * 1000 if cur_step_num > self.skip_step_num: self.total_time.append(cur_step_time) if self.pipeline_parallel: loss = loss / self.micro_size if self.micro_batch_interleave_num > 1: loss = loss / self.micro_batch_interleave_num if self.gradient_accumulation_steps > 1: loss = loss / self.gradient_accumulation_steps self.loss_recoder.append(loss) # when enable pp, loss will be only available on the last card if (not self.pipeline_parallel or self.is_last_stage) and not self.experiment_mode: real_loss = self.loss_list_std[cur_step_num - 1] if self.loss_mode == 'abs': loss_diff = abs(loss - real_loss) elif self.loss_mode == 'relative': loss_diff = abs(loss - real_loss / real_loss) else: raise ValueError(f"support 'abs' and 'relative' loss checking mode, but got {self.loss_mode}.") print(f"loss check mode: {self.loss_mode}.") assert loss_diff < self.loss_error, \ f"The error between loss: {loss} and loss_list_std: {real_loss} is larger than {self.loss_error}" def on_train_end(self, run_context): _ = run_context self.total_time.sort() self.total_time = self.total_time[:-self.skip_time_num] avg_step_time = sum(self.total_time) / len(self.total_time) self.time_recoder.append(avg_step_time) if self.avg_step_time_std is not None and not self.experiment_mode: assert (avg_step_time - self.avg_step_time_std) / self.avg_step_time_std < self.time_error_ratio, \ f"The error ratio between avg_step_time: {avg_step_time} and " \ f"avg_step_time_std: {self.avg_step_time_std} is larger than {self.time_error_ratio}" def get_experiment_results(self): print("\nexperiment loss: ") for i in range(0, len(self.loss_recoder), 5): loss = [f"{item:.6f}" for item in self.loss_recoder[i:i + 5]] print(', '.join(loss) + ',') print(f"\nexperiment time: {int(self.time_recoder[0])}") large_models/llama/env.sh0000644000175100017500000000166715040315702016130 0ustar jenkinsHwHiAiUser#!/bin/bash # Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ export ASCEND_PATH=/usr/local/Ascend if [ -d "${ASCEND_PATH}/ascend-toolkit" ]; then source ${ASCEND_PATH}/ascend-toolkit/set_env.sh else source ${ASCEND_PATH}/latest/bin/setenv.bash fi export DEVICE_MEMORY_CAPACITY=1073741824000 export NOT_FULLY_USE_DEVICES=off large_models/llama/train_llama.py0000644000175100017500000002200315040315702017624 0ustar jenkinsHwHiAiUser# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ Test module for testing the paralleled llama interface used for mindformers. How to run this: pytest tests/st/test_model/test_llama_model/test_parallel_train.py pytest tests/st/test_model/test_llama_model/test_parallel_predict.py """ import os import sys import argparse import numpy as np import mindspore as ms from mindspore import set_seed from mindspore.communication import init from mindspore.dataset import GeneratorDataset workspace = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) sys.path.insert(0, os.path.join(workspace, "mindformers")) from training_checker import TrainingChecker from mindformers.models.llama.llama_config import LlamaConfig from mindformers.models.llama.llama import LlamaForCausalLM from mindformers import Trainer, TrainingArguments ms.set_context(jit_config={"jit_level": "O1"}) ms.set_context(mode=ms.GRAPH_MODE) init() def generator_train(): """train dataset generator""" seq_len = 1025 step_num = 10 batch_size = 8 vocab_size = 32000 input_ids = np.random.randint(low=0, high=vocab_size, size=( step_num * batch_size, seq_len,)).astype(np.int32) for idx in range(len(input_ids)): yield input_ids[idx] def build_model(test_mode, is_dynamic=False, compute_dtype="float16", softmax_compute_type="float32", layernorm_compute_type="float32", rotary_dtype="float32", param_init_type="float16", gradient_accumulation_steps=1, fine_grain_interleave=1): """init task trainer.""" set_seed(0) np.random.seed(0) args = TrainingArguments( batch_size=8, num_train_epochs=1, use_parallel=True) model_config = LlamaConfig(num_layers=2, hidden_size=1536, num_heads=12, seq_length=1024, batch_size=8, use_flash_attention=True, use_past=False, is_dynamic=is_dynamic, compute_dtype=compute_dtype, layernorm_compute_type=layernorm_compute_type, softmax_compute_type=softmax_compute_type, rotary_dtype=rotary_dtype, param_init_type=param_init_type, block_size=32, num_blocks=20, do_sample=False, fine_grain_interleave=fine_grain_interleave) model = LlamaForCausalLM(model_config) train_dataset = GeneratorDataset( generator_train, column_names=["input_ids"]) train_dataset = train_dataset.batch(batch_size=8) loss_list_std = [10.451367, 10.455378, 10.465119, 10.463621, 10.476261, 10.462841, 10.472476, 10.468395, 10.469678, 10.461041,] avg_step_time_std = 10000 if test_mode == 'test_train_cp': loss_list_std = [10.448591, 10.450175, 10.458983, 10.466015, 10.473140, 10.459602, 10.472231, 10.466570, 10.462967, 10.467032,] avg_step_time_std = 10000 if test_mode == 'test_train_dp': loss_list_std = [10.448593, 10.450171, 10.458986, 10.466034, 10.473145, 10.459610, 10.472258, 10.466605, 10.462999, 10.467015,] avg_step_time_std = 10000 callback = TrainingChecker(loss_list_std=loss_list_std, avg_step_time_std=avg_step_time_std, micro_batch_num=2, micro_batch_interleave_num=2, gradient_accumulation_steps=gradient_accumulation_steps) task_trainer = Trainer(task='text_generation', model=model, args=args, train_dataset=train_dataset, callbacks=callback) return task_trainer def run_llama_4p_train(): """test msrun launch llama on 4p for Trainer.train().""" ms.reset_auto_parallel_context() ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.SEMI_AUTO_PARALLEL, gradients_mean=True, full_batch=True, enable_parallel_optimizer=True) ms.set_auto_parallel_context(pipeline_config={'pipeline_scheduler': '1f1b', 'pipeline_interleave': True}) task_trainer = build_model('test_train', fine_grain_interleave=2) task_trainer.config.callbacks[1].save_checkpoint_steps = 100 task_trainer.config.callbacks = task_trainer.config.callbacks[:1] task_trainer.config.runner_config.epochs = 1 task_trainer.config.runner_config.sink_mode = False task_trainer.config.runner_wrapper.scale_sense.loss_scale_value = 1024 ms.set_auto_parallel_context(pipeline_stages=2) task_trainer.set_parallel_config(data_parallel=1, model_parallel=2, pipeline_stage=2, micro_batch_num=2, micro_batch_interleave_num=2, vocab_emb_dp=False) task_trainer.train() sys.exit(0) def run_llama_2p_train_cp(): """test msrun launch llama on context parallel for Trainer.train().""" ms.reset_auto_parallel_context() ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.SEMI_AUTO_PARALLEL, gradients_mean=True, full_batch=True, enable_parallel_optimizer=True) task_trainer = build_model('test_train_cp', gradient_accumulation_steps=2) task_trainer.config.callbacks[1].save_checkpoint_steps = 100 task_trainer.config.callbacks = task_trainer.config.callbacks[:1] task_trainer.config.runner_config.epochs = 1 task_trainer.config.runner_config.sink_mode = False task_trainer.config.runner_wrapper.scale_sense.loss_scale_value = 1024 task_trainer.config.runner_config.gradient_accumulation_steps = 2 task_trainer.config.model.model_config.use_flash_attention = True task_trainer.set_parallel_config(data_parallel=1, model_parallel=1, context_parallel=2, pipeline_stage=1, micro_batch_num=1, micro_batch_interleave_num=2) task_trainer.train() sys.exit(0) def run_llama_2p_train_dp(): """test msrun launch llama on data parallel for Trainer.train().""" ms.reset_auto_parallel_context() ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.SEMI_AUTO_PARALLEL, gradients_mean=True, full_batch=True, enable_parallel_optimizer=True) task_trainer = build_model('test_train_dp') task_trainer.config.callbacks[1].save_checkpoint_steps = 100 task_trainer.config.callbacks = task_trainer.config.callbacks[:1] task_trainer.config.runner_config.epochs = 1 task_trainer.config.runner_config.sink_mode = False task_trainer.config.runner_wrapper.scale_sense.loss_scale_value = 1024 task_trainer.config.parallel.parallel_optimizer_config.optimizer_weight_shard_size = 1 task_trainer.set_parallel_config(data_parallel=2, model_parallel=1, context_parallel=1, pipeline_stage=1, micro_batch_num=1, micro_batch_interleave_num=2) task_trainer.train() sys.exit(0) def run_llama(): """ Feature: Trainer.train() Trainer.predict() Description: Test trainer for train/predict on parallel mode. Expectation: TypeError, ValueError, RuntimeError """ parser = argparse.ArgumentParser() parser.add_argument( '--test_mode', default="", type=str, help='test_mode.') args = parser.parse_args() if args.test_mode == "test_train": run_llama_4p_train() elif args.test_mode == "test_train_cp": run_llama_2p_train_cp() elif args.test_mode == "test_train_dp": run_llama_2p_train_dp() run_llama() large_models/llama/test_dryrun_llama_semi_compile.py0000644000175100017500000001215215040315702023622 0ustar jenkinsHwHiAiUser# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ Test module for parallel training of Llama models using Mindformers at jit_level O0. """ import os import sys import argparse import numpy as np import mindspore as ms from mindspore import set_seed from mindspore.communication import init from mindspore.dataset import GeneratorDataset workspace = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) sys.path.insert(0, os.path.join(workspace, "mindformers")) from mindformers.models.llama.llama_config import LlamaConfig from mindformers.models.llama.llama import LlamaForCausalLM from mindformers import Trainer, TrainingArguments ms.set_context(jit_config={"jit_level": "O0"}) ms.set_context(mode=ms.GRAPH_MODE) ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.SEMI_AUTO_PARALLEL, gradients_mean=True, full_batch=True, enable_parallel_optimizer=True) init() def generator_train(): """train dataset generator""" seq_len = 4097 step_num = 5 batch_size = 32 vocab_size = 32000 input_ids = np.random.randint(low=0, high=vocab_size, size=( step_num * batch_size, seq_len,)).astype(np.int32) for idx in range(len(input_ids)): yield input_ids[idx] def build_model(test_mode, is_dynamic=False, compute_dtype="float16", softmax_compute_type="float32", layernorm_compute_type="float32", rotary_dtype="float32", param_init_type="float16", gradient_accumulation_steps=1, fine_grain_inteleave=1): """init task trainer.""" set_seed(0) np.random.seed(0) args = TrainingArguments( batch_size=32, num_train_epochs=1, use_parallel=True) model_config = LlamaConfig(num_layers=80, hidden_size=8192, num_heads=64, seq_length=4096, batch_size=32, use_flash_attention=True, use_past=False, is_dynamic=is_dynamic, compute_dtype=compute_dtype, layernorm_compute_type=layernorm_compute_type, softmax_compute_type=softmax_compute_type, rotary_dtype=rotary_dtype, param_init_type=param_init_type, block_size=32, num_blocks=20, do_sample=False, fine_grain_inteleave=fine_grain_inteleave) model = LlamaForCausalLM(model_config) train_dataset = GeneratorDataset( generator_train, column_names=["input_ids"]) train_dataset = train_dataset.batch(batch_size=32) task_trainer = Trainer(task='text_generation', model=model, args=args, train_dataset=train_dataset) return task_trainer def run_llama_compile(): """test llama compile.""" task_trainer = build_model('llama_compile', compute_dtype="float16") task_trainer.config.callbacks[1].save_checkpoint_steps = 100 task_trainer.config.callbacks = task_trainer.config.callbacks[:1] task_trainer.config.runner_config.epochs = 1 task_trainer.config.runner_config.sink_mode = False task_trainer.config.runner_wrapper.scale_sense.loss_scale_value = 1024 task_trainer.config.parallel.parallel_optimizer_config.optimizer_weight_shard_size = 1 task_trainer.config.runner_config.gradient_accumulation_steps = 4 ms.set_auto_parallel_context(pipeline_stages=8) task_trainer.set_parallel_config(data_parallel=1, model_parallel=4, context_parallel=1, pipeline_stage=8, micro_batch_num=32) task_trainer.train() sys.exit(0) def run_llama(): """ Feature: Trainer.train() Trainer.predict() Description: Test trainer for train/predict on parallel mode. Expectation: TypeError, ValueError, RuntimeError """ parser = argparse.ArgumentParser() parser.add_argument( '--test_mode', default="", type=str, help='test_mode.') args = parser.parse_args() if args.test_mode == "compile": run_llama_compile() run_llama() large_models/llama/mpirun_launch_llama.sh0000644000175100017500000000304015040315702021335 0ustar jenkinsHwHiAiUser#!/bin/bash # Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ set -e BASE_PATH=$( cd "$(dirname $0)" pwd ) CONFIG_FILE=$1 USE_DEVICE_NUM=$2 TEST_MODE=$3 TEST_CASE=$4 MF_PATH=${BASE_PATH}/../../mindformers pip install -r ${MF_PATH}/requirements.txt export PATH=${ASCEND_HOME_PATH}/latest/tools/profiler/bin:$PATH if [ "$TEST_MODE" == "predict" ]; then mpirun --allow-run-as-root -n ${USE_DEVICE_NUM} \ python ${BASE_PATH}/infer_llama.py \ --yaml_file ${CONFIG_FILE} \ --test_mode ${TEST_CASE} >${BASE_PATH}/${TEST_CASE}.log 2>&1 elif [ "$TEST_MODE" == "train" ]; then export MS_FORMAT_MODE=1 export MS_GE_TRAIN=1 export MS_ENABLE_REF_MODE=1 export MS_ENABLE_GE=1 export MS_DEV_CELL_REUSE=1 export MS_GE_ATOMIC_CLEAN_POLICY=1 export MS_MEMORY_POOL_RECYCLE=1 mpirun --allow-run-as-root -n ${USE_DEVICE_NUM} \ python ${BASE_PATH}/train_llama.py \ --test_mode ${TEST_CASE} >${BASE_PATH}/${TEST_CASE}.log 2>&1 fi large_models/llama/msrun_launch_llama.sh0000644000175100017500000000203215040315702021167 0ustar jenkinsHwHiAiUser#!/bin/bash # Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ set -e BASE_PATH=$(cd "$(dirname $0)"; pwd) USE_DEVICE_NUM=$1 TEST_MODE=$2 PORT=$3 export GLOG_v=1 source ${BASE_PATH}/env.sh export MS_MEMORY_POOL_RECYCLE=1 msrun --worker_num=${USE_DEVICE_NUM} --local_worker_num=${USE_DEVICE_NUM} --master_port=${PORT} --log_dir=${TEST_MODE} --join=True \ ${BASE_PATH}/train_llama.py --test_mode ${TEST_MODE} >${TEST_MODE}.log 2>&1 large_models/llama/test_dryrun_llama.py0000644000175100017500000001400415040315702021073 0ustar jenkinsHwHiAiUser# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ Test module for parallel training of Llama models using Mindformers at jit_level O2. """ import os import sys import argparse import numpy as np import mindspore as ms from mindspore import set_seed from mindspore.communication import init from mindspore.dataset import GeneratorDataset workspace = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) sys.path.insert(0, os.path.join(workspace, "mindformers")) from mindformers.models.llama.llama_config import LlamaConfig from mindformers.models.llama.llama import LlamaForCausalLM from mindformers import Trainer, TrainingArguments ms.set_context(jit_config={"jit_level": "O2"}) ms.set_context(mode=ms.GRAPH_MODE) ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.SEMI_AUTO_PARALLEL, gradients_mean=True, full_batch=True, enable_parallel_optimizer=True) init() def generator_train(): """train dataset generator""" seq_len = 1025 step_num = 10 batch_size = 8 vocab_size = 32000 input_ids = np.random.randint(low=0, high=vocab_size, size=( step_num * batch_size, seq_len,)).astype(np.int32) for idx in range(len(input_ids)): yield input_ids[idx] def build_model(test_mode, is_dynamic=False, compute_dtype="float16", softmax_compute_type="float32", layernorm_compute_type="float32", rotary_dtype="float32", param_init_type="float16", gradient_accumulation_steps=1, fine_grain_inteleave=1): """init task trainer.""" set_seed(0) np.random.seed(0) args = TrainingArguments( batch_size=8, num_train_epochs=1, use_parallel=True) model_config = LlamaConfig(num_layers=2, hidden_size=1536, num_heads=12, seq_length=1024, batch_size=8, use_flash_attention=True, use_past=False, is_dynamic=is_dynamic, compute_dtype=compute_dtype, layernorm_compute_type=layernorm_compute_type, softmax_compute_type=softmax_compute_type, rotary_dtype=rotary_dtype, param_init_type=param_init_type, block_size=32, num_blocks=20, do_sample=False, fine_grain_inteleave=fine_grain_inteleave) model = LlamaForCausalLM(model_config) train_dataset = GeneratorDataset( generator_train, column_names=["input_ids"]) train_dataset = train_dataset.batch(batch_size=8) task_trainer = Trainer(task='text_generation', model=model, args=args, train_dataset=train_dataset) return task_trainer def run_llama_pipeline(): """test llama pipeline.""" task_trainer = build_model('llama_pipeline', fine_grain_inteleave=2) task_trainer.config.callbacks[1].save_checkpoint_steps = 100 task_trainer.config.callbacks = task_trainer.config.callbacks[:1] task_trainer.config.runner_config.epochs = 1 task_trainer.config.runner_config.sink_mode = False task_trainer.config.runner_wrapper.scale_sense.loss_scale_value = 1024 ms.set_auto_parallel_context(pipeline_stages=2) task_trainer.set_parallel_config(data_parallel=1, model_parallel=2, pipeline_stage=2, micro_batch_num=2, vocab_emb_dp=False) task_trainer.train() sys.exit(0) def run_llama_grad_accu(): """test llama grad accu.""" task_trainer = build_model('llama_grad_accu', gradient_accumulation_steps=4) task_trainer.config.callbacks[1].save_checkpoint_steps = 100 task_trainer.config.callbacks = task_trainer.config.callbacks[:1] task_trainer.config.runner_config.epochs = 1 task_trainer.config.runner_config.sink_mode = False task_trainer.config.runner_wrapper.scale_sense.loss_scale_value = 1024 task_trainer.config.parallel.parallel_optimizer_config.optimizer_weight_shard_size = 1 task_trainer.config.runner_config.gradient_accumulation_steps = 4 task_trainer.set_parallel_config(data_parallel=2, model_parallel=2, context_parallel=1, pipeline_stage=1, micro_batch_num=2) task_trainer.train() sys.exit(0) def run_llama(): """ Feature: Trainer.train() Trainer.predict() Description: Test trainer for train/predict on parallel mode. Expectation: TypeError, ValueError, RuntimeError """ parser = argparse.ArgumentParser() parser.add_argument( '--test_mode', default="", type=str, help='test_mode.') args = parser.parse_args() ms.set_context(save_graphs=True, save_graphs_path=f"./{args.test_mode}") if args.test_mode == "pipeline": run_llama_pipeline() elif args.test_mode == "grad_accu": run_llama_grad_accu() run_llama() large_models/llama/test_parallel_train.py0000644000175100017500000000535415040315702021403 0ustar jenkinsHwHiAiUser# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ Test module for testing the paralleled llama interface used for mindformers. How to run this: pytest tests/st/test_model/test_llama_model/test_parallel_train.py """ import os from multiprocessing.pool import Pool from tests.mark_utils import arg_mark import subprocess def run_command(command_info): cmd, log_path = command_info ret = os.system(cmd) return ret, log_path def check_results(commands, results): error_idx = [_ for _ in range(len(results)) if results[_][0] != 0] for idx in error_idx: print(f"testcase {commands[idx]} failed. please check log {results[idx][1]}.") os.system(f"grep -E 'ERROR|error|Error' {results[idx][1]} -C 5") os.system(f"cat {results[idx][1]}") assert error_idx == [] subprocess.check_output(["grep", "MS_DEV_P2P_HCCL_BUFFSIZE, and the value is 24 MB.", commands[0][1]]) @arg_mark(plat_marks=['platform_ascend910b'], level_mark='level1', card_mark='allcards', essential_mark='essential') def test_train(): """ Feature: Trainer.train() Description: Test context parallel trainer for train. Expectation: AssertionError """ ascend_home_path = os.getenv('ASCEND_HOME_PATH') if not ascend_home_path: os.environ['ASCEND_HOME_PATH'] = "/usr/local/Ascend/latest" sh_path = os.path.split(os.path.realpath(__file__))[0] commands = [(f"export MS_DEV_P2P_HCCL_BUFFSIZE=24 && export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 && " f"bash {sh_path}/msrun_launch_llama.sh 4 test_train 8128", f"{sh_path}/test_train/worker_0.log"), (f"export ASCEND_RT_VISIBLE_DEVICES=4,5 && " f"bash {sh_path}/msrun_launch_llama.sh 2 test_train_cp 8129", f"{sh_path}/test_train_cp/worker_0.log"), (f"export ASCEND_RT_VISIBLE_DEVICES=6,7 && " f"bash {sh_path}/msrun_launch_llama.sh 2 test_train_dp 8131", f"{sh_path}/test_train_dp/worker_0.log") ] with Pool(len(commands)) as pool: results = list(pool.imap(run_command, commands)) check_results(commands, results) large_models/llama/__init__.py0000644000175100017500000000000015040315764017101 0ustar jenkinsHwHiAiUserlarge_models/llama/test_dryrun_llama_auto_compile.py0000644000175100017500000001215515040315702023640 0ustar jenkinsHwHiAiUser# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ Test module for parallel training of Llama models using sharding propagation. """ import os import sys import argparse import numpy as np import mindspore as ms from mindspore import set_seed from mindspore.communication import init from mindspore.dataset import GeneratorDataset workspace = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) sys.path.insert(0, os.path.join(workspace, "mindformers")) from mindformers.models.llama.llama_config import LlamaConfig from mindformers.models.llama.llama import LlamaForCausalLM from mindformers import Trainer, TrainingArguments ms.set_context(jit_config={"jit_level": "O1"}) ms.set_context(mode=ms.GRAPH_MODE) ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.AUTO_PARALLEL, search_mode="sharding_propagation", full_batch=True, enable_parallel_optimizer=True) init() def generator_train(): """train dataset generator""" seq_len = 4097 step_num = 5 batch_size = 32 vocab_size = 32000 input_ids = np.random.randint(low=0, high=vocab_size, size=( step_num * batch_size, seq_len,)).astype(np.int32) for idx in range(len(input_ids)): yield input_ids[idx] def build_model(test_mode, is_dynamic=False, compute_dtype="float16", softmax_compute_type="float32", layernorm_compute_type="float32", rotary_dtype="float32", param_init_type="float16", gradient_accumulation_steps=1, fine_grain_inteleave=1): """init task trainer.""" set_seed(0) np.random.seed(0) args = TrainingArguments( batch_size=32, num_train_epochs=1, use_parallel=True) model_config = LlamaConfig(num_layers=80, hidden_size=8192, num_heads=64, seq_length=4096, batch_size=32, use_flash_attention=True, use_past=False, is_dynamic=is_dynamic, compute_dtype=compute_dtype, layernorm_compute_type=layernorm_compute_type, softmax_compute_type=softmax_compute_type, rotary_dtype=rotary_dtype, param_init_type=param_init_type, block_size=32, num_blocks=20, do_sample=False, fine_grain_inteleave=fine_grain_inteleave) model = LlamaForCausalLM(model_config) train_dataset = GeneratorDataset( generator_train, column_names=["input_ids"]) train_dataset = train_dataset.batch(batch_size=32) task_trainer = Trainer(task='text_generation', model=model, args=args, train_dataset=train_dataset) return task_trainer def run_llama_compile(): """test llama compile.""" task_trainer = build_model('llama_compile', compute_dtype="float16") task_trainer.config.callbacks[1].save_checkpoint_steps = 100 task_trainer.config.callbacks = task_trainer.config.callbacks[:1] task_trainer.config.runner_config.epochs = 1 task_trainer.config.runner_config.sink_mode = False task_trainer.config.runner_wrapper.scale_sense.loss_scale_value = 1024 task_trainer.config.parallel.parallel_optimizer_config.optimizer_weight_shard_size = 1 task_trainer.config.runner_config.gradient_accumulation_steps = 1 ms.set_auto_parallel_context(pipeline_stages=8) task_trainer.set_parallel_config(data_parallel=1, model_parallel=4, context_parallel=1, pipeline_stage=8, micro_batch_num=32) task_trainer.train() sys.exit(0) def run_llama(): """ Feature: Trainer.train() Trainer.predict() Description: Test trainer for train/predict on parallel mode. Expectation: TypeError, ValueError, RuntimeError """ parser = argparse.ArgumentParser() parser.add_argument( '--test_mode', default="", type=str, help='test_mode.') args = parser.parse_args() if args.test_mode == "compile": run_llama_compile() run_llama() large_models/llama/test_dry_run.py0000644000175100017500000000615515040315702020074 0ustar jenkinsHwHiAiUser# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ Test module for parallel training of Llama models using Mindformers at jit_level O2. """ import os import subprocess from tests.mark_utils import arg_mark def run_command_semi_compile(cmd, log_path, backend_time, compile_time): if os.path.isfile(log_path): os.remove(log_path) os.system(cmd) log_backend = "compile_backend_graph costs" log_output = subprocess.check_output( ["grep -r '%s' %s | head -1 | awk '{print $3}'" % (log_backend, log_path)], shell=True) log_time = str(log_output, 'utf-8').strip() assert float(log_time) <= backend_time * 1.1 log_compile = "compile_graph costs" log_output = subprocess.check_output( ["grep -r '%s' %s | head -1 | awk '{print $3}'" % (log_compile, log_path)], shell=True) log_time = str(log_output, 'utf-8').strip() assert float(log_time) <= compile_time * 1.1 def run_command_auto_compile(cmd, log_path, sharding_time): if os.path.isfile(log_path): os.remove(log_path) os.system(cmd) log_sharding = "parallel_strategy_search costs" log_output = subprocess.check_output( ["grep -r '%s' %s | awk '{print $3}'" % (log_sharding, log_path)], shell=True) log_time = str(log_output, 'utf-8').strip() assert float(log_time) <= sharding_time if os.path.isfile(log_path): os.remove(log_path) @arg_mark(plat_marks=['platform_ascend910b'], level_mark='level0', card_mark='dryrun_only', essential_mark='essential') def test_train_semi_compile(): """ Feature: Trainer.train() Description: Test llama2 70b semi compile time when parallel_mode=SEMI_AUTO_PARALLEL. Expectation: Throw AssertionError when compile_backend_graph time > 60000 ms or compile_graph > 200000 """ sh_path = os.path.split(os.path.realpath(__file__))[0] run_command_semi_compile(f"bash {sh_path}/dry_compile.sh semi compile", f"{sh_path}/compile.log", 60000, 200000) @arg_mark(plat_marks=['platform_ascend910b'], level_mark='level0', card_mark='dryrun_only', essential_mark='essential') def test_train_auto_compile(): """ Feature: refactor sharding propagation when AUTO_PARALLEL. Description: Test llama2 70b compile time when parallel_mode=AUTO_PARALLEL. Expectation: Throw AssertionError when parallel_strategy_search time > 11000 ms """ sh_path = os.path.split(os.path.realpath(__file__))[0] run_command_auto_compile(f"bash {sh_path}/dry_compile.sh auto compile", f"{sh_path}/compile_auto.log", 11000) large_models/llama/dry.sh0000644000175100017500000000175715040315702016136 0ustar jenkinsHwHiAiUser#!/bin/bash # Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ set -e BASE_PATH=$(cd "$(dirname $0)"; pwd) TEST_MODE=$2 export MS_SIMULATION_LEVEL=1 export RANK_SIZE=4 export RANK_ID=0 export GLOG_v=1 export MS_DEV_DUMP_IR_PASSES="hwopt_d_after_stream_assign" export ENABLE_LAZY_INLINE_NO_PIPELINE=$1 python ${BASE_PATH}/test_dryrun_llama.py --test_mode ${TEST_MODE} > ${TEST_MODE}.log 2>&1large_models/llama/__pycache__/0000755000175100017500000000000015040315764017212 5ustar jenkinsHwHiAiUserlarge_models/llama/__pycache__/__init__.cpython-39.pyc0000644000175100017500000000026715040315764023407 0ustar jenkinsHwHiAiUsera h@sdS)Nrrr\/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/llama/__init__.pylarge_models/llama/__pycache__/test_dry_run.cpython-39.pyc0000644000175100017500000000470115040315764024366 0ustar jenkinsHwHiAiUsera ›hm @shdZddlZddlZddlmZddZddZedgd d d d d dZedgd d d d ddZdS)zV Test module for parallel training of Llama models using Mindformers at jit_level O2. N)arg_markcCstj|rt|t|d}tjd||fgdd}t|d}t ||dks^Jd}tjd||fgdd}t|d}t ||dksJdS)Nzcompile_backend_graph costsz,grep -r '%s' %s | head -1 | awk '{print $3}'Tshellutf-8g?zcompile_graph costs ospathisfileremovesystem subprocess check_outputstrstripfloat)cmdlog_pathZ backend_time compile_timeZ log_backend log_outputlog_timeZ log_compiler`/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/llama/test_dry_run.pyrun_command_semi_compiles"     rcCsttj|rt|t|d}tjd||fgdd}t|d}t ||ksZJtj|rpt|dS)Nzparallel_strategy_search costsz"grep -r '%s' %s | awk '{print $3}'Trrr)rrZ sharding_timeZ log_shardingrrrrrrun_command_auto_compile*s     rplatform_ascend910blevel0 dryrun_only essential) plat_marks level_mark card_markessential_markcCs8tjtjtd}td|d|ddddS)z Feature: Trainer.train() Description: Test llama2 70b semi compile time when parallel_mode=SEMI_AUTO_PARALLEL. Expectation: Throw AssertionError when compile_backend_graph time > 60000 ms or compile_graph > 200000 rbash z/dry_compile.sh semi compilez /compile.logi`i@ N)rrsplitrealpath__file__rZsh_pathrrrtest_train_semi_compile9sr'cCs6tjtjtd}td|d|dddS)z Feature: refactor sharding propagation when AUTO_PARALLEL. Description: Test llama2 70b compile time when parallel_mode=AUTO_PARALLEL. Expectation: Throw AssertionError when parallel_strategy_search time > 11000 ms rr"z/dry_compile.sh auto compilez/compile_auto.logi*N)rrr#r$r%rr&rrrtest_train_auto_compileDsr() __doc__rr tests.mark_utilsrrrr'r(rrrrs  large_models/llama/dry_compile.sh0000644000175100017500000000227215040315702017637 0ustar jenkinsHwHiAiUser#!/bin/bash # Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ set -e BASE_PATH=$(cd "$(dirname $0)"; pwd) PARALLEL_MODE=$1 TEST_MODE=$2 export MS_SIMULATION_LEVEL=1 export GLOG_v=2 export RANK_SIZE=32 export RANK_ID=0 export MS_DEV_RUNTIME_CONF="compile_statistics:True" export MS_ENBALE_NUMA=1 if [ "$PARALLEL_MODE" = "semi" ]; then python ${BASE_PATH}/test_dryrun_llama_semi_compile.py --test_mode ${TEST_MODE} > ${TEST_MODE}.log 2>&1 elif [ "$PARALLEL_MODE" = "auto" ]; then python ${BASE_PATH}/test_dryrun_llama_auto_compile.py --test_mode ${TEST_MODE} > ${TEST_MODE}_auto.log 2>&1 filarge_models/test_standalone_infer.py0000644000175100017500000001434415040315702020636 0ustar jenkinsHwHiAiUser# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ Test module for testing the stand alone Infer interface used for mindformers. How to run this: pytest tests/st/networks/large_models/test_standalone_infer.py """ import os import sys workspace = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) sys.path.insert(0, os.path.join(workspace, "networks/mindformers")) import mindspore as ms from mindspore.nn.utils import no_init_parameters from tests.mark_utils import arg_mark from mindformers import build_context, MindFormerConfig, LlamaConfig, LlamaForCausalLM from research.qwen2.qwen2_tokenizer import Qwen2Tokenizer from similarity import compare_distance @arg_mark(plat_marks=['platform_ascend910b'], level_mark='level0', card_mark='onecard', essential_mark='essential') def test_qwen2_0_5b_predict_standalone(): """ Feature: Infer standalone st Description: Test infer interface for prediction with standalone. Expectation: AssertionError """ ms.runtime.set_kernel_launch_group() ascend_home_path = os.getenv('ASCEND_HOME_PATH') if not ascend_home_path: os.environ['ASCEND_HOME_PATH'] = "/usr/local/Ascend/latest" cur_dir = os.path.dirname(os.path.realpath(__file__)) config_path = os.path.join(cur_dir, "./qwen/configs/ci_predict_qwen2_0_5b_instruct.yaml") vocab_file_path = "/home/workspace/mindspore_dataset/weight/Qwen2.5-0.5B-Instruct-tokenizer/vocab.json" merges_file_path = "/home/workspace/mindspore_dataset/weight/Qwen2.5-0.5B-Instruct-tokenizer/merges.txt" load_safetensors = "/home/workspace/mindspore_dataset/weight/ms_safetensor_qwen2_0.5/model.safetensors" seq_length = 128 # init config with yaml config = MindFormerConfig(config_path) config.use_parallel = False config.load_checkpoint = load_safetensors config.model.model_config.seq_length = seq_length config.processor.tokenizer.vocab_file = vocab_file_path config.processor.tokenizer.merges_file = merges_file_path config.context.device_id = int(os.environ.get("DEVICE_ID", "0")) # init context build_context(config) model_config = LlamaConfig(**config.model.model_config) model_config.checkpoint_name_or_path = None # build tokenizer tokenizer = Qwen2Tokenizer(**config.processor.tokenizer) # build model with no_init_parameters(): network = LlamaForCausalLM(model_config) ms.load_checkpoint( ckpt_file_name=load_safetensors, net=network, format='safetensors' ) # predict batch_datas = {1: {"prompt": "你好!", "answer": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n你好!" "<|im_end|>\n<|im_start|>assistant\n你好!有什么可以帮助你的吗?<|im_end|>"}, 4: {"prompt": "用python编写快速排序", "answer": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n用" "python编写快速排序<|im_end|>\n<|im_start|>assistant\n以下是一个使用Python实现的快速排序" "算法:\n\n```python\ndef quick_sort(arr):\n if len(arr) <= 1:\n " "return arr\n else:\n pivot = arr[0]\n left = [x for x in arr[1:] " "if x < pivot]\n right = [x for x in arr[1:] if x >= pivot]\n " "return quick_sort(left) + [pivot] + quick_sort(right)\n\n# 示例输入\narr = [3,6,8,1"}, 8: {"prompt": "I believe the meaning of life is", "answer": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nI " "believe the meaning of life is<|im_end|>\n<|im_start|>assistant\nThe meaning of life " "is a philosophical question that has been debated for centuries, and there is no one " "definitive answer to it. Some people believe that the meaning of life is to find " "happiness and fulfillment in their lives, while others believe that it is to achieve " "success or recognition.\n\nOthers may argue that the meaning of life is to live a " "good life, to make a positive impact on the world, and to contribute to society in " "some way. Others may believe that the meaning of life is to seek knowledge and " "understanding"} } for batch_size, batch_data in batch_datas.items(): messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": batch_data["prompt"]} ] input_ids = tokenizer.encode(tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True )) input_ids_list = [] answer = batch_data["answer"] for i in range(0, batch_size): input_ids_list.append(input_ids) outputs = network.generate(input_ids_list, max_length=seq_length, do_sample=False, return_dict_in_generate=False) for i in range(0, len(outputs)): output_text = tokenizer.decode(outputs[i]) print("test_qwen2_0_5b_predict_standalone, output_text:", output_text) compare_distance(output_text, answer, bench_sim=0.95) large_models/run_parallel.py0000644000175100017500000003726215040315702016750 0ustar jenkinsHwHiAiUser# Copyright 2024 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ Test module for testing the paralleled infer interface used for mindformers. How to run this: pytest tests/st/networks/large_models/test_parallel_predict.py """ import argparse import os import sys import numpy as np from similarity import compare_distance workspace = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) sys.path.insert(0, os.path.join(workspace, "networks/mindformers")) from mindspore.nn.utils import no_init_parameters import mindspore as ms from mindformers.models.llama.llama_tokenizer_fast import LlamaTokenizerFast from mindformers import build_context, MindFormerConfig, build_parallel_config, LlamaConfig from mindformers.tools.logger import logger from research.qwen2_5.qwen2_5_tokenizer import Qwen2Tokenizer from research.qwen2_5.infer.qwen2_5 import ( ParallelQwenForCausalLM as ParallelQwenForCausalLM_MF, ) from research.deepseek3.deepseek3_config import DeepseekV3Config from research.deepseek3.deepseek3_model_infer import InferenceDeepseekV3ForCausalLM from deepseekv3_weight_processor import DeepseekV3WeightProcessor from qwen2_weight_processor import Qwen2WeightProcessor def parallel_qwen2_0_5b_predict_mp2(): """test qwen2 0.5B predict in model_parallel=2 with dynamic shape""" ms.runtime.set_kernel_launch_group() cur_dir = os.path.dirname(os.path.realpath(__file__)) config_path = os.path.join(cur_dir, "qwen/configs/ci_predict_qwen2_0_5b_instruct.yaml") vocab_file_path = "/home/workspace/mindspore_dataset/weight/Qwen2-0.5B-Instruct/vocab.json" merges_file_path = "/home/workspace/mindspore_dataset/weight/Qwen2-0.5B-Instruct/merges.txt" load_checkpoint = "/home/workspace/mindspore_dataset/weight/Qwen2-0.5B-Instruct/" seq_length = 128 # init config with yaml config = MindFormerConfig(config_path) config.use_parallel = True config.parallel_config.model_parallel = 2 config.parallel_config.data_parallel = 1 config.parallel_config.pipeline_stage = 1 config.load_checkpoint = load_checkpoint config.model.model_config.seq_length = seq_length config.model.model_config.qkv_concat = False config.processor.tokenizer.vocab_file = vocab_file_path config.processor.tokenizer.merges_file = merges_file_path config.parallel.parallel_mode = "STAND_ALONE" # init context build_context(config) build_parallel_config(config) config.model.model_config.parallel_config = config.parallel_config model_config = LlamaConfig(**config.model.model_config) model_config.checkpoint_name_or_path = None # build tokenizer tokenizer = Qwen2Tokenizer(**config.processor.tokenizer) # build model with no_init_parameters(): network = ParallelQwenForCausalLM_MF(model_config) # load checkpoint if config.load_checkpoint: logger.info("----------------Transform and load checkpoint----------------") weight_processor = Qwen2WeightProcessor(config, network, False) weight_processor.load_safetensors_shard(config.load_checkpoint) # predict batch_datas = {1: {"prompt": "你好!", "answer": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n你好!" "<|im_end|>\n<|im_start|>assistant\n你好!有什么可以帮助你的吗?<|im_end|>"}, 4: {"prompt": "用python编写快速排序", "answer": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n用" "python编写快速排序<|im_end|>\n<|im_start|>assistant\n以下是一个使用Python实现的快速排序" "算法:\n\n```python\ndef quick_sort(arr):\n if len(arr) <= 1:\n " "return arr\n else:\n pivot = arr[0]\n left = [x for x in arr[1:] " "if x < pivot]\n right = [x for x in arr[1:] if x >= pivot]\n " "return quick_sort(left) + [pivot] + quick_sort(right)\n\n# 示例输入\narr = [3,6,8,1"}, 8: {"prompt": "I believe the meaning of life is", "answer": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nI " "believe the meaning of life is<|im_end|>\n<|im_start|>assistant\nThe meaning of " "life is a philosophical question that has been debated for centuries, and there " "is no one definitive answer to it. Some people believe that the meaning of life " "is to find happiness and fulfillment in their lives, while others believe that it " "is to achieve success or recognition.\n\nOthers may argue that the meaning of life " "is to make a positive impact on the world, to help others, and to contribute to " "society as a whole. Others may believe that the meaning of life is to pursue " "knowledge and understanding, to"}, } for batch_size, batch_data in batch_datas.items(): messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": batch_data["prompt"]} ] input_ids = tokenizer.encode(tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True )) input_ids_list = [] answer = batch_data["answer"] for i in range(0, batch_size): input_ids_list.append(input_ids) outputs = network.generate(input_ids_list, max_length=seq_length, do_sample=False, return_dict_in_generate=False) for i in range(0, len(outputs)): output_text = tokenizer.decode(outputs[i]) print("parallel_qwen2_0.5b_predict_mp2, output_text:", output_text) print("parallel_qwen2_0.5b_predict_mp2, answer:", answer) compare_distance(output_text, answer, bench_sim=0.95) def parallel_qwen2_0_5b_predict_dp2_mp2(): """test qwen2 0.5B predict in data_parallel=2 and model_parallel=2 with dynamic shape""" ms.runtime.set_kernel_launch_group() cur_dir = os.path.dirname(os.path.realpath(__file__)) config_path = os.path.join(cur_dir, "qwen/configs/ci_predict_qwen2_0_5b_instruct.yaml") vocab_file_path = "/home/workspace/mindspore_dataset/weight/Qwen2-0.5B-Instruct/vocab.json" merges_file_path = "/home/workspace/mindspore_dataset/weight/Qwen2-0.5B-Instruct/merges.txt" load_checkpoint = "/home/workspace/mindspore_dataset/weight/Qwen2-0.5B-Instruct/" seq_length = 128 # init config with yaml config = MindFormerConfig(config_path) config.use_parallel = True config.parallel_config.model_parallel = 2 config.parallel_config.data_parallel = 2 config.parallel_config.pipeline_stage = 1 config.load_checkpoint = load_checkpoint config.model.model_config.seq_length = seq_length config.model.model_config.qkv_concat = False config.processor.tokenizer.vocab_file = vocab_file_path config.processor.tokenizer.merges_file = merges_file_path config.parallel.parallel_mode = "STAND_ALONE" # init context build_context(config) build_parallel_config(config) config.model.model_config.parallel_config = config.parallel_config model_config = LlamaConfig(**config.model.model_config) model_config.checkpoint_name_or_path = None # build tokenizer tokenizer = Qwen2Tokenizer(**config.processor.tokenizer) # build model with no_init_parameters(): network = ParallelQwenForCausalLM_MF(model_config) # load checkpoint if config.load_checkpoint: logger.info("----------------Transform and load checkpoint----------------") weight_processor = Qwen2WeightProcessor(config, network, False) weight_processor.load_safetensors_shard(config.load_checkpoint) # predict batch_datas = {1: {"prompt": "你好!", "answer": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n你好!" "<|im_end|>\n<|im_start|>assistant\n你好!有什么可以帮助你的吗?<|im_end|>"}, 4: {"prompt": "用python编写快速排序", "answer": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n用" "python编写快速排序<|im_end|>\n<|im_start|>assistant\n以下是一个使用Python实现的快速排序" "算法:\n\n```python\ndef quick_sort(arr):\n if len(arr) <= 1:\n " "return arr\n else:\n pivot = arr[0]\n left = [x for x in arr[1:] " "if x < pivot]\n right = [x for x in arr[1:] if x >= pivot]\n " "return quick_sort(left) + [pivot] + quick_sort(right)\n\n# 示例输入\narr = [3,6,8,1"}, 8: {"prompt": "I believe the meaning of life is", "answer": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nI " "believe the meaning of life is<|im_end|>\n<|im_start|>assistant\nThe meaning of " "life is a philosophical question that has been debated for centuries, and there " "is no one definitive answer to it. Some people believe that the meaning of life " "is to find happiness and fulfillment in their lives, while others believe that it " "is to achieve success or recognition.\n\nOthers may argue that the meaning of life " "is to make a positive impact on the world, to help others, and to contribute to " "society as a whole. Others may believe that the meaning of life is to pursue " "knowledge and understanding, to"}, } for batch_size, batch_data in batch_datas.items(): messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": batch_data["prompt"]} ] input_ids = tokenizer.encode(tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True )) input_ids_list = [] answer = batch_data["answer"] for i in range(0, batch_size): input_ids_list.append(input_ids) outputs = network.generate(input_ids_list, max_length=seq_length, do_sample=False, return_dict_in_generate=False) for i in range(0, len(outputs)): output_text = tokenizer.decode(outputs[i]) print("parallel_qwen2_0_5b_predict_dp2_mp2, output_text:", output_text) print("parallel_qwen2_0_5b_predict_dp2_mp2, answer:", answer) compare_distance(output_text, answer, bench_sim=0.95) def parallel_deepseek_r1_bf16_predict_mp2(): """test deepseek r1 bf16 predict in model_parallel=2 with dynamic shape""" ms.runtime.set_kernel_launch_group() cur_dir = os.path.dirname(os.path.realpath(__file__)) config_path = os.path.join(cur_dir, "deepseek/configs/ci_predict_deepseek3_671b.yaml") vocab_file_path = "/home/workspace/mindspore_dataset/weight/DeepSeek-R1-bf16/tokenizer.json" tokenizer_file_path = "/home/workspace/mindspore_dataset/weight/DeepSeek-R1-bf16/tokenizer.json" load_checkpoint = "/home/workspace/mindspore_dataset/weight/DeepSeek-R1-bf16/" seq_length = 128 # init config with yaml config = MindFormerConfig(config_path) config.use_parallel = True config.parallel_config.model_parallel = 2 config.parallel_config.data_parallel = 1 config.parallel_config.pipeline_stage = 1 config.load_checkpoint = load_checkpoint config.model.model_config.seq_length = seq_length config.processor.tokenizer.vocab_file = vocab_file_path config.processor.tokenizer.tokenizer_file = tokenizer_file_path # init context build_context(config) build_parallel_config(config) config.model.model_config.parallel_config = config.parallel_config config.model.model_config.moe_config = config.moe_config model_config = DeepseekV3Config(**config.model.model_config) model_config.checkpoint_name_or_path = None # build tokenizer tokenizer = LlamaTokenizerFast(config.processor.tokenizer.vocab_file, config.processor.tokenizer.tokenizer_file, unk_token=config.processor.tokenizer.unk_token, bos_token=config.processor.tokenizer.bos_token, eos_token=config.processor.tokenizer.eos_token, fast_tokenizer=True, trust_remote_code=True) # build model with no_init_parameters(): network = InferenceDeepseekV3ForCausalLM(model_config) # load checkpoint if config.load_checkpoint: logger.info("----------------Transform and load checkpoint----------------") weight_processor = DeepseekV3WeightProcessor(config, network, False) weight_processor.load_safetensors_shard(config.load_checkpoint) # predict batch_datas = { 4: {"prompt": "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 \n情感:<|Assistant|>\n", "answer": ["ugs611ాలు sic辨hara的开璞 SquaresInsp"]} } for batch_size, batch_data in batch_datas.items(): messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": batch_data["prompt"]} ] input_ids = tokenizer.encode(tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True )) input_ids_list = [] for _ in range(0, batch_size): input_ids_list.append(input_ids) outputs = network.generate(input_ids_list, max_length=seq_length, do_sample=False, return_dict_in_generate=False) assert np.array(outputs).shape == (4, 128) TEST_MAP = { 'parallel_qwen2_0_5b_predict_mp2': parallel_qwen2_0_5b_predict_mp2, 'parallel_qwen2_0_5b_predict_dp2_mp2': parallel_qwen2_0_5b_predict_dp2_mp2, 'parallel_deepseek_r1_bf16_predict_mp2': parallel_deepseek_r1_bf16_predict_mp2, } if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--mode', type=str, help='test mode of llama2 model.') args = parser.parse_args() TEST_MAP[args.mode]() large_models/__pycache__/0000755000175100017500000000000015040321130016104 5ustar jenkinsHwHiAiUserlarge_models/__pycache__/similarity.cpython-39.pyc0000644000175100017500000000251015040315764022741 0ustar jenkinsHwHiAiUsera ›h@s>ddlZddlZddlZddZddZddZd d d ZdS) NcCs@g}|D]}||vr||q|D]}||vr$||q$|SN)append)standard_cut_infer_ret_listtest_cut_infer_ret_list all_wordsZs_cutZt_cutrX/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/similarity.py_get_all_wordss  r cCs:g}g}|D]$}||||||q ||fSr)rcount)rrr la_standardlb_testwordrrr_get_word_vector s rcCsVt|}t|}t||jtt||jtt||j}t|dS)N)nparraydotTmathsqrtround)r r laalbbcosrrr_get_calculate_cos)s  6rffffff?c Csbtt|}tt|}t||}t|||\}}t||}tdt|||ks^JdS)zcompare distancezcalculate sim is:{}N) listjiebacutr rrprintformatstr) x1x2 bench_simy1y2rrrsimrrrcompare_distance0s  r()r)rrnumpyrr rrr(rrrrs   large_models/__pycache__/test_parallel_infer.cpython-39.pyc0000644000175100017500000000602315040315744024572 0ustar jenkinsHwHiAiUsera ›hx @sVddlZddlmZddlmZejejeZ ddZ ddZ Gdd d Z dS) N)Pool)arg_markcCs|\}}t|}||fS)N)ossystem)Z command_infocmdZlog_pathretra/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/test_parallel_infer.py run_commands r cspfddttD}|D]@}td||d|ddtd|ddq|gkslJdS) Ncs g|]}|ddkr|qS)rr).0_resultsrr z!check_results..z testcase z failed. please check log .zgrep -E 'ERROR|error|Error' z -C 5)rangelenprintrr)commandsrZ error_idxidxrr r check_resultss "rc@s6eZdZdZeddZedgddddd d Zd S) TestInferParallelz"A test class for testing pipeline.cCstd}|sdtjd<dS)NZASCEND_HOME_PATHz/usr/local/Ascend/latest)rgetenvenviron)Zascend_home_pathrrr setup_method(s zTestInferParallel.setup_methodplatform_ascend910blevel0allcards essential) plat_marks level_mark card_markessential_markcCsxdtddfdtddfdtdd fg}tt| }t|t|}Wd n1s`0Yt||d S) z Feature: Infer interface Description: Test parallel interface for training and prediction. Expectation: AssertionError zexport ASCEND_RT_VISIBLE_DEVICES=0,1 && export LCAL_COMM_ID=127.0.0.1:10068 && msrun --worker_num=2 --local_worker_num=2 --master_port=8222 --log_dir=parallel_qwen2_0_5b_predict_mp2 --join=True z7/run_parallel.py --mode parallel_qwen2_0_5b_predict_mp2z,parallel_qwen2_0_5b_predict_mp2/worker_0.logzexport ASCEND_RT_VISIBLE_DEVICES=2,3 && export LCAL_COMM_ID=127.0.0.1:10070 && msrun --worker_num=2 --local_worker_num=2 --master_port=8230 --log_dir=parallel_deepseek_r1_bf16_predict_mp2 --join=True z=/run_parallel.py --mode parallel_deepseek_r1_bf16_predict_mp2z2parallel_deepseek_r1_bf16_predict_mp2/worker_0.logzexport ASCEND_RT_VISIBLE_DEVICES=4,5,6,7 && export LCAL_COMM_ID=127.0.0.1:10074 && msrun --worker_num=4 --local_worker_num=4 --master_port=8240 --log_dir=parallel_qwen2_0_5b_predict_dp2_mp2 --join=True z;/run_parallel.py --mode parallel_qwen2_0_5b_predict_dp2_mp2z0parallel_qwen2_0_5b_predict_dp2_mp2/worker_0.logN)cur_dirrrlistimapr r)selfrpoolrrrr test_base_cases.s&.z!TestInferParallel.test_base_casesN)__name__ __module__ __qualname____doc__ staticmethodrrr*rrrr r%s  r) rZmultiprocessing.poolrtests.mark_utilsrpathdirnameabspath__file__r%r rrrrrr s   large_models/__pycache__/weight_processor.cpython-39.pyc0000644000175100017500000000715315040321130024131 0ustar jenkinsHwHiAiUsera ›h<@s^dZddlZddlmZddlmZddlmZddlm Z GdddeZ Gd d d Z dS) z# transform huggingface safetensor. N)Enum) safe_open)get_rank)get_tp_world_sizec@seZdZdZdZdZdZdS)EPMethodz EP method enums defaultalltoall allgatherN)__name__ __module__ __qualname____doc__DEFAULTALLTOALL ALLGATHERrr^/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/weight_processor.pyrsrc@sLeZdZdZddZddZddZdd Zdd d Zdd dZ ddZ dS)BaseWeightProcessorz Provide model weight load and shards. Args: config (MF Config): The config of Infer model. network (InferenceModelForCausalLM): The network of infer model. cCs@||_||_||_t|_t|_|j|j|_i|_i|_ dSN) confignetworkis_quantrglobal_rank_idr tp_group_size tp_rank_idparameter_dict file_handles)selfrrrrrr__init__-szBaseWeightProcessor.__init__cCs*||jvr t|dd}||j|<|j|S)Nnp) framework)rr)rfilenamefprrrget_file_handles8s   z$BaseWeightProcessor.get_file_handlescCs|`dSr)r)rrrrrelease_file_handles>sz(BaseWeightProcessor.release_file_handlesc CsV||}tj||}||}d}|durD||vrDd}||}||fS)NFT)ospathjoinr#metadatakeys get_tensor) r hf_param_name src_hf_dir hf_weight_mapsafetensor_filer!sf_fileqint4np_datarrrget_safetensor_from_fileAs  z,BaseWeightProcessor.get_safetensor_from_filercCs8||}tj||}||}d}|durD||vrDd}||} | } |dkr| d|j} |j | } |j d| } | | | }n|dkr| d|j} |j | } |j d| } | dd| | f}nZ|dkr"| d|j} |j | } |j d| } | dddd| | f}nt d |||fS)NFTrsplit_axis:{} is not supported.) r%r&r'r#r(r) get_slice get_shaperr ValueErrorformat)rr+r,r- split_axisr.r!r/r0r1shape split_sizestartstop split_datarrr'get_safetensor_from_file_split_tp_groupLs2      z;BaseWeightProcessor.get_safetensor_from_file_split_tp_groupcCs|jdkr|S|j}|dkrP|d|j}|j|}|jd|}|||}nR|dkr|d|j}|j|}|jd|}|dd||f}ntd||S)Nr3rr5)rr;rr8r9)rweightr:r;r<r=r>r?rrrsplit_weight_by_rankis   z(BaseWeightProcessor.split_weight_by_rankcCs tddS)z load safetensors and shards z1load_safetensors_shard method is not implemented.N)NotImplementedError)rr,rrrload_safetensors_shard|sz*BaseWeightProcessor.load_safetensors_shardN)r)r) r r r r rr#r$r2r@rBrDrrrrr$s   r) r r%enumr safetensorsr"mindspore.communication.managementr)mindformers.parallel_core.inference.utilsrrrrrrrs     large_models/__pycache__/qwen2_weight_processor.cpython-39.pyc0000644000175100017500000001672515040321130025252 0ustar jenkinsHwHiAiUsera ›hh;@spdZddlZddlZddlZddlZddlmZddlmZddl Z ddl m Z ddl mZGdddeZdS) z6 transform huggingface model to mindspore safetensor. N)tqdm) safe_open)get_rank)BaseWeightProcessorcs^eZdZdZfddZddZedddZd d Zd d Z d dZ ddZ ddZ Z S)Qwen2WeightProcessorz Provide Qwen2 Model weight load and shards. Args: config (Qwen2Config): The config of Qwen2 model. network (InferenceQwen2ForCausalLM): The network of Qwen2. cst|||dS)N)super__init__)selfconfignetworkis_quant __class__d/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/qwen2_weight_processor.pyr(szQwen2WeightProcessor.__init__c Cs"d}||}|jjjr,||||\}}n|j|||dd\}}tjt| tj |dd|j |<d}||}||||\}}tjt| tj |dd|j |<d} || } |jj j js|jjjs|j| ||dd\}}n|| ||\}}tjt| tj | dd|j | <dS) zconvert weight not in modelzmodel.embed_tokens.weightr split_axisFname requires_gradmodel.norm.weightzlm_head.weightN)convert_weight_namer parallel_config vocab_emb_dpget_safetensor_from_file'get_safetensor_from_file_split_tp_groupms Parameter from_numpyastypebfloat16parameter_dictmodel model_configtie_word_embeddings) r src_hf_dir hf_weight_mapembed_tokens_hf_nameembed_tokens_ms_namenp_data_ norm_hf_name norm_ms_namelm_head_hf_namelm_head_ms_namerrrinfer_convert_outer_weight,s<           z/Qwen2WeightProcessor.infer_convert_outer_weight) weight_namecCs|dd}|dd}|dd}|dd}|d d }|d d }|d d}|dd}|dd}|dd}|dd}|S)zreplace weight namezembed_tokens.weightztok_embeddings.embedding_weightzself_attn.q_proj.z attention.wq.zself_attn.k_proj.z attention.wk.zself_attn.v_proj.z attention.wv.zself_attn.o_proj.z attention.wo.zmlp.gate_proj.zfeed_forward.w1.zmlp.down_proj.zfeed_forward.w2.z mlp.up_proj.zfeed_forward.w3.z.input_layernorm.z.attention_norm.z.post_attention_layernorm.z .ffn_norm.rzmodel.norm_out.weight)replace)r r0rrrrLs           z(Qwen2WeightProcessor.convert_weight_namecCs:|jjjj}d|d}||}|j|||dd\}}d|d} || } |j| ||dd\} }d|d} || } |j| ||dd\}}|rd|d}tj||fdd }tj ||d d |j |<nHtj t | tj |d d |j |<tj t | tj | d d |j | <tj t |  tj | d d |j | <d S) zinfer process dense ffn weight model.layers.z.mlp.gate_proj.weightrrz.mlp.down_proj.weightz.mlp.up_proj.weightz".feed_forward.w_gate_hidden.weightaxisFrN)r r"r# qkv_concatrrnp concatenaterrr!rrr )r r%layer_idr& ffn_concat w1_hf_name w1_ms_name w1_ms_paramr* w2_hf_name w2_ms_name w2_ms_param w3_hf_name w3_ms_name w3_ms_paramw_gate_hidden_namew_gate_hidden_paramrrrinfer_process_dense_ffn_weight\sD                z3Qwen2WeightProcessor.infer_process_dense_ffn_weightcCs|jjjj}d|d}||}|j|||dd\}}d|d} || } |j| ||dd\} }d|d} || } |j| ||dd\}}d|d}||}|j|||dd\}}d|d}||}|j|||dd\}}d|d }||}|j|||dd\}}|rd|d }tj|||fdd }t | tj }tj ||d d |j |<d|d}tj| ||fdd }t | tj }tj ||d d |j |<ntj t | tj |d d |j |<tj t | tj | d d |j | <tj t | tj |d d |j |<tj t |  tj | d d |j | <tj t | tj |d d |j |<tj t | tj |d d |j |<d|d}||}|j|||dd\}}tj t | tj |d d |j |<dS)infer process attention weightr2z.self_attn.q_proj.weightrrz.self_attn.q_proj.biasz.self_attn.k_proj.weightz.self_attn.k_proj.biasz.self_attn.v_proj.weightz.self_attn.v_proj.biasz.attention.w_qkv.weightr4Frz.attention.w_qkv.biasz.self_attn.o_proj.weightr3N)r r"r#r6rrr7r8rrrr rr!)r r%r9r&r6Z wq_hf_nameZ wq_ms_nameZ wq_ms_paramr*Zwq_bias_hf_nameZwq_bias_ms_nameZwq_bias_ms_paramZ wk_hf_nameZ wk_ms_nameZ wk_ms_paramZwk_bias_hf_nameZwk_bias_ms_nameZwk_bias_ms_paramZ wv_hf_nameZ wv_ms_nameZ wv_ms_paramZwv_bias_hf_nameZwv_bias_ms_nameZwv_bias_ms_paramZ w_qkv_nameZ w_qkv_paramZw_qkv_bias_nameZw_qkv_bias_param wo_hf_name wo_ms_name wo_ms_paramrrrinfer_process_attention_weights                                     z3Qwen2WeightProcessor.infer_process_attention_weightc Csd|d}||}||||\}}tjt|tj|dd|j|<d|d}||} ||||\} }tjt| tj| dd|j| <dS)rGr2z.input_layernorm.weightFrz .post_attention_layernorm.weightN)rrrrrrr r!) r r%r9r&attention_norm_hf_nameattention_norm_ms_nameattention_norm_ms_paramr*ffn_norm_hf_nameffn_norm_ms_nameffn_norm_ms_paramrrrinfer_process_norm_weights&     z.Qwen2WeightProcessor.infer_process_norm_weightcCs.||||||||||||dS)zinfer convert layer weightN)rKrFrR)r r%r9r&rrrinfer_convert_layer_weightsz/Qwen2WeightProcessor.infer_convert_layer_weightcCsbt}d}t|D] }|drtj||}q6qi}tj|rt|d}t |d}Wdq1st0Yn\d}t |d|dd 2}| } | D]} ||t |  <qWdn1s0Y||||jjjj} |d k} tt| d | d D]} ||| |qt|j|j\}}td t |t |f|`tdS)z qwen load safetensors and shard z index.jsonr weight_mapNzmodel.safetensors/r7) frameworkrzWeight loading)descdisablez%param_not_load: %s, ckpt_not_load: %s)roslistdirendswithpathjoinexistsopenjsonloadrkeysstrstripr/r r"r# num_layersrrangerSrload_param_into_netr r!printgccollect)r r%rank_idparam_json_pathfiler&fpsafetensor_filesf_fileall_keyskeyrg enable_tqdmr9param_not_load ckpt_not_loadrrrload_safetensors_shards0   .0  z+Qwen2WeightProcessor.load_safetensors_shard)__name__ __module__ __qualname____doc__rr/rerrFrKrRrSrx __classcell__rrr rrs  $Pr)r|r[rbrknumpyr7r safetensorsr mindsporer"mindspore.communication.managementrweight_processorrrrrrrs    large_models/__pycache__/__init__.cpython-39.pyc0000644000175100017500000000026115040315744022311 0ustar jenkinsHwHiAiUsera ›h@sdS)NrrrV/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/__init__.pylarge_models/__pycache__/test_parallel_infer.cpython-39-pytest-6.2.5.pyc0000644000175100017500000000672115040321076026566 0ustar jenkinsHwHiAiUsera ›hx @spddlZddlmmZddlZddlmZddl m Z ej ej eZddZddZGdd d ZdS) N)Pool)arg_markcCs|\}}t|}||fS)N)ossystem)Z command_infocmdZlog_pathretra/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/test_parallel_infer.py run_commands r csfddttD}|D]@}td||d|ddtd|ddqg}||k}|std |fd ||fd tvst |rt |nd t |d }d d|i}t t |d}}dS)Ncs g|]}|ddkr|qS)rr).0_resultsrr z!check_results..z testcase z failed. please check log .zgrep -E 'ERROR|error|Error' z -C 5)==)z%(py0)s == %(py3)s error_idx)py0py3zassert %(py5)spy5) rangelenprintrr @pytest_ar_call_reprcompare @py_builtinslocals_should_repr_global_name _safereprAssertionError_format_explanation)commandsrridx @py_assert2 @py_assert1 @py_format4 @py_format6rr r check_resultss "r)c@s6eZdZdZeddZedgddddd d Zd S) TestInferParallelz"A test class for testing pipeline.cCstd}|sdtjd<dS)NZASCEND_HOME_PATHz/usr/local/Ascend/latest)rgetenvenviron)Zascend_home_pathrrr setup_method(s zTestInferParallel.setup_methodZplatform_ascend910bZlevel0ZallcardsZ essential)Z plat_marksZ level_markZ card_markZessential_markcCsxdtddfdtddfdtdd fg}tt| }t|t|}Wd n1s`0Yt||d S) z Feature: Infer interface Description: Test parallel interface for training and prediction. Expectation: AssertionError zexport ASCEND_RT_VISIBLE_DEVICES=0,1 && export LCAL_COMM_ID=127.0.0.1:10068 && msrun --worker_num=2 --local_worker_num=2 --master_port=8222 --log_dir=parallel_qwen2_0_5b_predict_mp2 --join=True z7/run_parallel.py --mode parallel_qwen2_0_5b_predict_mp2z,parallel_qwen2_0_5b_predict_mp2/worker_0.logzexport ASCEND_RT_VISIBLE_DEVICES=2,3 && export LCAL_COMM_ID=127.0.0.1:10070 && msrun --worker_num=2 --local_worker_num=2 --master_port=8230 --log_dir=parallel_deepseek_r1_bf16_predict_mp2 --join=True z=/run_parallel.py --mode parallel_deepseek_r1_bf16_predict_mp2z2parallel_deepseek_r1_bf16_predict_mp2/worker_0.logzexport ASCEND_RT_VISIBLE_DEVICES=4,5,6,7 && export LCAL_COMM_ID=127.0.0.1:10074 && msrun --worker_num=4 --local_worker_num=4 --master_port=8240 --log_dir=parallel_qwen2_0_5b_predict_dp2_mp2 --join=True z;/run_parallel.py --mode parallel_qwen2_0_5b_predict_dp2_mp2z0parallel_qwen2_0_5b_predict_dp2_mp2/worker_0.logN)cur_dirrrlistimapr r))selfr#poolrrrr test_base_cases.s&.z!TestInferParallel.test_base_casesN)__name__ __module__ __qualname____doc__ staticmethodr-rr3rrrr r*%s  r*)builtinsr_pytest.assertion.rewrite assertionrewriterrZmultiprocessing.poolrZtests.mark_utilsrpathdirnameabspath__file__r.r r)r*rrrr s "  large_models/__pycache__/deepseekv3_weight_processor.cpython-39.pyc0000644000175100017500000003133515040321130026246 0ustar jenkinsHwHiAiUsera ›h g@sldZddlZddlZddlZddlZddlmZddlZddl m Z ddl m Z ddZ Gdd d e ZdS) z6 transform huggingface model to mindspore safetensor. N)tqdm)get_rank)BaseWeightProcessorcCsn|jtjkrtj}nV|jtjkr(tj}nB|jtjkr|jjjnd|_dS)N) super__init__configmodel model_config num_layers moe_config expert_numnum_router_experts)selfrnetworkis_quant __class__rrr8s z"DeepseekV3WeightProcessor.__init__cCsb|d| ddddf}|d| dddddf}tj||gdd|d| dddf<|S)zprocess rope router weight.Nraxis)r concatenate)rweightqk_rope_head_dimw1w2rrrinfer_trans_rope_weight>s&z1DeepseekV3WeightProcessor.infer_trans_rope_weight) weight_namecCs|dd}|dd}|dd}|dd}|d d }|d d }|d d}|dd}|dd}|dd}|dd}|dd}|dd}|dd}|dd}|dd }|d!d"}|d#d$}|d%d&}|d'd(}|S))zreplace weight namezembed_tokens.weightztok_embeddings.embedding_weightz.self_attn.q_a_proj.z.attention.q2l_proj.z.self_attn.q_a_layernorm.z.attention.lq_norm.z.self_attn.q_b_proj.z.attention.l2q_proj.z.self_attn.kv_a_proj_with_mqa.z.attention.kv2l.z.self_attn.kv_a_layernorm.z.attention.lkv_norm.z.self_attn.kv_b_proj..attention.lkv2kv.z.self_attn.o_proj.z.attention.wo.zmlp.gate_proj.zfeed_forward.w1.zmlp.down_proj.zfeed_forward.w2.z mlp.up_proj.zfeed_forward.w3.z mlp.experts.z feed_forward.routed_experts.ffn.zmlp.shared_experts.gate_proj.zfeed_forward.shared_experts.w1.zmlp.shared_experts.down_proj.zfeed_forward.shared_experts.w2.zmlp.shared_experts.up_proj.zfeed_forward.shared_experts.w3.zmlp.gate.weightz/feed_forward.routed_experts.router.dense.weightz mlp.gate.e_score_correction_biasz:feed_forward.routed_experts.router.e_score_correction_biasz.input_layernorm.z.attention_norm.z.post_attention_layernorm.z .ffn_norm.model.norm.weightzmodel.norm_out.weight)replace)rr-rrrconvert_weight_nameEs.                   z-DeepseekV3WeightProcessor.convert_weight_namecCs|jjjj}d|d}||}||||\}}tjt| tj |dd|j |<d|d} || } || ||\} }tjt|  tj | dd|j | <g} g} g}d|d}d|d}d|d}t d |jD]}d|d |d }|j|||d d \}}d|d |d }|j|||dd \}}d|d |d}|j|||d d \}}| || |||qtj| d d}tj| d d}tj|d d}|rd|d}tj||gdd}t|d ddj tj d}tj||dd|j |<ndt|d dd tj }tj||dd|j |<t|d dd tj }tj||dd|j |<t|d dd tj }tj||dd|j |<dS)z process moe router expert weight model.layers.z.mlp.gate.weightFname requires_gradz!.mlp.gate.e_score_correction_biasz*.feed_forward.routed_experts.ffn.w1.weightz*.feed_forward.routed_experts.ffn.w2.weightz*.feed_forward.routed_experts.ffn.w3.weightrz .mlp.experts.z.gate_proj.weight split_axisz.down_proj.weightrz.up_proj.weightr%z5.feed_forward.routed_experts.ffn.w_gate_hidden.weightr#)rN)rrr ffn_concatr1get_safetensor_from_filer Parameter from_numpyastyperparameter_dictr ranger'get_safetensor_from_file_split_tp_groupappendrstackr'permute)r src_hf_dirlayer_id hf_weight_mapr8Zrouter_dense_hf_nameZrouter_dense_ms_nameZrouter_dense_ms_param_Ze_score_correction_bias_hf_nameZe_score_correction_bias_ms_nameZ e_score_correction_bias_ms_paramZw1_listZw2_listZw3_list w1_ms_name w2_ms_name w3_ms_nameindex w1_hf_name w1_ms_param w2_hf_name w2_ms_param w3_hf_name w3_ms_paramZw1_ms_stack_paramZw2_ms_stack_paramZw3_ms_stack_paramw_gate_hidden_namew_gate_hidden_npw_gate_hidden_paramrrr*infer_process_moe_routed_expert_ffn_weight`s                      zDDeepseekV3WeightProcessor.infer_process_moe_routed_expert_ffn_weightc CsL|j|||dd\}}|j|||dd\}}|j|||dd\} }||| fS)Nrr6r)r?) rrKrMrOrCrErLrFrNrPrrrget_moe_shared_expert_weights      z6DeepseekV3WeightProcessor.get_moe_shared_expert_weightcCs"|jjjj}d|d}d|d}d|d}||}||} ||} ||||||\} } } |rd|d}tj| | gdd}t | tj }tj ||dd |j |<nHtj t |  tj |dd |j |<tj t |  tj | dd |j | <tj t |  tj | dd |j | <d S) z*infer process moe shared expert ffn weightr2z$.mlp.shared_experts.gate_proj.weightz$.mlp.shared_experts.down_proj.weightz".mlp.shared_experts.up_proj.weightz1.feed_forward.shared_experts.w_gate_hidden.weightrr%Fr3N)rrrr8r1rUrr'r r;r<rr:r=)rrCrDrEr8rKrMrOrGrHrIrLrNrPrQrRrSrrr*infer_process_moe_shared_expert_ffn_weights<            zDDeepseekV3WeightProcessor.infer_process_moe_shared_expert_ffn_weightcCsL|jjjj}d|d}||}|j|||dd\}}d|d} || } |j| ||dd\} }d|d} || } |j| ||dd\}}|rd|d}tj||gdd }t | tj }tj ||d d |j |<nHtj t | tj |d d |j |<tj t | tj | d d |j | <tj t |  tj | d d |j | <d S) zinfer process dense ffn weightr2z.mlp.gate_proj.weightrr6z.mlp.down_proj.weightrz.mlp.up_proj.weightz".feed_forward.w_gate_hidden.weightr%Fr3N)rrrr8r1r?rr'r r;r<rr:r=)rrCrDrEr8rKrGrLrFrMrHrNrOrIrPrQrRrSrrrinfer_process_dense_ffn_weightsH                z8DeepseekV3WeightProcessor.infer_process_dense_ffn_weightc)Cs|jjjj}|jjjj}|jjjj}|jjjj}|jjjj}||} ||} |jjjj} d|d} | | } | | ||\}}d|d}| |}| |||\}}| | d}| ||}| rt ||fd}d|d}tjt|tj|dd|j|<nHtjt|tj| dd|j| <tjt|tj|dd|j|<d|d }| |}| |||\}}tjt|tj|dd|j|<d|d }| |}| |||\}}| || d}| ||}| || d}|j|dd }tjt|tj|dd|j|<d|d }| |}| |||\}}tjt|tj|dd|j|<d|d }| |}| |||\} }||}!| ||!d} | ddd|ddf| dd|dddf}"}#|" d|"jd}"|j|"dd }"|dd}$tjt|"tj|$dd|j|$<|# d|#jd}#|j|#dd }#|dd}%tjt|#tj|%dd|j|%<d|d}&| |&}'| |&||\}(}|j|(dd }(tjt|(tj|'dd|j|'<dS)infer process attention weightr2z.self_attn.q_a_proj.weightz$.self_attn.kv_a_proj_with_mqa.weightrz.attention.qkv2l.weightFr3z.self_attn.q_a_layernorm.weightz.self_attn.q_b_proj.weightr6z .self_attn.kv_a_layernorm.weightz.self_attn.kv_b_proj.weightNr.z.attention.lkv2kv_k_nope.z.attention.lkv2kv_v.z.self_attn.o_proj.weightr)rrr num_heads kv_lora_rankr) v_head_dimqk_nope_head_dim qkv_concatr1r9reshaper,rr'r r:r;r<rr=Zsplit_weight_by_rankshaper0))rrCrDrErZr[r)r\r]Zrope_dimZ kv_head_dimr^Zq2l_proj_hf_nameZq2l_proj_ms_nameZq_a_proj_ms_paramrFZ kv2l_hf_nameZ kv2l_ms_nameZ kv2l_ms_paramZ wqkv2l_weightZwqkv2l_weight_nameZlq_norm_hf_nameZlq_norm_ms_nameZlq_norm_ms_paramZl2q_proj_hf_nameZl2q_proj_ms_nameZl2q_proj_ms_paramZlkv_norm_hf_nameZlkv_norm_ms_nameZlkv_norm_ms_paramZlkv2kv_hf_nameZlkv2kv_ms_nameZlkv2kv_ms_paramZ lkv2kv_headZ value_k_nopeZvalue_vZ name_k_nopeZname_vZ wo_hf_nameZ wo_ms_nameZ wo_ms_paramrrrinfer_process_attention_weights                           6      z8DeepseekV3WeightProcessor.infer_process_attention_weightc Csd|d}||}||||\}}tjt|tj|dd|j|<d|d}||} ||||\} }tjt| tj| dd|j| <dS)rXr2z.input_layernorm.weightFr3z .post_attention_layernorm.weightN)r1r9r r:r;r<rr=) rrCrDrEZattention_norm_hf_nameZattention_norm_ms_nameZattention_norm_ms_paramrFZffn_norm_hf_nameZffn_norm_ms_nameZffn_norm_ms_paramrrrinfer_process_norm_weight\s&     z3DeepseekV3WeightProcessor.infer_process_norm_weightc Csd}||}||||\}}tjt|tj|dd|j|<d}||}||||\}}tjt|tj|dd|j|<d} || } |jj j s|j | ||dd\}}n|| ||\}}tjt|tj| dd|j| <dS) zconvert weight not in modelzmodel.embed_tokens.weightFr3r/zlm_head.weightrr6N) r1r9r r:r;r<rr=rparallel_config vocab_emb_dpr?) rrCrEZembed_tokens_hf_nameZembed_tokens_ms_namenp_datarFZ norm_hf_nameZ norm_ms_nameZlm_head_hf_nameZlm_head_ms_namerrrinfer_convert_outer_weightrs2        z4DeepseekV3WeightProcessor.infer_convert_outer_weightcCsT|dkr&||||||||n||||||||||||dS)zinfer convert layer weightN)rTrVrWrarb)rrCrDrErrrinfer_convert_layer_weights z4DeepseekV3WeightProcessor.infer_convert_layer_weightFc Cst}d}t|D]}|drd|vr0|jsBd|vr|jrB|rtj||}t|d}t |d}Wdn1s~0Yqq|drtj||}t|d,}t |}| dr|d}Wdn1s0Yqq|st d||d k}|j j jj} |s$d n|j} |s6|jn|j| } |||tt| | d | d D]} ||| |qbt|j|j\} }td t| t|f|`tdS) z$deepseek load safetensors and shard z index.jsonquantr weight_mapNz_name_map.jsonzNot found param_json_path in rzWeight loading)descdisablez%param_not_load: %s, ckpt_not_load: %s)roslistdirendswithr pathjoinopenjsonloadget ValueErrorrrrZnum_nextn_predict_layersrrfrr>rhr load_param_into_netrr=printstrgccollect)rrC is_mtp_modelrank_idZparam_json_pathfilefprEZ enable_tqdmZ mtp_layersZ start_layerZ end_layerrDparam_not_load ckpt_not_loadrrrload_safetensors_shardsF  ,    &  z0DeepseekV3WeightProcessor.load_safetensors_shard)F)__name__ __module__ __qualname____doc__rr,r{r1rTrUrVrWrarbrfrhr __classcell__rrr!rr/s I  'b r)rrorur|numpyrr mindsporer "mindspore.communication.managementrweight_processorrrrrrrrs   large_models/__pycache__/test_standalone_infer.cpython-39.pyc0000644000175100017500000001100515040315744025122 0ustar jenkinsHwHiAiUsera ›h @sdZddlZddlZejejejejeZejdej edddl Z ddl m Z ddlmZddlmZmZmZmZddlmZddlmZed gd d d d ddZdS)z Test module for testing the stand alone Infer interface used for mindformers. How to run this: pytest tests/st/networks/large_models/test_standalone_infer.py Nznetworks/mindformers)no_init_parameters)arg_mark) build_contextMindFormerConfig LlamaConfigLlamaForCausalLM)Qwen2Tokenizer)compare_distanceplatform_ascend910blevel0onecard essential) plat_marks level_mark card_markessential_markcCstjtd}|s"dtjd<tjtjt }tj |d}d}d}d}d}t |}d|_ ||_ ||jj_||jj_||jj_ttjd d |j_t|tfi|jj}d |_tfi|jj} tt|} Wd n1s0Ytj || d d dddddddddd} | D]\} } dddd| ddg}| | j!|ddd}g}| d}t"d| D]}|#|q~| j$||ddd}t"dt%|D],}| &||}t'd |t(||d!d"qq2d S)#z Feature: Infer standalone st Description: Test infer interface for prediction with standalone. Expectation: AssertionError ASCEND_HOME_PATHz/usr/local/Ascend/latestz2./qwen/configs/ci_predict_qwen2_0_5b_instruct.yamlzS/home/workspace/mindspore_dataset/weight/Qwen2.5-0.5B-Instruct-tokenizer/vocab.jsonzS/home/workspace/mindspore_dataset/weight/Qwen2.5-0.5B-Instruct-tokenizer/merges.txtzR/home/workspace/mindspore_dataset/weight/ms_safetensor_qwen2_0.5/model.safetensorsFZ DEVICE_ID0NZ safetensors)Zckpt_file_nameZnetformatu 你好!u<|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 你好!<|im_end|> <|im_start|>assistant 你好!有什么可以帮助你的吗?<|im_end|>)promptansweru用python编写快速排序u<|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1z I believe the meaning of life isa<|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to live a good life, to make a positive impact on the world, and to contribute to society in some way. Others may believe that the meaning of life is to seek knowledge and understanding)systemzYou are a helpful assistant.)ZrolecontentuserrT)tokenizeZadd_generation_promptrr) max_lengthZ do_sampleZreturn_dict_in_generatez0test_qwen2_0_5b_predict_standalone, output_text:gffffff?)Z bench_sim))msZruntimeZset_kernel_launch_grouposgetenvenvironpathdirnamerealpath__file__joinrZ use_parallelZload_checkpointZmodel model_config seq_length processor tokenizerZ vocab_fileZ merges_fileintgetcontextZ device_idrrZcheckpoint_name_or_pathrrritemsencodeZapply_chat_templaterangeappendZgeneratelendecodeprintr )ascend_home_pathcur_dirZ config_pathZvocab_file_pathZmerges_file_pathZload_safetensorsr*configr)r,ZnetworkZ batch_datasZ batch_sizeZ batch_datamessagesZ input_idsZinput_ids_listrioutputsZ output_textr=c/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/test_standalone_infer.py"test_qwen2_0_5b_predict_standalone"sx      &  r?)__doc__r!sysr$r%r&r'Z workspaceinsertr(Z mindsporer Zmindspore.nn.utilsrtests.mark_utilsrZ mindformersrrrrZresearch.qwen2.qwen2_tokenizerrZ similarityr r?r=r=r=r>s$    large_models/qwen/0000755000175100017500000000000015040315702014656 5ustar jenkinsHwHiAiUserlarge_models/qwen/configs/0000755000175100017500000000000015040315702016306 5ustar jenkinsHwHiAiUserlarge_models/qwen/configs/ci_predict_qwen2_0_5b_instruct.yaml0000644000175100017500000000646515040315702025166 0ustar jenkinsHwHiAiUserseed: 0 output_dir: './output' # path to save checkpoint/strategy load_checkpoint: '' src_strategy_path_or_dir: '' auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model only_save_strategy: False resume_training: False use_parallel: False run_mode: 'predict' # trainer config trainer: type: CausalLanguageModelingTrainer model_name: 'qwen2_0_5b' # runner config runner_config: epochs: 5 batch_size: 1 sink_mode: True sink_size: 2 # default parallel of device num = 8 for Atlas 800T A2 parallel_config: data_parallel: 1 model_parallel: 1 pipeline_stage: 1 micro_batch_num: 1 vocab_emb_dp: False gradient_aggregation_group: 4 # when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. micro_batch_interleave_num: 1 model: model_config: type: LlamaConfig batch_size: 1 seq_length: 8192 hidden_size: 896 num_layers: 24 num_heads: 14 n_kv_heads: 2 vocab_size: 151936 intermediate_size: 4864 max_position_embeddings: 32768 qkv_has_bias: True rms_norm_eps: 1.0e-6 theta: 1000000.0 emb_dropout_prob: 0.0 eos_token_id: [151643,151645] pad_token_id: 151643 bos_token_id: 151643 compute_dtype: "bfloat16" layernorm_compute_type: "float32" softmax_compute_type: "float32" rotary_dtype: "bfloat16" param_init_type: "bfloat16" use_past: True use_flash_attention: True block_size: 32 num_blocks: 1024 use_past_shard: False offset: 0 checkpoint_name_or_path: "" repetition_penalty: 1.1 max_decode_length: 512 temperature: 0.7 top_k: 20 top_p: 0.8 do_sample: True is_dynamic: True qkv_concat: True tie_word_embeddings: True auto_map: AutoTokenizer: [qwen2_tokenizer.Qwen2Tokenizer, null] arch: type: LlamaForCausalLM processor: return_tensors: ms tokenizer: model_max_length: 32768 vocab_file: "/path/vocab.json" merges_file: "/path/merges.txt" unk_token: "<|endoftext|>" pad_token: "<|endoftext|>" eos_token: "<|im_end|>" chat_template: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" type: Qwen2Tokenizer auto_register: qwen2_tokenizer.Qwen2Tokenizer type: Qwen2Processor # mindspore context init config context: mode: 0 #0--Graph Mode; 1--Pynative Mode device_target: "Ascend" enable_graph_kernel: False ascend_config: precision_mode: "must_keep_origin_dtype" max_call_depth: 10000 max_device_memory: "59GB" save_graphs: False save_graphs_path: "./graph" device_id: 0 # parallel context config parallel: parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel gradients_mean: False enable_alltoall: False full_batch: True search_mode: "sharding_propagation" enable_parallel_optimizer: False strategy_ckpt_config: save_file: "./ckpt_strategy.ckpt" only_trainable_params: False parallel_optimizer_config: gradient_accumulation_shard: False parallel_optimizer_threshold: 64 large_models/.pytest_cache/0000755000175100017500000000000015040321273016435 5ustar jenkinsHwHiAiUserlarge_models/.pytest_cache/README.md0000644000175100017500000000044715040321273017721 0ustar jenkinsHwHiAiUser# pytest cache directory # This directory contains data from the pytest's cache plugin, which provides the `--lf` and `--ff` options, as well as the `cache` fixture. **Do not** commit this to version control. See [the docs](https://docs.pytest.org/en/stable/cache.html) for more information. large_models/.pytest_cache/CACHEDIR.TAG0000644000175100017500000000030215040321273020127 0ustar jenkinsHwHiAiUserSignature: 8a477f597d28d172789f06886806bc55 # This file is a cache directory tag created by pytest. # For information about cache directory tags, see: # http://www.bford.info/cachedir/spec.html large_models/.pytest_cache/v/0000755000175100017500000000000015040321273016702 5ustar jenkinsHwHiAiUserlarge_models/.pytest_cache/v/cache/0000755000175100017500000000000015040321273017745 5ustar jenkinsHwHiAiUserlarge_models/.pytest_cache/v/cache/lastfailed0000644000175100017500000000011015040321273021770 0ustar jenkinsHwHiAiUser{ "test_parallel_infer.py::TestInferParallel::test_base_cases": true }large_models/.pytest_cache/v/cache/nodeids0000644000175100017500000000010215040321273021306 0ustar jenkinsHwHiAiUser[ "test_parallel_infer.py::TestInferParallel::test_base_cases" ]large_models/.pytest_cache/v/cache/stepwise0000644000175100017500000000000215040321273021523 0ustar jenkinsHwHiAiUser[]large_models/.pytest_cache/.gitignore0000644000175100017500000000004515040321273020424 0ustar jenkinsHwHiAiUser# Created by pytest automatically. * large_models/parallel_qwen2_0_5b_predict_dp2_mp2/0000700000175100017500000000000015040321111022433 5ustar jenkinsHwHiAiUserlarge_models/parallel_qwen2_0_5b_predict_dp2_mp2/worker_1.log0000644000175100017500000040203115040321202024702 0ustar jenkinsHwHiAiUser/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) 2025-07-24 11:02:48,505 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config moe_config is empty. 2025-07-24 11:02:48,539 - mindformers./output/log[mindformers/core/context/build_context.py:168] - INFO - Predict context config, jit_level: O0, infer_boost: on [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:48.540.571 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:48.541.360 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:48.541.822 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:48.541.939 [mindspore/context.py:1655] For 'context.set_context', 'enable_graph_kernel' parameter is deprecated, and will be removed in the next version. Please use jit_config={'jit_level': 'O1'} instead. [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:48.542.071 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:48.542.217 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:48.542.335 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. 2025-07-24 11:02:48,542 - mindformers./output/log[mindformers/core/context/parallel.py:73] - INFO - full_batch is set to False for non-parallel modes [WARNING] DISTRIBUTED(3806915,ffff1f7fe060,python):2025-07-24-11:02:48.544.594 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:59690 to 127.0.0.1:8240 is successfully created. System errno: Success [WARNING] DISTRIBUTED(3806915,ffffb3989f30,python):2025-07-24-11:02:48.544.593 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 20 source: 127.0.0.1:59690, destination: 127.0.0.1:8240 [WARNING] DISTRIBUTED(3806915,ffffb3989f30,python):2025-07-24-11:02:48.544.739 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:59692, destination: 127.0.0.1:8240 [WARNING] DISTRIBUTED(3806915,ffffb3989f30,python):2025-07-24-11:02:48.544.774 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:8240 to be connected...Retry number: 1 [WARNING] DISTRIBUTED(3806915,ffff24fde060,python):2025-07-24-11:02:48.544.772 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:59692 to 127.0.0.1:8240 is successfully created. System errno: Success [WARNING] DISTRIBUTED(3806915,ffffb3989f30,python):2025-07-24-11:02:49.045.193 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/1200). [WARNING] DISTRIBUTED(3806915,ffffb3989f30,python):2025-07-24-11:02:49.545.280 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/1200). [WARNING] DISTRIBUTED(3806915,ffffb3989f30,python):2025-07-24-11:02:50.045.371 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(3806915,ffffb3989f30,python):2025-07-24-11:02:50.045.395 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 1 rank id: 1 [MS_ALLOC_CONF]Runtime config: enable_vmm:False [WARNING] DEVICE(3806915,ffffb3989f30,python):2025-07-24-11:02:50.268.632 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_memory_adapter.cc:155] Initialize] Reserved memory size for other components(2101346304) is less than recommend size(4068122112), It may lead to Out Of Memory in HCCL or other components, Please double check context key 'variable_memory_max_size'/'max_device_memory' [WARNING] DEVICE(3806915,ffffb3989f30,python):2025-07-24-11:02:51.558.446 [mindspore/ccsrc/plugin/res_manager/ascend/collective/multi_ascend_collective_comm_lib.cc:84] Initialize] Loading LCCL because env MS_ENABLE_LCCL is set to on. Pay attention that LCCL only supports communication group within single node in KernelByKernel for now. [WARNING] DISTRIBUTED(3806915,ffffb3989f30,python):2025-07-24-11:02:51.563.743 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(3806915,ffffb3989f30,python):2025-07-24-11:02:51.563.943 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(3806915,fffe8f7fe060,python):2025-07-24-11:02:51.564.132 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:8240, node_rank:2130706433, total_rank_size:4, local_rank_size4 [WARNING] HCCL_ADPT(3806915,fffe8f7fe060,python):2025-07-24-11:02:51.564.214 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(3806915,fffe8f7fe060,python):2025-07-24-11:02:51.564.236 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:632] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(3806915,fffe8f7fe060,python):2025-07-24-11:02:51.564.252 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [WARNING] DISTRIBUTED(3806915,fffe8f7fe060,python):2025-07-24-11:02:51.564.651 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group 2025-07-24 11:02:51,565 - mindformers./output/log[mindformers/tools/utils.py:185] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_1.ckpt' 2025-07-24 11:02:51,567 - mindformers./output/log[mindformers/core/context/build_context.py:383] - INFO - cann workqueue cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191] 2025-07-24 11:02:51,567 - mindformers./output/log[mindformers/core/context/build_context.py:387] - WARNING - CANN use cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], model get empty cpu list, disable binding cores 2025-07-24 11:02:51,568 - mindformers./output/log[mindformers/core/context/build_context.py:395] - INFO - cpu_affinity, rank_id: 1, device_num: 4 2025-07-24 11:02:51,569 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 1, 'capacity_factor': 1.1, 'aux_loss_factor': 0.05, 'num_experts_chosen': 1, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV1', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 0, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': None, 'n_group': None, 'first_k_dense_replace': True, 'moe_intermediate_size': 1407, 'routed_scaling_factor': 1.0, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} 2025-07-24 11:02:51,570 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': False, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} [WARNING] DEVICE(3806915,fffe8effd060,python):2025-07-24-11:02:52.004.654 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 tp_group is:True dp_group is:True 2025-07-24 11:02:52,079 - mindformers./output/log[mindformers/parallel_core/inference/parallel_state.py:358] - INFO - expert_model_parallel_size(1) is not equal to world_size(4), so we will use 4 as the MOE_tensor_parallel_size. [WARNING] DISTRIBUTED(3806915,ffffb3989f30,python):2025-07-24-11:02:52.083.074 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: dp-1-3 [const vector]{1, 3}, async: 0, submit_now: 1 [WARNING] DEVICE(3806915,fffe8effd060,python):2025-07-24-11:02:52.550.084 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group [WARNING] DISTRIBUTED(3806915,fffe8effd060,python):2025-07-24-11:02:52.550.242 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: hccl_world_group [WARNING] DISTRIBUTED(3806915,fffe8effd060,python):2025-07-24-11:02:52.550.279 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: hccl_world_group end. [WARNING] DISTRIBUTED(3806915,fffe8f7fe060,python):2025-07-24-11:02:52.550.409 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: hccl_world_group [WARNING] DISTRIBUTED(3806915,fffe8f7fe060,python):2025-07-24-11:02:52.554.055 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: dp-1-3 [WARNING] DEVICE(3806915,fffe6d0bf060,python):2025-07-24-11:02:52.575.950 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for dp-1-3, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806915,fffe6d0bf060,python):2025-07-24-11:02:52.635.238 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for dp-1-3 [WARNING] DISTRIBUTED(3806915,fffe6d0bf060,python):2025-07-24-11:02:52.635.361 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: dp-1-3 [WARNING] DISTRIBUTED(3806915,fffe6d0bf060,python):2025-07-24-11:02:52.635.387 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: dp-1-3 end. [WARNING] DISTRIBUTED(3806915,fffe8f7fe060,python):2025-07-24-11:02:52.635.452 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: dp-1-3 [WARNING] DISTRIBUTED(3806915,ffffb3989f30,python):2025-07-24-11:02:52.635.712 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: tp-0-1 [const vector]{0, 1}, async: 0, submit_now: 1 [WARNING] DISTRIBUTED(3806915,fffe8f7fe060,python):2025-07-24-11:02:52.636.155 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: tp-0-1 [WARNING] DEVICE(3806915,fffe6d0bf060,python):2025-07-24-11:02:52.647.753 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for tp-0-1, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806915,fffe6d0bf060,python):2025-07-24-11:02:52.702.062 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for tp-0-1 [WARNING] DISTRIBUTED(3806915,fffe6d0bf060,python):2025-07-24-11:02:52.702.426 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: tp-0-1 [WARNING] DISTRIBUTED(3806915,fffe6d0bf060,python):2025-07-24-11:02:52.702.455 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: tp-0-1 end. [WARNING] DISTRIBUTED(3806915,fffe8f7fe060,python):2025-07-24-11:02:52.702.512 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: tp-0-1 [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:52.718.889 [mindspore/ops/primitive.py:220] The in_strategy/in_layout of the operator in your network will not take effect in stand_alone mode. This means the the shard function called in the network is ignored. If you want to enable it, please use semi auto or auto parallel mode by context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL or context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL) data_parallel_group:dp-1-3 tensor_model_parallel_group:tp-0-1 2025-07-24 11:02:53,418 - mindformers./output/log[mindformers/models/modeling_utils.py:1517] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. 2025-07-24 11:02:53,419 - mindformers./output/log[/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/run_parallel.py:178] - INFO - ----------------Transform and load checkpoint---------------- [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.577.098 [mindspore/train/serialization.py:333] The type of model.layers.0.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.578.205 [mindspore/train/serialization.py:333] The type of model.layers.0.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.578.804 [mindspore/train/serialization.py:333] The type of model.layers.1.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.579.524 [mindspore/train/serialization.py:333] The type of model.layers.1.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.580.072 [mindspore/train/serialization.py:333] The type of model.layers.2.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.580.806 [mindspore/train/serialization.py:333] The type of model.layers.2.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.581.407 [mindspore/train/serialization.py:333] The type of model.layers.3.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.582.113 [mindspore/train/serialization.py:333] The type of model.layers.3.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.582.620 [mindspore/train/serialization.py:333] The type of model.layers.4.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.583.324 [mindspore/train/serialization.py:333] The type of model.layers.4.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.583.851 [mindspore/train/serialization.py:333] The type of model.layers.5.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.584.563 [mindspore/train/serialization.py:333] The type of model.layers.5.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.585.058 [mindspore/train/serialization.py:333] The type of model.layers.6.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.585.790 [mindspore/train/serialization.py:333] The type of model.layers.6.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.586.294 [mindspore/train/serialization.py:333] The type of model.layers.7.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.586.964 [mindspore/train/serialization.py:333] The type of model.layers.7.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.587.529 [mindspore/train/serialization.py:333] The type of model.layers.8.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.588.185 [mindspore/train/serialization.py:333] The type of model.layers.8.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.588.727 [mindspore/train/serialization.py:333] The type of model.layers.9.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.589.396 [mindspore/train/serialization.py:333] The type of model.layers.9.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.589.906 [mindspore/train/serialization.py:333] The type of model.layers.10.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.590.566 [mindspore/train/serialization.py:333] The type of model.layers.10.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.591.131 [mindspore/train/serialization.py:333] The type of model.layers.11.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.591.792 [mindspore/train/serialization.py:333] The type of model.layers.11.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.592.284 [mindspore/train/serialization.py:333] The type of model.layers.12.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.592.953 [mindspore/train/serialization.py:333] The type of model.layers.12.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.593.500 [mindspore/train/serialization.py:333] The type of model.layers.13.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.594.172 [mindspore/train/serialization.py:333] The type of model.layers.13.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.594.728 [mindspore/train/serialization.py:333] The type of model.layers.14.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.595.390 [mindspore/train/serialization.py:333] The type of model.layers.14.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.595.935 [mindspore/train/serialization.py:333] The type of model.layers.15.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.596.646 [mindspore/train/serialization.py:333] The type of model.layers.15.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.597.170 [mindspore/train/serialization.py:333] The type of model.layers.16.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.597.895 [mindspore/train/serialization.py:333] The type of model.layers.16.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.598.427 [mindspore/train/serialization.py:333] The type of model.layers.17.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.599.086 [mindspore/train/serialization.py:333] The type of model.layers.17.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.599.575 [mindspore/train/serialization.py:333] The type of model.layers.18.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.600.240 [mindspore/train/serialization.py:333] The type of model.layers.18.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.600.780 [mindspore/train/serialization.py:333] The type of model.layers.19.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.601.493 [mindspore/train/serialization.py:333] The type of model.layers.19.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.602.010 [mindspore/train/serialization.py:333] The type of model.layers.20.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.602.662 [mindspore/train/serialization.py:333] The type of model.layers.20.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.603.174 [mindspore/train/serialization.py:333] The type of model.layers.21.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.603.884 [mindspore/train/serialization.py:333] The type of model.layers.21.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.604.458 [mindspore/train/serialization.py:333] The type of model.layers.22.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.605.119 [mindspore/train/serialization.py:333] The type of model.layers.22.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.605.611 [mindspore/train/serialization.py:333] The type of model.layers.23.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.606.278 [mindspore/train/serialization.py:333] The type of model.layers.23.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.606.769 [mindspore/train/serialization.py:333] The type of model.norm_out.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.609.970 [mindspore/train/serialization.py:1789] For 'load_param_into_net', 48 parameters in the 'net' are not loaded, because they are not in the 'parameter_dict', please check whether the network structure is consistent when training and loading checkpoint. [WARNING] ME(3806915:281473694867248,MainProcess):2025-07-24-11:02:54.610.163 [mindspore/train/serialization.py:1793] ['model.layers.0.attention.paged_attention_mgr.key_cache', 'model.layers.0.attention.paged_attention_mgr.value_cache', 'model.layers.1.attention.paged_attention_mgr.key_cache', 'model.layers.1.attention.paged_attention_mgr.value_cache', 'model.layers.2.attention.paged_attention_mgr.key_cache', 'model.layers.2.attention.paged_attention_mgr.value_cache', 'model.layers.3.attention.paged_attention_mgr.key_cache', 'model.layers.3.attention.paged_attention_mgr.value_cache', 'model.layers.4.attention.paged_attention_mgr.key_cache', 'model.layers.4.attention.paged_attention_mgr.value_cache', 'model.layers.5.attention.paged_attention_mgr.key_cache', 'model.layers.5.attention.paged_attention_mgr.value_cache', 'model.layers.6.attention.paged_attention_mgr.key_cache', 'model.layers.6.attention.paged_attention_mgr.value_cache', 'model.layers.7.attention.paged_attention_mgr.key_cache', 'model.layers.7.attention.paged_attention_mgr.value_cache', 'model.layers.8.attention.paged_attention_mgr.key_cache', 'model.layers.8.attention.paged_attention_mgr.value_cache', 'model.layers.9.attention.paged_attention_mgr.key_cache', 'model.layers.9.attention.paged_attention_mgr.value_cache', 'model.layers.10.attention.paged_attention_mgr.key_cache', 'model.layers.10.attention.paged_attention_mgr.value_cache', 'model.layers.11.attention.paged_attention_mgr.key_cache', 'model.layers.11.attention.paged_attention_mgr.value_cache', 'model.layers.12.attention.paged_attention_mgr.key_cache', 'model.layers.12.attention.paged_attention_mgr.value_cache', 'model.layers.13.attention.paged_attention_mgr.key_cache', 'model.layers.13.attention.paged_attention_mgr.value_cache', 'model.layers.14.attention.paged_attention_mgr.key_cache', 'model.layers.14.attention.paged_attention_mgr.value_cache', 'model.layers.15.attention.paged_attention_mgr.key_cache', 'model.layers.15.attention.paged_attention_mgr.value_cache', 'model.layers.16.attention.paged_attention_mgr.key_cache', 'model.layers.16.attention.paged_attention_mgr.value_cache', 'model.layers.17.attention.paged_attention_mgr.key_cache', 'model.layers.17.attention.paged_attention_mgr.value_cache', 'model.layers.18.attention.paged_attention_mgr.key_cache', 'model.layers.18.attention.paged_attention_mgr.value_cache', 'model.layers.19.attention.paged_attention_mgr.key_cache', 'model.layers.19.attention.paged_attention_mgr.value_cache', 'model.layers.20.attention.paged_attention_mgr.key_cache', 'model.layers.20.attention.paged_attention_mgr.value_cache', 'model.layers.21.attention.paged_attention_mgr.key_cache', 'model.layers.21.attention.paged_attention_mgr.value_cache', 'model.layers.22.attention.paged_attention_mgr.key_cache', 'model.layers.22.attention.paged_attention_mgr.value_cache', 'model.layers.23.attention.paged_attention_mgr.key_cache', 'model.layers.23.attention.paged_attention_mgr.value_cache'] are not loaded. param_not_load: ['model.layers.0.attention.paged_attention_mgr.key_cache', 'model.layers.0.attention.paged_attention_mgr.value_cache', 'model.layers.1.attention.paged_attention_mgr.key_cache', 'model.layers.1.attention.paged_attention_mgr.value_cache', 'model.layers.2.attention.paged_attention_mgr.key_cache', 'model.layers.2.attention.paged_attention_mgr.value_cache', 'model.layers.3.attention.paged_attention_mgr.key_cache', 'model.layers.3.attention.paged_attention_mgr.value_cache', 'model.layers.4.attention.paged_attention_mgr.key_cache', 'model.layers.4.attention.paged_attention_mgr.value_cache', 'model.layers.5.attention.paged_attention_mgr.key_cache', 'model.layers.5.attention.paged_attention_mgr.value_cache', 'model.layers.6.attention.paged_attention_mgr.key_cache', 'model.layers.6.attention.paged_attention_mgr.value_cache', 'model.layers.7.attention.paged_attention_mgr.key_cache', 'model.layers.7.attention.paged_attention_mgr.value_cache', 'model.layers.8.attention.paged_attention_mgr.key_cache', 'model.layers.8.attention.paged_attention_mgr.value_cache', 'model.layers.9.attention.paged_attention_mgr.key_cache', 'model.layers.9.attention.paged_attention_mgr.value_cache', 'model.layers.10.attention.paged_attention_mgr.key_cache', 'model.layers.10.attention.paged_attention_mgr.value_cache', 'model.layers.11.attention.paged_attention_mgr.key_cache', 'model.layers.11.attention.paged_attention_mgr.value_cache', 'model.layers.12.attention.paged_attention_mgr.key_cache', 'model.layers.12.attention.paged_attention_mgr.value_cache', 'model.layers.13.attention.paged_attention_mgr.key_cache', 'model.layers.13.attention.paged_attention_mgr.value_cache', 'model.layers.14.attention.paged_attention_mgr.key_cache', 'model.layers.14.attention.paged_attention_mgr.value_cache', 'model.layers.15.attention.paged_attention_mgr.key_cache', 'model.layers.15.attention.paged_attention_mgr.value_cache', 'model.layers.16.attention.paged_attention_mgr.key_cache', 'model.layers.16.attention.paged_attention_mgr.value_cache', 'model.layers.17.attention.paged_attention_mgr.key_cache', 'model.layers.17.attention.paged_attention_mgr.value_cache', 'model.layers.18.attention.paged_attention_mgr.key_cache', 'model.layers.18.attention.paged_attention_mgr.value_cache', 'model.layers.19.attention.paged_attention_mgr.key_cache', 'model.layers.19.attention.paged_attention_mgr.value_cache', 'model.layers.20.attention.paged_attention_mgr.key_cache', 'model.layers.20.attention.paged_attention_mgr.value_cache', 'model.layers.21.attention.paged_attention_mgr.key_cache', 'model.layers.21.attention.paged_attention_mgr.value_cache', 'model.layers.22.attention.paged_attention_mgr.key_cache', 'model.layers.22.attention.paged_attention_mgr.value_cache', 'model.layers.23.attention.paged_attention_mgr.key_cache', 'model.layers.23.attention.paged_attention_mgr.value_cache'], ckpt_not_load: [] 2025-07-24 11:02:55,352 - mindformers./output/log[mindformers/generation/text_generator.py:726] - WARNING - batch size {batch} can not be divisible by data_parallel {data_parallel}, and would not split. 2025-07-24 11:02:55,354 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:02:55,355 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:02:55,356 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:02:55,357 - mindformers./output/log[mindformers/research/qwen2_5/infer/qwen2_5.py:188] - INFO - Set dynamic input for llama. TotalTime = 14.8424, [24] [bootstrap]: 0.0124656 [type_inference]: 9.28456 [auto_monad]: 0.142586 [graph_reusing]: 0.00626681 [inline]: 1.206e-05 [add_attr]: 0.128321, [1] [add_attr_with_inline]: 0.128265, [1] [Cycle 1]: 0.0516695, [2] [tag_attr]: 0.0346878 [meta_addattr_fg_expand]: 0.0167631 [parallel-infer-symbol]: 6.51996e-06 [pre_auto_parallel]: 0.0425494 [insert-virtual-dataset]: 1.308e-05 [parallel-infer-symbol-second]: 3.76999e-06 [dataset_repeat_opt]: 3.85998e-06 [pipeline_split]: 2.20095e-06 [optimize]: 3.72622, [53] [py_interpret_to_execute]: 0.0435477 [rewriter_before_opt_a]: 0.149485 [opt_a]: 3.24663, [3] [Cycle 1]: 2.5919, [45] [expand_dump_flag]: 0.00225024 [switch_simplify]: 0.052573 [loop_unroll]: 0.0368141 [a_1]: 1.23054 [invalid_dout_check]: 0.00763234 [recompute_prepare]: 0.00720342 [updatestate_depend_eliminate]: 0.0284419 [updatestate_assign_eliminate]: 0.00494715 [updatestate_loads_eliminate]: 0.0133085 [parameter_eliminate]: 1.11e-05 [a_2]: 0.100832 [accelerated_algorithm]: 0.0069519 [shard]: 8.37003e-06 [meta_shard_fg_expand]: 0.00303763 [shard_inline]: 0.00323994 [merge_send_recv]: 0.00305102 [auto_parallel]: 0.0028944 [parallel]: 1.6461e-05 [flash_sp]: 0.00165641 [merge_comm]: 0.00290961 [allreduce_fusion]: 0.00285057 [matmul_add_comm_reduction]: 0.00372084 [allreduce_slice_to_reducescatter]: 1.83994e-06 [virtual_shard_identity]: 0.00313966 [virtual_dataset]: 0.00305274 [get_grad_eliminate_]: 0.00310936 [virtual_output]: 0.00305019 [merge_forward]: 0.00302402 [offload_activation]: 0.00389596 [cell_reuse_recompute_pass]: 5.18095e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00555129 [merge_recompute_call_nodes]: 3.84008e-06 [before_grad]: 0.00530586 [set_forward_comm_id_for_comm_node_pass]: 0.0032352 [meta_fg_expand]: 0.00691886 [flash_sp_send_recv_attached]: 1.067e-05 [receive_attached]: 3.42994e-06 [after_resolve]: 0.00376386 [a_after_grad]: 0.00485509 [renormalize]: 0.959256 [add_forward_monad_depend]: 2.456e-05 [auto_monad_grad]: 3.98001e-06 [auto_monad_eliminator]: 0.0292441 [cse]: 0.0177348 [a_3]: 0.0205787 [Cycle 2]: 0.423703, [45] [expand_dump_flag]: 3.88001e-06 [switch_simplify]: 0.00275925 [loop_unroll]: 0.00275867 [a_1]: 0.0702839 [invalid_dout_check]: 0.00278241 [recompute_prepare]: 0.00264968 [updatestate_depend_eliminate]: 0.00259212 [updatestate_assign_eliminate]: 0.00258808 [updatestate_loads_eliminate]: 0.00257786 [parameter_eliminate]: 7.89994e-06 [a_2]: 0.0455822 [accelerated_algorithm]: 0.00359192 [shard]: 4.6799e-06 [meta_shard_fg_expand]: 0.00152727 [shard_inline]: 0.00285753 [merge_send_recv]: 0.0028541 [auto_parallel]: 0.0027149 [parallel]: 1.364e-05 [flash_sp]: 5.89993e-06 [merge_comm]: 0.00267616 [allreduce_fusion]: 0.00262827 [matmul_add_comm_reduction]: 0.00333085 [allreduce_slice_to_reducescatter]: 1.86998e-06 [virtual_shard_identity]: 0.00286445 [virtual_dataset]: 0.00280161 [get_grad_eliminate_]: 0.00279408 [virtual_output]: 0.0027643 [merge_forward]: 0.00278799 [offload_activation]: 0.00341054 [cell_reuse_recompute_pass]: 5.03997e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00491136 [merge_recompute_call_nodes]: 5.01995e-06 [before_grad]: 0.00474815 [set_forward_comm_id_for_comm_node_pass]: 0.00291606 [meta_fg_expand]: 0.00355729 [flash_sp_send_recv_attached]: 4.22995e-06 [receive_attached]: 4.50003e-06 [after_resolve]: 0.0033455 [a_after_grad]: 0.0043968 [renormalize]: 0.185262 [add_forward_monad_depend]: 1.778e-05 [auto_monad_grad]: 4.40003e-06 [auto_monad_eliminator]: 0.00540465 [cse]: 0.0133462 [a_3]: 0.0206577 [Cycle 3]: 0.230989, [45] [expand_dump_flag]: 5.54009e-06 [switch_simplify]: 0.00276275 [loop_unroll]: 0.00276805 [a_1]: 0.0721771 [invalid_dout_check]: 0.00260894 [recompute_prepare]: 0.00271329 [updatestate_depend_eliminate]: 0.00244965 [updatestate_assign_eliminate]: 0.00236961 [updatestate_loads_eliminate]: 0.00231073 [parameter_eliminate]: 9.69099e-06 [a_2]: 0.0440067 [accelerated_algorithm]: 0.0034881 [shard]: 4.82006e-06 [meta_shard_fg_expand]: 0.00142623 [shard_inline]: 0.00279102 [merge_send_recv]: 0.00271947 [auto_parallel]: 0.00255049 [parallel]: 1.458e-05 [flash_sp]: 3.61993e-06 [merge_comm]: 0.00251458 [allreduce_fusion]: 0.0024897 [matmul_add_comm_reduction]: 0.00323496 [allreduce_slice_to_reducescatter]: 1.55997e-06 [virtual_shard_identity]: 0.00280046 [virtual_dataset]: 0.00272041 [get_grad_eliminate_]: 0.00275331 [virtual_output]: 0.00269816 [merge_forward]: 0.0024947 [offload_activation]: 0.00328451 [cell_reuse_recompute_pass]: 5.24998e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00491696 [merge_recompute_call_nodes]: 4.21004e-06 [before_grad]: 0.00476487 [set_forward_comm_id_for_comm_node_pass]: 0.00300481 [meta_fg_expand]: 0.00342464 [flash_sp_send_recv_attached]: 3.94997e-06 [receive_attached]: 3.69002e-06 [after_resolve]: 0.00334245 [a_after_grad]: 0.00428787 [renormalize]: 4.29922e-07 [add_forward_monad_depend]: 1.09599e-05 [auto_monad_grad]: 3.88001e-06 [auto_monad_eliminator]: 0.00468292 [cse]: 0.00892469 [a_3]: 0.0205689 [py_interpret_to_execute_after_opt_a]: 0.00437708 [slice_cell_reuse_recomputed_activation]: 5.19003e-06 [rewriter_after_opt_a]: 0.0382039 [convert_after_rewriter]: 0.0030176 [order_py_execute_after_rewriter]: 0.00221238 [opt_b]: 0.0915321, [1] [Cycle 1]: 0.091516, [7] [b_1]: 0.0717381 [b_2]: 0.0028071 [updatestate_depend_eliminate]: 0.00268633 [updatestate_assign_eliminate]: 0.00254298 [updatestate_loads_eliminate]: 0.00251238 [renormalize]: 1.77009e-06 [cse]: 0.00904201 [optimize_parallel_all_gather_comm]: 0.00476067 [overlap_param_gather]: 1.73009e-05 [cconv]: 0.00142767 [loop_unroll]: 0.00384953 [opt_after_cconv]: 0.0307627, [1] [Cycle 1]: 0.0307471, [7] [c_1]: 0.0134531 [parameter_eliminate]: 9.54e-06 [updatestate_depend_eliminate]: 0.00330037 [updatestate_assign_eliminate]: 0.00253679 [updatestate_loads_eliminate]: 0.00250536 [cse]: 0.00877746 [renormalize]: 2.27999e-06 [remove_dup_value]: 0.0158421 [tuple_transform]: 0.0188374, [1] [Cycle 1]: 0.0188198, [2] [d_1]: 0.018768 [renormalize]: 6.6997e-07 [partial_unused_args_eliminate]: 9.82999e-06 [add_cache_embedding]: 0.00296544 [add_recomputation]: 0.016909 [cse_after_recomputation]: 0.00508988, [1] [Cycle 1]: 0.00507195, [1] [cse]: 0.00503965 [environ_conv]: 0.00177976 [swap_dp_allreduce_reducescatter]: 0.002705 [bias_add_comm_swap]: 5.86105e-06 [label_micro_interleaved_index]: 1.259e-05 [label_fine_grained_interleaved_index]: 3.39001e-06 [merge_cast_opt]: 1.71002e-06 [slice_recompute_activation]: 2.11003e-06 [micro_interleaved_order_control]: 3.32005e-06 [assign_add_opt]: 2.035e-05 [ForceFp32Comm]: 1.01991e-06 [remove_cast_before_assign_add]: 1.40001e-06 [full_micro_interleaved_order_control]: 2.68e-06 [reorder_send_recv_between_fp_bp]: 2.46998e-06 [comm_op_add_attrs]: 1.07998e-06 [add_comm_op_reuse_tag]: 9.59961e-07 [interleave_split_concat_branches]: 1.37009e-06 [interleave_parallel_branches]: 1.12003e-06 [overlap_opt_shard_in_pipeline]: 5.7291e-05 [overlap_opt_shard_grad_in_pipeline]: 2.36998e-06 [control_data_broadcast_order]: 0.00454246 [grouped_pairwise_exchange_alltoall]: 1.5999e-06 [offloading_packed_experts]: 0.000997416 [overlap_recompute_and_grad_model_parallel]: 0.000990397 [overlap_grad_matmul_and_grad_allreduce]: 3.89002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.52003e-06 [overlap_recompute_comm]: 2.58e-06 [overlap_grad_ring_attention]: 0.000999557 [overlap_grad_flash_sp]: 0.00496228 [begin_end_overlap_inline]: 1.49e-06 [split_matmul_comm_elemetwise]: 3.12098e-06 [split_layernorm_comm]: 1.96998e-06 [handle_group_info]: 1.55997e-06 [symbol_engine_optimizer]: 0.0287836, [1] [Cycle 1]: 0.0287704, [6] [build]: 0.0134338 [elim_shapecalc]: 0.00257387 [elim_not_effective]: 0.00672967 [opt_reshape]: 0.00181444 [fold_const_symbol]: 0.00406441 [renormalize]: 1.41002e-06 [detach_backward]: 4.87105e-06 [pipeline_parallel_scheduler]: 1.87999e-06 [auto_monad_reorder]: 0.00432174 [get_jit_bprop_graph]: 2.64996e-06 [rewriter_after_jit_bprop_graph]: 1.002e-05 [opt_after_jit_grad]: 0.00657484 [distribtued_split]: 0.00475912 [validate]: 0.00356218 [backend_pass]: 3.53996e-06 [task_emit]: 1.47591 [execute]: 1.202e-05 Sums bootstrap : 0.012466s : 0.08% type_inference : 9.284557s : 62.92% auto_monad : 0.142586s : 0.97% graph_reusing : 0.006267s : 0.04% inline : 0.000012s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.034688s : 0.24% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.016763s : 0.11% parallel-infer-symbol : 0.000007s : 0.00% pre_auto_parallel : 0.042549s : 0.29% insert-virtual-dataset : 0.000013s : 0.00% parallel-infer-symbol-second : 0.000004s : 0.00% dataset_repeat_opt : 0.000004s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.043548s : 0.30% optimize.rewriter_before_opt_a : 0.149485s : 1.01% optimize.opt_a.expand_dump_flag : 0.002260s : 0.02% optimize.opt_a.switch_simplify : 0.058095s : 0.39% optimize.opt_a.loop_unroll : 0.042341s : 0.29% optimize.opt_a.a_1 : 1.372997s : 9.30% optimize.opt_a.invalid_dout_check : 0.013024s : 0.09% optimize.opt_a.recompute_prepare : 0.012566s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.033484s : 0.23% optimize.opt_a.updatestate_assign_eliminate : 0.009905s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.018197s : 0.12% optimize.opt_a.parameter_eliminate : 0.000029s : 0.00% optimize.opt_a.a_2 : 0.190421s : 1.29% optimize.opt_a.accelerated_algorithm : 0.014032s : 0.10% optimize.opt_a.shard : 0.000018s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.005991s : 0.04% optimize.opt_a.shard_inline : 0.008888s : 0.06% optimize.opt_a.merge_send_recv : 0.008625s : 0.06% optimize.opt_a.auto_parallel : 0.008160s : 0.06% optimize.opt_a.parallel : 0.000045s : 0.00% optimize.opt_a.flash_sp : 0.001666s : 0.01% optimize.opt_a.merge_comm : 0.008100s : 0.05% optimize.opt_a.allreduce_fusion : 0.007969s : 0.05% optimize.opt_a.matmul_add_comm_reduction : 0.010287s : 0.07% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000005s : 0.00% optimize.opt_a.virtual_shard_identity : 0.008805s : 0.06% optimize.opt_a.virtual_dataset : 0.008575s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.008657s : 0.06% optimize.opt_a.virtual_output : 0.008513s : 0.06% optimize.opt_a.merge_forward : 0.008307s : 0.06% optimize.opt_a.offload_activation : 0.010591s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000015s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.015380s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000013s : 0.00% optimize.opt_a.before_grad : 0.014819s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.009156s : 0.06% optimize.opt_a.meta_fg_expand : 0.013901s : 0.09% optimize.opt_a.flash_sp_send_recv_attached : 0.000019s : 0.00% optimize.opt_a.receive_attached : 0.000012s : 0.00% optimize.opt_a.after_resolve : 0.010452s : 0.07% optimize.opt_a.a_after_grad : 0.013540s : 0.09% optimize.opt_a.renormalize : 1.144518s : 7.76% optimize.opt_a.add_forward_monad_depend : 0.000053s : 0.00% optimize.opt_a.auto_monad_grad : 0.000012s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.039332s : 0.27% optimize.opt_a.cse : 0.040006s : 0.27% optimize.opt_a.a_3 : 0.061805s : 0.42% optimize.py_interpret_to_execute_after_opt_a : 0.004377s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.038204s : 0.26% optimize.convert_after_rewriter : 0.003018s : 0.02% optimize.order_py_execute_after_rewriter : 0.002212s : 0.01% optimize.opt_b.b_1 : 0.071738s : 0.49% optimize.opt_b.b_2 : 0.002807s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.002686s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.002543s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.002512s : 0.02% optimize.opt_b.renormalize : 0.000002s : 0.00% optimize.opt_b.cse : 0.009042s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.004761s : 0.03% optimize.overlap_param_gather : 0.000017s : 0.00% optimize.cconv : 0.001428s : 0.01% optimize.loop_unroll : 0.003850s : 0.03% optimize.opt_after_cconv.c_1 : 0.013453s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000010s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.003300s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.002537s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.002505s : 0.02% optimize.opt_after_cconv.cse : 0.008777s : 0.06% optimize.opt_after_cconv.renormalize : 0.000002s : 0.00% optimize.remove_dup_value : 0.015842s : 0.11% optimize.tuple_transform.d_1 : 0.018768s : 0.13% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.partial_unused_args_eliminate : 0.000010s : 0.00% optimize.add_cache_embedding : 0.002965s : 0.02% optimize.add_recomputation : 0.016909s : 0.11% optimize.cse_after_recomputation.cse : 0.005040s : 0.03% optimize.environ_conv : 0.001780s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.002705s : 0.02% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000020s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000057s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.004542s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000997s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000990s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.001000s : 0.01% optimize.overlap_grad_flash_sp : 0.004962s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000002s : 0.00% optimize.symbol_engine_optimizer.build : 0.013434s : 0.09% optimize.symbol_engine_optimizer.elim_shapecalc : 0.002574s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.006730s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.001814s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.004064s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000005s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.004322s : 0.03% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000010s : 0.00% opt_after_jit_grad : 0.006575s : 0.04% distribtued_split : 0.004759s : 0.03% validate : 0.003562s : 0.02% backend_pass : 0.000004s : 0.00% task_emit : 1.475906s : 10.00% execute : 0.000012s : 0.00% Time group info: ------[substitution.] 0.357715 67729 1.23% : 0.004411s : 1231: substitution.arithmetic_simplify 2.12% : 0.007574s : 1800: substitution.cast_eliminate 0.05% : 0.000190s : 124: substitution.depend_value_elim 0.30% : 0.001080s : 1449: substitution.elim_not_effective 0.32% : 0.001142s : 1234: substitution.float_tuple_getitem_switch 0.40% : 0.001416s : 1135: substitution.fold_const_symbol 0.40% : 0.001423s : 1963: substitution.graph_param_transform 77.66% : 0.277789s : 5675: substitution.inline 0.75% : 0.002691s : 5144: substitution.j_node_and_user_rematch 1.02% : 0.003661s : 1354: substitution.less_batch_normalization 0.41% : 0.001460s : 2708: substitution.load_eliminater 0.48% : 0.001734s : 1473: substitution.minmaximum_grad 0.04% : 0.000151s : 196: substitution.reduce_all_const_elim 0.78% : 0.002802s : 5144: substitution.remove_not_recompute_node 0.16% : 0.000555s : 1029: substitution.replace_old_param 1.08% : 0.003852s : 1449: substitution.reshape_eliminate 0.41% : 0.001454s : 596: substitution.switch_simplify 0.01% : 0.000034s : 6: substitution.transpose_eliminate 1.23% : 0.004413s : 1909: substitution.tuple_list_convert_item_index_to_positive 0.67% : 0.002413s : 2029: substitution.tuple_list_get_item_const_eliminator 1.18% : 0.004206s : 2029: substitution.tuple_list_get_item_depend_reorder 2.38% : 0.008517s : 3684: substitution.tuple_list_get_item_eliminator 0.89% : 0.003187s : 2029: substitution.tuple_list_get_set_item_eliminator 2.66% : 0.009529s : 11069: substitution.updatestate_pure_node_eliminater 3.34% : 0.011965s : 11269: substitution.updatestate_useless_node_eliminater 0.02% : 0.000064s : 1: substitution.value_based_eliminate ------[type_inference.] 9.247307 2 86.69% : 8.016483s : 1: type_inference.infer 13.31% : 1.230824s : 1: type_inference.specialize ------[replace.] 0.093144 8070 7.34% : 0.006835s : 676: replace.cast_eliminate 0.27% : 0.000248s : 24: replace.depend_value_elim 2.03% : 0.001888s : 169: replace.elim_not_effective 69.52% : 0.064755s : 5675: replace.inline 1.88% : 0.001755s : 170: replace.reshape_eliminate 8.77% : 0.008167s : 596: replace.switch_simplify 1.55% : 0.001446s : 120: replace.tuple_list_get_item_depend_reorder 8.62% : 0.008032s : 639: replace.tuple_list_get_item_eliminator 0.02% : 0.000018s : 1: replace.updatestate_pure_node_eliminater ------[match.] 0.282128 8070 1.14% : 0.003219s : 676: match.cast_eliminate 0.00% : 0.000013s : 24: match.depend_value_elim 0.09% : 0.000265s : 169: match.elim_not_effective 97.15% : 0.274079s : 5675: match.inline 0.21% : 0.000589s : 170: match.reshape_eliminate 0.40% : 0.001132s : 596: match.switch_simplify 0.32% : 0.000912s : 120: match.tuple_list_get_item_depend_reorder 0.68% : 0.001915s : 639: match.tuple_list_get_item_eliminator 0.00% : 0.000004s : 1: match.updatestate_pure_node_eliminater ------[predicate.] 0.3198651635139 1.10% : 0.003533s : 24391: predicate.accumulaten_eliminater 0.11% : 0.000350s : 1479: predicate.ad_related_special_op_eliminate 0.78% : 0.002500s : 8454: predicate.addn_check_dump 1.14% : 0.003638s : 24391: predicate.addn_zero_filter 1.10% : 0.003505s : 24391: predicate.adjust_all_reduce_mul_add 2.49% : 0.007949s : 32845: predicate.arithmetic_simplify 1.45% : 0.004637s : 25237: predicate.cast_eliminate 0.59% : 0.001872s : 5893: predicate.check_bprop_eliminate 0.77% : 0.002477s : 8454: predicate.compare_switch_simplify 0.04% : 0.000143s : 1964: predicate.const_output_eliminate 0.79% : 0.002532s : 8479: predicate.depend_value_elim 1.78% : 0.005689s : 25237: predicate.dict_get_item_const_eliminator 1.28% : 0.004087s : 25237: predicate.dict_get_item_eliminator 1.16% : 0.003698s : 25237: predicate.dict_set_item_eliminator 0.25% : 0.000800s : 3443: predicate.dumpgradient_eliminate 0.04% : 0.000140s : 1793: predicate.elim_not_effective 0.15% : 0.000485s : 1963: predicate.elim_shapecalc_of_broadcastargs 1.41% : 0.004526s : 27201: predicate.environ_add_const_eliminate 1.29% : 0.004124s : 27201: predicate.environ_get_add_eliminate 1.39% : 0.004444s : 27201: predicate.environ_get_depend_swap 2.15% : 0.006877s : 35655: predicate.environ_get_eliminate 1.32% : 0.004226s : 27201: predicate.environ_get_set_eliminate 1.43% : 0.004572s : 31672: predicate.exchange_switch_depend_value 1.87% : 0.005984s : 31672: predicate.float_depend_g_call 0.79% : 0.002512s : 8454: predicate.float_environ_get_switch 0.92% : 0.002950s : 10418: predicate.float_tuple_getitem_switch 0.04% : 0.000118s : 1479: predicate.fold_const_symbol 0.60% : 0.001933s : 6179: predicate.get_grad_eliminate 0.06% : 0.000181s : 1963: predicate.graph_param_transform 0.71% : 0.002269s : 8454: predicate.incorporate_call 0.70% : 0.002246s : 8454: predicate.incorporate_call_switch 5.44% : 0.017415s : 74624: predicate.inline 0.71% : 0.002284s : 6179: predicate.inline_without_move 0.15% : 0.000476s : 6179: predicate.j_node_and_user_rematch 0.67% : 0.002134s : 6186: predicate.less_batch_normalization 1.64% : 0.005241s : 29923: predicate.list_to_tuple_eliminator_ 2.64% : 0.008440s : 54315: predicate.load_eliminater 0.20% : 0.000639s : 1964: predicate.loop_unroll_after_grad 3.43% : 0.010973s : 31204: predicate.loop_unroll_before_grad 1.45% : 0.004642s : 29285: predicate.make_slice_get_slice_eliminator 0.79% : 0.002526s : 8454: predicate.merge_addn 0.58% : 0.001848s : 5893: predicate.micro_step_allgather_replace 0.58% : 0.001840s : 5893: predicate.mini_step_allgather_replace 1.09% : 0.003476s : 24391: predicate.minmaximum_grad 0.10% : 0.000330s : 1479: predicate.mutable_eliminate 0.11% : 0.000339s : 1479: predicate.opt_reshape 0.20% : 0.000649s : 1964: predicate.parallel_virtual_node 3.76% : 0.012036s : 31672: predicate.partial_defer_inline 1.41% : 0.004521s : 27960: predicate.partial_eliminate 1.09% : 0.003489s : 24391: predicate.print_const_string_wrapper 0.81% : 0.002594s : 8430: predicate.reduce_all_const_elim 1.31% : 0.004193s : 24391: predicate.reduce_eliminate 2.70% : 0.008623s : 54315: predicate.redundant_stop_gradient_eliminater 0.15% : 0.000484s : 6179: predicate.remove_not_recompute_node 0.98% : 0.003130s : 31889: predicate.replace_applicator 0.15% : 0.000473s : 6179: predicate.replace_old_param 0.04% : 0.000142s : 1964: predicate.reset_defer_inline 1.26% : 0.004038s : 24561: predicate.reshape_eliminate 0.58% : 0.001864s : 5893: predicate.row_tensor_add_zeros_like 0.20% : 0.000651s : 1964: predicate.row_tensor_eliminate 0.64% : 0.002048s : 5893: predicate.same_eliminate 0.22% : 0.000694s : 8875: predicate.set_cell_output_no_recompute 0.60% : 0.001929s : 6179: predicate.shard_identity_eliminate 0.25% : 0.000812s : 3443: predicate.special_op_eliminate 0.89% : 0.002837s : 8454: predicate.specialize_transform 0.62% : 0.001993s : 5893: predicate.split_environ_get_set_with_tuple_value 0.32% : 0.001021s : 6179: predicate.stack_unstack_eliminate 0.09% : 0.000280s : 1964: predicate.switch_call_monad_eliminater 1.62% : 0.005178s : 31672: predicate.switch_defer_inline 2.10% : 0.006718s : 37565: predicate.switch_layer_defer_inline 5.96% : 0.019051s : 72522: predicate.switch_simplify 1.10% : 0.003527s : 24391: predicate.tile_eliminate 1.07% : 0.003425s : 24391: predicate.transpose_eliminate 1.55% : 0.004969s : 29164: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.005169s : 29284: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.004571s : 29284: predicate.tuple_list_get_item_depend_reorder 2.57% : 0.008210s : 38377: predicate.tuple_list_get_item_eliminator 1.45% : 0.004633s : 29284: predicate.tuple_list_get_set_item_eliminator 2.41% : 0.007705s : 37738: predicate.tuple_list_set_item_eliminator 1.62% : 0.005192s : 29923: predicate.tuple_to_list_eliminator_ 2.75% : 0.008790s : 54316: predicate.updatestate_pure_node_eliminater 3.61% : 0.011534s : 62770: predicate.updatestate_useless_node_eliminater 0.21% : 0.000662s : 1964: predicate.value_based_eliminate 0.60% : 0.001928s : 6179: predicate.virtual_dataset_eliminate 0.60% : 0.001922s : 6179: predicate.virtual_output_eliminate 0.18% : 0.000579s : 1964: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.743565 7279 62.91% : 0.467750s : 2038: func_graph_cloner_run.FuncGraphClonerGraph 4.51% : 0.033571s : 304: func_graph_cloner_run.FuncGraphClonerNode 32.58% : 0.242245s : 4937: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 21.820044 253 0.00% : 0.000004s : 1: ForceFp32Comm 0.59% : 0.128341s : 1: add_attr 0.59% : 0.128271s : 1: add_attr_with_inline 0.01% : 0.002982s : 1: add_cache_embedding 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.016943s : 1: add_recomputation 0.00% : 0.000025s : 1: assign_add_opt 0.65% : 0.142646s : 1: auto_monad 0.02% : 0.004344s : 1: auto_monad_reorder 0.00% : 0.000020s : 1: backend_pass 0.00% : 0.000008s : 1: begin_end_overlap_inline 0.00% : 0.000013s : 1: bias_add_comm_swap 0.06% : 0.012538s : 1: bootstrap 0.01% : 0.001442s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.004560s : 1: control_data_broadcast_order 0.01% : 0.003038s : 1: convert_after_rewriter 0.02% : 0.005098s : 1: cse_after_recomputation 0.00% : 0.000009s : 1: dataset_repeat_opt 0.00% : 0.000011s : 1: detach_backward 0.02% : 0.004786s : 1: distribtued_split 0.01% : 0.001796s : 1: environ_conv 0.00% : 0.000025s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000009s : 1: get_jit_bprop_graph 0.03% : 0.006302s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000019s : 1: inline 0.00% : 0.000023s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.02% : 0.003863s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.00% : 0.001007s : 1: offloading_packed_experts 0.01% : 0.002865s : 1: opt.transform.loop_unroll_optimizer 8.47% : 1.849031s : 134: opt.transform.opt_a 0.06% : 0.013447s : 1: opt.transform.opt_after_cconv 0.03% : 0.005683s : 2: opt.transform.opt_after_jit_grad 0.34% : 0.074341s : 28: opt.transform.opt_b 0.09% : 0.018761s : 1: opt.transform.opt_trans_graph 0.07% : 0.015161s : 4: opt.transform.symbol_engine_opt 14.88% : 3.246640s : 1: opt_a 0.14% : 0.030772s : 1: opt_after_cconv 0.03% : 0.006594s : 1: opt_after_jit_grad 0.42% : 0.091539s : 1: opt_b 17.08% : 3.726242s : 1: optimize 0.02% : 0.004780s : 1: optimize_parallel_all_gather_comm 0.01% : 0.002227s : 1: order_py_execute_after_rewriter 0.02% : 0.004983s : 1: overlap_grad_flash_sp 0.00% : 0.000008s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.001009s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000063s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000024s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.001000s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000015s : 1: parallel-infer-symbol 0.00% : 0.000009s : 1: parallel-infer-symbol-second 0.00% : 0.000014s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.20% : 0.042606s : 1: pre_auto_parallel 0.20% : 0.043588s : 1: py_interpret_to_execute 0.02% : 0.004399s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.015877s : 1: remove_dup_value 2.61% : 0.568507s : 2: renormalize.infer 2.64% : 0.575687s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000015s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.038230s : 1: rewriter_after_opt_a 0.69% : 0.149528s : 1: rewriter_before_opt_a 0.00% : 0.000010s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.002722s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.028790s : 1: symbol_engine_optimizer 6.76% : 1.475948s : 1: task_emit 0.09% : 0.018845s : 1: tuple_transform 42.55% : 9.284658s : 1: type_inference 0.03% : 0.007176s : 1: validate [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:12.388.413 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:12.389.960 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:12.393.122 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:12.418.979 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty TotalTime = 13.2573, [24] [bootstrap]: 0.00940556 [type_inference]: 8.36256 [auto_monad]: 0.13441 [graph_reusing]: 0.00217782 [inline]: 3.86999e-06 [add_attr]: 0.123144, [1] [add_attr_with_inline]: 0.123119, [1] [Cycle 1]: 0.0490547, [2] [tag_attr]: 0.0325394 [meta_addattr_fg_expand]: 0.0163998 [parallel-infer-symbol]: 6.85104e-06 [pre_auto_parallel]: 0.0383896 [insert-virtual-dataset]: 1.24e-05 [parallel-infer-symbol-second]: 3.62005e-06 [dataset_repeat_opt]: 3.30003e-06 [pipeline_split]: 2.48989e-06 [optimize]: 3.57741, [53] [py_interpret_to_execute]: 0.0413638 [rewriter_before_opt_a]: 0.146828 [opt_a]: 3.10436, [3] [Cycle 1]: 2.46751, [45] [expand_dump_flag]: 0.00218035 [switch_simplify]: 0.0527313 [loop_unroll]: 0.0363683 [a_1]: 1.13328 [invalid_dout_check]: 0.00876289 [recompute_prepare]: 0.00694056 [updatestate_depend_eliminate]: 0.0277743 [updatestate_assign_eliminate]: 0.00425738 [updatestate_loads_eliminate]: 0.0143925 [parameter_eliminate]: 8.35001e-06 [a_2]: 0.0992364 [accelerated_algorithm]: 0.00656454 [shard]: 3.83996e-06 [meta_shard_fg_expand]: 0.00321135 [shard_inline]: 0.00322388 [merge_send_recv]: 0.00296159 [auto_parallel]: 0.00288151 [parallel]: 1.68e-05 [flash_sp]: 0.00169022 [merge_comm]: 0.00289561 [allreduce_fusion]: 0.00288934 [matmul_add_comm_reduction]: 0.0038275 [allreduce_slice_to_reducescatter]: 1.26997e-06 [virtual_shard_identity]: 0.00302575 [virtual_dataset]: 0.00297011 [get_grad_eliminate_]: 0.00300853 [virtual_output]: 0.00298422 [merge_forward]: 0.00285916 [offload_activation]: 0.0039758 [cell_reuse_recompute_pass]: 3.63996e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00526415 [merge_recompute_call_nodes]: 2.40002e-06 [before_grad]: 0.00522055 [set_forward_comm_id_for_comm_node_pass]: 0.00313489 [meta_fg_expand]: 0.00711631 [flash_sp_send_recv_attached]: 8.42009e-06 [receive_attached]: 3.32005e-06 [after_resolve]: 0.00345179 [a_after_grad]: 0.00475133 [renormalize]: 0.932532 [add_forward_monad_depend]: 2.603e-05 [auto_monad_grad]: 3.79002e-06 [auto_monad_eliminator]: 0.0311688 [cse]: 0.0178241 [a_3]: 0.0210389 [Cycle 2]: 0.401032, [45] [expand_dump_flag]: 5.17e-06 [switch_simplify]: 0.00282117 [loop_unroll]: 0.0028251 [a_1]: 0.0703426 [invalid_dout_check]: 0.00284866 [recompute_prepare]: 0.00268149 [updatestate_depend_eliminate]: 0.00262052 [updatestate_assign_eliminate]: 0.00258965 [updatestate_loads_eliminate]: 0.00259159 [parameter_eliminate]: 7.97003e-06 [a_2]: 0.0457627 [accelerated_algorithm]: 0.00361953 [shard]: 4.22006e-06 [meta_shard_fg_expand]: 0.00153291 [shard_inline]: 0.00286699 [merge_send_recv]: 0.00287113 [auto_parallel]: 0.00281905 [parallel]: 1.31901e-05 [flash_sp]: 6.29004e-06 [merge_comm]: 0.00281048 [allreduce_fusion]: 0.0027579 [matmul_add_comm_reduction]: 0.00349789 [allreduce_slice_to_reducescatter]: 1.27999e-06 [virtual_shard_identity]: 0.00279675 [virtual_dataset]: 0.00280288 [get_grad_eliminate_]: 0.00282278 [virtual_output]: 0.00276478 [merge_forward]: 0.00272479 [offload_activation]: 0.00358175 [cell_reuse_recompute_pass]: 3.73996e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00489469 [merge_recompute_call_nodes]: 2.25997e-06 [before_grad]: 0.0048862 [set_forward_comm_id_for_comm_node_pass]: 0.0030123 [meta_fg_expand]: 0.00386087 [flash_sp_send_recv_attached]: 3.29001e-06 [receive_attached]: 3.34007e-06 [after_resolve]: 0.00325667 [a_after_grad]: 0.00439316 [renormalize]: 0.163176 [add_forward_monad_depend]: 1.77399e-05 [auto_monad_grad]: 4.14997e-06 [auto_monad_eliminator]: 0.00546159 [cse]: 0.0113869 [a_3]: 0.0205318 [Cycle 3]: 0.235786, [45] [expand_dump_flag]: 3.73996e-06 [switch_simplify]: 0.00274263 [loop_unroll]: 0.00273549 [a_1]: 0.0720391 [invalid_dout_check]: 0.00252398 [recompute_prepare]: 0.00264053 [updatestate_depend_eliminate]: 0.00273099 [updatestate_assign_eliminate]: 0.00283081 [updatestate_loads_eliminate]: 0.00272093 [parameter_eliminate]: 7.57002e-06 [a_2]: 0.0449274 [accelerated_algorithm]: 0.00357619 [shard]: 3.82995e-06 [meta_shard_fg_expand]: 0.00144186 [shard_inline]: 0.00281985 [merge_send_recv]: 0.00288419 [auto_parallel]: 0.00300157 [parallel]: 1.39601e-05 [flash_sp]: 3.09991e-06 [merge_comm]: 0.00300647 [allreduce_fusion]: 0.00289672 [matmul_add_comm_reduction]: 0.0038423 [allreduce_slice_to_reducescatter]: 1.39e-06 [virtual_shard_identity]: 0.00275755 [virtual_dataset]: 0.00272025 [get_grad_eliminate_]: 0.0028743 [virtual_output]: 0.00272229 [merge_forward]: 0.00275718 [offload_activation]: 0.0037511 [cell_reuse_recompute_pass]: 3.9899e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00490517 [merge_recompute_call_nodes]: 2.78e-06 [before_grad]: 0.00481968 [set_forward_comm_id_for_comm_node_pass]: 0.00302369 [meta_fg_expand]: 0.00364388 [flash_sp_send_recv_attached]: 3.69002e-06 [receive_attached]: 3.41993e-06 [after_resolve]: 0.00330317 [a_after_grad]: 0.00431424 [renormalize]: 2.69967e-07 [add_forward_monad_depend]: 7.73999e-06 [auto_monad_grad]: 4.21004e-06 [auto_monad_eliminator]: 0.00485905 [cse]: 0.00877694 [a_3]: 0.0203712 [py_interpret_to_execute_after_opt_a]: 0.00423202 [slice_cell_reuse_recomputed_activation]: 4.23996e-06 [rewriter_after_opt_a]: 0.0353943 [convert_after_rewriter]: 0.00278136 [order_py_execute_after_rewriter]: 0.00231252 [opt_b]: 0.0922975, [1] [Cycle 1]: 0.0922844, [7] [b_1]: 0.0726571 [b_2]: 0.00285839 [updatestate_depend_eliminate]: 0.00263444 [updatestate_assign_eliminate]: 0.00258728 [updatestate_loads_eliminate]: 0.00260666 [renormalize]: 1.84006e-06 [cse]: 0.00879646 [optimize_parallel_all_gather_comm]: 0.00498407 [overlap_param_gather]: 6.38003e-06 [cconv]: 0.00149267 [loop_unroll]: 0.0037055 [opt_after_cconv]: 0.0301159, [1] [Cycle 1]: 0.0301059, [7] [c_1]: 0.0132437 [parameter_eliminate]: 6.64999e-06 [updatestate_depend_eliminate]: 0.00296794 [updatestate_assign_eliminate]: 0.00259896 [updatestate_loads_eliminate]: 0.00259878 [cse]: 0.00857736 [renormalize]: 9.69972e-07 [remove_dup_value]: 0.0141094 [tuple_transform]: 0.0184038, [1] [Cycle 1]: 0.0183916, [2] [d_1]: 0.0183555 [renormalize]: 1.36998e-06 [partial_unused_args_eliminate]: 6.59004e-06 [add_cache_embedding]: 0.0027561 [add_recomputation]: 0.0177873 [cse_after_recomputation]: 0.00553543, [1] [Cycle 1]: 0.00542699, [1] [cse]: 0.0054026 [environ_conv]: 0.00174098 [swap_dp_allreduce_reducescatter]: 0.00321709 [bias_add_comm_swap]: 4.10993e-06 [label_micro_interleaved_index]: 1.111e-05 [label_fine_grained_interleaved_index]: 3.43006e-06 [merge_cast_opt]: 1.75997e-06 [slice_recompute_activation]: 2.24006e-06 [micro_interleaved_order_control]: 2.76999e-06 [assign_add_opt]: 2.0531e-05 [ForceFp32Comm]: 1.1801e-06 [remove_cast_before_assign_add]: 1.33994e-06 [full_micro_interleaved_order_control]: 2.91993e-06 [reorder_send_recv_between_fp_bp]: 2.64996e-06 [comm_op_add_attrs]: 1.12003e-06 [add_comm_op_reuse_tag]: 1.30001e-06 [interleave_split_concat_branches]: 1.42993e-06 [interleave_parallel_branches]: 1.20001e-06 [overlap_opt_shard_in_pipeline]: 2.29001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.27999e-06 [control_data_broadcast_order]: 0.00514002 [grouped_pairwise_exchange_alltoall]: 1.92004e-06 [offloading_packed_experts]: 0.00106981 [overlap_recompute_and_grad_model_parallel]: 0.00106529 [overlap_grad_matmul_and_grad_allreduce]: 3.73996e-06 [overlap_recompute_allgather_and_fa_grad]: 1.60991e-06 [overlap_recompute_comm]: 2.42994e-06 [overlap_grad_ring_attention]: 0.00107444 [overlap_grad_flash_sp]: 0.00537742 [begin_end_overlap_inline]: 1.17999e-06 [split_matmul_comm_elemetwise]: 3.84997e-06 [split_layernorm_comm]: 2.21992e-06 [handle_group_info]: 1.61992e-06 [symbol_engine_optimizer]: 0.0292142, [1] [Cycle 1]: 0.0292015, [6] [build]: 0.0135729 [elim_shapecalc]: 0.00259908 [elim_not_effective]: 0.00680571 [opt_reshape]: 0.00181446 [fold_const_symbol]: 0.004225 [renormalize]: 1.85997e-06 [detach_backward]: 5.00109e-06 [pipeline_parallel_scheduler]: 2.72004e-06 [auto_monad_reorder]: 0.00510603 [get_jit_bprop_graph]: 4.17e-06 [rewriter_after_jit_bprop_graph]: 1.8541e-05 [opt_after_jit_grad]: 0.00650962 [distribtued_split]: 0.00483099 [validate]: 0.00377619 [backend_pass]: 3.57011e-06 [task_emit]: 0.985043 [execute]: 1.2071e-05 Sums bootstrap : 0.009406s : 0.07% type_inference : 8.362560s : 63.48% auto_monad : 0.134410s : 1.02% graph_reusing : 0.002178s : 0.02% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.032539s : 0.25% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.016400s : 0.12% parallel-infer-symbol : 0.000007s : 0.00% pre_auto_parallel : 0.038390s : 0.29% insert-virtual-dataset : 0.000012s : 0.00% parallel-infer-symbol-second : 0.000004s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.041364s : 0.31% optimize.rewriter_before_opt_a : 0.146828s : 1.11% optimize.opt_a.expand_dump_flag : 0.002189s : 0.02% optimize.opt_a.switch_simplify : 0.058295s : 0.44% optimize.opt_a.loop_unroll : 0.041929s : 0.32% optimize.opt_a.a_1 : 1.275663s : 9.68% optimize.opt_a.invalid_dout_check : 0.014136s : 0.11% optimize.opt_a.recompute_prepare : 0.012263s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.033126s : 0.25% optimize.opt_a.updatestate_assign_eliminate : 0.009678s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.019705s : 0.15% optimize.opt_a.parameter_eliminate : 0.000024s : 0.00% optimize.opt_a.a_2 : 0.189926s : 1.44% optimize.opt_a.accelerated_algorithm : 0.013760s : 0.10% optimize.opt_a.shard : 0.000012s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.006186s : 0.05% optimize.opt_a.shard_inline : 0.008911s : 0.07% optimize.opt_a.merge_send_recv : 0.008717s : 0.07% optimize.opt_a.auto_parallel : 0.008702s : 0.07% optimize.opt_a.parallel : 0.000044s : 0.00% optimize.opt_a.flash_sp : 0.001700s : 0.01% optimize.opt_a.merge_comm : 0.008713s : 0.07% optimize.opt_a.allreduce_fusion : 0.008544s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.011168s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.008580s : 0.07% optimize.opt_a.virtual_dataset : 0.008493s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.008706s : 0.07% optimize.opt_a.virtual_output : 0.008471s : 0.06% optimize.opt_a.merge_forward : 0.008341s : 0.06% optimize.opt_a.offload_activation : 0.011309s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000011s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.015064s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000007s : 0.00% optimize.opt_a.before_grad : 0.014926s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.009171s : 0.07% optimize.opt_a.meta_fg_expand : 0.014621s : 0.11% optimize.opt_a.flash_sp_send_recv_attached : 0.000015s : 0.00% optimize.opt_a.receive_attached : 0.000010s : 0.00% optimize.opt_a.after_resolve : 0.010012s : 0.08% optimize.opt_a.a_after_grad : 0.013459s : 0.10% optimize.opt_a.renormalize : 1.095708s : 8.32% optimize.opt_a.add_forward_monad_depend : 0.000052s : 0.00% optimize.opt_a.auto_monad_grad : 0.000012s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.041489s : 0.31% optimize.opt_a.cse : 0.037988s : 0.29% optimize.opt_a.a_3 : 0.061942s : 0.47% optimize.py_interpret_to_execute_after_opt_a : 0.004232s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.00% optimize.rewriter_after_opt_a : 0.035394s : 0.27% optimize.convert_after_rewriter : 0.002781s : 0.02% optimize.order_py_execute_after_rewriter : 0.002313s : 0.02% optimize.opt_b.b_1 : 0.072657s : 0.55% optimize.opt_b.b_2 : 0.002858s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.002634s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.002587s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.002607s : 0.02% optimize.opt_b.renormalize : 0.000002s : 0.00% optimize.opt_b.cse : 0.008796s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.004984s : 0.04% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.001493s : 0.01% optimize.loop_unroll : 0.003706s : 0.03% optimize.opt_after_cconv.c_1 : 0.013244s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.002968s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.002599s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.002599s : 0.02% optimize.opt_after_cconv.cse : 0.008577s : 0.07% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.014109s : 0.11% optimize.tuple_transform.d_1 : 0.018355s : 0.14% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.partial_unused_args_eliminate : 0.000007s : 0.00% optimize.add_cache_embedding : 0.002756s : 0.02% optimize.add_recomputation : 0.017787s : 0.14% optimize.cse_after_recomputation.cse : 0.005403s : 0.04% optimize.environ_conv : 0.001741s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.003217s : 0.02% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000011s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000021s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.005140s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.001070s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.001065s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.001074s : 0.01% optimize.overlap_grad_flash_sp : 0.005377s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000002s : 0.00% optimize.symbol_engine_optimizer.build : 0.013573s : 0.10% optimize.symbol_engine_optimizer.elim_shapecalc : 0.002599s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.006806s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.001814s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.004225s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000002s : 0.00% detach_backward : 0.000005s : 0.00% pipeline_parallel_scheduler : 0.000003s : 0.00% auto_monad_reorder : 0.005106s : 0.04% get_jit_bprop_graph : 0.000004s : 0.00% rewriter_after_jit_bprop_graph : 0.000019s : 0.00% opt_after_jit_grad : 0.006510s : 0.05% distribtued_split : 0.004831s : 0.04% validate : 0.003776s : 0.03% backend_pass : 0.000004s : 0.00% task_emit : 0.985043s : 7.48% execute : 0.000012s : 0.00% Time group info: ------[substitution.] 0.347511 64790 1.13% : 0.003931s : 1231: substitution.arithmetic_simplify 1.61% : 0.005599s : 1027: substitution.cast_eliminate 0.05% : 0.000188s : 124: substitution.depend_value_elim 0.32% : 0.001102s : 1468: substitution.elim_not_effective 0.31% : 0.001066s : 1114: substitution.float_tuple_getitem_switch 0.42% : 0.001469s : 1154: substitution.fold_const_symbol 0.41% : 0.001409s : 1984: substitution.graph_param_transform 78.13% : 0.271522s : 5025: substitution.inline 0.81% : 0.002807s : 5110: substitution.j_node_and_user_rematch 0.99% : 0.003435s : 1354: substitution.less_batch_normalization 0.46% : 0.001595s : 3090: substitution.load_eliminater 0.41% : 0.001413s : 1305: substitution.minmaximum_grad 0.04% : 0.000149s : 196: substitution.reduce_all_const_elim 0.79% : 0.002756s : 5110: substitution.remove_not_recompute_node 0.16% : 0.000557s : 1035: substitution.replace_old_param 1.15% : 0.003999s : 1449: substitution.reshape_eliminate 0.43% : 0.001497s : 588: substitution.switch_simplify 0.01% : 0.000036s : 6: substitution.transpose_eliminate 1.15% : 0.003987s : 1693: substitution.tuple_list_convert_item_index_to_positive 0.59% : 0.002062s : 1741: substitution.tuple_list_get_item_const_eliminator 0.98% : 0.003409s : 1741: substitution.tuple_list_get_item_depend_reorder 2.36% : 0.008200s : 3349: substitution.tuple_list_get_item_eliminator 0.81% : 0.002816s : 1741: substitution.tuple_list_get_set_item_eliminator 2.80% : 0.009741s : 10976: substitution.updatestate_pure_node_eliminater 3.67% : 0.012754s : 11178: substitution.updatestate_useless_node_eliminater 0.00% : 0.000013s : 1: substitution.value_based_eliminate ------[type_inference.] 8.327712 2 85.29% : 7.102631s : 1: type_inference.infer 14.71% : 1.225081s : 1: type_inference.specialize ------[replace.] 0.081791 7390 9.07% : 0.007417s : 676: replace.cast_eliminate 0.33% : 0.000271s : 24: replace.depend_value_elim 2.35% : 0.001918s : 169: replace.elim_not_effective 64.24% : 0.052544s : 5025: replace.inline 2.35% : 0.001924s : 170: replace.reshape_eliminate 9.92% : 0.008114s : 588: replace.switch_simplify 0.67% : 0.000545s : 48: replace.tuple_list_get_item_depend_reorder 11.04% : 0.009029s : 688: replace.tuple_list_get_item_eliminator 0.04% : 0.000029s : 2: replace.updatestate_pure_node_eliminater ------[match.] 0.276293 7390 1.25% : 0.003451s : 676: match.cast_eliminate 0.01% : 0.000015s : 24: match.depend_value_elim 0.10% : 0.000283s : 169: match.elim_not_effective 97.04% : 0.268109s : 5025: match.inline 0.27% : 0.000743s : 170: match.reshape_eliminate 0.41% : 0.001124s : 588: match.switch_simplify 0.16% : 0.000429s : 48: match.tuple_list_get_item_depend_reorder 0.77% : 0.002133s : 688: match.tuple_list_get_item_eliminator 0.00% : 0.000005s : 2: match.updatestate_pure_node_eliminater ------[predicate.] 0.3029371525361 1.06% : 0.003223s : 22122: predicate.accumulaten_eliminater 0.12% : 0.000354s : 1500: predicate.ad_related_special_op_eliminate 0.89% : 0.002689s : 8356: predicate.addn_check_dump 1.08% : 0.003258s : 22122: predicate.addn_zero_filter 1.05% : 0.003191s : 22122: predicate.adjust_all_reduce_mul_add 2.41% : 0.007304s : 30478: predicate.arithmetic_simplify 1.23% : 0.003721s : 22968: predicate.cast_eliminate 0.64% : 0.001949s : 5957: predicate.check_bprop_eliminate 0.88% : 0.002677s : 8356: predicate.compare_switch_simplify 0.05% : 0.000143s : 1985: predicate.const_output_eliminate 0.89% : 0.002691s : 8426: predicate.depend_value_elim 1.15% : 0.003476s : 22968: predicate.dict_get_item_const_eliminator 1.22% : 0.003709s : 22968: predicate.dict_get_item_eliminator 1.09% : 0.003309s : 22968: predicate.dict_set_item_eliminator 0.28% : 0.000859s : 3485: predicate.dumpgradient_eliminate 0.05% : 0.000145s : 1814: predicate.elim_not_effective 0.16% : 0.000490s : 1984: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.003831s : 24953: predicate.environ_add_const_eliminate 1.28% : 0.003864s : 24953: predicate.environ_get_add_eliminate 1.27% : 0.003857s : 24953: predicate.environ_get_depend_swap 2.19% : 0.006628s : 33309: predicate.environ_get_eliminate 1.27% : 0.003852s : 24953: predicate.environ_get_set_eliminate 1.37% : 0.004147s : 28731: predicate.exchange_switch_depend_value 1.72% : 0.005205s : 28731: predicate.float_depend_g_call 0.89% : 0.002707s : 8356: predicate.float_environ_get_switch 1.06% : 0.003212s : 10341: predicate.float_tuple_getitem_switch 0.04% : 0.000125s : 1500: predicate.fold_const_symbol 0.67% : 0.002044s : 6151: predicate.get_grad_eliminate 0.06% : 0.000184s : 1984: predicate.graph_param_transform 0.79% : 0.002402s : 8356: predicate.incorporate_call 0.79% : 0.002402s : 8356: predicate.incorporate_call_switch 5.47% : 0.016579s : 69330: predicate.inline 0.78% : 0.002358s : 6151: predicate.inline_without_move 0.16% : 0.000493s : 6151: predicate.j_node_and_user_rematch 0.74% : 0.002251s : 6160: predicate.less_batch_normalization 1.61% : 0.004886s : 27673: predicate.list_to_tuple_eliminator_ 2.56% : 0.007770s : 49796: predicate.load_eliminater 0.22% : 0.000674s : 1985: predicate.loop_unroll_after_grad 3.70% : 0.011210s : 31312: predicate.loop_unroll_before_grad 1.42% : 0.004298s : 26986: predicate.make_slice_get_slice_eliminator 0.89% : 0.002692s : 8356: predicate.merge_addn 0.63% : 0.001921s : 5957: predicate.micro_step_allgather_replace 0.63% : 0.001922s : 5957: predicate.mini_step_allgather_replace 1.07% : 0.003246s : 22122: predicate.minmaximum_grad 0.11% : 0.000334s : 1500: predicate.mutable_eliminate 0.11% : 0.000343s : 1500: predicate.opt_reshape 0.23% : 0.000702s : 1985: predicate.parallel_virtual_node 3.56% : 0.010794s : 28731: predicate.partial_defer_inline 1.30% : 0.003943s : 25689: predicate.partial_eliminate 1.05% : 0.003190s : 22122: predicate.print_const_string_wrapper 0.90% : 0.002714s : 8332: predicate.reduce_all_const_elim 1.39% : 0.004204s : 22122: predicate.reduce_eliminate 2.50% : 0.007569s : 49796: predicate.redundant_stop_gradient_eliminater 0.15% : 0.000453s : 6151: predicate.remove_not_recompute_node 0.90% : 0.002737s : 29661: predicate.replace_applicator 0.16% : 0.000476s : 6151: predicate.replace_old_param 0.05% : 0.000141s : 1985: predicate.reset_defer_inline 1.15% : 0.003473s : 22292: predicate.reshape_eliminate 0.64% : 0.001941s : 5957: predicate.row_tensor_add_zeros_like 0.24% : 0.000712s : 1985: predicate.row_tensor_eliminate 0.71% : 0.002153s : 5957: predicate.same_eliminate 0.23% : 0.000688s : 8821: predicate.set_cell_output_no_recompute 0.67% : 0.002023s : 6151: predicate.shard_identity_eliminate 0.29% : 0.000877s : 3485: predicate.special_op_eliminate 0.99% : 0.003010s : 8356: predicate.specialize_transform 0.69% : 0.002087s : 5957: predicate.split_environ_get_set_with_tuple_value 0.35% : 0.001050s : 6151: predicate.stack_unstack_eliminate 0.09% : 0.000283s : 1985: predicate.switch_call_monad_eliminater 1.58% : 0.004796s : 28731: predicate.switch_defer_inline 2.10% : 0.006360s : 34688: predicate.switch_layer_defer_inline 6.13% : 0.018576s : 69575: predicate.switch_simplify 1.05% : 0.003187s : 22122: predicate.tile_eliminate 1.04% : 0.003147s : 22122: predicate.transpose_eliminate 1.49% : 0.004517s : 26937: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.004546s : 26985: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.004592s : 26985: predicate.tuple_list_get_item_depend_reorder 2.60% : 0.007878s : 36029: predicate.tuple_list_get_item_eliminator 1.47% : 0.004441s : 26985: predicate.tuple_list_get_set_item_eliminator 2.51% : 0.007608s : 35341: predicate.tuple_list_set_item_eliminator 1.56% : 0.004740s : 27673: predicate.tuple_to_list_eliminator_ 2.70% : 0.008170s : 49798: predicate.updatestate_pure_node_eliminater 3.67% : 0.011117s : 58154: predicate.updatestate_useless_node_eliminater 0.23% : 0.000695s : 1985: predicate.value_based_eliminate 0.67% : 0.002035s : 6151: predicate.virtual_dataset_eliminate 0.67% : 0.002032s : 6151: predicate.virtual_output_eliminate 0.21% : 0.000650s : 1985: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.721429 6967 63.94% : 0.461251s : 1979: func_graph_cloner_run.FuncGraphClonerGraph 1.02% : 0.007391s : 116: func_graph_cloner_run.FuncGraphClonerNode 35.04% : 0.252787s : 4872: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 19.933550 253 0.00% : 0.000004s : 1: ForceFp32Comm 0.62% : 0.123163s : 1: add_attr 0.62% : 0.123126s : 1: add_attr_with_inline 0.01% : 0.002769s : 1: add_cache_embedding 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.017807s : 1: add_recomputation 0.00% : 0.000024s : 1: assign_add_opt 0.67% : 0.134462s : 1: auto_monad 0.03% : 0.005141s : 1: auto_monad_reorder 0.00% : 0.000021s : 1: backend_pass 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.05% : 0.009449s : 1: bootstrap 0.01% : 0.001504s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.005154s : 1: control_data_broadcast_order 0.01% : 0.002798s : 1: convert_after_rewriter 0.03% : 0.005543s : 1: cse_after_recomputation 0.00% : 0.000010s : 1: dataset_repeat_opt 0.00% : 0.000011s : 1: detach_backward 0.02% : 0.004860s : 1: distribtued_split 0.01% : 0.001754s : 1: environ_conv 0.00% : 0.000021s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000011s : 1: get_jit_bprop_graph 0.01% : 0.002200s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000010s : 1: inline 0.00% : 0.000022s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000009s : 1: label_fine_grained_interleaved_index 0.00% : 0.000015s : 1: label_micro_interleaved_index 0.02% : 0.003717s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.01% : 0.001080s : 1: offloading_packed_experts 0.01% : 0.002839s : 1: opt.transform.loop_unroll_optimizer 8.78% : 1.749698s : 134: opt.transform.opt_a 0.07% : 0.013240s : 1: opt.transform.opt_after_cconv 0.03% : 0.005603s : 2: opt.transform.opt_after_jit_grad 0.38% : 0.075361s : 28: opt.transform.opt_b 0.09% : 0.018349s : 1: opt.transform.opt_trans_graph 0.08% : 0.015420s : 4: opt.transform.symbol_engine_opt 15.57% : 3.104373s : 1: opt_a 0.15% : 0.030233s : 1: opt_after_cconv 0.03% : 0.006535s : 1: opt_after_jit_grad 0.46% : 0.092303s : 1: opt_b 17.95% : 3.577433s : 1: optimize 0.03% : 0.004999s : 1: optimize_parallel_all_gather_comm 0.01% : 0.002325s : 1: order_py_execute_after_rewriter 0.03% : 0.005397s : 1: overlap_grad_flash_sp 0.00% : 0.000008s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.001085s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000012s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.001075s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000015s : 1: parallel-infer-symbol 0.00% : 0.000009s : 1: parallel-infer-symbol-second 0.00% : 0.000010s : 1: partial_unused_args_eliminate 0.00% : 0.000007s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.19% : 0.038445s : 1: pre_auto_parallel 0.21% : 0.041405s : 1: py_interpret_to_execute 0.02% : 0.004246s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.014325s : 1: remove_dup_value 2.60% : 0.517381s : 2: renormalize.infer 2.90% : 0.578021s : 2: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000024s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.035411s : 1: rewriter_after_opt_a 0.74% : 0.146876s : 1: rewriter_before_opt_a 0.00% : 0.000010s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.003229s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.029221s : 1: symbol_engine_optimizer 4.94% : 0.985082s : 1: task_emit 0.09% : 0.018410s : 1: tuple_transform 41.95% : 8.362618s : 1: type_inference 0.04% : 0.007703s : 1: validate [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:26.876.355 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:26.876.666 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:26.877.546 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:26.877.736 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:26.878.527 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:03:27,026 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 31.66836953163147 s; generated tokens: 8 tokens; generate speed: 0.25261799449476935 tokens/s 2025-07-24 11:03:27,027 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.002757549285888672 s; prefill predict time: 17.069538831710815 s; prefill post time: 0.06441974639892578 s; decode prepare time: 0.0012051037379673549 s; decode predict time: 0.010319113731384277 s; decode post time: 0.00853729248046875 s 2025-07-24 11:03:27,031 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. Building prefix dict from the default dictionary ... DEBUG:jieba:Building prefix dict from the default dictionary ... Loading model from cache /tmp/jieba.cache DEBUG:jieba:Loading model from cache /tmp/jieba.cache Loading model cost 1.284 seconds. DEBUG:jieba:Loading model cost 1.284 seconds. Prefix dict has been built successfully. DEBUG:jieba:Prefix dict has been built successfully. parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 你好!<|im_end|> <|im_start|>assistant 你好!有什么可以帮助你的吗?<|im_end|> parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 你好!<|im_end|> <|im_start|>assistant 你好!有什么可以帮助你的吗?<|im_end|> calculate sim is:1.0 2025-07-24 11:03:28,326 - mindformers./output/log[mindformers/generation/text_generator.py:735] - INFO - The batch is: 4, and the split_size is: 2, and the global_rank_id is: 1, and the dp_rank_id is: 0 and start is: 0, and stop is: 2 2025-07-24 11:03:28,328 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:03:28,329 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:03:28,329 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:03:28,330 - mindformers./output/log[mindformers/research/qwen2_5/infer/qwen2_5.py:188] - INFO - Set dynamic input for llama. [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:28.335.713 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:28.335.888 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:28.336.413 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:28.343.281 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:28.357.052 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:28.357.226 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:28.357.601 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:28.357.781 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:28.357.964 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:03:29,678 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 1.3481841087341309 s; generated tokens: 208 tokens; generate speed: 154.2815989689274 tokens/s 2025-07-24 11:03:29,679 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.0016849040985107422 s; prefill predict time: 0.011673927307128906 s; prefill post time: 0.008545160293579102 s; decode prepare time: 0.0008681514888133818 s; decode predict time: 0.008306054507984835 s; decode post time: 0.0036244693311672767 s 2025-07-24 11:03:29,683 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] return quick_sort(left) + middle + quick_sort(right) # 示例输入 arr = [ parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1 calculate sim is:1.0 parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] return quick_sort(left) + middle + quick_sort(right) # 示例输入 arr = [ parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1 calculate sim is:1.0 2025-07-24 11:03:29,699 - mindformers./output/log[mindformers/generation/text_generator.py:735] - INFO - The batch is: 8, and the split_size is: 4, and the global_rank_id is: 1, and the dp_rank_id is: 0 and start is: 0, and stop is: 4 2025-07-24 11:03:29,701 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:03:29,701 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:03:29,702 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:03:29,702 - mindformers./output/log[mindformers/research/qwen2_5/infer/qwen2_5.py:188] - INFO - Set dynamic input for llama. [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:29.707.471 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:29.707.659 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:29.708.114 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:29.714.146 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:29.728.061 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:29.728.229 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:29.728.574 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:29.728.665 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806915,fffd83fff060,python):2025-07-24-11:03:29.728.859 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:03:30,809 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 1.1067819595336914 s; generated tokens: 408 tokens; generate speed: 368.63629415490135 tokens/s 2025-07-24 11:03:30,810 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.0013515949249267578 s; prefill predict time: 0.01081395149230957 s; prefill post time: 0.007124185562133789 s; decode prepare time: 0.0009387884989823446 s; decode predict time: 0.005060186386108398 s; decode post time: 0.004634642364955185 s 2025-07-24 11:03:30,814 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 large_models/parallel_qwen2_0_5b_predict_dp2_mp2/worker_2.log0000644000175100017500000040201115040321202024701 0ustar jenkinsHwHiAiUser/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) 2025-07-24 11:02:48,654 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config moe_config is empty. 2025-07-24 11:02:48,689 - mindformers./output/log[mindformers/core/context/build_context.py:168] - INFO - Predict context config, jit_level: O0, infer_boost: on [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:48.690.533 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:48.691.351 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:48.691.832 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:48.691.961 [mindspore/context.py:1655] For 'context.set_context', 'enable_graph_kernel' parameter is deprecated, and will be removed in the next version. Please use jit_config={'jit_level': 'O1'} instead. [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:48.692.103 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:48.692.260 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:48.692.385 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. 2025-07-24 11:02:48,692 - mindformers./output/log[mindformers/core/context/parallel.py:73] - INFO - full_batch is set to False for non-parallel modes [WARNING] DISTRIBUTED(3806939,ffff2487b060,python):2025-07-24-11:02:48.695.061 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:59694 to 127.0.0.1:8240 is successfully created. System errno: Success [WARNING] DISTRIBUTED(3806939,ffffb4211f30,python):2025-07-24-11:02:48.695.058 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 20 source: 127.0.0.1:59694, destination: 127.0.0.1:8240 [WARNING] DISTRIBUTED(3806939,ffffb4211f30,python):2025-07-24-11:02:48.695.234 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:59696, destination: 127.0.0.1:8240 [WARNING] DISTRIBUTED(3806939,ffff2587d060,python):2025-07-24-11:02:48.695.264 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:59696 to 127.0.0.1:8240 is successfully created. System errno: Success [WARNING] DISTRIBUTED(3806939,ffffb4211f30,python):2025-07-24-11:02:48.695.274 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:8240 to be connected...Retry number: 1 [WARNING] DISTRIBUTED(3806939,ffffb4211f30,python):2025-07-24-11:02:49.195.740 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/1200). [WARNING] DISTRIBUTED(3806939,ffffb4211f30,python):2025-07-24-11:02:49.695.827 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/1200). [WARNING] DISTRIBUTED(3806939,ffffb4211f30,python):2025-07-24-11:02:50.195.927 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(3806939,ffffb4211f30,python):2025-07-24-11:02:50.195.953 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 2 rank id: 2 [MS_ALLOC_CONF]Runtime config: enable_vmm:False [WARNING] DEVICE(3806939,ffffb4211f30,python):2025-07-24-11:02:50.415.415 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_memory_adapter.cc:155] Initialize] Reserved memory size for other components(2101346304) is less than recommend size(4068166400), It may lead to Out Of Memory in HCCL or other components, Please double check context key 'variable_memory_max_size'/'max_device_memory' [WARNING] DEVICE(3806939,ffffb4211f30,python):2025-07-24-11:02:51.747.728 [mindspore/ccsrc/plugin/res_manager/ascend/collective/multi_ascend_collective_comm_lib.cc:84] Initialize] Loading LCCL because env MS_ENABLE_LCCL is set to on. Pay attention that LCCL only supports communication group within single node in KernelByKernel for now. [WARNING] DISTRIBUTED(3806939,ffffb4211f30,python):2025-07-24-11:02:51.751.283 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(3806939,ffffb4211f30,python):2025-07-24-11:02:51.751.495 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(3806939,fffecc8a0060,python):2025-07-24-11:02:51.751.724 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:8240, node_rank:2130706433, total_rank_size:4, local_rank_size4 [WARNING] HCCL_ADPT(3806939,fffecc8a0060,python):2025-07-24-11:02:51.751.812 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(3806939,fffecc8a0060,python):2025-07-24-11:02:51.751.838 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:632] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(3806939,fffecc8a0060,python):2025-07-24-11:02:51.751.856 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [WARNING] DISTRIBUTED(3806939,fffecc8a0060,python):2025-07-24-11:02:51.752.295 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group 2025-07-24 11:02:51,753 - mindformers./output/log[mindformers/tools/utils.py:185] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_2.ckpt' 2025-07-24 11:02:51,755 - mindformers./output/log[mindformers/core/context/build_context.py:383] - INFO - cann workqueue cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191] 2025-07-24 11:02:51,755 - mindformers./output/log[mindformers/core/context/build_context.py:387] - WARNING - CANN use cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], model get empty cpu list, disable binding cores 2025-07-24 11:02:51,756 - mindformers./output/log[mindformers/core/context/build_context.py:395] - INFO - cpu_affinity, rank_id: 2, device_num: 4 2025-07-24 11:02:51,757 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 1, 'capacity_factor': 1.1, 'aux_loss_factor': 0.05, 'num_experts_chosen': 1, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV1', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 0, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': None, 'n_group': None, 'first_k_dense_replace': True, 'moe_intermediate_size': 1407, 'routed_scaling_factor': 1.0, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} 2025-07-24 11:02:51,758 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': False, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} [WARNING] DEVICE(3806939,fffe7ffff060,python):2025-07-24-11:02:52.004.778 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 tp_group is:True dp_group is:True 2025-07-24 11:02:52,275 - mindformers./output/log[mindformers/parallel_core/inference/parallel_state.py:358] - INFO - expert_model_parallel_size(1) is not equal to world_size(4), so we will use 4 as the MOE_tensor_parallel_size. [WARNING] DISTRIBUTED(3806939,ffffb4211f30,python):2025-07-24-11:02:52.277.638 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: dp-0-2 [const vector]{0, 2}, async: 0, submit_now: 1 [WARNING] DEVICE(3806939,fffe7ffff060,python):2025-07-24-11:02:52.552.528 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group [WARNING] DISTRIBUTED(3806939,fffe7ffff060,python):2025-07-24-11:02:52.552.688 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: hccl_world_group [WARNING] DISTRIBUTED(3806939,fffe7ffff060,python):2025-07-24-11:02:52.552.718 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: hccl_world_group end. [WARNING] DISTRIBUTED(3806939,fffecc8a0060,python):2025-07-24-11:02:52.552.811 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: hccl_world_group [WARNING] DISTRIBUTED(3806939,fffecc8a0060,python):2025-07-24-11:02:52.553.212 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: dp-0-2 [WARNING] DEVICE(3806939,fffe7ffff060,python):2025-07-24-11:02:52.565.239 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for dp-0-2, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806939,fffe7ffff060,python):2025-07-24-11:02:52.799.870 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for dp-0-2 [WARNING] DISTRIBUTED(3806939,fffe7ffff060,python):2025-07-24-11:02:52.799.984 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: dp-0-2 [WARNING] DISTRIBUTED(3806939,fffe7ffff060,python):2025-07-24-11:02:52.800.013 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: dp-0-2 end. [WARNING] DISTRIBUTED(3806939,fffecc8a0060,python):2025-07-24-11:02:52.800.067 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: dp-0-2 [WARNING] DISTRIBUTED(3806939,ffffb4211f30,python):2025-07-24-11:02:52.800.278 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: tp-2-3 [const vector]{2, 3}, async: 0, submit_now: 1 [WARNING] DISTRIBUTED(3806939,fffecc8a0060,python):2025-07-24-11:02:52.804.666 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: tp-2-3 [WARNING] DEVICE(3806939,fffe69dd9060,python):2025-07-24-11:02:52.823.638 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for tp-2-3, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806939,fffe69dd9060,python):2025-07-24-11:02:52.878.342 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for tp-2-3 [WARNING] DISTRIBUTED(3806939,fffe69dd9060,python):2025-07-24-11:02:52.878.435 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: tp-2-3 [WARNING] DISTRIBUTED(3806939,fffe69dd9060,python):2025-07-24-11:02:52.878.462 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: tp-2-3 end. [WARNING] DISTRIBUTED(3806939,fffecc8a0060,python):2025-07-24-11:02:52.878.511 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: tp-2-3 [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:52.894.912 [mindspore/ops/primitive.py:220] The in_strategy/in_layout of the operator in your network will not take effect in stand_alone mode. This means the the shard function called in the network is ignored. If you want to enable it, please use semi auto or auto parallel mode by context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL or context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL) data_parallel_group:dp-0-2 tensor_model_parallel_group:tp-2-3 2025-07-24 11:02:53,547 - mindformers./output/log[mindformers/models/modeling_utils.py:1517] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. 2025-07-24 11:02:53,547 - mindformers./output/log[/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/run_parallel.py:178] - INFO - ----------------Transform and load checkpoint---------------- [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.605.809 [mindspore/train/serialization.py:333] The type of model.layers.0.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.606.664 [mindspore/train/serialization.py:333] The type of model.layers.0.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.607.165 [mindspore/train/serialization.py:333] The type of model.layers.1.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.607.827 [mindspore/train/serialization.py:333] The type of model.layers.1.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.608.305 [mindspore/train/serialization.py:333] The type of model.layers.2.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.608.945 [mindspore/train/serialization.py:333] The type of model.layers.2.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.609.448 [mindspore/train/serialization.py:333] The type of model.layers.3.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.610.097 [mindspore/train/serialization.py:333] The type of model.layers.3.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.610.577 [mindspore/train/serialization.py:333] The type of model.layers.4.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.611.205 [mindspore/train/serialization.py:333] The type of model.layers.4.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.611.673 [mindspore/train/serialization.py:333] The type of model.layers.5.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.612.301 [mindspore/train/serialization.py:333] The type of model.layers.5.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.612.773 [mindspore/train/serialization.py:333] The type of model.layers.6.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.613.417 [mindspore/train/serialization.py:333] The type of model.layers.6.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.613.900 [mindspore/train/serialization.py:333] The type of model.layers.7.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.614.541 [mindspore/train/serialization.py:333] The type of model.layers.7.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.615.011 [mindspore/train/serialization.py:333] The type of model.layers.8.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.615.640 [mindspore/train/serialization.py:333] The type of model.layers.8.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.616.125 [mindspore/train/serialization.py:333] The type of model.layers.9.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.616.759 [mindspore/train/serialization.py:333] The type of model.layers.9.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.617.234 [mindspore/train/serialization.py:333] The type of model.layers.10.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.617.875 [mindspore/train/serialization.py:333] The type of model.layers.10.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.618.363 [mindspore/train/serialization.py:333] The type of model.layers.11.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.618.995 [mindspore/train/serialization.py:333] The type of model.layers.11.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.619.464 [mindspore/train/serialization.py:333] The type of model.layers.12.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.620.105 [mindspore/train/serialization.py:333] The type of model.layers.12.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.620.579 [mindspore/train/serialization.py:333] The type of model.layers.13.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.621.206 [mindspore/train/serialization.py:333] The type of model.layers.13.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.621.691 [mindspore/train/serialization.py:333] The type of model.layers.14.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.622.325 [mindspore/train/serialization.py:333] The type of model.layers.14.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.622.820 [mindspore/train/serialization.py:333] The type of model.layers.15.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.623.455 [mindspore/train/serialization.py:333] The type of model.layers.15.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.623.929 [mindspore/train/serialization.py:333] The type of model.layers.16.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.624.554 [mindspore/train/serialization.py:333] The type of model.layers.16.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.625.036 [mindspore/train/serialization.py:333] The type of model.layers.17.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.625.677 [mindspore/train/serialization.py:333] The type of model.layers.17.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.626.153 [mindspore/train/serialization.py:333] The type of model.layers.18.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.626.797 [mindspore/train/serialization.py:333] The type of model.layers.18.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.627.270 [mindspore/train/serialization.py:333] The type of model.layers.19.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.627.895 [mindspore/train/serialization.py:333] The type of model.layers.19.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.628.366 [mindspore/train/serialization.py:333] The type of model.layers.20.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.629.001 [mindspore/train/serialization.py:333] The type of model.layers.20.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.629.486 [mindspore/train/serialization.py:333] The type of model.layers.21.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.630.131 [mindspore/train/serialization.py:333] The type of model.layers.21.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.630.630 [mindspore/train/serialization.py:333] The type of model.layers.22.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.631.266 [mindspore/train/serialization.py:333] The type of model.layers.22.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.631.742 [mindspore/train/serialization.py:333] The type of model.layers.23.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.632.373 [mindspore/train/serialization.py:333] The type of model.layers.23.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.632.845 [mindspore/train/serialization.py:333] The type of model.norm_out.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.635.957 [mindspore/train/serialization.py:1789] For 'load_param_into_net', 48 parameters in the 'net' are not loaded, because they are not in the 'parameter_dict', please check whether the network structure is consistent when training and loading checkpoint. [WARNING] ME(3806939:281473703812912,MainProcess):2025-07-24-11:02:54.636.107 [mindspore/train/serialization.py:1793] ['model.layers.0.attention.paged_attention_mgr.key_cache', 'model.layers.0.attention.paged_attention_mgr.value_cache', 'model.layers.1.attention.paged_attention_mgr.key_cache', 'model.layers.1.attention.paged_attention_mgr.value_cache', 'model.layers.2.attention.paged_attention_mgr.key_cache', 'model.layers.2.attention.paged_attention_mgr.value_cache', 'model.layers.3.attention.paged_attention_mgr.key_cache', 'model.layers.3.attention.paged_attention_mgr.value_cache', 'model.layers.4.attention.paged_attention_mgr.key_cache', 'model.layers.4.attention.paged_attention_mgr.value_cache', 'model.layers.5.attention.paged_attention_mgr.key_cache', 'model.layers.5.attention.paged_attention_mgr.value_cache', 'model.layers.6.attention.paged_attention_mgr.key_cache', 'model.layers.6.attention.paged_attention_mgr.value_cache', 'model.layers.7.attention.paged_attention_mgr.key_cache', 'model.layers.7.attention.paged_attention_mgr.value_cache', 'model.layers.8.attention.paged_attention_mgr.key_cache', 'model.layers.8.attention.paged_attention_mgr.value_cache', 'model.layers.9.attention.paged_attention_mgr.key_cache', 'model.layers.9.attention.paged_attention_mgr.value_cache', 'model.layers.10.attention.paged_attention_mgr.key_cache', 'model.layers.10.attention.paged_attention_mgr.value_cache', 'model.layers.11.attention.paged_attention_mgr.key_cache', 'model.layers.11.attention.paged_attention_mgr.value_cache', 'model.layers.12.attention.paged_attention_mgr.key_cache', 'model.layers.12.attention.paged_attention_mgr.value_cache', 'model.layers.13.attention.paged_attention_mgr.key_cache', 'model.layers.13.attention.paged_attention_mgr.value_cache', 'model.layers.14.attention.paged_attention_mgr.key_cache', 'model.layers.14.attention.paged_attention_mgr.value_cache', 'model.layers.15.attention.paged_attention_mgr.key_cache', 'model.layers.15.attention.paged_attention_mgr.value_cache', 'model.layers.16.attention.paged_attention_mgr.key_cache', 'model.layers.16.attention.paged_attention_mgr.value_cache', 'model.layers.17.attention.paged_attention_mgr.key_cache', 'model.layers.17.attention.paged_attention_mgr.value_cache', 'model.layers.18.attention.paged_attention_mgr.key_cache', 'model.layers.18.attention.paged_attention_mgr.value_cache', 'model.layers.19.attention.paged_attention_mgr.key_cache', 'model.layers.19.attention.paged_attention_mgr.value_cache', 'model.layers.20.attention.paged_attention_mgr.key_cache', 'model.layers.20.attention.paged_attention_mgr.value_cache', 'model.layers.21.attention.paged_attention_mgr.key_cache', 'model.layers.21.attention.paged_attention_mgr.value_cache', 'model.layers.22.attention.paged_attention_mgr.key_cache', 'model.layers.22.attention.paged_attention_mgr.value_cache', 'model.layers.23.attention.paged_attention_mgr.key_cache', 'model.layers.23.attention.paged_attention_mgr.value_cache'] are not loaded. param_not_load: ['model.layers.0.attention.paged_attention_mgr.key_cache', 'model.layers.0.attention.paged_attention_mgr.value_cache', 'model.layers.1.attention.paged_attention_mgr.key_cache', 'model.layers.1.attention.paged_attention_mgr.value_cache', 'model.layers.2.attention.paged_attention_mgr.key_cache', 'model.layers.2.attention.paged_attention_mgr.value_cache', 'model.layers.3.attention.paged_attention_mgr.key_cache', 'model.layers.3.attention.paged_attention_mgr.value_cache', 'model.layers.4.attention.paged_attention_mgr.key_cache', 'model.layers.4.attention.paged_attention_mgr.value_cache', 'model.layers.5.attention.paged_attention_mgr.key_cache', 'model.layers.5.attention.paged_attention_mgr.value_cache', 'model.layers.6.attention.paged_attention_mgr.key_cache', 'model.layers.6.attention.paged_attention_mgr.value_cache', 'model.layers.7.attention.paged_attention_mgr.key_cache', 'model.layers.7.attention.paged_attention_mgr.value_cache', 'model.layers.8.attention.paged_attention_mgr.key_cache', 'model.layers.8.attention.paged_attention_mgr.value_cache', 'model.layers.9.attention.paged_attention_mgr.key_cache', 'model.layers.9.attention.paged_attention_mgr.value_cache', 'model.layers.10.attention.paged_attention_mgr.key_cache', 'model.layers.10.attention.paged_attention_mgr.value_cache', 'model.layers.11.attention.paged_attention_mgr.key_cache', 'model.layers.11.attention.paged_attention_mgr.value_cache', 'model.layers.12.attention.paged_attention_mgr.key_cache', 'model.layers.12.attention.paged_attention_mgr.value_cache', 'model.layers.13.attention.paged_attention_mgr.key_cache', 'model.layers.13.attention.paged_attention_mgr.value_cache', 'model.layers.14.attention.paged_attention_mgr.key_cache', 'model.layers.14.attention.paged_attention_mgr.value_cache', 'model.layers.15.attention.paged_attention_mgr.key_cache', 'model.layers.15.attention.paged_attention_mgr.value_cache', 'model.layers.16.attention.paged_attention_mgr.key_cache', 'model.layers.16.attention.paged_attention_mgr.value_cache', 'model.layers.17.attention.paged_attention_mgr.key_cache', 'model.layers.17.attention.paged_attention_mgr.value_cache', 'model.layers.18.attention.paged_attention_mgr.key_cache', 'model.layers.18.attention.paged_attention_mgr.value_cache', 'model.layers.19.attention.paged_attention_mgr.key_cache', 'model.layers.19.attention.paged_attention_mgr.value_cache', 'model.layers.20.attention.paged_attention_mgr.key_cache', 'model.layers.20.attention.paged_attention_mgr.value_cache', 'model.layers.21.attention.paged_attention_mgr.key_cache', 'model.layers.21.attention.paged_attention_mgr.value_cache', 'model.layers.22.attention.paged_attention_mgr.key_cache', 'model.layers.22.attention.paged_attention_mgr.value_cache', 'model.layers.23.attention.paged_attention_mgr.key_cache', 'model.layers.23.attention.paged_attention_mgr.value_cache'], ckpt_not_load: [] 2025-07-24 11:02:55,296 - mindformers./output/log[mindformers/generation/text_generator.py:726] - WARNING - batch size {batch} can not be divisible by data_parallel {data_parallel}, and would not split. 2025-07-24 11:02:55,298 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:02:55,299 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:02:55,299 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:02:55,300 - mindformers./output/log[mindformers/research/qwen2_5/infer/qwen2_5.py:188] - INFO - Set dynamic input for llama. TotalTime = 13.846, [24] [bootstrap]: 0.0119969 [type_inference]: 8.5284 [auto_monad]: 0.128124 [graph_reusing]: 0.00509365 [inline]: 4.64998e-06 [add_attr]: 0.114048, [1] [add_attr_with_inline]: 0.113996, [1] [Cycle 1]: 0.0400704, [2] [tag_attr]: 0.0265064 [meta_addattr_fg_expand]: 0.0133586 [parallel-infer-symbol]: 8.09995e-06 [pre_auto_parallel]: 0.032877 [insert-virtual-dataset]: 1.04799e-05 [parallel-infer-symbol-second]: 2.92994e-06 [dataset_repeat_opt]: 3.48e-06 [pipeline_split]: 2.96009e-06 [optimize]: 3.55012, [53] [py_interpret_to_execute]: 0.0352691 [rewriter_before_opt_a]: 0.138407 [opt_a]: 3.0939, [3] [Cycle 1]: 2.46722, [45] [expand_dump_flag]: 0.00215444 [switch_simplify]: 0.0531037 [loop_unroll]: 0.0348769 [a_1]: 1.21823 [invalid_dout_check]: 0.00663384 [recompute_prepare]: 0.00664931 [updatestate_depend_eliminate]: 0.0257839 [updatestate_assign_eliminate]: 0.00420793 [updatestate_loads_eliminate]: 0.0124621 [parameter_eliminate]: 6.40005e-06 [a_2]: 0.104157 [accelerated_algorithm]: 0.00689175 [shard]: 2.83006e-06 [meta_shard_fg_expand]: 0.00277822 [shard_inline]: 0.00330599 [merge_send_recv]: 0.00295634 [auto_parallel]: 0.00303619 [parallel]: 1.31e-05 [flash_sp]: 0.00178604 [merge_comm]: 0.00292975 [allreduce_fusion]: 0.00292404 [matmul_add_comm_reduction]: 0.00379851 [allreduce_slice_to_reducescatter]: 1.32003e-06 [virtual_shard_identity]: 0.00319632 [virtual_dataset]: 0.00311945 [get_grad_eliminate_]: 0.00319217 [virtual_output]: 0.00311552 [merge_forward]: 0.00285315 [offload_activation]: 0.00401714 [cell_reuse_recompute_pass]: 3.01993e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00542353 [merge_recompute_call_nodes]: 2.58e-06 [before_grad]: 0.00527187 [set_forward_comm_id_for_comm_node_pass]: 0.00311131 [meta_fg_expand]: 0.00609067 [flash_sp_send_recv_attached]: 6.59004e-06 [receive_attached]: 1.61401e-05 [after_resolve]: 0.00354214 [a_after_grad]: 0.00491516 [renormalize]: 0.854508 [add_forward_monad_depend]: 2.6301e-05 [auto_monad_grad]: 3.92995e-06 [auto_monad_eliminator]: 0.0288577 [cse]: 0.0154933 [a_3]: 0.0207571 [Cycle 2]: 0.392934, [45] [expand_dump_flag]: 4.13996e-06 [switch_simplify]: 0.00280667 [loop_unroll]: 0.00281771 [a_1]: 0.0716335 [invalid_dout_check]: 0.00264377 [recompute_prepare]: 0.0026937 [updatestate_depend_eliminate]: 0.00256338 [updatestate_assign_eliminate]: 0.00258674 [updatestate_loads_eliminate]: 0.00258681 [parameter_eliminate]: 7.09004e-06 [a_2]: 0.0455977 [accelerated_algorithm]: 0.00359759 [shard]: 3.40003e-06 [meta_shard_fg_expand]: 0.00136975 [shard_inline]: 0.00289618 [merge_send_recv]: 0.00273603 [auto_parallel]: 0.00275538 [parallel]: 1.16001e-05 [flash_sp]: 5.54998e-06 [merge_comm]: 0.0027366 [allreduce_fusion]: 0.00271784 [matmul_add_comm_reduction]: 0.00342553 [allreduce_slice_to_reducescatter]: 1.15996e-06 [virtual_shard_identity]: 0.00281748 [virtual_dataset]: 0.00280161 [get_grad_eliminate_]: 0.00296991 [virtual_output]: 0.00279497 [merge_forward]: 0.00269387 [offload_activation]: 0.00349686 [cell_reuse_recompute_pass]: 3.90992e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00474846 [merge_recompute_call_nodes]: 2.75997e-06 [before_grad]: 0.00472731 [set_forward_comm_id_for_comm_node_pass]: 0.00289821 [meta_fg_expand]: 0.0034542 [flash_sp_send_recv_attached]: 2.94996e-06 [receive_attached]: 3.59991e-06 [after_resolve]: 0.00322512 [a_after_grad]: 0.00435538 [renormalize]: 0.154643 [add_forward_monad_depend]: 1.78601e-05 [auto_monad_grad]: 3.72995e-06 [auto_monad_eliminator]: 0.00497831 [cse]: 0.0122298 [a_3]: 0.0211413 [Cycle 3]: 0.233703, [45] [expand_dump_flag]: 4.05998e-06 [switch_simplify]: 0.00282115 [loop_unroll]: 0.00284455 [a_1]: 0.0713699 [invalid_dout_check]: 0.00222578 [recompute_prepare]: 0.0027315 [updatestate_depend_eliminate]: 0.00253679 [updatestate_assign_eliminate]: 0.00255114 [updatestate_loads_eliminate]: 0.0025802 [parameter_eliminate]: 4.98001e-06 [a_2]: 0.0464696 [accelerated_algorithm]: 0.00359169 [shard]: 3.50992e-06 [meta_shard_fg_expand]: 0.00126769 [shard_inline]: 0.00291594 [merge_send_recv]: 0.00266846 [auto_parallel]: 0.00270651 [parallel]: 1.12901e-05 [flash_sp]: 3.21004e-06 [merge_comm]: 0.00266587 [allreduce_fusion]: 0.00266345 [matmul_add_comm_reduction]: 0.00337427 [allreduce_slice_to_reducescatter]: 1.31002e-06 [virtual_shard_identity]: 0.00285915 [virtual_dataset]: 0.0028431 [get_grad_eliminate_]: 0.00294476 [virtual_output]: 0.00282359 [merge_forward]: 0.00265261 [offload_activation]: 0.00348649 [cell_reuse_recompute_pass]: 3.83996e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00478442 [merge_recompute_call_nodes]: 2.34006e-06 [before_grad]: 0.00474531 [set_forward_comm_id_for_comm_node_pass]: 0.00290058 [meta_fg_expand]: 0.00352899 [flash_sp_send_recv_attached]: 3.00992e-06 [receive_attached]: 4.10003e-06 [after_resolve]: 0.00324938 [a_after_grad]: 0.00440389 [renormalize]: 1.39931e-07 [add_forward_monad_depend]: 4.39002e-06 [auto_monad_grad]: 2.70992e-06 [auto_monad_eliminator]: 0.00441417 [cse]: 0.00813072 [a_3]: 0.0212632 [py_interpret_to_execute_after_opt_a]: 0.00402609 [slice_cell_reuse_recomputed_activation]: 3.40003e-06 [rewriter_after_opt_a]: 0.0362689 [convert_after_rewriter]: 0.00265722 [order_py_execute_after_rewriter]: 0.00226739 [opt_b]: 0.0925965, [1] [Cycle 1]: 0.0925868, [7] [b_1]: 0.0736733 [b_2]: 0.00293123 [updatestate_depend_eliminate]: 0.00254889 [updatestate_assign_eliminate]: 0.00256357 [updatestate_loads_eliminate]: 0.00258817 [renormalize]: 9.60077e-07 [cse]: 0.00816142 [optimize_parallel_all_gather_comm]: 0.00477499 [overlap_param_gather]: 9.36002e-06 [cconv]: 0.00139789 [loop_unroll]: 0.00616787 [opt_after_cconv]: 0.0295374, [1] [Cycle 1]: 0.029529, [7] [c_1]: 0.0133604 [parameter_eliminate]: 4.45999e-06 [updatestate_depend_eliminate]: 0.00290488 [updatestate_assign_eliminate]: 0.00255566 [updatestate_loads_eliminate]: 0.00256555 [cse]: 0.00803355 [renormalize]: 7.89994e-07 [remove_dup_value]: 0.0149032 [tuple_transform]: 0.0180014, [1] [Cycle 1]: 0.0179922, [2] [d_1]: 0.0179644 [renormalize]: 5.00004e-07 [partial_unused_args_eliminate]: 4.89003e-06 [add_cache_embedding]: 0.00290194 [add_recomputation]: 0.0160136 [cse_after_recomputation]: 0.00479929, [1] [Cycle 1]: 0.00478871, [1] [cse]: 0.00476764 [environ_conv]: 0.00171488 [swap_dp_allreduce_reducescatter]: 0.00268749 [bias_add_comm_swap]: 1.814e-05 [label_micro_interleaved_index]: 7.75e-06 [label_fine_grained_interleaved_index]: 3.32994e-06 [merge_cast_opt]: 1.89e-06 [slice_recompute_activation]: 2.02004e-06 [micro_interleaved_order_control]: 3.29001e-06 [assign_add_opt]: 1.769e-05 [ForceFp32Comm]: 1.14995e-06 [remove_cast_before_assign_add]: 1.40991e-06 [full_micro_interleaved_order_control]: 2.68e-06 [reorder_send_recv_between_fp_bp]: 2.55997e-06 [comm_op_add_attrs]: 1.14006e-06 [add_comm_op_reuse_tag]: 1.04005e-06 [interleave_split_concat_branches]: 1.61002e-06 [interleave_parallel_branches]: 1.23994e-06 [overlap_opt_shard_in_pipeline]: 3.584e-05 [overlap_opt_shard_grad_in_pipeline]: 2.41993e-06 [control_data_broadcast_order]: 0.00466478 [grouped_pairwise_exchange_alltoall]: 1.87999e-06 [offloading_packed_experts]: 0.0010346 [overlap_recompute_and_grad_model_parallel]: 0.00105924 [overlap_grad_matmul_and_grad_allreduce]: 3.70992e-06 [overlap_recompute_allgather_and_fa_grad]: 1.18e-05 [overlap_recompute_comm]: 2.85998e-06 [overlap_grad_ring_attention]: 0.00103148 [overlap_grad_flash_sp]: 0.0050923 [begin_end_overlap_inline]: 1.15007e-06 [split_matmul_comm_elemetwise]: 3.06999e-06 [split_layernorm_comm]: 2.51993e-06 [handle_group_info]: 1.20001e-06 [symbol_engine_optimizer]: 0.0280841, [1] [Cycle 1]: 0.028076, [6] [build]: 0.013166 [elim_shapecalc]: 0.00252589 [elim_not_effective]: 0.00635199 [opt_reshape]: 0.00189026 [fold_const_symbol]: 0.00403246 [renormalize]: 3.7998e-07 [detach_backward]: 3.66999e-06 [pipeline_parallel_scheduler]: 1.96008e-06 [auto_monad_reorder]: 0.00419715 [get_jit_bprop_graph]: 3.11004e-06 [rewriter_after_jit_bprop_graph]: 6.44999e-06 [opt_after_jit_grad]: 0.00686389 [distribtued_split]: 0.00463749 [validate]: 0.00350645 [backend_pass]: 2.62994e-06 [task_emit]: 1.45198 [execute]: 1.073e-05 Sums bootstrap : 0.011997s : 0.09% type_inference : 8.528398s : 61.96% auto_monad : 0.128124s : 0.93% graph_reusing : 0.005094s : 0.04% inline : 0.000005s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.026506s : 0.19% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.013359s : 0.10% parallel-infer-symbol : 0.000008s : 0.00% pre_auto_parallel : 0.032877s : 0.24% insert-virtual-dataset : 0.000010s : 0.00% parallel-infer-symbol-second : 0.000003s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000003s : 0.00% optimize.py_interpret_to_execute : 0.035269s : 0.26% optimize.rewriter_before_opt_a : 0.138407s : 1.01% optimize.opt_a.expand_dump_flag : 0.002163s : 0.02% optimize.opt_a.switch_simplify : 0.058732s : 0.43% optimize.opt_a.loop_unroll : 0.040539s : 0.29% optimize.opt_a.a_1 : 1.361233s : 9.89% optimize.opt_a.invalid_dout_check : 0.011503s : 0.08% optimize.opt_a.recompute_prepare : 0.012075s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.030884s : 0.22% optimize.opt_a.updatestate_assign_eliminate : 0.009346s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.017629s : 0.13% optimize.opt_a.parameter_eliminate : 0.000018s : 0.00% optimize.opt_a.a_2 : 0.196224s : 1.43% optimize.opt_a.accelerated_algorithm : 0.014081s : 0.10% optimize.opt_a.shard : 0.000010s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.005416s : 0.04% optimize.opt_a.shard_inline : 0.009118s : 0.07% optimize.opt_a.merge_send_recv : 0.008361s : 0.06% optimize.opt_a.auto_parallel : 0.008498s : 0.06% optimize.opt_a.parallel : 0.000036s : 0.00% optimize.opt_a.flash_sp : 0.001795s : 0.01% optimize.opt_a.merge_comm : 0.008332s : 0.06% optimize.opt_a.allreduce_fusion : 0.008305s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.010598s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.008873s : 0.06% optimize.opt_a.virtual_dataset : 0.008764s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.009107s : 0.07% optimize.opt_a.virtual_output : 0.008734s : 0.06% optimize.opt_a.merge_forward : 0.008200s : 0.06% optimize.opt_a.offload_activation : 0.011000s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000011s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.014956s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000008s : 0.00% optimize.opt_a.before_grad : 0.014744s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.008910s : 0.06% optimize.opt_a.meta_fg_expand : 0.013074s : 0.09% optimize.opt_a.flash_sp_send_recv_attached : 0.000013s : 0.00% optimize.opt_a.receive_attached : 0.000024s : 0.00% optimize.opt_a.after_resolve : 0.010017s : 0.07% optimize.opt_a.a_after_grad : 0.013674s : 0.10% optimize.opt_a.renormalize : 1.009152s : 7.33% optimize.opt_a.add_forward_monad_depend : 0.000049s : 0.00% optimize.opt_a.auto_monad_grad : 0.000010s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.038250s : 0.28% optimize.opt_a.cse : 0.035854s : 0.26% optimize.opt_a.a_3 : 0.063162s : 0.46% optimize.py_interpret_to_execute_after_opt_a : 0.004026s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.036269s : 0.26% optimize.convert_after_rewriter : 0.002657s : 0.02% optimize.order_py_execute_after_rewriter : 0.002267s : 0.02% optimize.opt_b.b_1 : 0.073673s : 0.54% optimize.opt_b.b_2 : 0.002931s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.002549s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.002564s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.002588s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.008161s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.004775s : 0.03% optimize.overlap_param_gather : 0.000009s : 0.00% optimize.cconv : 0.001398s : 0.01% optimize.loop_unroll : 0.006168s : 0.04% optimize.opt_after_cconv.c_1 : 0.013360s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.002905s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.002556s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.002566s : 0.02% optimize.opt_after_cconv.cse : 0.008034s : 0.06% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.014903s : 0.11% optimize.tuple_transform.d_1 : 0.017964s : 0.13% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_cache_embedding : 0.002902s : 0.02% optimize.add_recomputation : 0.016014s : 0.12% optimize.cse_after_recomputation.cse : 0.004768s : 0.03% optimize.environ_conv : 0.001715s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.002687s : 0.02% optimize.bias_add_comm_swap : 0.000018s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000018s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000036s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.004665s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.001035s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.001059s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000012s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.001031s : 0.01% optimize.overlap_grad_flash_sp : 0.005092s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000003s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.013166s : 0.10% optimize.symbol_engine_optimizer.elim_shapecalc : 0.002526s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.006352s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.001890s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.004032s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.004197s : 0.03% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.006864s : 0.05% distribtued_split : 0.004637s : 0.03% validate : 0.003506s : 0.03% backend_pass : 0.000003s : 0.00% task_emit : 1.451977s : 10.55% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.331706 67729 1.12% : 0.003709s : 1231: substitution.arithmetic_simplify 2.00% : 0.006620s : 1800: substitution.cast_eliminate 0.06% : 0.000191s : 124: substitution.depend_value_elim 0.31% : 0.001029s : 1449: substitution.elim_not_effective 0.35% : 0.001153s : 1234: substitution.float_tuple_getitem_switch 0.41% : 0.001363s : 1135: substitution.fold_const_symbol 0.40% : 0.001317s : 1963: substitution.graph_param_transform 77.48% : 0.256997s : 5675: substitution.inline 0.81% : 0.002674s : 5144: substitution.j_node_and_user_rematch 1.07% : 0.003564s : 1354: substitution.less_batch_normalization 0.43% : 0.001418s : 2708: substitution.load_eliminater 0.46% : 0.001516s : 1473: substitution.minmaximum_grad 0.05% : 0.000150s : 196: substitution.reduce_all_const_elim 0.83% : 0.002747s : 5144: substitution.remove_not_recompute_node 0.15% : 0.000502s : 1029: substitution.replace_old_param 1.04% : 0.003457s : 1449: substitution.reshape_eliminate 0.41% : 0.001362s : 596: substitution.switch_simplify 0.01% : 0.000038s : 6: substitution.transpose_eliminate 1.27% : 0.004204s : 1909: substitution.tuple_list_convert_item_index_to_positive 0.69% : 0.002287s : 2029: substitution.tuple_list_get_item_const_eliminator 1.18% : 0.003903s : 2029: substitution.tuple_list_get_item_depend_reorder 2.42% : 0.008024s : 3684: substitution.tuple_list_get_item_eliminator 0.92% : 0.003052s : 2029: substitution.tuple_list_get_set_item_eliminator 2.73% : 0.009054s : 11069: substitution.updatestate_pure_node_eliminater 3.42% : 0.011335s : 11269: substitution.updatestate_useless_node_eliminater 0.01% : 0.000039s : 1: substitution.value_based_eliminate ------[type_inference.] 8.495503 2 86.37% : 7.337559s : 1: type_inference.infer 13.63% : 1.157944s : 1: type_inference.specialize ------[replace.] 0.089342 8070 6.84% : 0.006114s : 676: replace.cast_eliminate 0.22% : 0.000195s : 24: replace.depend_value_elim 1.99% : 0.001775s : 169: replace.elim_not_effective 71.04% : 0.063468s : 5675: replace.inline 1.80% : 0.001610s : 170: replace.reshape_eliminate 8.54% : 0.007634s : 596: replace.switch_simplify 1.49% : 0.001329s : 120: replace.tuple_list_get_item_depend_reorder 8.06% : 0.007199s : 639: replace.tuple_list_get_item_eliminator 0.02% : 0.000018s : 1: replace.updatestate_pure_node_eliminater ------[match.] 0.260446 8070 1.08% : 0.002804s : 676: match.cast_eliminate 0.01% : 0.000014s : 24: match.depend_value_elim 0.09% : 0.000231s : 169: match.elim_not_effective 97.27% : 0.253327s : 5675: match.inline 0.18% : 0.000476s : 170: match.reshape_eliminate 0.40% : 0.001055s : 596: match.switch_simplify 0.29% : 0.000765s : 120: match.tuple_list_get_item_depend_reorder 0.68% : 0.001771s : 639: match.tuple_list_get_item_eliminator 0.00% : 0.000003s : 1: match.updatestate_pure_node_eliminater ------[predicate.] 0.3104601635139 1.10% : 0.003410s : 24391: predicate.accumulaten_eliminater 0.12% : 0.000359s : 1479: predicate.ad_related_special_op_eliminate 0.86% : 0.002676s : 8454: predicate.addn_check_dump 1.11% : 0.003441s : 24391: predicate.addn_zero_filter 1.10% : 0.003425s : 24391: predicate.adjust_all_reduce_mul_add 2.44% : 0.007560s : 32845: predicate.arithmetic_simplify 1.29% : 0.004016s : 25237: predicate.cast_eliminate 0.61% : 0.001900s : 5893: predicate.check_bprop_eliminate 0.86% : 0.002656s : 8454: predicate.compare_switch_simplify 0.05% : 0.000143s : 1964: predicate.const_output_eliminate 0.87% : 0.002716s : 8479: predicate.depend_value_elim 1.22% : 0.003788s : 25237: predicate.dict_get_item_const_eliminator 1.27% : 0.003945s : 25237: predicate.dict_get_item_eliminator 1.14% : 0.003533s : 25237: predicate.dict_set_item_eliminator 0.28% : 0.000857s : 3443: predicate.dumpgradient_eliminate 0.05% : 0.000150s : 1793: predicate.elim_not_effective 0.15% : 0.000479s : 1963: predicate.elim_shapecalc_of_broadcastargs 1.33% : 0.004120s : 27201: predicate.environ_add_const_eliminate 1.32% : 0.004109s : 27201: predicate.environ_get_add_eliminate 1.35% : 0.004188s : 27201: predicate.environ_get_depend_swap 2.25% : 0.006983s : 35655: predicate.environ_get_eliminate 1.31% : 0.004061s : 27201: predicate.environ_get_set_eliminate 1.50% : 0.004668s : 31672: predicate.exchange_switch_depend_value 1.83% : 0.005670s : 31672: predicate.float_depend_g_call 0.86% : 0.002678s : 8454: predicate.float_environ_get_switch 0.99% : 0.003086s : 10418: predicate.float_tuple_getitem_switch 0.04% : 0.000115s : 1479: predicate.fold_const_symbol 0.65% : 0.002016s : 6179: predicate.get_grad_eliminate 0.06% : 0.000178s : 1963: predicate.graph_param_transform 0.77% : 0.002378s : 8454: predicate.incorporate_call 0.77% : 0.002377s : 8454: predicate.incorporate_call_switch 5.67% : 0.017609s : 74624: predicate.inline 0.76% : 0.002347s : 6179: predicate.inline_without_move 0.15% : 0.000466s : 6179: predicate.j_node_and_user_rematch 0.72% : 0.002248s : 6186: predicate.less_batch_normalization 1.52% : 0.004721s : 29923: predicate.list_to_tuple_eliminator_ 2.61% : 0.008096s : 54315: predicate.load_eliminater 0.21% : 0.000661s : 1964: predicate.loop_unroll_after_grad 3.27% : 0.010164s : 31204: predicate.loop_unroll_before_grad 1.48% : 0.004592s : 29285: predicate.make_slice_get_slice_eliminator 0.86% : 0.002667s : 8454: predicate.merge_addn 0.61% : 0.001890s : 5893: predicate.micro_step_allgather_replace 0.61% : 0.001899s : 5893: predicate.mini_step_allgather_replace 1.10% : 0.003429s : 24391: predicate.minmaximum_grad 0.11% : 0.000347s : 1479: predicate.mutable_eliminate 0.11% : 0.000346s : 1479: predicate.opt_reshape 0.22% : 0.000694s : 1964: predicate.parallel_virtual_node 3.30% : 0.010251s : 31672: predicate.partial_defer_inline 1.39% : 0.004316s : 27960: predicate.partial_eliminate 1.09% : 0.003385s : 24391: predicate.print_const_string_wrapper 0.89% : 0.002764s : 8430: predicate.reduce_all_const_elim 1.41% : 0.004368s : 24391: predicate.reduce_eliminate 2.61% : 0.008096s : 54315: predicate.redundant_stop_gradient_eliminater 0.14% : 0.000433s : 6179: predicate.remove_not_recompute_node 0.95% : 0.002942s : 31889: predicate.replace_applicator 0.15% : 0.000456s : 6179: predicate.replace_old_param 0.05% : 0.000143s : 1964: predicate.reset_defer_inline 1.27% : 0.003951s : 24561: predicate.reshape_eliminate 0.61% : 0.001883s : 5893: predicate.row_tensor_add_zeros_like 0.23% : 0.000702s : 1964: predicate.row_tensor_eliminate 0.62% : 0.001938s : 5893: predicate.same_eliminate 0.22% : 0.000690s : 8875: predicate.set_cell_output_no_recompute 0.64% : 0.001987s : 6179: predicate.shard_identity_eliminate 0.28% : 0.000873s : 3443: predicate.special_op_eliminate 0.97% : 0.003016s : 8454: predicate.specialize_transform 0.65% : 0.002022s : 5893: predicate.split_environ_get_set_with_tuple_value 0.31% : 0.000965s : 6179: predicate.stack_unstack_eliminate 0.09% : 0.000269s : 1964: predicate.switch_call_monad_eliminater 1.58% : 0.004896s : 31672: predicate.switch_defer_inline 2.15% : 0.006685s : 37565: predicate.switch_layer_defer_inline 5.70% : 0.017686s : 72522: predicate.switch_simplify 1.12% : 0.003487s : 24391: predicate.tile_eliminate 1.12% : 0.003482s : 24391: predicate.transpose_eliminate 1.49% : 0.004618s : 29164: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.005022s : 29284: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.004976s : 29284: predicate.tuple_list_get_item_depend_reorder 2.57% : 0.007976s : 38377: predicate.tuple_list_get_item_eliminator 1.52% : 0.004730s : 29284: predicate.tuple_list_get_set_item_eliminator 2.55% : 0.007927s : 37738: predicate.tuple_list_set_item_eliminator 1.50% : 0.004651s : 29923: predicate.tuple_to_list_eliminator_ 2.80% : 0.008683s : 54316: predicate.updatestate_pure_node_eliminater 3.54% : 0.011001s : 62770: predicate.updatestate_useless_node_eliminater 0.22% : 0.000690s : 1964: predicate.value_based_eliminate 0.65% : 0.002009s : 6179: predicate.virtual_dataset_eliminate 0.65% : 0.002014s : 6179: predicate.virtual_output_eliminate 0.20% : 0.000618s : 1964: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.673862 7279 61.74% : 0.416021s : 2038: func_graph_cloner_run.FuncGraphClonerGraph 4.35% : 0.029287s : 304: func_graph_cloner_run.FuncGraphClonerNode 33.92% : 0.228553s : 4937: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 20.493578 253 0.00% : 0.000004s : 1: ForceFp32Comm 0.56% : 0.114070s : 1: add_attr 0.56% : 0.114003s : 1: add_attr_with_inline 0.01% : 0.002916s : 1: add_cache_embedding 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.016031s : 1: add_recomputation 0.00% : 0.000022s : 1: assign_add_opt 0.63% : 0.128189s : 1: auto_monad 0.02% : 0.004236s : 1: auto_monad_reorder 0.00% : 0.000015s : 1: backend_pass 0.00% : 0.000007s : 1: begin_end_overlap_inline 0.00% : 0.000024s : 1: bias_add_comm_swap 0.06% : 0.012086s : 1: bootstrap 0.01% : 0.001410s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.004678s : 1: control_data_broadcast_order 0.01% : 0.002673s : 1: convert_after_rewriter 0.02% : 0.004805s : 1: cse_after_recomputation 0.00% : 0.000008s : 1: dataset_repeat_opt 0.00% : 0.000010s : 1: detach_backward 0.02% : 0.004676s : 1: distribtued_split 0.01% : 0.001728s : 1: environ_conv 0.00% : 0.000022s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000010s : 1: get_jit_bprop_graph 0.02% : 0.005118s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000012s : 1: inline 0.00% : 0.000019s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000012s : 1: label_micro_interleaved_index 0.03% : 0.006181s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.01% : 0.001045s : 1: offloading_packed_experts 0.01% : 0.002927s : 1: opt.transform.loop_unroll_optimizer 8.99% : 1.843184s : 134: opt.transform.opt_a 0.07% : 0.013357s : 1: opt.transform.opt_after_cconv 0.03% : 0.006193s : 2: opt.transform.opt_after_jit_grad 0.37% : 0.076461s : 28: opt.transform.opt_b 0.09% : 0.017959s : 1: opt.transform.opt_trans_graph 0.07% : 0.014786s : 4: opt.transform.symbol_engine_opt 15.10% : 3.093916s : 1: opt_a 0.14% : 0.029544s : 1: opt_after_cconv 0.03% : 0.006881s : 1: opt_after_jit_grad 0.45% : 0.092601s : 1: opt_b 17.32% : 3.550137s : 1: optimize 0.02% : 0.004788s : 1: optimize_parallel_all_gather_comm 0.01% : 0.002282s : 1: order_py_execute_after_rewriter 0.02% : 0.005107s : 1: overlap_grad_flash_sp 0.00% : 0.000008s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.001041s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000041s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000016s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.001071s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000016s : 1: parallel-infer-symbol 0.00% : 0.000008s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000007s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.16% : 0.032930s : 1: pre_auto_parallel 0.17% : 0.035317s : 1: py_interpret_to_execute 0.02% : 0.004042s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.014923s : 1: remove_dup_value 2.33% : 0.477891s : 2: renormalize.infer 2.59% : 0.530988s : 2: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.036281s : 1: rewriter_after_opt_a 0.68% : 0.138461s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.002701s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.028089s : 1: symbol_engine_optimizer 7.09% : 1.452012s : 1: task_emit 0.09% : 0.018007s : 1: tuple_transform 41.62% : 8.528466s : 1: type_inference 0.03% : 0.006983s : 1: validate [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:11.663.871 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:11.665.333 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:11.668.388 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:11.692.672 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty TotalTime = 13.2507, [24] [bootstrap]: 0.00894477 [type_inference]: 8.4377 [auto_monad]: 0.127967 [graph_reusing]: 0.00207497 [inline]: 4.59002e-06 [add_attr]: 0.119603, [1] [add_attr_with_inline]: 0.11958, [1] [Cycle 1]: 0.0412368, [2] [tag_attr]: 0.0272801 [meta_addattr_fg_expand]: 0.0138208 [parallel-infer-symbol]: 6.61996e-06 [pre_auto_parallel]: 0.0334701 [insert-virtual-dataset]: 1.08801e-05 [parallel-infer-symbol-second]: 3.68e-06 [dataset_repeat_opt]: 3.63006e-06 [pipeline_split]: 2.48e-06 [optimize]: 3.52798, [53] [py_interpret_to_execute]: 0.0370307 [rewriter_before_opt_a]: 0.139454 [opt_a]: 3.04148, [3] [Cycle 1]: 2.36984, [45] [expand_dump_flag]: 0.00216772 [switch_simplify]: 0.0504658 [loop_unroll]: 0.0344125 [a_1]: 1.07987 [invalid_dout_check]: 0.00657914 [recompute_prepare]: 0.0064154 [updatestate_depend_eliminate]: 0.0253623 [updatestate_assign_eliminate]: 0.00421251 [updatestate_loads_eliminate]: 0.013669 [parameter_eliminate]: 7.85e-06 [a_2]: 0.0988514 [accelerated_algorithm]: 0.00650478 [shard]: 3.37011e-06 [meta_shard_fg_expand]: 0.00280055 [shard_inline]: 0.00313726 [merge_send_recv]: 0.00287754 [auto_parallel]: 0.00292002 [parallel]: 1.403e-05 [flash_sp]: 0.00166399 [merge_comm]: 0.00291088 [allreduce_fusion]: 0.00288076 [matmul_add_comm_reduction]: 0.00376337 [allreduce_slice_to_reducescatter]: 1.42993e-06 [virtual_shard_identity]: 0.00303476 [virtual_dataset]: 0.00300413 [get_grad_eliminate_]: 0.00305509 [virtual_output]: 0.00307984 [merge_forward]: 0.00284431 [offload_activation]: 0.00394116 [cell_reuse_recompute_pass]: 3.55009e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00528225 [merge_recompute_call_nodes]: 2.42994e-06 [before_grad]: 0.00502141 [set_forward_comm_id_for_comm_node_pass]: 0.0031073 [meta_fg_expand]: 0.00651095 [flash_sp_send_recv_attached]: 5.34998e-06 [receive_attached]: 3.36999e-06 [after_resolve]: 0.00337716 [a_after_grad]: 0.00469461 [renormalize]: 0.9039 [add_forward_monad_depend]: 6.042e-05 [auto_monad_grad]: 4.75103e-06 [auto_monad_eliminator]: 0.0270241 [cse]: 0.0186104 [a_3]: 0.0208287 [Cycle 2]: 0.419387, [45] [expand_dump_flag]: 3.82005e-06 [switch_simplify]: 0.00283308 [loop_unroll]: 0.00284711 [a_1]: 0.0747772 [invalid_dout_check]: 0.00304131 [recompute_prepare]: 0.00278878 [updatestate_depend_eliminate]: 0.00268192 [updatestate_assign_eliminate]: 0.00271018 [updatestate_loads_eliminate]: 0.00270089 [parameter_eliminate]: 9.6499e-06 [a_2]: 0.0461452 [accelerated_algorithm]: 0.0036478 [shard]: 3.92995e-06 [meta_shard_fg_expand]: 0.00171342 [shard_inline]: 0.00295268 [merge_send_recv]: 0.00289496 [auto_parallel]: 0.00291583 [parallel]: 1.216e-05 [flash_sp]: 5.61995e-06 [merge_comm]: 0.00288016 [allreduce_fusion]: 0.00286376 [matmul_add_comm_reduction]: 0.00370405 [allreduce_slice_to_reducescatter]: 1.47999e-06 [virtual_shard_identity]: 0.0028266 [virtual_dataset]: 0.00282827 [get_grad_eliminate_]: 0.00284484 [virtual_output]: 0.002859 [merge_forward]: 0.00280915 [offload_activation]: 0.00384411 [cell_reuse_recompute_pass]: 3.38e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.0048929 [merge_recompute_call_nodes]: 2.31003e-06 [before_grad]: 0.00473011 [set_forward_comm_id_for_comm_node_pass]: 0.00307284 [meta_fg_expand]: 0.00381524 [flash_sp_send_recv_attached]: 3.53006e-06 [receive_attached]: 3.59002e-06 [after_resolve]: 0.00325233 [a_after_grad]: 0.00439708 [renormalize]: 0.16492 [add_forward_monad_depend]: 1.824e-05 [auto_monad_grad]: 3.55998e-06 [auto_monad_eliminator]: 0.00530273 [cse]: 0.0192343 [a_3]: 0.0228119 [Cycle 3]: 0.252202, [45] [expand_dump_flag]: 4.99003e-06 [switch_simplify]: 0.00290776 [loop_unroll]: 0.0044692 [a_1]: 0.0718215 [invalid_dout_check]: 0.00313127 [recompute_prepare]: 0.0027781 [updatestate_depend_eliminate]: 0.00275691 [updatestate_assign_eliminate]: 0.00275797 [updatestate_loads_eliminate]: 0.00278523 [parameter_eliminate]: 1.0231e-05 [a_2]: 0.0472175 [accelerated_algorithm]: 0.00385828 [shard]: 4.83007e-06 [meta_shard_fg_expand]: 0.00279553 [shard_inline]: 0.00298172 [merge_send_recv]: 0.002953 [auto_parallel]: 0.00299785 [parallel]: 1.459e-05 [flash_sp]: 4.30003e-06 [merge_comm]: 0.00298208 [allreduce_fusion]: 0.00295661 [matmul_add_comm_reduction]: 0.00418758 [allreduce_slice_to_reducescatter]: 1.51992e-06 [virtual_shard_identity]: 0.00289222 [virtual_dataset]: 0.00291451 [get_grad_eliminate_]: 0.00293166 [virtual_output]: 0.0042728 [merge_forward]: 0.00295599 [offload_activation]: 0.0044313 [cell_reuse_recompute_pass]: 3.85998e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00496944 [merge_recompute_call_nodes]: 2.36998e-06 [before_grad]: 0.00619706 [set_forward_comm_id_for_comm_node_pass]: 0.00363659 [meta_fg_expand]: 0.00392495 [flash_sp_send_recv_attached]: 5.78002e-06 [receive_attached]: 3.28e-06 [after_resolve]: 0.0034978 [a_after_grad]: 0.00528424 [renormalize]: 2.89991e-07 [add_forward_monad_depend]: 5.87991e-06 [auto_monad_grad]: 4.32995e-06 [auto_monad_eliminator]: 0.0057218 [cse]: 0.0102576 [a_3]: 0.0212 [py_interpret_to_execute_after_opt_a]: 0.00541905 [slice_cell_reuse_recomputed_activation]: 3.82005e-06 [rewriter_after_opt_a]: 0.038237 [convert_after_rewriter]: 0.00284031 [order_py_execute_after_rewriter]: 0.0024623 [opt_b]: 0.0952759, [1] [Cycle 1]: 0.0952621, [7] [b_1]: 0.0748275 [b_2]: 0.00297342 [updatestate_depend_eliminate]: 0.00271447 [updatestate_assign_eliminate]: 0.00274483 [updatestate_loads_eliminate]: 0.00275329 [renormalize]: 1.10001e-06 [cse]: 0.00912147 [optimize_parallel_all_gather_comm]: 0.00546159 [overlap_param_gather]: 3.11399e-05 [cconv]: 0.00194081 [loop_unroll]: 0.00446419 [opt_after_cconv]: 0.0310489, [1] [Cycle 1]: 0.0310403, [7] [c_1]: 0.0132839 [parameter_eliminate]: 4.24997e-06 [updatestate_depend_eliminate]: 0.00318777 [updatestate_assign_eliminate]: 0.0027224 [updatestate_loads_eliminate]: 0.00272669 [cse]: 0.00900274 [renormalize]: 6.10016e-07 [remove_dup_value]: 0.022477 [tuple_transform]: 0.0191392, [1] [Cycle 1]: 0.0191262, [2] [d_1]: 0.0190904 [renormalize]: 4.60073e-07 [partial_unused_args_eliminate]: 4.92006e-06 [add_cache_embedding]: 0.00325893 [add_recomputation]: 0.0204378 [cse_after_recomputation]: 0.00617935, [1] [Cycle 1]: 0.00606667, [1] [cse]: 0.00604373 [environ_conv]: 0.00194355 [swap_dp_allreduce_reducescatter]: 0.00320104 [bias_add_comm_swap]: 3.44997e-06 [label_micro_interleaved_index]: 9.11008e-06 [label_fine_grained_interleaved_index]: 3.51004e-06 [merge_cast_opt]: 2.02993e-06 [slice_recompute_activation]: 2.17999e-06 [micro_interleaved_order_control]: 3.29001e-06 [assign_add_opt]: 2.136e-05 [ForceFp32Comm]: 1.09e-06 [remove_cast_before_assign_add]: 1.42993e-06 [full_micro_interleaved_order_control]: 2.72994e-06 [reorder_send_recv_between_fp_bp]: 2.60002e-06 [comm_op_add_attrs]: 1.13994e-06 [add_comm_op_reuse_tag]: 1.42003e-06 [interleave_split_concat_branches]: 1.65997e-06 [interleave_parallel_branches]: 1.30001e-06 [overlap_opt_shard_in_pipeline]: 3.86999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.69001e-06 [control_data_broadcast_order]: 0.00510297 [grouped_pairwise_exchange_alltoall]: 1.63005e-06 [offloading_packed_experts]: 0.00109645 [overlap_recompute_and_grad_model_parallel]: 0.00110789 [overlap_grad_matmul_and_grad_allreduce]: 4.09002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.68011e-06 [overlap_recompute_comm]: 2.78e-06 [overlap_grad_ring_attention]: 0.0010916 [overlap_grad_flash_sp]: 0.00540503 [begin_end_overlap_inline]: 1.11002e-06 [split_matmul_comm_elemetwise]: 2.95998e-06 [split_layernorm_comm]: 2.20002e-06 [handle_group_info]: 1.17999e-06 [symbol_engine_optimizer]: 0.031602, [1] [Cycle 1]: 0.0315911, [6] [build]: 0.016308 [elim_shapecalc]: 0.00253935 [elim_not_effective]: 0.006714 [opt_reshape]: 0.00188033 [fold_const_symbol]: 0.00402614 [renormalize]: 8.70088e-07 [detach_backward]: 3.39001e-06 [pipeline_parallel_scheduler]: 2.19001e-06 [auto_monad_reorder]: 0.00513301 [get_jit_bprop_graph]: 3.19001e-06 [rewriter_after_jit_bprop_graph]: 8.66002e-06 [opt_after_jit_grad]: 0.00663084 [distribtued_split]: 0.00539506 [validate]: 0.00369566 [backend_pass]: 2.59001e-06 [task_emit]: 0.966969 [execute]: 1.213e-05 Sums bootstrap : 0.008945s : 0.07% type_inference : 8.437704s : 64.10% auto_monad : 0.127967s : 0.97% graph_reusing : 0.002075s : 0.02% inline : 0.000005s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.027280s : 0.21% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.013821s : 0.10% parallel-infer-symbol : 0.000007s : 0.00% pre_auto_parallel : 0.033470s : 0.25% insert-virtual-dataset : 0.000011s : 0.00% parallel-infer-symbol-second : 0.000004s : 0.00% dataset_repeat_opt : 0.000004s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.037031s : 0.28% optimize.rewriter_before_opt_a : 0.139454s : 1.06% optimize.opt_a.expand_dump_flag : 0.002177s : 0.02% optimize.opt_a.switch_simplify : 0.056207s : 0.43% optimize.opt_a.loop_unroll : 0.041729s : 0.32% optimize.opt_a.a_1 : 1.226465s : 9.32% optimize.opt_a.invalid_dout_check : 0.012752s : 0.10% optimize.opt_a.recompute_prepare : 0.011982s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.030801s : 0.23% optimize.opt_a.updatestate_assign_eliminate : 0.009681s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.019155s : 0.15% optimize.opt_a.parameter_eliminate : 0.000028s : 0.00% optimize.opt_a.a_2 : 0.192214s : 1.46% optimize.opt_a.accelerated_algorithm : 0.014011s : 0.11% optimize.opt_a.shard : 0.000012s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.007309s : 0.06% optimize.opt_a.shard_inline : 0.009072s : 0.07% optimize.opt_a.merge_send_recv : 0.008725s : 0.07% optimize.opt_a.auto_parallel : 0.008834s : 0.07% optimize.opt_a.parallel : 0.000041s : 0.00% optimize.opt_a.flash_sp : 0.001674s : 0.01% optimize.opt_a.merge_comm : 0.008773s : 0.07% optimize.opt_a.allreduce_fusion : 0.008701s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.011655s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.008754s : 0.07% optimize.opt_a.virtual_dataset : 0.008747s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.008832s : 0.07% optimize.opt_a.virtual_output : 0.010212s : 0.08% optimize.opt_a.merge_forward : 0.008609s : 0.07% optimize.opt_a.offload_activation : 0.012217s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000011s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.015145s : 0.12% optimize.opt_a.merge_recompute_call_nodes : 0.000007s : 0.00% optimize.opt_a.before_grad : 0.015949s : 0.12% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.009817s : 0.07% optimize.opt_a.meta_fg_expand : 0.014251s : 0.11% optimize.opt_a.flash_sp_send_recv_attached : 0.000015s : 0.00% optimize.opt_a.receive_attached : 0.000010s : 0.00% optimize.opt_a.after_resolve : 0.010127s : 0.08% optimize.opt_a.a_after_grad : 0.014376s : 0.11% optimize.opt_a.renormalize : 1.068820s : 8.12% optimize.opt_a.add_forward_monad_depend : 0.000085s : 0.00% optimize.opt_a.auto_monad_grad : 0.000013s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.038049s : 0.29% optimize.opt_a.cse : 0.048102s : 0.37% optimize.opt_a.a_3 : 0.064841s : 0.49% optimize.py_interpret_to_execute_after_opt_a : 0.005419s : 0.04% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.00% optimize.rewriter_after_opt_a : 0.038237s : 0.29% optimize.convert_after_rewriter : 0.002840s : 0.02% optimize.order_py_execute_after_rewriter : 0.002462s : 0.02% optimize.opt_b.b_1 : 0.074827s : 0.57% optimize.opt_b.b_2 : 0.002973s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.002714s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.002745s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.002753s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.009121s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.005462s : 0.04% optimize.overlap_param_gather : 0.000031s : 0.00% optimize.cconv : 0.001941s : 0.01% optimize.loop_unroll : 0.004464s : 0.03% optimize.opt_after_cconv.c_1 : 0.013284s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.003188s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.002722s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.002727s : 0.02% optimize.opt_after_cconv.cse : 0.009003s : 0.07% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.022477s : 0.17% optimize.tuple_transform.d_1 : 0.019090s : 0.15% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_cache_embedding : 0.003259s : 0.02% optimize.add_recomputation : 0.020438s : 0.16% optimize.cse_after_recomputation.cse : 0.006044s : 0.05% optimize.environ_conv : 0.001944s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.003201s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000009s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000004s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000021s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.005103s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.001096s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.001108s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.001092s : 0.01% optimize.overlap_grad_flash_sp : 0.005405s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.016308s : 0.12% optimize.symbol_engine_optimizer.elim_shapecalc : 0.002539s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.006714s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.001880s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.004026s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.005133s : 0.04% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000009s : 0.00% opt_after_jit_grad : 0.006631s : 0.05% distribtued_split : 0.005395s : 0.04% validate : 0.003696s : 0.03% backend_pass : 0.000003s : 0.00% task_emit : 0.966969s : 7.35% execute : 0.000012s : 0.00% Time group info: ------[substitution.] 0.302871 64790 1.21% : 0.003663s : 1231: substitution.arithmetic_simplify 1.67% : 0.005049s : 1027: substitution.cast_eliminate 0.06% : 0.000190s : 124: substitution.depend_value_elim 0.36% : 0.001101s : 1468: substitution.elim_not_effective 0.35% : 0.001074s : 1114: substitution.float_tuple_getitem_switch 0.46% : 0.001388s : 1154: substitution.fold_const_symbol 0.51% : 0.001533s : 1984: substitution.graph_param_transform 76.09% : 0.230454s : 5025: substitution.inline 0.91% : 0.002768s : 5110: substitution.j_node_and_user_rematch 1.11% : 0.003355s : 1354: substitution.less_batch_normalization 0.52% : 0.001564s : 3090: substitution.load_eliminater 0.46% : 0.001381s : 1305: substitution.minmaximum_grad 0.05% : 0.000153s : 196: substitution.reduce_all_const_elim 0.90% : 0.002724s : 5110: substitution.remove_not_recompute_node 0.17% : 0.000519s : 1035: substitution.replace_old_param 1.21% : 0.003656s : 1449: substitution.reshape_eliminate 0.46% : 0.001390s : 588: substitution.switch_simplify 0.01% : 0.000035s : 6: substitution.transpose_eliminate 1.28% : 0.003867s : 1693: substitution.tuple_list_convert_item_index_to_positive 0.68% : 0.002064s : 1741: substitution.tuple_list_get_item_const_eliminator 1.06% : 0.003210s : 1741: substitution.tuple_list_get_item_depend_reorder 2.58% : 0.007816s : 3349: substitution.tuple_list_get_item_eliminator 0.91% : 0.002750s : 1741: substitution.tuple_list_get_set_item_eliminator 2.98% : 0.009034s : 10976: substitution.updatestate_pure_node_eliminater 4.00% : 0.012119s : 11178: substitution.updatestate_useless_node_eliminater 0.01% : 0.000016s : 1: substitution.value_based_eliminate ------[type_inference.] 8.404708 2 85.10% : 7.152209s : 1: type_inference.infer 14.90% : 1.252499s : 1: type_inference.specialize ------[replace.] 0.073387 7390 8.30% : 0.006092s : 676: replace.cast_eliminate 0.33% : 0.000244s : 24: replace.depend_value_elim 2.74% : 0.002008s : 169: replace.elim_not_effective 64.73% : 0.047501s : 5025: replace.inline 2.17% : 0.001591s : 170: replace.reshape_eliminate 10.45% : 0.007672s : 588: replace.switch_simplify 0.67% : 0.000488s : 48: replace.tuple_list_get_item_depend_reorder 10.58% : 0.007764s : 688: replace.tuple_list_get_item_eliminator 0.03% : 0.000025s : 2: replace.updatestate_pure_node_eliminater ------[match.] 0.234452 7390 1.24% : 0.002901s : 676: match.cast_eliminate 0.01% : 0.000014s : 24: match.depend_value_elim 0.10% : 0.000246s : 169: match.elim_not_effective 96.97% : 0.227353s : 5025: match.inline 0.25% : 0.000585s : 170: match.reshape_eliminate 0.43% : 0.001020s : 588: match.switch_simplify 0.16% : 0.000364s : 48: match.tuple_list_get_item_depend_reorder 0.84% : 0.001965s : 688: match.tuple_list_get_item_eliminator 0.00% : 0.000004s : 2: match.updatestate_pure_node_eliminater ------[predicate.] 0.2990351525361 1.06% : 0.003164s : 22122: predicate.accumulaten_eliminater 0.13% : 0.000376s : 1500: predicate.ad_related_special_op_eliminate 0.93% : 0.002783s : 8356: predicate.addn_check_dump 1.03% : 0.003093s : 22122: predicate.addn_zero_filter 1.03% : 0.003071s : 22122: predicate.adjust_all_reduce_mul_add 2.31% : 0.006895s : 30478: predicate.arithmetic_simplify 1.27% : 0.003811s : 22968: predicate.cast_eliminate 0.68% : 0.002030s : 5957: predicate.check_bprop_eliminate 0.93% : 0.002777s : 8356: predicate.compare_switch_simplify 0.05% : 0.000143s : 1985: predicate.const_output_eliminate 0.93% : 0.002774s : 8426: predicate.depend_value_elim 1.13% : 0.003375s : 22968: predicate.dict_get_item_const_eliminator 1.18% : 0.003530s : 22968: predicate.dict_get_item_eliminator 1.07% : 0.003193s : 22968: predicate.dict_set_item_eliminator 0.30% : 0.000898s : 3485: predicate.dumpgradient_eliminate 0.05% : 0.000141s : 1814: predicate.elim_not_effective 0.17% : 0.000506s : 1984: predicate.elim_shapecalc_of_broadcastargs 1.33% : 0.003967s : 24953: predicate.environ_add_const_eliminate 1.28% : 0.003824s : 24953: predicate.environ_get_add_eliminate 1.29% : 0.003844s : 24953: predicate.environ_get_depend_swap 2.29% : 0.006835s : 33309: predicate.environ_get_eliminate 1.29% : 0.003845s : 24953: predicate.environ_get_set_eliminate 1.41% : 0.004230s : 28731: predicate.exchange_switch_depend_value 1.72% : 0.005157s : 28731: predicate.float_depend_g_call 0.93% : 0.002766s : 8356: predicate.float_environ_get_switch 1.09% : 0.003255s : 10341: predicate.float_tuple_getitem_switch 0.04% : 0.000115s : 1500: predicate.fold_const_symbol 0.70% : 0.002095s : 6151: predicate.get_grad_eliminate 0.06% : 0.000181s : 1984: predicate.graph_param_transform 0.83% : 0.002473s : 8356: predicate.incorporate_call 0.83% : 0.002469s : 8356: predicate.incorporate_call_switch 5.60% : 0.016755s : 69330: predicate.inline 0.82% : 0.002459s : 6151: predicate.inline_without_move 0.15% : 0.000460s : 6151: predicate.j_node_and_user_rematch 0.79% : 0.002369s : 6160: predicate.less_batch_normalization 1.48% : 0.004434s : 27673: predicate.list_to_tuple_eliminator_ 2.51% : 0.007516s : 49796: predicate.load_eliminater 0.24% : 0.000710s : 1985: predicate.loop_unroll_after_grad 3.47% : 0.010390s : 31312: predicate.loop_unroll_before_grad 1.42% : 0.004238s : 26986: predicate.make_slice_get_slice_eliminator 0.92% : 0.002755s : 8356: predicate.merge_addn 0.68% : 0.002025s : 5957: predicate.micro_step_allgather_replace 0.68% : 0.002023s : 5957: predicate.mini_step_allgather_replace 1.09% : 0.003260s : 22122: predicate.minmaximum_grad 0.12% : 0.000360s : 1500: predicate.mutable_eliminate 0.12% : 0.000364s : 1500: predicate.opt_reshape 0.25% : 0.000753s : 1985: predicate.parallel_virtual_node 3.20% : 0.009559s : 28731: predicate.partial_defer_inline 1.34% : 0.004006s : 25689: predicate.partial_eliminate 1.03% : 0.003092s : 22122: predicate.print_const_string_wrapper 0.94% : 0.002825s : 8332: predicate.reduce_all_const_elim 1.29% : 0.003858s : 22122: predicate.reduce_eliminate 2.45% : 0.007322s : 49796: predicate.redundant_stop_gradient_eliminater 0.14% : 0.000431s : 6151: predicate.remove_not_recompute_node 0.89% : 0.002653s : 29661: predicate.replace_applicator 0.15% : 0.000450s : 6151: predicate.replace_old_param 0.05% : 0.000144s : 1985: predicate.reset_defer_inline 1.19% : 0.003550s : 22292: predicate.reshape_eliminate 0.68% : 0.002042s : 5957: predicate.row_tensor_add_zeros_like 0.25% : 0.000758s : 1985: predicate.row_tensor_eliminate 1.01% : 0.003033s : 5957: predicate.same_eliminate 0.23% : 0.000682s : 8821: predicate.set_cell_output_no_recompute 0.69% : 0.002077s : 6151: predicate.shard_identity_eliminate 0.31% : 0.000929s : 3485: predicate.special_op_eliminate 1.05% : 0.003131s : 8356: predicate.specialize_transform 0.73% : 0.002194s : 5957: predicate.split_environ_get_set_with_tuple_value 0.33% : 0.000991s : 6151: predicate.stack_unstack_eliminate 0.09% : 0.000273s : 1985: predicate.switch_call_monad_eliminater 1.47% : 0.004396s : 28731: predicate.switch_defer_inline 2.14% : 0.006407s : 34688: predicate.switch_layer_defer_inline 5.98% : 0.017895s : 69575: predicate.switch_simplify 1.06% : 0.003171s : 22122: predicate.tile_eliminate 1.07% : 0.003195s : 22122: predicate.transpose_eliminate 1.49% : 0.004450s : 26937: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.004658s : 26985: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.004339s : 26985: predicate.tuple_list_get_item_depend_reorder 2.60% : 0.007768s : 36029: predicate.tuple_list_get_item_eliminator 1.46% : 0.004371s : 26985: predicate.tuple_list_get_set_item_eliminator 2.62% : 0.007839s : 35341: predicate.tuple_list_set_item_eliminator 1.43% : 0.004287s : 27673: predicate.tuple_to_list_eliminator_ 2.61% : 0.007799s : 49798: predicate.updatestate_pure_node_eliminater 3.45% : 0.010324s : 58154: predicate.updatestate_useless_node_eliminater 0.25% : 0.000754s : 1985: predicate.value_based_eliminate 0.70% : 0.002100s : 6151: predicate.virtual_dataset_eliminate 0.71% : 0.002131s : 6151: predicate.virtual_output_eliminate 0.22% : 0.000664s : 1985: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.729199 6967 64.41% : 0.469658s : 1979: func_graph_cloner_run.FuncGraphClonerGraph 0.85% : 0.006222s : 116: func_graph_cloner_run.FuncGraphClonerNode 34.74% : 0.253319s : 4872: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 19.808329 253 0.00% : 0.000004s : 1: ForceFp32Comm 0.60% : 0.119629s : 1: add_attr 0.60% : 0.119587s : 1: add_attr_with_inline 0.02% : 0.003273s : 1: add_cache_embedding 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.020458s : 1: add_recomputation 0.00% : 0.000025s : 1: assign_add_opt 0.65% : 0.128029s : 1: auto_monad 0.03% : 0.005169s : 1: auto_monad_reorder 0.00% : 0.000018s : 1: backend_pass 0.00% : 0.000010s : 1: begin_end_overlap_inline 0.00% : 0.000013s : 1: bias_add_comm_swap 0.05% : 0.008990s : 1: bootstrap 0.01% : 0.001954s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.005116s : 1: control_data_broadcast_order 0.01% : 0.002855s : 1: convert_after_rewriter 0.03% : 0.006187s : 1: cse_after_recomputation 0.00% : 0.000010s : 1: dataset_repeat_opt 0.00% : 0.000010s : 1: detach_backward 0.03% : 0.005415s : 1: distribtued_split 0.01% : 0.001957s : 1: environ_conv 0.00% : 0.000021s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000012s : 1: get_jit_bprop_graph 0.01% : 0.002097s : 1: graph_reusing 0.00% : 0.000011s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000011s : 1: inline 0.00% : 0.000019s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000011s : 1: label_fine_grained_interleaved_index 0.00% : 0.000013s : 1: label_micro_interleaved_index 0.02% : 0.004477s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.01% : 0.001107s : 1: offloading_packed_experts 0.01% : 0.002947s : 1: opt.transform.loop_unroll_optimizer 8.62% : 1.707953s : 134: opt.transform.opt_a 0.07% : 0.013279s : 1: opt.transform.opt_after_cconv 0.03% : 0.005710s : 2: opt.transform.opt_after_jit_grad 0.39% : 0.077649s : 28: opt.transform.opt_b 0.10% : 0.019078s : 1: opt.transform.opt_trans_graph 0.08% : 0.015141s : 4: opt.transform.symbol_engine_opt 15.35% : 3.041490s : 1: opt_a 0.16% : 0.031055s : 1: opt_after_cconv 0.03% : 0.006649s : 1: opt_after_jit_grad 0.48% : 0.095281s : 1: opt_b 17.81% : 3.528000s : 1: optimize 0.03% : 0.005476s : 1: optimize_parallel_all_gather_comm 0.01% : 0.002477s : 1: order_py_execute_after_rewriter 0.03% : 0.005419s : 1: overlap_grad_flash_sp 0.00% : 0.000011s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.001100s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000039s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.001119s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000015s : 1: parallel-infer-symbol 0.00% : 0.000009s : 1: parallel-infer-symbol-second 0.00% : 0.000009s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.17% : 0.033528s : 1: pre_auto_parallel 0.19% : 0.037071s : 1: py_interpret_to_execute 0.03% : 0.005432s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.11% : 0.022507s : 1: remove_dup_value 2.56% : 0.506969s : 2: renormalize.infer 2.84% : 0.561570s : 2: renormalize.specialize 0.00% : 0.000011s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.19% : 0.038250s : 1: rewriter_after_opt_a 0.70% : 0.139504s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000008s : 1: split_matmul_comm_elemetwise 0.02% : 0.003215s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.031608s : 1: symbol_engine_optimizer 4.88% : 0.967003s : 1: task_emit 0.10% : 0.019146s : 1: tuple_transform 42.60% : 8.437777s : 1: type_inference 0.04% : 0.008215s : 1: validate [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:26.406.788 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:26.407.114 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:26.407.982 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:26.408.138 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:26.408.892 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:03:26,510 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 31.20920157432556 s; generated tokens: 8 tokens; generate speed: 0.25633465761524793 tokens/s 2025-07-24 11:03:26,511 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.0020928382873535156 s; prefill predict time: 16.69329285621643 s; prefill post time: 0.06165051460266113 s; decode prepare time: 0.0011292185102190291 s; decode predict time: 0.005893349647521973 s; decode post time: 0.005744048527308873 s 2025-07-24 11:03:26,516 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. Building prefix dict from the default dictionary ... DEBUG:jieba:Building prefix dict from the default dictionary ... Loading model from cache /tmp/jieba.cache DEBUG:jieba:Loading model from cache /tmp/jieba.cache Loading model cost 1.271 seconds. DEBUG:jieba:Loading model cost 1.271 seconds. Prefix dict has been built successfully. DEBUG:jieba:Prefix dict has been built successfully. parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 你好!<|im_end|> <|im_start|>assistant 你好!有什么可以帮助你的吗?<|im_end|> parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 你好!<|im_end|> <|im_start|>assistant 你好!有什么可以帮助你的吗?<|im_end|> calculate sim is:1.0 2025-07-24 11:03:27,799 - mindformers./output/log[mindformers/generation/text_generator.py:735] - INFO - The batch is: 4, and the split_size is: 2, and the global_rank_id is: 2, and the dp_rank_id is: 1 and start is: 2, and stop is: 4 2025-07-24 11:03:27,802 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:03:27,803 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:03:27,803 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:03:27,804 - mindformers./output/log[mindformers/research/qwen2_5/infer/qwen2_5.py:188] - INFO - Set dynamic input for llama. [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:27.810.018 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:27.810.192 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:27.810.699 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:27.817.460 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:27.830.441 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:27.830.603 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:27.830.974 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:27.831.061 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:27.831.236 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:03:28,764 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 0.9593923091888428 s; generated tokens: 208 tokens; generate speed: 216.8039059807161 tokens/s 2025-07-24 11:03:28,764 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.0016512870788574219 s; prefill predict time: 0.011078357696533203 s; prefill post time: 0.008404016494750977 s; decode prepare time: 0.0008294767546422273 s; decode predict time: 0.004994121252321729 s; decode post time: 0.003184698160412242 s 2025-07-24 11:03:28,769 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] return quick_sort(left) + middle + quick_sort(right) # 示例输入 arr = [ parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1 calculate sim is:1.0 parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] return quick_sort(left) + middle + quick_sort(right) # 示例输入 arr = [ parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1 calculate sim is:1.0 2025-07-24 11:03:28,785 - mindformers./output/log[mindformers/generation/text_generator.py:735] - INFO - The batch is: 8, and the split_size is: 4, and the global_rank_id is: 2, and the dp_rank_id is: 1 and start is: 4, and stop is: 8 2025-07-24 11:03:28,786 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:03:28,787 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:03:28,787 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:03:28,788 - mindformers./output/log[mindformers/research/qwen2_5/infer/qwen2_5.py:188] - INFO - Set dynamic input for llama. [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:28.792.680 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:28.792.873 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:28.793.424 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:28.801.860 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:28.812.588 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:28.812.766 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:28.813.208 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:28.813.321 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806939,fffdb0ff9060,python):2025-07-24-11:03:28.813.536 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:03:30,031 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 1.242950439453125 s; generated tokens: 408 tokens; generate speed: 328.25122148837437 tokens/s 2025-07-24 11:03:30,032 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.001247406005859375 s; prefill predict time: 0.012296199798583984 s; prefill post time: 0.006251096725463867 s; decode prepare time: 0.0009136648461370185 s; decode predict time: 0.005651011466979981 s; decode post time: 0.0054396780410615525 s 2025-07-24 11:03:30,037 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 large_models/parallel_qwen2_0_5b_predict_dp2_mp2/worker_3.log0000644000175100017500000040233115040321202024707 0ustar jenkinsHwHiAiUser/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) 2025-07-24 11:02:48,490 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config moe_config is empty. 2025-07-24 11:02:48,524 - mindformers./output/log[mindformers/core/context/build_context.py:168] - INFO - Predict context config, jit_level: O0, infer_boost: on [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:48.525.932 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:48.526.757 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:48.527.214 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:48.527.328 [mindspore/context.py:1655] For 'context.set_context', 'enable_graph_kernel' parameter is deprecated, and will be removed in the next version. Please use jit_config={'jit_level': 'O1'} instead. [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:48.527.461 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:48.527.610 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:48.527.731 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. 2025-07-24 11:02:48,527 - mindformers./output/log[mindformers/core/context/parallel.py:73] - INFO - full_batch is set to False for non-parallel modes [WARNING] DISTRIBUTED(3806956,ffff1ded0060,python):2025-07-24-11:02:48.529.908 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:59684 to 127.0.0.1:8240 is successfully created. System errno: Success [WARNING] DISTRIBUTED(3806956,ffffad902f30,python):2025-07-24-11:02:48.529.893 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 20 source: 127.0.0.1:59684, destination: 127.0.0.1:8240 [WARNING] DISTRIBUTED(3806956,ffffad902f30,python):2025-07-24-11:02:48.530.076 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:59686, destination: 127.0.0.1:8240 [WARNING] DISTRIBUTED(3806956,ffff1eed2060,python):2025-07-24-11:02:48.530.105 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:59686 to 127.0.0.1:8240 is successfully created. System errno: Success [WARNING] DISTRIBUTED(3806956,ffffad902f30,python):2025-07-24-11:02:48.530.116 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:8240 to be connected...Retry number: 1 [WARNING] DISTRIBUTED(3806956,ffffad902f30,python):2025-07-24-11:02:49.030.613 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/1200). [WARNING] DISTRIBUTED(3806956,ffffad902f30,python):2025-07-24-11:02:49.530.711 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/1200). [WARNING] DISTRIBUTED(3806956,ffffad902f30,python):2025-07-24-11:02:50.030.800 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(3/1200). [WARNING] DISTRIBUTED(3806956,ffffad902f30,python):2025-07-24-11:02:50.530.906 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(3806956,ffffad902f30,python):2025-07-24-11:02:50.530.937 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 3 rank id: 3 [MS_ALLOC_CONF]Runtime config: enable_vmm:False [WARNING] DEVICE(3806956,ffffad902f30,python):2025-07-24-11:02:50.751.520 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_memory_adapter.cc:155] Initialize] Reserved memory size for other components(2101346304) is less than recommend size(4068252416), It may lead to Out Of Memory in HCCL or other components, Please double check context key 'variable_memory_max_size'/'max_device_memory' [WARNING] DEVICE(3806956,ffffad902f30,python):2025-07-24-11:02:51.982.606 [mindspore/ccsrc/plugin/res_manager/ascend/collective/multi_ascend_collective_comm_lib.cc:84] Initialize] Loading LCCL because env MS_ENABLE_LCCL is set to on. Pay attention that LCCL only supports communication group within single node in KernelByKernel for now. [WARNING] DISTRIBUTED(3806956,ffffad902f30,python):2025-07-24-11:02:51.986.353 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(3806956,ffffad902f30,python):2025-07-24-11:02:51.986.598 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(3806956,fffec5a04060,python):2025-07-24-11:02:51.986.804 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:8240, node_rank:2130706433, total_rank_size:4, local_rank_size4 [WARNING] HCCL_ADPT(3806956,fffec5a04060,python):2025-07-24-11:02:51.986.885 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(3806956,fffec5a04060,python):2025-07-24-11:02:51.986.909 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:632] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(3806956,fffec5a04060,python):2025-07-24-11:02:51.986.926 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group [WARNING] DISTRIBUTED(3806956,fffec5a04060,python):2025-07-24-11:02:51.987.563 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group 2025-07-24 11:02:51,988 - mindformers./output/log[mindformers/tools/utils.py:185] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_3.ckpt' 2025-07-24 11:02:51,990 - mindformers./output/log[mindformers/core/context/build_context.py:383] - INFO - cann workqueue cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191] 2025-07-24 11:02:51,990 - mindformers./output/log[mindformers/core/context/build_context.py:387] - WARNING - CANN use cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], model get empty cpu list, disable binding cores 2025-07-24 11:02:51,990 - mindformers./output/log[mindformers/core/context/build_context.py:395] - INFO - cpu_affinity, rank_id: 3, device_num: 4 2025-07-24 11:02:51,991 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 1, 'capacity_factor': 1.1, 'aux_loss_factor': 0.05, 'num_experts_chosen': 1, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV1', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 0, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': None, 'n_group': None, 'first_k_dense_replace': True, 'moe_intermediate_size': 1407, 'routed_scaling_factor': 1.0, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} 2025-07-24 11:02:51,991 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': False, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} [WARNING] DEVICE(3806956,fffec5203060,python):2025-07-24-11:02:52.004.511 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 tp_group is:True dp_group is:True 2025-07-24 11:02:52,506 - mindformers./output/log[mindformers/parallel_core/inference/parallel_state.py:358] - INFO - expert_model_parallel_size(1) is not equal to world_size(4), so we will use 4 as the MOE_tensor_parallel_size. [WARNING] DISTRIBUTED(3806956,ffffad902f30,python):2025-07-24-11:02:52.508.768 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: dp-1-3 [const vector]{1, 3}, async: 0, submit_now: 1 [WARNING] DEVICE(3806956,fffec5203060,python):2025-07-24-11:02:52.563.315 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group [WARNING] DISTRIBUTED(3806956,fffec5203060,python):2025-07-24-11:02:52.563.453 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: hccl_world_group [WARNING] DISTRIBUTED(3806956,fffec5203060,python):2025-07-24-11:02:52.563.482 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: hccl_world_group end. [WARNING] DISTRIBUTED(3806956,fffec5a04060,python):2025-07-24-11:02:52.563.569 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: hccl_world_group [WARNING] DISTRIBUTED(3806956,fffec5a04060,python):2025-07-24-11:02:52.563.884 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: dp-1-3 [WARNING] DEVICE(3806956,fffec5203060,python):2025-07-24-11:02:52.575.964 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for dp-1-3, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806956,fffec5203060,python):2025-07-24-11:02:52.811.056 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for dp-1-3 [WARNING] DISTRIBUTED(3806956,fffec5203060,python):2025-07-24-11:02:52.811.157 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: dp-1-3 [WARNING] DISTRIBUTED(3806956,fffec5203060,python):2025-07-24-11:02:52.811.183 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: dp-1-3 end. [WARNING] DISTRIBUTED(3806956,fffec5a04060,python):2025-07-24-11:02:52.811.244 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: dp-1-3 [WARNING] DISTRIBUTED(3806956,ffffad902f30,python):2025-07-24-11:02:52.811.424 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: tp-2-3 [const vector]{2, 3}, async: 0, submit_now: 1 [WARNING] DISTRIBUTED(3806956,fffec5a04060,python):2025-07-24-11:02:52.811.807 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: tp-2-3 [WARNING] DEVICE(3806956,fffec5203060,python):2025-07-24-11:02:52.823.533 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for tp-2-3, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806956,fffec5203060,python):2025-07-24-11:02:53.058.886 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for tp-2-3 [WARNING] DISTRIBUTED(3806956,fffec5203060,python):2025-07-24-11:02:53.059.067 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: tp-2-3 [WARNING] DISTRIBUTED(3806956,fffec5203060,python):2025-07-24-11:02:53.059.097 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: tp-2-3 end. [WARNING] DISTRIBUTED(3806956,fffec5a04060,python):2025-07-24-11:02:53.059.190 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: tp-2-3 [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:53.628.76 [mindspore/ops/primitive.py:220] The in_strategy/in_layout of the operator in your network will not take effect in stand_alone mode. This means the the shard function called in the network is ignored. If you want to enable it, please use semi auto or auto parallel mode by context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL or context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL) data_parallel_group:dp-1-3 tensor_model_parallel_group:tp-2-3 2025-07-24 11:02:53,727 - mindformers./output/log[mindformers/models/modeling_utils.py:1517] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. 2025-07-24 11:02:53,728 - mindformers./output/log[/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/run_parallel.py:178] - INFO - ----------------Transform and load checkpoint---------------- [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.833.081 [mindspore/train/serialization.py:333] The type of model.layers.0.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.833.972 [mindspore/train/serialization.py:333] The type of model.layers.0.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.834.452 [mindspore/train/serialization.py:333] The type of model.layers.1.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.835.079 [mindspore/train/serialization.py:333] The type of model.layers.1.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.835.531 [mindspore/train/serialization.py:333] The type of model.layers.2.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.836.148 [mindspore/train/serialization.py:333] The type of model.layers.2.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.836.637 [mindspore/train/serialization.py:333] The type of model.layers.3.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.837.248 [mindspore/train/serialization.py:333] The type of model.layers.3.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.837.713 [mindspore/train/serialization.py:333] The type of model.layers.4.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.838.327 [mindspore/train/serialization.py:333] The type of model.layers.4.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.838.790 [mindspore/train/serialization.py:333] The type of model.layers.5.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.839.401 [mindspore/train/serialization.py:333] The type of model.layers.5.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.839.856 [mindspore/train/serialization.py:333] The type of model.layers.6.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.840.469 [mindspore/train/serialization.py:333] The type of model.layers.6.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.840.915 [mindspore/train/serialization.py:333] The type of model.layers.7.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.841.531 [mindspore/train/serialization.py:333] The type of model.layers.7.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.842.008 [mindspore/train/serialization.py:333] The type of model.layers.8.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.842.625 [mindspore/train/serialization.py:333] The type of model.layers.8.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.843.093 [mindspore/train/serialization.py:333] The type of model.layers.9.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.843.699 [mindspore/train/serialization.py:333] The type of model.layers.9.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.844.142 [mindspore/train/serialization.py:333] The type of model.layers.10.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.844.750 [mindspore/train/serialization.py:333] The type of model.layers.10.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.845.203 [mindspore/train/serialization.py:333] The type of model.layers.11.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.845.827 [mindspore/train/serialization.py:333] The type of model.layers.11.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.846.282 [mindspore/train/serialization.py:333] The type of model.layers.12.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.846.900 [mindspore/train/serialization.py:333] The type of model.layers.12.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.847.347 [mindspore/train/serialization.py:333] The type of model.layers.13.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.847.957 [mindspore/train/serialization.py:333] The type of model.layers.13.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.848.402 [mindspore/train/serialization.py:333] The type of model.layers.14.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.849.032 [mindspore/train/serialization.py:333] The type of model.layers.14.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.849.483 [mindspore/train/serialization.py:333] The type of model.layers.15.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.850.109 [mindspore/train/serialization.py:333] The type of model.layers.15.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.850.573 [mindspore/train/serialization.py:333] The type of model.layers.16.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.851.187 [mindspore/train/serialization.py:333] The type of model.layers.16.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.851.646 [mindspore/train/serialization.py:333] The type of model.layers.17.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.852.256 [mindspore/train/serialization.py:333] The type of model.layers.17.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.852.707 [mindspore/train/serialization.py:333] The type of model.layers.18.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.853.317 [mindspore/train/serialization.py:333] The type of model.layers.18.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.853.782 [mindspore/train/serialization.py:333] The type of model.layers.19.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.854.407 [mindspore/train/serialization.py:333] The type of model.layers.19.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.854.861 [mindspore/train/serialization.py:333] The type of model.layers.20.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.855.490 [mindspore/train/serialization.py:333] The type of model.layers.20.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.855.943 [mindspore/train/serialization.py:333] The type of model.layers.21.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.856.550 [mindspore/train/serialization.py:333] The type of model.layers.21.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.857.009 [mindspore/train/serialization.py:333] The type of model.layers.22.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.857.616 [mindspore/train/serialization.py:333] The type of model.layers.22.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.858.091 [mindspore/train/serialization.py:333] The type of model.layers.23.attention_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.858.710 [mindspore/train/serialization.py:333] The type of model.layers.23.ffn_norm.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.859.162 [mindspore/train/serialization.py:333] The type of model.norm_out.weight:BFloat16 in 'parameter_dict' is different from the type of it in 'net':Float32, then the type convert from BFloat16 to Float32 in the network. May consume additional memory and time [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.862.301 [mindspore/train/serialization.py:1789] For 'load_param_into_net', 48 parameters in the 'net' are not loaded, because they are not in the 'parameter_dict', please check whether the network structure is consistent when training and loading checkpoint. [WARNING] ME(3806956:281473593650992,MainProcess):2025-07-24-11:02:54.862.437 [mindspore/train/serialization.py:1793] ['model.layers.0.attention.paged_attention_mgr.key_cache', 'model.layers.0.attention.paged_attention_mgr.value_cache', 'model.layers.1.attention.paged_attention_mgr.key_cache', 'model.layers.1.attention.paged_attention_mgr.value_cache', 'model.layers.2.attention.paged_attention_mgr.key_cache', 'model.layers.2.attention.paged_attention_mgr.value_cache', 'model.layers.3.attention.paged_attention_mgr.key_cache', 'model.layers.3.attention.paged_attention_mgr.value_cache', 'model.layers.4.attention.paged_attention_mgr.key_cache', 'model.layers.4.attention.paged_attention_mgr.value_cache', 'model.layers.5.attention.paged_attention_mgr.key_cache', 'model.layers.5.attention.paged_attention_mgr.value_cache', 'model.layers.6.attention.paged_attention_mgr.key_cache', 'model.layers.6.attention.paged_attention_mgr.value_cache', 'model.layers.7.attention.paged_attention_mgr.key_cache', 'model.layers.7.attention.paged_attention_mgr.value_cache', 'model.layers.8.attention.paged_attention_mgr.key_cache', 'model.layers.8.attention.paged_attention_mgr.value_cache', 'model.layers.9.attention.paged_attention_mgr.key_cache', 'model.layers.9.attention.paged_attention_mgr.value_cache', 'model.layers.10.attention.paged_attention_mgr.key_cache', 'model.layers.10.attention.paged_attention_mgr.value_cache', 'model.layers.11.attention.paged_attention_mgr.key_cache', 'model.layers.11.attention.paged_attention_mgr.value_cache', 'model.layers.12.attention.paged_attention_mgr.key_cache', 'model.layers.12.attention.paged_attention_mgr.value_cache', 'model.layers.13.attention.paged_attention_mgr.key_cache', 'model.layers.13.attention.paged_attention_mgr.value_cache', 'model.layers.14.attention.paged_attention_mgr.key_cache', 'model.layers.14.attention.paged_attention_mgr.value_cache', 'model.layers.15.attention.paged_attention_mgr.key_cache', 'model.layers.15.attention.paged_attention_mgr.value_cache', 'model.layers.16.attention.paged_attention_mgr.key_cache', 'model.layers.16.attention.paged_attention_mgr.value_cache', 'model.layers.17.attention.paged_attention_mgr.key_cache', 'model.layers.17.attention.paged_attention_mgr.value_cache', 'model.layers.18.attention.paged_attention_mgr.key_cache', 'model.layers.18.attention.paged_attention_mgr.value_cache', 'model.layers.19.attention.paged_attention_mgr.key_cache', 'model.layers.19.attention.paged_attention_mgr.value_cache', 'model.layers.20.attention.paged_attention_mgr.key_cache', 'model.layers.20.attention.paged_attention_mgr.value_cache', 'model.layers.21.attention.paged_attention_mgr.key_cache', 'model.layers.21.attention.paged_attention_mgr.value_cache', 'model.layers.22.attention.paged_attention_mgr.key_cache', 'model.layers.22.attention.paged_attention_mgr.value_cache', 'model.layers.23.attention.paged_attention_mgr.key_cache', 'model.layers.23.attention.paged_attention_mgr.value_cache'] are not loaded. param_not_load: ['model.layers.0.attention.paged_attention_mgr.key_cache', 'model.layers.0.attention.paged_attention_mgr.value_cache', 'model.layers.1.attention.paged_attention_mgr.key_cache', 'model.layers.1.attention.paged_attention_mgr.value_cache', 'model.layers.2.attention.paged_attention_mgr.key_cache', 'model.layers.2.attention.paged_attention_mgr.value_cache', 'model.layers.3.attention.paged_attention_mgr.key_cache', 'model.layers.3.attention.paged_attention_mgr.value_cache', 'model.layers.4.attention.paged_attention_mgr.key_cache', 'model.layers.4.attention.paged_attention_mgr.value_cache', 'model.layers.5.attention.paged_attention_mgr.key_cache', 'model.layers.5.attention.paged_attention_mgr.value_cache', 'model.layers.6.attention.paged_attention_mgr.key_cache', 'model.layers.6.attention.paged_attention_mgr.value_cache', 'model.layers.7.attention.paged_attention_mgr.key_cache', 'model.layers.7.attention.paged_attention_mgr.value_cache', 'model.layers.8.attention.paged_attention_mgr.key_cache', 'model.layers.8.attention.paged_attention_mgr.value_cache', 'model.layers.9.attention.paged_attention_mgr.key_cache', 'model.layers.9.attention.paged_attention_mgr.value_cache', 'model.layers.10.attention.paged_attention_mgr.key_cache', 'model.layers.10.attention.paged_attention_mgr.value_cache', 'model.layers.11.attention.paged_attention_mgr.key_cache', 'model.layers.11.attention.paged_attention_mgr.value_cache', 'model.layers.12.attention.paged_attention_mgr.key_cache', 'model.layers.12.attention.paged_attention_mgr.value_cache', 'model.layers.13.attention.paged_attention_mgr.key_cache', 'model.layers.13.attention.paged_attention_mgr.value_cache', 'model.layers.14.attention.paged_attention_mgr.key_cache', 'model.layers.14.attention.paged_attention_mgr.value_cache', 'model.layers.15.attention.paged_attention_mgr.key_cache', 'model.layers.15.attention.paged_attention_mgr.value_cache', 'model.layers.16.attention.paged_attention_mgr.key_cache', 'model.layers.16.attention.paged_attention_mgr.value_cache', 'model.layers.17.attention.paged_attention_mgr.key_cache', 'model.layers.17.attention.paged_attention_mgr.value_cache', 'model.layers.18.attention.paged_attention_mgr.key_cache', 'model.layers.18.attention.paged_attention_mgr.value_cache', 'model.layers.19.attention.paged_attention_mgr.key_cache', 'model.layers.19.attention.paged_attention_mgr.value_cache', 'model.layers.20.attention.paged_attention_mgr.key_cache', 'model.layers.20.attention.paged_attention_mgr.value_cache', 'model.layers.21.attention.paged_attention_mgr.key_cache', 'model.layers.21.attention.paged_attention_mgr.value_cache', 'model.layers.22.attention.paged_attention_mgr.key_cache', 'model.layers.22.attention.paged_attention_mgr.value_cache', 'model.layers.23.attention.paged_attention_mgr.key_cache', 'model.layers.23.attention.paged_attention_mgr.value_cache'], ckpt_not_load: [] 2025-07-24 11:02:55,257 - mindformers./output/log[mindformers/generation/text_generator.py:726] - WARNING - batch size {batch} can not be divisible by data_parallel {data_parallel}, and would not split. 2025-07-24 11:02:55,259 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:02:55,260 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:02:55,260 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:02:55,261 - mindformers./output/log[mindformers/research/qwen2_5/infer/qwen2_5.py:188] - INFO - Set dynamic input for llama. TotalTime = 14.4926, [24] [bootstrap]: 0.0118745 [type_inference]: 8.81516 [auto_monad]: 0.144852 [graph_reusing]: 0.00649425 [inline]: 7.111e-06 [add_attr]: 0.130451, [1] [add_attr_with_inline]: 0.130429, [1] [Cycle 1]: 0.0530556, [2] [tag_attr]: 0.0353082 [meta_addattr_fg_expand]: 0.0175708 [parallel-infer-symbol]: 6.98003e-06 [pre_auto_parallel]: 0.04376 [insert-virtual-dataset]: 1.053e-05 [parallel-infer-symbol-second]: 3.09001e-06 [dataset_repeat_opt]: 3.10002e-06 [pipeline_split]: 2.24996e-06 [optimize]: 3.83924, [53] [py_interpret_to_execute]: 0.0459896 [rewriter_before_opt_a]: 0.187338 [opt_a]: 3.30152, [3] [Cycle 1]: 2.64041, [45] [expand_dump_flag]: 0.00265232 [switch_simplify]: 0.055028 [loop_unroll]: 0.0374457 [a_1]: 1.28661 [invalid_dout_check]: 0.00815853 [recompute_prepare]: 0.00763363 [updatestate_depend_eliminate]: 0.0298818 [updatestate_assign_eliminate]: 0.00571489 [updatestate_loads_eliminate]: 0.0140643 [parameter_eliminate]: 1.144e-05 [a_2]: 0.106428 [accelerated_algorithm]: 0.00725669 [shard]: 3.44007e-06 [meta_shard_fg_expand]: 0.00301262 [shard_inline]: 0.00334455 [merge_send_recv]: 0.00339658 [auto_parallel]: 0.00331145 [parallel]: 3.8601e-05 [flash_sp]: 0.00189944 [merge_comm]: 0.00331369 [allreduce_fusion]: 0.00330393 [matmul_add_comm_reduction]: 0.00413807 [allreduce_slice_to_reducescatter]: 1.39e-06 [virtual_shard_identity]: 0.00334452 [virtual_dataset]: 0.00327962 [get_grad_eliminate_]: 0.00331064 [virtual_output]: 0.00323913 [merge_forward]: 0.0033836 [offload_activation]: 0.00447133 [cell_reuse_recompute_pass]: 5.00993e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00556979 [merge_recompute_call_nodes]: 3.00002e-06 [before_grad]: 0.00557263 [set_forward_comm_id_for_comm_node_pass]: 0.00372511 [meta_fg_expand]: 0.00909856 [flash_sp_send_recv_attached]: 1.039e-05 [receive_attached]: 1.81199e-05 [after_resolve]: 0.00371991 [a_after_grad]: 0.00515938 [renormalize]: 0.924491 [add_forward_monad_depend]: 2.069e-05 [auto_monad_grad]: 3.73996e-06 [auto_monad_eliminator]: 0.0287914 [cse]: 0.0224135 [a_3]: 0.0226589 [Cycle 2]: 0.415479, [45] [expand_dump_flag]: 5.61995e-06 [switch_simplify]: 0.00292966 [loop_unroll]: 0.00289199 [a_1]: 0.0711955 [invalid_dout_check]: 0.00374521 [recompute_prepare]: 0.00280388 [updatestate_depend_eliminate]: 0.00253089 [updatestate_assign_eliminate]: 0.00229435 [updatestate_loads_eliminate]: 0.00226621 [parameter_eliminate]: 8.15e-06 [a_2]: 0.048201 [accelerated_algorithm]: 0.00375399 [shard]: 3.56999e-06 [meta_shard_fg_expand]: 0.00155204 [shard_inline]: 0.00302404 [merge_send_recv]: 0.00317443 [auto_parallel]: 0.00289153 [parallel]: 1.37601e-05 [flash_sp]: 2.205e-05 [merge_comm]: 0.00296266 [allreduce_fusion]: 0.00309566 [matmul_add_comm_reduction]: 0.0036808 [allreduce_slice_to_reducescatter]: 1.30001e-06 [virtual_shard_identity]: 0.00301489 [virtual_dataset]: 0.00302113 [get_grad_eliminate_]: 0.00296758 [virtual_output]: 0.00290804 [merge_forward]: 0.00309924 [offload_activation]: 0.00367811 [cell_reuse_recompute_pass]: 4.80004e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00497554 [merge_recompute_call_nodes]: 3.21993e-06 [before_grad]: 0.00506171 [set_forward_comm_id_for_comm_node_pass]: 0.00337846 [meta_fg_expand]: 0.00407923 [flash_sp_send_recv_attached]: 4.50993e-06 [receive_attached]: 3.94997e-06 [after_resolve]: 0.00336497 [a_after_grad]: 0.00445496 [renormalize]: 0.165102 [add_forward_monad_depend]: 1.57601e-05 [auto_monad_grad]: 3.74997e-06 [auto_monad_eliminator]: 0.00632746 [cse]: 0.0137127 [a_3]: 0.0221602 [Cycle 3]: 0.245591, [45] [expand_dump_flag]: 5.47001e-06 [switch_simplify]: 0.00290616 [loop_unroll]: 0.00290222 [a_1]: 0.0707252 [invalid_dout_check]: 0.00347916 [recompute_prepare]: 0.00305297 [updatestate_depend_eliminate]: 0.00292857 [updatestate_assign_eliminate]: 0.00295761 [updatestate_loads_eliminate]: 0.00300271 [parameter_eliminate]: 9.66096e-06 [a_2]: 0.0478227 [accelerated_algorithm]: 0.00372645 [shard]: 3.73996e-06 [meta_shard_fg_expand]: 0.00155314 [shard_inline]: 0.00298058 [merge_send_recv]: 0.00321075 [auto_parallel]: 0.00320859 [parallel]: 1.51399e-05 [flash_sp]: 2.84996e-06 [merge_comm]: 0.00305082 [allreduce_fusion]: 0.00298664 [matmul_add_comm_reduction]: 0.00363351 [allreduce_slice_to_reducescatter]: 1.06997e-06 [virtual_shard_identity]: 0.00299888 [virtual_dataset]: 0.00299801 [get_grad_eliminate_]: 0.00297347 [virtual_output]: 0.00291687 [merge_forward]: 0.00306405 [offload_activation]: 0.00376228 [cell_reuse_recompute_pass]: 5.63997e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00494701 [merge_recompute_call_nodes]: 3.39001e-06 [before_grad]: 0.00489046 [set_forward_comm_id_for_comm_node_pass]: 0.00338915 [meta_fg_expand]: 0.00445828 [flash_sp_send_recv_attached]: 4.12005e-06 [receive_attached]: 3.66999e-06 [after_resolve]: 0.00335591 [a_after_grad]: 0.00444312 [renormalize]: 4.10015e-07 [add_forward_monad_depend]: 8.85001e-06 [auto_monad_grad]: 3.54997e-06 [auto_monad_eliminator]: 0.00513885 [cse]: 0.00944802 [a_3]: 0.0216101 [py_interpret_to_execute_after_opt_a]: 0.00498387 [slice_cell_reuse_recomputed_activation]: 3.93996e-06 [rewriter_after_opt_a]: 0.0396916 [convert_after_rewriter]: 0.00360546 [order_py_execute_after_rewriter]: 0.00289053 [opt_b]: 0.0991085, [1] [Cycle 1]: 0.0990901, [7] [b_1]: 0.0767877 [b_2]: 0.00299894 [updatestate_depend_eliminate]: 0.00306282 [updatestate_assign_eliminate]: 0.00310557 [updatestate_loads_eliminate]: 0.00291917 [renormalize]: 1.70001e-06 [cse]: 0.0099927 [optimize_parallel_all_gather_comm]: 0.00515095 [overlap_param_gather]: 1.12699e-05 [cconv]: 0.00155613 [loop_unroll]: 0.0040359 [opt_after_cconv]: 0.0326599, [1] [Cycle 1]: 0.0326439, [7] [c_1]: 0.0132729 [parameter_eliminate]: 8.30006e-06 [updatestate_depend_eliminate]: 0.00349854 [updatestate_assign_eliminate]: 0.00289922 [updatestate_loads_eliminate]: 0.00282287 [cse]: 0.0099647 [renormalize]: 1.36998e-06 [remove_dup_value]: 0.0164786 [tuple_transform]: 0.0185941, [1] [Cycle 1]: 0.018577, [2] [d_1]: 0.0185119 [renormalize]: 1.25996e-06 [partial_unused_args_eliminate]: 8.97003e-06 [add_cache_embedding]: 0.00342157 [add_recomputation]: 0.0177943 [cse_after_recomputation]: 0.00555287, [1] [Cycle 1]: 0.00553485, [1] [cse]: 0.00549441 [environ_conv]: 0.00221462 [swap_dp_allreduce_reducescatter]: 0.00294526 [bias_add_comm_swap]: 4.15999e-06 [label_micro_interleaved_index]: 1.106e-05 [label_fine_grained_interleaved_index]: 2.01099e-05 [merge_cast_opt]: 1.71002e-06 [slice_recompute_activation]: 2.37999e-06 [micro_interleaved_order_control]: 3.24997e-06 [assign_add_opt]: 2.53899e-05 [ForceFp32Comm]: 9.2003e-07 [remove_cast_before_assign_add]: 1.042e-05 [full_micro_interleaved_order_control]: 2.93995e-06 [reorder_send_recv_between_fp_bp]: 2.58e-06 [comm_op_add_attrs]: 1.14995e-06 [add_comm_op_reuse_tag]: 1.03994e-06 [interleave_split_concat_branches]: 1.54995e-06 [interleave_parallel_branches]: 1.24006e-06 [overlap_opt_shard_in_pipeline]: 1.125e-05 [overlap_opt_shard_grad_in_pipeline]: 2.53005e-06 [control_data_broadcast_order]: 0.00508306 [grouped_pairwise_exchange_alltoall]: 1.73005e-06 [offloading_packed_experts]: 0.00112491 [overlap_recompute_and_grad_model_parallel]: 0.00112404 [overlap_grad_matmul_and_grad_allreduce]: 3.29001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.75007e-06 [overlap_recompute_comm]: 2.46998e-06 [overlap_grad_ring_attention]: 0.00113928 [overlap_grad_flash_sp]: 0.00520774 [begin_end_overlap_inline]: 1.23004e-06 [split_matmul_comm_elemetwise]: 2.89991e-06 [split_layernorm_comm]: 2.17999e-06 [handle_group_info]: 1.30001e-06 [symbol_engine_optimizer]: 0.0290717, [1] [Cycle 1]: 0.029055, [6] [build]: 0.0132118 [elim_shapecalc]: 0.00265666 [elim_not_effective]: 0.00687012 [opt_reshape]: 0.001908 [fold_const_symbol]: 0.00421971 [renormalize]: 1.05996e-06 [detach_backward]: 4.08001e-06 [pipeline_parallel_scheduler]: 3.50003e-06 [auto_monad_reorder]: 0.00468233 [get_jit_bprop_graph]: 3.17011e-06 [rewriter_after_jit_bprop_graph]: 1.14799e-05 [opt_after_jit_grad]: 0.00691974 [distribtued_split]: 0.00507995 [validate]: 0.0036957 [backend_pass]: 3.42994e-06 [task_emit]: 1.47572 [execute]: 1.122e-05 Sums bootstrap : 0.011875s : 0.08% type_inference : 8.815158s : 61.19% auto_monad : 0.144852s : 1.01% graph_reusing : 0.006494s : 0.05% inline : 0.000007s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.035308s : 0.25% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.017571s : 0.12% parallel-infer-symbol : 0.000007s : 0.00% pre_auto_parallel : 0.043760s : 0.30% insert-virtual-dataset : 0.000011s : 0.00% parallel-infer-symbol-second : 0.000003s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.045990s : 0.32% optimize.rewriter_before_opt_a : 0.187338s : 1.30% optimize.opt_a.expand_dump_flag : 0.002663s : 0.02% optimize.opt_a.switch_simplify : 0.060864s : 0.42% optimize.opt_a.loop_unroll : 0.043240s : 0.30% optimize.opt_a.a_1 : 1.428528s : 9.92% optimize.opt_a.invalid_dout_check : 0.015383s : 0.11% optimize.opt_a.recompute_prepare : 0.013490s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.035341s : 0.25% optimize.opt_a.updatestate_assign_eliminate : 0.010967s : 0.08% optimize.opt_a.updatestate_loads_eliminate : 0.019333s : 0.13% optimize.opt_a.parameter_eliminate : 0.000029s : 0.00% optimize.opt_a.a_2 : 0.202452s : 1.41% optimize.opt_a.accelerated_algorithm : 0.014737s : 0.10% optimize.opt_a.shard : 0.000011s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.006118s : 0.04% optimize.opt_a.shard_inline : 0.009349s : 0.06% optimize.opt_a.merge_send_recv : 0.009782s : 0.07% optimize.opt_a.auto_parallel : 0.009412s : 0.07% optimize.opt_a.parallel : 0.000068s : 0.00% optimize.opt_a.flash_sp : 0.001924s : 0.01% optimize.opt_a.merge_comm : 0.009327s : 0.06% optimize.opt_a.allreduce_fusion : 0.009386s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.011452s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.009358s : 0.06% optimize.opt_a.virtual_dataset : 0.009299s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.009252s : 0.06% optimize.opt_a.virtual_output : 0.009064s : 0.06% optimize.opt_a.merge_forward : 0.009547s : 0.07% optimize.opt_a.offload_activation : 0.011912s : 0.08% optimize.opt_a.cell_reuse_recompute_pass : 0.000015s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.015492s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000010s : 0.00% optimize.opt_a.before_grad : 0.015525s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.010493s : 0.07% optimize.opt_a.meta_fg_expand : 0.017636s : 0.12% optimize.opt_a.flash_sp_send_recv_attached : 0.000019s : 0.00% optimize.opt_a.receive_attached : 0.000026s : 0.00% optimize.opt_a.after_resolve : 0.010441s : 0.07% optimize.opt_a.a_after_grad : 0.014057s : 0.10% optimize.opt_a.renormalize : 1.089593s : 7.56% optimize.opt_a.add_forward_monad_depend : 0.000045s : 0.00% optimize.opt_a.auto_monad_grad : 0.000011s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.040258s : 0.28% optimize.opt_a.cse : 0.045574s : 0.32% optimize.opt_a.a_3 : 0.066429s : 0.46% optimize.py_interpret_to_execute_after_opt_a : 0.004984s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.00% optimize.rewriter_after_opt_a : 0.039692s : 0.28% optimize.convert_after_rewriter : 0.003605s : 0.03% optimize.order_py_execute_after_rewriter : 0.002891s : 0.02% optimize.opt_b.b_1 : 0.076788s : 0.53% optimize.opt_b.b_2 : 0.002999s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.003063s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.003106s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.002919s : 0.02% optimize.opt_b.renormalize : 0.000002s : 0.00% optimize.opt_b.cse : 0.009993s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.005151s : 0.04% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.001556s : 0.01% optimize.loop_unroll : 0.004036s : 0.03% optimize.opt_after_cconv.c_1 : 0.013273s : 0.09% optimize.opt_after_cconv.parameter_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.003499s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.002899s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.002823s : 0.02% optimize.opt_after_cconv.cse : 0.009965s : 0.07% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.016479s : 0.11% optimize.tuple_transform.d_1 : 0.018512s : 0.13% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.partial_unused_args_eliminate : 0.000009s : 0.00% optimize.add_cache_embedding : 0.003422s : 0.02% optimize.add_recomputation : 0.017794s : 0.12% optimize.cse_after_recomputation.cse : 0.005494s : 0.04% optimize.environ_conv : 0.002215s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.002945s : 0.02% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000011s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000020s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000025s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000010s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000011s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.005083s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.001125s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.001124s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000003s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.001139s : 0.01% optimize.overlap_grad_flash_sp : 0.005208s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.013212s : 0.09% optimize.symbol_engine_optimizer.elim_shapecalc : 0.002657s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.006870s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.001908s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.004220s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000004s : 0.00% auto_monad_reorder : 0.004682s : 0.03% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000011s : 0.00% opt_after_jit_grad : 0.006920s : 0.05% distribtued_split : 0.005080s : 0.04% validate : 0.003696s : 0.03% backend_pass : 0.000003s : 0.00% task_emit : 1.475722s : 10.24% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.399448 67729 1.12% : 0.004486s : 1231: substitution.arithmetic_simplify 1.96% : 0.007814s : 1800: substitution.cast_eliminate 0.05% : 0.000195s : 124: substitution.depend_value_elim 0.28% : 0.001122s : 1449: substitution.elim_not_effective 0.30% : 0.001217s : 1234: substitution.float_tuple_getitem_switch 0.36% : 0.001450s : 1135: substitution.fold_const_symbol 0.36% : 0.001421s : 1963: substitution.graph_param_transform 79.48% : 0.317499s : 5675: substitution.inline 0.70% : 0.002781s : 5144: substitution.j_node_and_user_rematch 0.96% : 0.003821s : 1354: substitution.less_batch_normalization 0.36% : 0.001450s : 2708: substitution.load_eliminater 0.42% : 0.001681s : 1473: substitution.minmaximum_grad 0.04% : 0.000153s : 196: substitution.reduce_all_const_elim 0.68% : 0.002725s : 5144: substitution.remove_not_recompute_node 0.14% : 0.000542s : 1029: substitution.replace_old_param 1.10% : 0.004400s : 1449: substitution.reshape_eliminate 0.37% : 0.001466s : 596: substitution.switch_simplify 0.01% : 0.000032s : 6: substitution.transpose_eliminate 1.14% : 0.004534s : 1909: substitution.tuple_list_convert_item_index_to_positive 0.60% : 0.002415s : 2029: substitution.tuple_list_get_item_const_eliminator 1.13% : 0.004502s : 2029: substitution.tuple_list_get_item_depend_reorder 2.22% : 0.008876s : 3684: substitution.tuple_list_get_item_eliminator 0.81% : 0.003248s : 2029: substitution.tuple_list_get_set_item_eliminator 2.36% : 0.009410s : 11069: substitution.updatestate_pure_node_eliminater 3.05% : 0.012186s : 11269: substitution.updatestate_useless_node_eliminater 0.01% : 0.000022s : 1: substitution.value_based_eliminate ------[type_inference.] 8.777474 2 86.61% : 7.601853s : 1: type_inference.infer 13.39% : 1.175621s : 1: type_inference.specialize ------[replace.] 0.101353 8070 7.12% : 0.007217s : 676: replace.cast_eliminate 0.24% : 0.000248s : 24: replace.depend_value_elim 1.91% : 0.001932s : 169: replace.elim_not_effective 69.62% : 0.070563s : 5675: replace.inline 1.87% : 0.001897s : 170: replace.reshape_eliminate 9.39% : 0.009512s : 596: replace.switch_simplify 1.47% : 0.001494s : 120: replace.tuple_list_get_item_depend_reorder 8.33% : 0.008444s : 639: replace.tuple_list_get_item_eliminator 0.04% : 0.000045s : 1: replace.updatestate_pure_node_eliminater ------[match.] 0.322556 8070 1.05% : 0.003398s : 676: match.cast_eliminate 0.00% : 0.000014s : 24: match.depend_value_elim 0.10% : 0.000308s : 169: match.elim_not_effective 97.27% : 0.313766s : 5675: match.inline 0.26% : 0.000846s : 170: match.reshape_eliminate 0.35% : 0.001144s : 596: match.switch_simplify 0.35% : 0.001130s : 120: match.tuple_list_get_item_depend_reorder 0.60% : 0.001946s : 639: match.tuple_list_get_item_eliminator 0.00% : 0.000005s : 1: match.updatestate_pure_node_eliminater ------[predicate.] 0.3284751635139 1.08% : 0.003554s : 24391: predicate.accumulaten_eliminater 0.11% : 0.000369s : 1479: predicate.ad_related_special_op_eliminate 0.89% : 0.002934s : 8454: predicate.addn_check_dump 1.20% : 0.003955s : 24391: predicate.addn_zero_filter 1.07% : 0.003505s : 24391: predicate.adjust_all_reduce_mul_add 2.40% : 0.007873s : 32845: predicate.arithmetic_simplify 1.25% : 0.004098s : 25237: predicate.cast_eliminate 0.71% : 0.002320s : 5893: predicate.check_bprop_eliminate 0.90% : 0.002967s : 8454: predicate.compare_switch_simplify 0.04% : 0.000143s : 1964: predicate.const_output_eliminate 0.86% : 0.002827s : 8479: predicate.depend_value_elim 1.15% : 0.003762s : 25237: predicate.dict_get_item_const_eliminator 1.25% : 0.004105s : 25237: predicate.dict_get_item_eliminator 1.14% : 0.003745s : 25237: predicate.dict_set_item_eliminator 0.28% : 0.000905s : 3443: predicate.dumpgradient_eliminate 0.04% : 0.000140s : 1793: predicate.elim_not_effective 0.15% : 0.000493s : 1963: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.004132s : 27201: predicate.environ_add_const_eliminate 1.32% : 0.004337s : 27201: predicate.environ_get_add_eliminate 1.30% : 0.004280s : 27201: predicate.environ_get_depend_swap 2.29% : 0.007520s : 35655: predicate.environ_get_eliminate 1.29% : 0.004239s : 27201: predicate.environ_get_set_eliminate 1.41% : 0.004623s : 31672: predicate.exchange_switch_depend_value 1.79% : 0.005879s : 31672: predicate.float_depend_g_call 0.86% : 0.002841s : 8454: predicate.float_environ_get_switch 1.02% : 0.003336s : 10418: predicate.float_tuple_getitem_switch 0.04% : 0.000115s : 1479: predicate.fold_const_symbol 0.66% : 0.002161s : 6179: predicate.get_grad_eliminate 0.06% : 0.000182s : 1963: predicate.graph_param_transform 0.76% : 0.002501s : 8454: predicate.incorporate_call 0.76% : 0.002504s : 8454: predicate.incorporate_call_switch 5.49% : 0.018025s : 74624: predicate.inline 0.77% : 0.002516s : 6179: predicate.inline_without_move 0.14% : 0.000468s : 6179: predicate.j_node_and_user_rematch 0.74% : 0.002442s : 6186: predicate.less_batch_normalization 1.53% : 0.005027s : 29923: predicate.list_to_tuple_eliminator_ 2.52% : 0.008266s : 54315: predicate.load_eliminater 0.21% : 0.000702s : 1964: predicate.loop_unroll_after_grad 3.44% : 0.011301s : 31204: predicate.loop_unroll_before_grad 1.39% : 0.004575s : 29285: predicate.make_slice_get_slice_eliminator 0.91% : 0.002974s : 8454: predicate.merge_addn 0.61% : 0.002018s : 5893: predicate.micro_step_allgather_replace 0.62% : 0.002030s : 5893: predicate.mini_step_allgather_replace 1.05% : 0.003465s : 24391: predicate.minmaximum_grad 0.11% : 0.000349s : 1479: predicate.mutable_eliminate 0.11% : 0.000360s : 1479: predicate.opt_reshape 0.23% : 0.000745s : 1964: predicate.parallel_virtual_node 4.29% : 0.014107s : 31672: predicate.partial_defer_inline 1.32% : 0.004337s : 27960: predicate.partial_eliminate 1.05% : 0.003451s : 24391: predicate.print_const_string_wrapper 0.88% : 0.002889s : 8430: predicate.reduce_all_const_elim 1.36% : 0.004459s : 24391: predicate.reduce_eliminate 2.49% : 0.008189s : 54315: predicate.redundant_stop_gradient_eliminater 0.14% : 0.000452s : 6179: predicate.remove_not_recompute_node 0.91% : 0.002977s : 31889: predicate.replace_applicator 0.14% : 0.000456s : 6179: predicate.replace_old_param 0.04% : 0.000142s : 1964: predicate.reset_defer_inline 1.22% : 0.003999s : 24561: predicate.reshape_eliminate 0.62% : 0.002033s : 5893: predicate.row_tensor_add_zeros_like 0.23% : 0.000756s : 1964: predicate.row_tensor_eliminate 0.72% : 0.002377s : 5893: predicate.same_eliminate 0.21% : 0.000686s : 8875: predicate.set_cell_output_no_recompute 0.66% : 0.002154s : 6179: predicate.shard_identity_eliminate 0.28% : 0.000930s : 3443: predicate.special_op_eliminate 0.95% : 0.003120s : 8454: predicate.specialize_transform 0.67% : 0.002199s : 5893: predicate.split_environ_get_set_with_tuple_value 0.30% : 0.001001s : 6179: predicate.stack_unstack_eliminate 0.09% : 0.000286s : 1964: predicate.switch_call_monad_eliminater 1.56% : 0.005115s : 31672: predicate.switch_defer_inline 2.18% : 0.007170s : 37565: predicate.switch_layer_defer_inline 5.82% : 0.019129s : 72522: predicate.switch_simplify 1.06% : 0.003472s : 24391: predicate.tile_eliminate 1.09% : 0.003565s : 24391: predicate.transpose_eliminate 1.49% : 0.004878s : 29164: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.005395s : 29284: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.004827s : 29284: predicate.tuple_list_get_item_depend_reorder 2.61% : 0.008572s : 38377: predicate.tuple_list_get_item_eliminator 1.49% : 0.004880s : 29284: predicate.tuple_list_get_set_item_eliminator 2.51% : 0.008239s : 37738: predicate.tuple_list_set_item_eliminator 1.55% : 0.005089s : 29923: predicate.tuple_to_list_eliminator_ 2.59% : 0.008498s : 54316: predicate.updatestate_pure_node_eliminater 3.44% : 0.011300s : 62770: predicate.updatestate_useless_node_eliminater 0.23% : 0.000741s : 1964: predicate.value_based_eliminate 0.65% : 0.002141s : 6179: predicate.virtual_dataset_eliminate 0.66% : 0.002159s : 6179: predicate.virtual_output_eliminate 0.21% : 0.000697s : 1964: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.737662 7279 62.51% : 0.461113s : 2038: func_graph_cloner_run.FuncGraphClonerGraph 5.41% : 0.039913s : 304: func_graph_cloner_run.FuncGraphClonerNode 32.08% : 0.236636s : 4937: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 21.617965 253 0.00% : 0.000004s : 1: ForceFp32Comm 0.60% : 0.130470s : 1: add_attr 0.60% : 0.130438s : 1: add_attr_with_inline 0.02% : 0.003440s : 1: add_cache_embedding 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.017832s : 1: add_recomputation 0.00% : 0.000032s : 1: assign_add_opt 0.67% : 0.144912s : 1: auto_monad 0.02% : 0.004752s : 1: auto_monad_reorder 0.00% : 0.000020s : 1: backend_pass 0.00% : 0.000009s : 1: begin_end_overlap_inline 0.00% : 0.000010s : 1: bias_add_comm_swap 0.06% : 0.011944s : 1: bootstrap 0.01% : 0.001573s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.005103s : 1: control_data_broadcast_order 0.02% : 0.003636s : 1: convert_after_rewriter 0.03% : 0.005561s : 1: cse_after_recomputation 0.00% : 0.000010s : 1: dataset_repeat_opt 0.00% : 0.000011s : 1: detach_backward 0.02% : 0.005134s : 1: distribtued_split 0.01% : 0.002237s : 1: environ_conv 0.00% : 0.000024s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000013s : 1: get_jit_bprop_graph 0.03% : 0.006533s : 1: graph_reusing 0.00% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000015s : 1: inline 0.00% : 0.000020s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000026s : 1: label_fine_grained_interleaved_index 0.00% : 0.000015s : 1: label_micro_interleaved_index 0.02% : 0.004054s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.01% : 0.001135s : 1: offloading_packed_experts 0.01% : 0.003023s : 1: opt.transform.loop_unroll_optimizer 8.93% : 1.930632s : 134: opt.transform.opt_a 0.06% : 0.013267s : 1: opt.transform.opt_after_cconv 0.03% : 0.006100s : 2: opt.transform.opt_after_jit_grad 0.37% : 0.079585s : 28: opt.transform.opt_b 0.09% : 0.018500s : 1: opt.transform.opt_trans_graph 0.07% : 0.015632s : 4: opt.transform.symbol_engine_opt 15.27% : 3.301530s : 1: opt_a 0.15% : 0.032670s : 1: opt_after_cconv 0.03% : 0.006946s : 1: opt_after_jit_grad 0.46% : 0.099116s : 1: opt_b 17.76% : 3.839263s : 1: optimize 0.02% : 0.005176s : 1: optimize_parallel_all_gather_comm 0.01% : 0.002913s : 1: order_py_execute_after_rewriter 0.02% : 0.005229s : 1: overlap_grad_flash_sp 0.00% : 0.000008s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.001151s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000015s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000019s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.001134s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000016s : 1: parallel-infer-symbol 0.00% : 0.000009s : 1: parallel-infer-symbol-second 0.00% : 0.000013s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.20% : 0.043821s : 1: pre_auto_parallel 0.21% : 0.046033s : 1: py_interpret_to_execute 0.02% : 0.005009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000014s : 1: remove_cast_before_assign_add 0.08% : 0.016519s : 1: remove_dup_value 2.36% : 0.511138s : 2: renormalize.infer 2.67% : 0.578116s : 2: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000018s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.039718s : 1: rewriter_after_opt_a 0.87% : 0.187382s : 1: rewriter_before_opt_a 0.00% : 0.000010s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.002965s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.029080s : 1: symbol_engine_optimizer 6.83% : 1.475765s : 1: task_emit 0.09% : 0.018604s : 1: tuple_transform 40.78% : 8.815226s : 1: type_inference 0.03% : 0.007525s : 1: validate [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:11.964.313 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:11.965.817 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:11.969.022 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:11.994.880 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty TotalTime = 13.3425, [24] [bootstrap]: 0.00908343 [type_inference]: 8.31388 [auto_monad]: 0.129161 [graph_reusing]: 0.00217467 [inline]: 3.74008e-06 [add_attr]: 0.121141, [1] [add_attr_with_inline]: 0.121116, [1] [Cycle 1]: 0.0485448, [2] [tag_attr]: 0.032003 [meta_addattr_fg_expand]: 0.0164082 [parallel-infer-symbol]: 5.69003e-06 [pre_auto_parallel]: 0.0395667 [insert-virtual-dataset]: 9.80997e-06 [parallel-infer-symbol-second]: 3.38e-06 [dataset_repeat_opt]: 3.63996e-06 [pipeline_split]: 2.41003e-06 [optimize]: 3.63124, [53] [py_interpret_to_execute]: 0.0411202 [rewriter_before_opt_a]: 0.148497 [opt_a]: 3.14067, [3] [Cycle 1]: 2.49354, [45] [expand_dump_flag]: 0.00218483 [switch_simplify]: 0.0523425 [loop_unroll]: 0.0359814 [a_1]: 1.08667 [invalid_dout_check]: 0.00837657 [recompute_prepare]: 0.00687066 [updatestate_depend_eliminate]: 0.0285405 [updatestate_assign_eliminate]: 0.0044666 [updatestate_loads_eliminate]: 0.0148248 [parameter_eliminate]: 8.69005e-06 [a_2]: 0.102893 [accelerated_algorithm]: 0.00663 [shard]: 3.12994e-06 [meta_shard_fg_expand]: 0.00336071 [shard_inline]: 0.00314028 [merge_send_recv]: 0.0029569 [auto_parallel]: 0.00294273 [parallel]: 1.41e-05 [flash_sp]: 0.00168068 [merge_comm]: 0.00294894 [allreduce_fusion]: 0.00293611 [matmul_add_comm_reduction]: 0.00390093 [allreduce_slice_to_reducescatter]: 1.09e-06 [virtual_shard_identity]: 0.0030302 [virtual_dataset]: 0.00298412 [get_grad_eliminate_]: 0.0030559 [virtual_output]: 0.00298665 [merge_forward]: 0.00295101 [offload_activation]: 0.00401794 [cell_reuse_recompute_pass]: 3.29001e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00524554 [merge_recompute_call_nodes]: 2.89001e-06 [before_grad]: 0.005132 [set_forward_comm_id_for_comm_node_pass]: 0.00325774 [meta_fg_expand]: 0.00742287 [flash_sp_send_recv_attached]: 7.19097e-06 [receive_attached]: 2.96009e-06 [after_resolve]: 0.00342441 [a_after_grad]: 0.00469997 [renormalize]: 1.00006 [add_forward_monad_depend]: 1.999e-05 [auto_monad_grad]: 3.70003e-06 [auto_monad_eliminator]: 0.0298501 [cse]: 0.0192209 [a_3]: 0.0214836 [Cycle 2]: 0.40325, [45] [expand_dump_flag]: 4.49002e-06 [switch_simplify]: 0.00285415 [loop_unroll]: 0.00288821 [a_1]: 0.0707433 [invalid_dout_check]: 0.00308399 [recompute_prepare]: 0.00286682 [updatestate_depend_eliminate]: 0.0027969 [updatestate_assign_eliminate]: 0.00269695 [updatestate_loads_eliminate]: 0.00269412 [parameter_eliminate]: 7.69994e-06 [a_2]: 0.0474483 [accelerated_algorithm]: 0.00369793 [shard]: 3.8899e-06 [meta_shard_fg_expand]: 0.00155524 [shard_inline]: 0.0029777 [merge_send_recv]: 0.00305316 [auto_parallel]: 0.00294916 [parallel]: 1.235e-05 [flash_sp]: 5.10004e-06 [merge_comm]: 0.00294854 [allreduce_fusion]: 0.00291697 [matmul_add_comm_reduction]: 0.00364238 [allreduce_slice_to_reducescatter]: 9.49949e-07 [virtual_shard_identity]: 0.00289213 [virtual_dataset]: 0.00290413 [get_grad_eliminate_]: 0.00291771 [virtual_output]: 0.00289169 [merge_forward]: 0.00290341 [offload_activation]: 0.00371409 [cell_reuse_recompute_pass]: 4.1601e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00498921 [merge_recompute_call_nodes]: 2.36998e-06 [before_grad]: 0.00491113 [set_forward_comm_id_for_comm_node_pass]: 0.00320793 [meta_fg_expand]: 0.00412256 [flash_sp_send_recv_attached]: 3.13995e-06 [receive_attached]: 3.19991e-06 [after_resolve]: 0.00329416 [a_after_grad]: 0.00443367 [renormalize]: 0.159228 [add_forward_monad_depend]: 1.63601e-05 [auto_monad_grad]: 3.68094e-06 [auto_monad_eliminator]: 0.00554442 [cse]: 0.0116139 [a_3]: 0.0210089 [Cycle 3]: 0.243844, [45] [expand_dump_flag]: 4.39002e-06 [switch_simplify]: 0.00281768 [loop_unroll]: 0.00554346 [a_1]: 0.0704984 [invalid_dout_check]: 0.00259989 [recompute_prepare]: 0.00279362 [updatestate_depend_eliminate]: 0.0027485 [updatestate_assign_eliminate]: 0.00271369 [updatestate_loads_eliminate]: 0.00269165 [parameter_eliminate]: 8.12009e-06 [a_2]: 0.0471425 [accelerated_algorithm]: 0.00367178 [shard]: 3.40003e-06 [meta_shard_fg_expand]: 0.00150079 [shard_inline]: 0.00295616 [merge_send_recv]: 0.00301249 [auto_parallel]: 0.00291641 [parallel]: 1.221e-05 [flash_sp]: 3.11004e-06 [merge_comm]: 0.00333149 [allreduce_fusion]: 0.00311502 [matmul_add_comm_reduction]: 0.00400891 [allreduce_slice_to_reducescatter]: 9.50065e-07 [virtual_shard_identity]: 0.00288345 [virtual_dataset]: 0.00289703 [get_grad_eliminate_]: 0.00301788 [virtual_output]: 0.00286793 [merge_forward]: 0.00302464 [offload_activation]: 0.00394595 [cell_reuse_recompute_pass]: 3.69002e-06 [cell_reuse_handle_not_recompute_node_pass]: 0.00496939 [merge_recompute_call_nodes]: 2.29001e-06 [before_grad]: 0.00487751 [set_forward_comm_id_for_comm_node_pass]: 0.00328116 [meta_fg_expand]: 0.00398127 [flash_sp_send_recv_attached]: 3.18e-06 [receive_attached]: 2.83006e-06 [after_resolve]: 0.00336858 [a_after_grad]: 0.00442325 [renormalize]: 2.40048e-07 [add_forward_monad_depend]: 6.99994e-06 [auto_monad_grad]: 3.36999e-06 [auto_monad_eliminator]: 0.00502301 [cse]: 0.00907002 [a_3]: 0.0212946 [py_interpret_to_execute_after_opt_a]: 0.00436641 [slice_cell_reuse_recomputed_activation]: 3.90003e-06 [rewriter_after_opt_a]: 0.0369469 [convert_after_rewriter]: 0.0029318 [order_py_execute_after_rewriter]: 0.0024742 [opt_b]: 0.0962987, [1] [Cycle 1]: 0.096284, [7] [b_1]: 0.0756927 [b_2]: 0.00296598 [updatestate_depend_eliminate]: 0.00281353 [updatestate_assign_eliminate]: 0.00279639 [updatestate_loads_eliminate]: 0.00276915 [renormalize]: 1.50001e-06 [cse]: 0.00909805 [optimize_parallel_all_gather_comm]: 0.0051481 [overlap_param_gather]: 1.095e-05 [cconv]: 0.00158561 [loop_unroll]: 0.00408973 [opt_after_cconv]: 0.0310437, [1] [Cycle 1]: 0.0310323, [7] [c_1]: 0.0133292 [parameter_eliminate]: 5.8501e-06 [updatestate_depend_eliminate]: 0.00316704 [updatestate_assign_eliminate]: 0.00275662 [updatestate_loads_eliminate]: 0.00275393 [cse]: 0.00890147 [renormalize]: 9.30042e-07 [remove_dup_value]: 0.0141065 [tuple_transform]: 0.0179868, [1] [Cycle 1]: 0.0179737, [2] [d_1]: 0.0179385 [renormalize]: 7.00005e-07 [partial_unused_args_eliminate]: 6.47001e-06 [add_cache_embedding]: 0.002901 [add_recomputation]: 0.0247612 [cse_after_recomputation]: 0.00565571, [1] [Cycle 1]: 0.00554143, [1] [cse]: 0.00550825 [environ_conv]: 0.0017916 [swap_dp_allreduce_reducescatter]: 0.00329153 [bias_add_comm_swap]: 3.40107e-06 [label_micro_interleaved_index]: 8.36002e-06 [label_fine_grained_interleaved_index]: 3.02005e-06 [merge_cast_opt]: 4.35009e-06 [slice_recompute_activation]: 2.44007e-06 [micro_interleaved_order_control]: 2.94996e-06 [assign_add_opt]: 2.048e-05 [ForceFp32Comm]: 1.11992e-06 [remove_cast_before_assign_add]: 1.46008e-06 [full_micro_interleaved_order_control]: 2.72994e-06 [reorder_send_recv_between_fp_bp]: 2.39001e-06 [comm_op_add_attrs]: 1.05007e-06 [add_comm_op_reuse_tag]: 1.01002e-06 [interleave_split_concat_branches]: 1.52993e-06 [interleave_parallel_branches]: 1.20001e-06 [overlap_opt_shard_in_pipeline]: 4.01509e-05 [overlap_opt_shard_grad_in_pipeline]: 2.56998e-06 [control_data_broadcast_order]: 0.00536845 [grouped_pairwise_exchange_alltoall]: 1.67999e-06 [offloading_packed_experts]: 0.001101 [overlap_recompute_and_grad_model_parallel]: 0.0011098 [overlap_grad_matmul_and_grad_allreduce]: 3.50003e-06 [overlap_recompute_allgather_and_fa_grad]: 1.63994e-06 [overlap_recompute_comm]: 2.39001e-06 [overlap_grad_ring_attention]: 0.00112583 [overlap_grad_flash_sp]: 0.00549125 [begin_end_overlap_inline]: 1.03004e-06 [split_matmul_comm_elemetwise]: 3.47989e-06 [split_layernorm_comm]: 2.16998e-06 [handle_group_info]: 1.41992e-06 [symbol_engine_optimizer]: 0.0303402, [1] [Cycle 1]: 0.0303249, [6] [build]: 0.0141628 [elim_shapecalc]: 0.00264422 [elim_not_effective]: 0.00702297 [opt_reshape]: 0.00195433 [fold_const_symbol]: 0.00434005 [renormalize]: 1.22096e-06 [detach_backward]: 4.1601e-06 [pipeline_parallel_scheduler]: 2.14996e-06 [auto_monad_reorder]: 0.00576036 [get_jit_bprop_graph]: 8.69995e-06 [rewriter_after_jit_bprop_graph]: 1.582e-05 [opt_after_jit_grad]: 0.00693127 [distribtued_split]: 0.0053213 [validate]: 0.00403172 [backend_pass]: 4.18001e-06 [task_emit]: 1.06903 [execute]: 1.17201e-05 Sums bootstrap : 0.009083s : 0.07% type_inference : 8.313885s : 62.70% auto_monad : 0.129161s : 0.97% graph_reusing : 0.002175s : 0.02% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.032003s : 0.24% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.016408s : 0.12% parallel-infer-symbol : 0.000006s : 0.00% pre_auto_parallel : 0.039567s : 0.30% insert-virtual-dataset : 0.000010s : 0.00% parallel-infer-symbol-second : 0.000003s : 0.00% dataset_repeat_opt : 0.000004s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.041120s : 0.31% optimize.rewriter_before_opt_a : 0.148497s : 1.12% optimize.opt_a.expand_dump_flag : 0.002194s : 0.02% optimize.opt_a.switch_simplify : 0.058014s : 0.44% optimize.opt_a.loop_unroll : 0.044413s : 0.33% optimize.opt_a.a_1 : 1.227913s : 9.26% optimize.opt_a.invalid_dout_check : 0.014060s : 0.11% optimize.opt_a.recompute_prepare : 0.012531s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.034086s : 0.26% optimize.opt_a.updatestate_assign_eliminate : 0.009877s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.020211s : 0.15% optimize.opt_a.parameter_eliminate : 0.000025s : 0.00% optimize.opt_a.a_2 : 0.197484s : 1.49% optimize.opt_a.accelerated_algorithm : 0.014000s : 0.11% optimize.opt_a.shard : 0.000010s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.006417s : 0.05% optimize.opt_a.shard_inline : 0.009074s : 0.07% optimize.opt_a.merge_send_recv : 0.009023s : 0.07% optimize.opt_a.auto_parallel : 0.008808s : 0.07% optimize.opt_a.parallel : 0.000039s : 0.00% optimize.opt_a.flash_sp : 0.001689s : 0.01% optimize.opt_a.merge_comm : 0.009229s : 0.07% optimize.opt_a.allreduce_fusion : 0.008968s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.011552s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.008806s : 0.07% optimize.opt_a.virtual_dataset : 0.008785s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.008991s : 0.07% optimize.opt_a.virtual_output : 0.008746s : 0.07% optimize.opt_a.merge_forward : 0.008879s : 0.07% optimize.opt_a.offload_activation : 0.011678s : 0.09% optimize.opt_a.cell_reuse_recompute_pass : 0.000011s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.015204s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000008s : 0.00% optimize.opt_a.before_grad : 0.014921s : 0.11% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.009747s : 0.07% optimize.opt_a.meta_fg_expand : 0.015527s : 0.12% optimize.opt_a.flash_sp_send_recv_attached : 0.000014s : 0.00% optimize.opt_a.receive_attached : 0.000009s : 0.00% optimize.opt_a.after_resolve : 0.010087s : 0.08% optimize.opt_a.a_after_grad : 0.013557s : 0.10% optimize.opt_a.renormalize : 1.159287s : 8.74% optimize.opt_a.add_forward_monad_depend : 0.000043s : 0.00% optimize.opt_a.auto_monad_grad : 0.000011s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.040418s : 0.30% optimize.opt_a.cse : 0.039905s : 0.30% optimize.opt_a.a_3 : 0.063787s : 0.48% optimize.py_interpret_to_execute_after_opt_a : 0.004366s : 0.03% optimize.slice_cell_reuse_recomputed_activation : 0.000004s : 0.00% optimize.rewriter_after_opt_a : 0.036947s : 0.28% optimize.convert_after_rewriter : 0.002932s : 0.02% optimize.order_py_execute_after_rewriter : 0.002474s : 0.02% optimize.opt_b.b_1 : 0.075693s : 0.57% optimize.opt_b.b_2 : 0.002966s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.002814s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.002796s : 0.02% optimize.opt_b.updatestate_loads_eliminate : 0.002769s : 0.02% optimize.opt_b.renormalize : 0.000002s : 0.00% optimize.opt_b.cse : 0.009098s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.005148s : 0.04% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.001586s : 0.01% optimize.loop_unroll : 0.004090s : 0.03% optimize.opt_after_cconv.c_1 : 0.013329s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.003167s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.002757s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.002754s : 0.02% optimize.opt_after_cconv.cse : 0.008901s : 0.07% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.014106s : 0.11% optimize.tuple_transform.d_1 : 0.017938s : 0.14% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.partial_unused_args_eliminate : 0.000006s : 0.00% optimize.add_cache_embedding : 0.002901s : 0.02% optimize.add_recomputation : 0.024761s : 0.19% optimize.cse_after_recomputation.cse : 0.005508s : 0.04% optimize.environ_conv : 0.001792s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.003292s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000020s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000040s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.005368s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.001101s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.001110s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000004s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.001126s : 0.01% optimize.overlap_grad_flash_sp : 0.005491s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.014163s : 0.11% optimize.symbol_engine_optimizer.elim_shapecalc : 0.002644s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.007023s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.001954s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.004340s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000004s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.005760s : 0.04% get_jit_bprop_graph : 0.000009s : 0.00% rewriter_after_jit_bprop_graph : 0.000016s : 0.00% opt_after_jit_grad : 0.006931s : 0.05% distribtued_split : 0.005321s : 0.04% validate : 0.004032s : 0.03% backend_pass : 0.000004s : 0.00% task_emit : 1.069027s : 8.06% execute : 0.000012s : 0.00% Time group info: ------[substitution.] 0.319661 64790 1.13% : 0.003626s : 1231: substitution.arithmetic_simplify 1.52% : 0.004855s : 1027: substitution.cast_eliminate 0.06% : 0.000188s : 124: substitution.depend_value_elim 0.38% : 0.001205s : 1468: substitution.elim_not_effective 0.35% : 0.001117s : 1114: substitution.float_tuple_getitem_switch 0.48% : 0.001529s : 1154: substitution.fold_const_symbol 0.44% : 0.001391s : 1984: substitution.graph_param_transform 77.26% : 0.246980s : 5025: substitution.inline 0.87% : 0.002776s : 5110: substitution.j_node_and_user_rematch 1.09% : 0.003478s : 1354: substitution.less_batch_normalization 0.49% : 0.001563s : 3090: substitution.load_eliminater 0.43% : 0.001390s : 1305: substitution.minmaximum_grad 0.05% : 0.000148s : 196: substitution.reduce_all_const_elim 0.85% : 0.002705s : 5110: substitution.remove_not_recompute_node 0.17% : 0.000537s : 1035: substitution.replace_old_param 1.17% : 0.003739s : 1449: substitution.reshape_eliminate 0.45% : 0.001430s : 588: substitution.switch_simplify 0.01% : 0.000032s : 6: substitution.transpose_eliminate 1.22% : 0.003912s : 1693: substitution.tuple_list_convert_item_index_to_positive 0.65% : 0.002073s : 1741: substitution.tuple_list_get_item_const_eliminator 1.02% : 0.003263s : 1741: substitution.tuple_list_get_item_depend_reorder 2.45% : 0.007840s : 3349: substitution.tuple_list_get_item_eliminator 0.87% : 0.002783s : 1741: substitution.tuple_list_get_set_item_eliminator 2.77% : 0.008862s : 10976: substitution.updatestate_pure_node_eliminater 3.82% : 0.012221s : 11178: substitution.updatestate_useless_node_eliminater 0.01% : 0.000018s : 1: substitution.value_based_eliminate ------[type_inference.] 8.279567 2 86.29% : 7.144051s : 1: type_inference.infer 13.71% : 1.135517s : 1: type_inference.specialize ------[replace.] 0.077907 7390 8.47% : 0.006599s : 676: replace.cast_eliminate 0.39% : 0.000303s : 24: replace.depend_value_elim 2.59% : 0.002017s : 169: replace.elim_not_effective 64.64% : 0.050357s : 5025: replace.inline 2.20% : 0.001712s : 170: replace.reshape_eliminate 10.21% : 0.007955s : 588: replace.switch_simplify 0.65% : 0.000508s : 48: replace.tuple_list_get_item_depend_reorder 10.81% : 0.008425s : 688: replace.tuple_list_get_item_eliminator 0.04% : 0.000032s : 2: replace.updatestate_pure_node_eliminater ------[match.] 0.250773 7390 1.14% : 0.002850s : 676: match.cast_eliminate 0.01% : 0.000014s : 24: match.depend_value_elim 0.13% : 0.000315s : 169: match.elim_not_effective 97.16% : 0.243661s : 5025: match.inline 0.22% : 0.000560s : 170: match.reshape_eliminate 0.42% : 0.001054s : 588: match.switch_simplify 0.15% : 0.000388s : 48: match.tuple_list_get_item_depend_reorder 0.77% : 0.001924s : 688: match.tuple_list_get_item_eliminator 0.00% : 0.000005s : 2: match.updatestate_pure_node_eliminater ------[predicate.] 0.3088351525361 1.05% : 0.003254s : 22122: predicate.accumulaten_eliminater 0.12% : 0.000382s : 1500: predicate.ad_related_special_op_eliminate 2.03% : 0.006274s : 8356: predicate.addn_check_dump 1.01% : 0.003114s : 22122: predicate.addn_zero_filter 1.04% : 0.003224s : 22122: predicate.adjust_all_reduce_mul_add 2.27% : 0.007021s : 30478: predicate.arithmetic_simplify 1.19% : 0.003663s : 22968: predicate.cast_eliminate 0.66% : 0.002040s : 5957: predicate.check_bprop_eliminate 0.90% : 0.002787s : 8356: predicate.compare_switch_simplify 0.05% : 0.000145s : 1985: predicate.const_output_eliminate 0.91% : 0.002809s : 8426: predicate.depend_value_elim 1.14% : 0.003529s : 22968: predicate.dict_get_item_const_eliminator 1.29% : 0.003982s : 22968: predicate.dict_get_item_eliminator 1.08% : 0.003336s : 22968: predicate.dict_set_item_eliminator 0.29% : 0.000908s : 3485: predicate.dumpgradient_eliminate 0.05% : 0.000145s : 1814: predicate.elim_not_effective 0.17% : 0.000523s : 1984: predicate.elim_shapecalc_of_broadcastargs 1.27% : 0.003913s : 24953: predicate.environ_add_const_eliminate 1.36% : 0.004185s : 24953: predicate.environ_get_add_eliminate 1.29% : 0.003989s : 24953: predicate.environ_get_depend_swap 2.20% : 0.006787s : 33309: predicate.environ_get_eliminate 1.28% : 0.003938s : 24953: predicate.environ_get_set_eliminate 1.33% : 0.004106s : 28731: predicate.exchange_switch_depend_value 1.71% : 0.005284s : 28731: predicate.float_depend_g_call 0.90% : 0.002780s : 8356: predicate.float_environ_get_switch 1.08% : 0.003328s : 10341: predicate.float_tuple_getitem_switch 0.04% : 0.000117s : 1500: predicate.fold_const_symbol 0.69% : 0.002137s : 6151: predicate.get_grad_eliminate 0.06% : 0.000184s : 1984: predicate.graph_param_transform 0.81% : 0.002493s : 8356: predicate.incorporate_call 0.80% : 0.002475s : 8356: predicate.incorporate_call_switch 5.51% : 0.017029s : 69330: predicate.inline 0.80% : 0.002468s : 6151: predicate.inline_without_move 0.15% : 0.000455s : 6151: predicate.j_node_and_user_rematch 0.77% : 0.002384s : 6160: predicate.less_batch_normalization 1.49% : 0.004593s : 27673: predicate.list_to_tuple_eliminator_ 2.47% : 0.007613s : 49796: predicate.load_eliminater 0.24% : 0.000726s : 1985: predicate.loop_unroll_after_grad 3.62% : 0.011168s : 31312: predicate.loop_unroll_before_grad 1.38% : 0.004271s : 26986: predicate.make_slice_get_slice_eliminator 0.91% : 0.002798s : 8356: predicate.merge_addn 0.66% : 0.002035s : 5957: predicate.micro_step_allgather_replace 0.65% : 0.002016s : 5957: predicate.mini_step_allgather_replace 1.08% : 0.003332s : 22122: predicate.minmaximum_grad 0.12% : 0.000361s : 1500: predicate.mutable_eliminate 0.12% : 0.000379s : 1500: predicate.opt_reshape 0.24% : 0.000749s : 1985: predicate.parallel_virtual_node 3.52% : 0.010869s : 28731: predicate.partial_defer_inline 1.29% : 0.003977s : 25689: predicate.partial_eliminate 1.01% : 0.003122s : 22122: predicate.print_const_string_wrapper 0.93% : 0.002867s : 8332: predicate.reduce_all_const_elim 1.33% : 0.004097s : 22122: predicate.reduce_eliminate 2.42% : 0.007476s : 49796: predicate.redundant_stop_gradient_eliminater 0.14% : 0.000447s : 6151: predicate.remove_not_recompute_node 0.93% : 0.002869s : 29661: predicate.replace_applicator 0.15% : 0.000454s : 6151: predicate.replace_old_param 0.05% : 0.000154s : 1985: predicate.reset_defer_inline 1.15% : 0.003559s : 22292: predicate.reshape_eliminate 0.66% : 0.002025s : 5957: predicate.row_tensor_add_zeros_like 0.25% : 0.000768s : 1985: predicate.row_tensor_eliminate 0.70% : 0.002166s : 5957: predicate.same_eliminate 0.22% : 0.000688s : 8821: predicate.set_cell_output_no_recompute 0.69% : 0.002133s : 6151: predicate.shard_identity_eliminate 0.30% : 0.000936s : 3485: predicate.special_op_eliminate 1.01% : 0.003110s : 8356: predicate.specialize_transform 0.70% : 0.002159s : 5957: predicate.split_environ_get_set_with_tuple_value 0.32% : 0.000987s : 6151: predicate.stack_unstack_eliminate 0.09% : 0.000276s : 1985: predicate.switch_call_monad_eliminater 1.43% : 0.004410s : 28731: predicate.switch_defer_inline 2.07% : 0.006400s : 34688: predicate.switch_layer_defer_inline 5.98% : 0.018477s : 69575: predicate.switch_simplify 1.03% : 0.003184s : 22122: predicate.tile_eliminate 1.04% : 0.003216s : 22122: predicate.transpose_eliminate 1.48% : 0.004565s : 26937: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.004803s : 26985: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.004415s : 26985: predicate.tuple_list_get_item_depend_reorder 2.64% : 0.008154s : 36029: predicate.tuple_list_get_item_eliminator 1.42% : 0.004398s : 26985: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.007683s : 35341: predicate.tuple_list_set_item_eliminator 1.40% : 0.004318s : 27673: predicate.tuple_to_list_eliminator_ 2.62% : 0.008081s : 49798: predicate.updatestate_pure_node_eliminater 3.45% : 0.010663s : 58154: predicate.updatestate_useless_node_eliminater 0.24% : 0.000746s : 1985: predicate.value_based_eliminate 0.69% : 0.002136s : 6151: predicate.virtual_dataset_eliminate 0.69% : 0.002145s : 6151: predicate.virtual_output_eliminate 0.22% : 0.000674s : 1985: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.745121 6967 66.09% : 0.492471s : 1979: func_graph_cloner_run.FuncGraphClonerGraph 0.90% : 0.006692s : 116: func_graph_cloner_run.FuncGraphClonerNode 33.01% : 0.245958s : 4872: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 20.103730 253 0.00% : 0.000005s : 1: ForceFp32Comm 0.60% : 0.121160s : 1: add_attr 0.60% : 0.121122s : 1: add_attr_with_inline 0.01% : 0.002914s : 1: add_cache_embedding 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.12% : 0.024788s : 1: add_recomputation 0.00% : 0.000025s : 1: assign_add_opt 0.64% : 0.129230s : 1: auto_monad 0.03% : 0.005814s : 1: auto_monad_reorder 0.00% : 0.000024s : 1: backend_pass 0.00% : 0.000008s : 1: begin_end_overlap_inline 0.00% : 0.000014s : 1: bias_add_comm_swap 0.05% : 0.009133s : 1: bootstrap 0.01% : 0.001599s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.005384s : 1: control_data_broadcast_order 0.01% : 0.002950s : 1: convert_after_rewriter 0.03% : 0.005664s : 1: cse_after_recomputation 0.00% : 0.000010s : 1: dataset_repeat_opt 0.00% : 0.000012s : 1: detach_backward 0.03% : 0.005357s : 1: distribtued_split 0.01% : 0.001806s : 1: environ_conv 0.00% : 0.000021s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000018s : 1: get_jit_bprop_graph 0.01% : 0.002200s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000010s : 1: inline 0.00% : 0.000019s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000012s : 1: label_micro_interleaved_index 0.02% : 0.004104s : 1: loop_unroll 0.00% : 0.000009s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.01% : 0.001110s : 1: offloading_packed_experts 0.01% : 0.002919s : 1: opt.transform.loop_unroll_optimizer 8.53% : 1.715621s : 134: opt.transform.opt_a 0.07% : 0.013325s : 1: opt.transform.opt_after_cconv 0.03% : 0.005909s : 2: opt.transform.opt_after_jit_grad 0.39% : 0.078510s : 28: opt.transform.opt_b 0.09% : 0.017933s : 1: opt.transform.opt_trans_graph 0.08% : 0.015936s : 4: opt.transform.symbol_engine_opt 15.62% : 3.140681s : 1: opt_a 0.15% : 0.031050s : 1: opt_after_cconv 0.03% : 0.006960s : 1: opt_after_jit_grad 0.48% : 0.096305s : 1: opt_b 18.06% : 3.631270s : 1: optimize 0.03% : 0.005164s : 1: optimize_parallel_all_gather_comm 0.01% : 0.002489s : 1: order_py_execute_after_rewriter 0.03% : 0.005514s : 1: overlap_grad_flash_sp 0.00% : 0.000008s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.001136s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000047s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000018s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.001120s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000016s : 1: parallel-infer-symbol 0.00% : 0.000009s : 1: parallel-infer-symbol-second 0.00% : 0.000011s : 1: partial_unused_args_eliminate 0.00% : 0.000008s : 1: pipeline_parallel_scheduler 0.00% : 0.000008s : 1: pipeline_split 0.20% : 0.039626s : 1: pre_auto_parallel 0.20% : 0.041163s : 1: py_interpret_to_execute 0.02% : 0.004382s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.07% : 0.014335s : 1: remove_dup_value 2.75% : 0.551892s : 2: renormalize.infer 3.02% : 0.607082s : 2: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000022s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.036964s : 1: rewriter_after_opt_a 0.74% : 0.148543s : 1: rewriter_before_opt_a 0.00% : 0.000010s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.02% : 0.003305s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.030348s : 1: symbol_engine_optimizer 5.32% : 1.069066s : 1: task_emit 0.09% : 0.017994s : 1: tuple_transform 41.36% : 8.313953s : 1: type_inference 0.04% : 0.008462s : 1: validate [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:25.996.582 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:25.996.898 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:25.997.863 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:25.998.033 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:25.998.717 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:03:26,509 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 31.247695446014404 s; generated tokens: 8 tokens; generate speed: 0.2560188802985914 tokens/s 2025-07-24 11:03:26,510 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.002022266387939453 s; prefill predict time: 16.738914489746094 s; prefill post time: 0.05533266067504883 s; decode prepare time: 0.0010406289781842912 s; decode predict time: 0.0066778262456258135 s; decode post time: 0.005154609680175781 s 2025-07-24 11:03:26,515 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. Building prefix dict from the default dictionary ... DEBUG:jieba:Building prefix dict from the default dictionary ... Loading model from cache /tmp/jieba.cache DEBUG:jieba:Loading model from cache /tmp/jieba.cache Loading model cost 1.265 seconds. DEBUG:jieba:Loading model cost 1.265 seconds. Prefix dict has been built successfully. DEBUG:jieba:Prefix dict has been built successfully. parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 你好!<|im_end|> <|im_start|>assistant 你好!有什么可以帮助你的吗?<|im_end|> parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 你好!<|im_end|> <|im_start|>assistant 你好!有什么可以帮助你的吗?<|im_end|> calculate sim is:1.0 2025-07-24 11:03:27,790 - mindformers./output/log[mindformers/generation/text_generator.py:735] - INFO - The batch is: 4, and the split_size is: 2, and the global_rank_id is: 3, and the dp_rank_id is: 1 and start is: 2, and stop is: 4 2025-07-24 11:03:27,792 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:03:27,793 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:03:27,793 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:03:27,794 - mindformers./output/log[mindformers/research/qwen2_5/infer/qwen2_5.py:188] - INFO - Set dynamic input for llama. [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:27.798.882 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:27.799.053 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:27.799.568 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:27.806.687 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:27.826.649 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:27.826.798 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:27.827.161 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:27.827.250 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:27.827.451 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:03:28,764 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 0.9698691368103027 s; generated tokens: 208 tokens; generate speed: 214.4619228570038 tokens/s 2025-07-24 11:03:28,765 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.0013117790222167969 s; prefill predict time: 0.02124929428100586 s; prefill post time: 0.005386829376220703 s; decode prepare time: 0.000852976030516393 s; decode predict time: 0.004929540204066856 s; decode post time: 0.003230787017970409 s 2025-07-24 11:03:28,769 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] return quick_sort(left) + middle + quick_sort(right) # 示例输入 arr = [ parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1 calculate sim is:1.0 parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] return quick_sort(left) + middle + quick_sort(right) # 示例输入 arr = [ parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1 calculate sim is:1.0 2025-07-24 11:03:28,785 - mindformers./output/log[mindformers/generation/text_generator.py:735] - INFO - The batch is: 8, and the split_size is: 4, and the global_rank_id is: 3, and the dp_rank_id is: 1 and start is: 4, and stop is: 8 2025-07-24 11:03:28,786 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:03:28,787 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:03:28,787 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:03:28,788 - mindformers./output/log[mindformers/research/qwen2_5/infer/qwen2_5.py:188] - INFO - Set dynamic input for llama. [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:28.792.504 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:28.792.669 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:28.793.139 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:28.799.294 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:28.812.191 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:28.812.342 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:28.812.704 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:28.812.795 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806956,fffd9b7fe060,python):2025-07-24-11:03:28.812.973 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:03:30,032 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 1.243579626083374 s; generated tokens: 408 tokens; generate speed: 328.0851434378889 tokens/s 2025-07-24 11:03:30,032 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.0013360977172851562 s; prefill predict time: 0.011849641799926758 s; prefill post time: 0.00650477409362793 s; decode prepare time: 0.0009570310611536007 s; decode predict time: 0.006123967170715332 s; decode post time: 0.004934752341544274 s 2025-07-24 11:03:30,036 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 large_models/parallel_qwen2_0_5b_predict_dp2_mp2/worker_0.log0000644000175100017500000040364615040321202024716 0ustar jenkinsHwHiAiUser/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) 2025-07-24 11:02:48,491 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config moe_config is empty. 2025-07-24 11:02:48,525 - mindformers./output/log[mindformers/core/context/build_context.py:168] - INFO - Predict context config, jit_level: O0, infer_boost: on [WARNING] ME(3806906:281473143385904,MainProcess):2025-07-24-11:02:48.526.887 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(3806906:281473143385904,MainProcess):2025-07-24-11:02:48.527.680 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(3806906:281473143385904,MainProcess):2025-07-24-11:02:48.528.135 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(3806906:281473143385904,MainProcess):2025-07-24-11:02:48.528.252 [mindspore/context.py:1655] For 'context.set_context', 'enable_graph_kernel' parameter is deprecated, and will be removed in the next version. Please use jit_config={'jit_level': 'O1'} instead. [WARNING] ME(3806906:281473143385904,MainProcess):2025-07-24-11:02:48.528.390 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(3806906:281473143385904,MainProcess):2025-07-24-11:02:48.528.539 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(3806906:281473143385904,MainProcess):2025-07-24-11:02:48.528.657 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. 2025-07-24 11:02:48,528 - mindformers./output/log[mindformers/core/context/parallel.py:73] - INFO - full_batch is set to False for non-parallel modes [WARNING] DISTRIBUTED(3806906,ffff92b9af30,python):2025-07-24-11:02:48.530.951 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 20 source: 127.0.0.1:59688, destination: 127.0.0.1:8240 [WARNING] DISTRIBUTED(3806906,ffff0317b060,python):2025-07-24-11:02:48.530.951 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:59688 to 127.0.0.1:8240 is successfully created. System errno: Success [WARNING] DISTRIBUTED(3806906,ffff92b9af30,python):2025-07-24-11:02:48.531.036 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:8240 to be connected...Retry number: 1 [WARNING] DISTRIBUTED(3806906,ffff92b9af30,python):2025-07-24-11:02:49.031.299 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:485] Connect] Connection 21 source: 127.0.0.1:59698, destination: 127.0.0.1:8240 [WARNING] DISTRIBUTED(3806906,ffff92b9af30,python):2025-07-24-11:02:49.031.338 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:494] Connect] Waiting for the state of the connection to 127.0.0.1:8240 to be connected...Retry number: 2 [WARNING] DISTRIBUTED(3806906,ffff0417d060,python):2025-07-24-11:02:49.031.329 [mindspore/ccsrc/distributed/rpc/tcp/tcp_comm.cc:79] ConnectedEventHandler] Connection from 127.0.0.1:59698 to 127.0.0.1:8240 is successfully created. System errno: Success [WARNING] DISTRIBUTED(3806906,ffff92b9af30,python):2025-07-24-11:02:49.531.750 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/1200). [WARNING] DISTRIBUTED(3806906,ffff92b9af30,python):2025-07-24-11:02:50.031.864 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(3806906,ffff92b9af30,python):2025-07-24-11:02:50.031.894 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:355] PostProcess] This node 0 rank id: 0 [MS_ALLOC_CONF]Runtime config: enable_vmm:False [WARNING] DEVICE(3806906,ffff92b9af30,python):2025-07-24-11:02:50.250.162 [mindspore/ccsrc/plugin/res_manager/ascend/mem_manager/ascend_memory_adapter.cc:155] Initialize] Reserved memory size for other components(2101346304) is less than recommend size(4068206080), It may lead to Out Of Memory in HCCL or other components, Please double check context key 'variable_memory_max_size'/'max_device_memory' [WARNING] DEVICE(3806906,ffff92b9af30,python):2025-07-24-11:02:51.483.175 [mindspore/ccsrc/plugin/res_manager/ascend/collective/multi_ascend_collective_comm_lib.cc:84] Initialize] Loading LCCL because env MS_ENABLE_LCCL is set to on. Pay attention that LCCL only supports communication group within single node in KernelByKernel for now. [WARNING] DISTRIBUTED(3806906,ffff92b9af30,python):2025-07-24-11:02:51.486.898 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: hccl_world_group [const vector]{0, 1, 2, 3}, async: 1, submit_now: 1 [WARNING] DISTRIBUTED(3806906,ffff92b9af30,python):2025-07-24-11:02:51.487.101 [mindspore/ccsrc/distributed/collective/collective_manager.cc:393] CreateCommunicationGroup] This group's communicator is async created hccl_world_group [WARNING] DEVICE(3806906,fffe6effd060,python):2025-07-24-11:02:51.487.277 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:254] SetGlobalCommInfo] Start to SetGlobalCommInfo for hccl_world_group, master_ip:2130706433, master_port:8240, node_rank:2130706433, total_rank_size:4, local_rank_size4 [WARNING] HCCL_ADPT(3806906,fffe6effd060,python):2025-07-24-11:02:51.487.348 [mindspore/ccsrc/utils/dlopen_macro.h:165] DlsymAscend] Dynamically load symbol HcclSetGlobalCommInfo failed, result = /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/communication/../lib/plugin/ascend/libhccl_plugin.so: undefined symbol: HcclSetGlobalCommInfo [WARNING] HCCL_ADPT(3806906,fffe6effd060,python):2025-07-24-11:02:51.487.368 [mindspore/ccsrc/plugin/res_manager/ascend/hccl_adapter/hccl_adapter.cc:632] HcclSetGlobalCommInfo] Func HcclSetGlobalCommInfo is not supported in CANN package. [WARNING] DEVICE(3806906,fffe6effd060,python):2025-07-24-11:02:51.487.382 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:265] SetGlobalCommInfo] End to SetGlobalCommInfo for hccl_world_group 2025-07-24 11:02:51,488 - mindformers./output/log[mindformers/tools/utils.py:185] - INFO - set strategy path to './output/strategy/ckpt_strategy_rank_0.ckpt' 2025-07-24 11:02:51,490 - mindformers./output/log[mindformers/core/context/build_context.py:383] - INFO - cann workqueue cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191] 2025-07-24 11:02:51,490 - mindformers./output/log[mindformers/core/context/build_context.py:387] - WARNING - CANN use cpus: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191], model get empty cpu list, disable binding cores 2025-07-24 11:02:51,491 - mindformers./output/log[mindformers/core/context/build_context.py:395] - INFO - cpu_affinity, rank_id: 0, device_num: 4 [WARNING] DISTRIBUTED(3806906,fffe6effd060,python):2025-07-24-11:02:51.491.783 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: hccl_world_group 2025-07-24 11:02:51,492 - mindformers./output/log[mindformers/core/parallel_config.py:41] - INFO - initial moe_config from dict: {'expert_num': 1, 'capacity_factor': 1.1, 'aux_loss_factor': 0.05, 'num_experts_chosen': 1, 'expert_group_size': None, 'group_wise_a2a': False, 'comp_comm_parallel': False, 'comp_comm_parallel_degree': 2, 'save_token_distribution': False, 'cur_layer': 0, 'enable_cold_hot_expert': False, 'update_step': 10000, 'hot_expert_num': 0, 'cold_token_percent': 1.0, 'moe_module_name': '', 'routing_policy': 'TopkRouterV1', 'norm_topk_prob': True, 'enable_sdrop': False, 'use_fused_ops_topkrouter': False, 'router_dense_type': 'float32', 'shared_expert_num': 0, 'use_shared_expert_gating': False, 'max_router_load': 131072, 'topk_method': 'greedy', 'topk_group': None, 'n_group': None, 'first_k_dense_replace': True, 'moe_intermediate_size': 1407, 'routed_scaling_factor': 1.0, 'aux_loss_types': None, 'aux_loss_factors': None, 'z_loss_factor': 0.0, 'balance_via_topk_bias': False, 'topk_bias_update_rate': 0.0, 'use_allgather_dispatcher': False, 'moe_shared_expert_overlap': False, 'expert_model_parallel': None, 'use_gating_sigmoid': False, 'enable_deredundency': False, 'npu_nums_per_device': 1, 'use_gmm': False, 'enable_gmm_safe_tokens': False, 'use_fused_ops_permute': False, 'callback_moe_droprate': False, 'dispatch_global_max_bs': 0, 'ep_extend_tp': True} 2025-07-24 11:02:51,492 - mindformers./output/log[mindformers/core/parallel_config.py:61] - INFO - initial parallel_config from dict: {'data_parallel': 2, 'model_parallel': 2, 'context_parallel': 1, 'expert_parallel': 1, 'pipeline_stage': 1, 'micro_batch_num': 1, 'seq_split_num': 1, 'use_seq_parallel': False, 'optimizer_shard': None, 'gradient_aggregation_group': 4, 'vocab_emb_dp': False, 'context_parallel_algo': 'colossalai_cp', 'ulysses_degree_in_cp': 1, 'mem_coeff': 0.1} [WARNING] DEVICE(3806906,fffe6cff9060,python):2025-07-24-11:02:52.005.101 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group, hcclBufferSize is 200 MB, hcclDeterministic is 0 tp_group is:True dp_group is:True 2025-07-24 11:02:52,070 - mindformers./output/log[mindformers/parallel_core/inference/parallel_state.py:358] - INFO - expert_model_parallel_size(1) is not equal to world_size(4), so we will use 4 as the MOE_tensor_parallel_size. [WARNING] DISTRIBUTED(3806906,ffff92b9af30,python):2025-07-24-11:02:52.072.918 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: dp-0-2 [const vector]{0, 2}, async: 0, submit_now: 1 [WARNING] DEVICE(3806906,fffe6cff9060,python):2025-07-24-11:02:52.391.216 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for hccl_world_group [WARNING] DISTRIBUTED(3806906,fffe6cff9060,python):2025-07-24-11:02:52.391.376 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: hccl_world_group [WARNING] DISTRIBUTED(3806906,fffe6cff9060,python):2025-07-24-11:02:52.391.403 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: hccl_world_group end. [WARNING] DISTRIBUTED(3806906,fffe6effd060,python):2025-07-24-11:02:52.391.496 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: hccl_world_group [WARNING] DISTRIBUTED(3806906,fffe6effd060,python):2025-07-24-11:02:52.392.111 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: dp-0-2 [WARNING] DEVICE(3806906,fffe4ce7d060,python):2025-07-24-11:02:52.565.323 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for dp-0-2, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806906,fffe4ce7d060,python):2025-07-24-11:02:52.618.892 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for dp-0-2 [WARNING] DISTRIBUTED(3806906,fffe4ce7d060,python):2025-07-24-11:02:52.618.987 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: dp-0-2 [WARNING] DISTRIBUTED(3806906,fffe4ce7d060,python):2025-07-24-11:02:52.619.009 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: dp-0-2 end. [WARNING] DISTRIBUTED(3806906,fffe6effd060,python):2025-07-24-11:02:52.619.070 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: dp-0-2 [WARNING] DISTRIBUTED(3806906,ffff92b9af30,python):2025-07-24-11:02:52.619.312 [mindspore/ccsrc/distributed/collective/collective_manager.cc:341] CreateCommunicationGroup] Start to create communication group: tp-0-1 [const vector]{0, 1}, async: 0, submit_now: 1 [WARNING] DISTRIBUTED(3806906,fffe6effd060,python):2025-07-24-11:02:52.619.949 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1021] CreateDeviceCommunicator] Begin initialize communication group on the device side: tp-0-1 [WARNING] DEVICE(3806906,fffe430bf060,python):2025-07-24-11:02:52.647.854 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:169] InitByRootInfoConfig] Start to initialize communicator by HcclCommInitRootInfoConfig for tp-0-1, hcclBufferSize is 200 MB, hcclDeterministic is 0 [WARNING] DEVICE(3806906,fffe430bf060,python):2025-07-24-11:02:52.699.623 [mindspore/ccsrc/plugin/res_manager/ascend/collective/ascend_communication_group.cc:184] InitByRootInfoConfig] End to initialize communicator by HcclCommInitRootInfoConfig for tp-0-1 [WARNING] DISTRIBUTED(3806906,fffe430bf060,python):2025-07-24-11:02:52.699.768 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1226] CacheInitedGroups] Cache inited group: tp-0-1 [WARNING] DISTRIBUTED(3806906,fffe430bf060,python):2025-07-24-11:02:52.699.797 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1229] CacheInitedGroups] Cache inited group: tp-0-1 end. [WARNING] DISTRIBUTED(3806906,fffe6effd060,python):2025-07-24-11:02:52.699.868 [mindspore/ccsrc/distributed/collective/collective_manager.cc:1032] CreateDeviceCommunicator] End initialize communication group on the device side: tp-0-1 [WARNING] ME(3806906:281473143385904,MainProcess):2025-07-24-11:02:52.706.161 [mindspore/ops/primitive.py:220] The in_strategy/in_layout of the operator in your network will not take effect in stand_alone mode. This means the the shard function called in the network is ignored. If you want to enable it, please use semi auto or auto parallel mode by context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL or context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL) data_parallel_group:dp-0-2 tensor_model_parallel_group:tp-0-1 2025-07-24 11:02:53,421 - mindformers./output/log[mindformers/models/modeling_utils.py:1517] - INFO - model built, but weights is unloaded, since the config has no checkpoint_name_or_path attribute or checkpoint_name_or_path is None. 2025-07-24 11:02:53,423 - mindformers./output/log[/home/jenkins/mindspore/testcases/testcases/tests/st/networks/large_models/run_parallel.py:178] - INFO - ----------------Transform and load checkpoint---------------- Weight loading: 0%| | 0/24 [00:00system You are a helpful assistant.<|im_end|> <|im_start|>user 你好!<|im_end|> <|im_start|>assistant 你好!有什么可以帮助你的吗?<|im_end|> parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 你好!<|im_end|> <|im_start|>assistant 你好!有什么可以帮助你的吗?<|im_end|> calculate sim is:1.0 2025-07-24 11:03:28,300 - mindformers./output/log[mindformers/generation/text_generator.py:735] - INFO - The batch is: 4, and the split_size is: 2, and the global_rank_id is: 0, and the dp_rank_id is: 0 and start is: 0, and stop is: 2 2025-07-24 11:03:28,301 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:03:28,302 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:03:28,302 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:03:28,303 - mindformers./output/log[mindformers/research/qwen2_5/infer/qwen2_5.py:188] - INFO - Set dynamic input for llama. [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:28.308.420 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:28.308.576 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:28.309.021 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:28.315.730 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:28.352.848 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:28.352.994 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:28.353.356 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:28.353.442 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:28.353.602 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:03:29,678 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 1.3750135898590088 s; generated tokens: 208 tokens; generate speed: 151.27123217838735 tokens/s 2025-07-24 11:03:29,679 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.0015180110931396484 s; prefill predict time: 0.037731170654296875 s; prefill post time: 0.005625009536743164 s; decode prepare time: 0.0008118013733799018 s; decode predict time: 0.00613552215052586 s; decode post time: 0.005840921864926236 s 2025-07-24 11:03:29,683 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] return quick_sort(left) + middle + quick_sort(right) # 示例输入 arr = [ parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1 calculate sim is:1.0 parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr pivot = arr[len(arr) // 2] left = [x for x in arr if x < pivot] middle = [x for x in arr if x == pivot] right = [x for x in arr if x > pivot] return quick_sort(left) + middle + quick_sort(right) # 示例输入 arr = [ parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user 用python编写快速排序<|im_end|> <|im_start|>assistant 以下是一个使用Python实现的快速排序算法: ```python def quick_sort(arr): if len(arr) <= 1: return arr else: pivot = arr[0] left = [x for x in arr[1:] if x < pivot] right = [x for x in arr[1:] if x >= pivot] return quick_sort(left) + [pivot] + quick_sort(right) # 示例输入 arr = [3,6,8,1 calculate sim is:1.0 2025-07-24 11:03:29,700 - mindformers./output/log[mindformers/generation/text_generator.py:735] - INFO - The batch is: 8, and the split_size is: 4, and the global_rank_id is: 0, and the dp_rank_id is: 0 and start is: 0, and stop is: 4 2025-07-24 11:03:29,701 - mindformers./output/log[mindformers/generation/text_generator.py:892] - INFO - Generation Config is: {'max_length': 128, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'num_beams': 1, 'do_sample': False, 'use_past': True, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8, 'repetition_penalty': 1.1, 'encoder_repetition_penalty': 1.0, 'renormalize_logits': False, 'return_dict_in_generate': False, 'output_scores': False, 'output_logits': False, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151643, 151645], 'parallel_decoding': False, 'window_size': 5, 'level': 5, 'guess_set_size': 3, '_from_model_config': True} 2025-07-24 11:03:29,702 - mindformers./output/log[mindformers/generation/text_generator.py:950] - INFO - The generation mode will be **GREEDY_SEARCH**. 2025-07-24 11:03:29,702 - mindformers./output/log[mindformers/modules/block_tables.py:63] - INFO - init cache engine success. 2025-07-24 11:03:29,703 - mindformers./output/log[mindformers/research/qwen2_5/infer/qwen2_5.py:188] - INFO - Set dynamic input for llama. [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:29.707.890 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:29.708.043 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:29.708.466 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:29.714.461 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:29.727.848 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:29.727.993 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:29.728.300 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:29.728.393 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty [WARNING] INTERNAL_KERNEL(3806906,fffd7e7fc060,python):2025-07-24-11:03:29.728.555 [/home/jenkins/agent-working-dir/workspace/executor0/ms_kernels_internal/src/ops/matmul_common/tiling_utils.h:38] getMatMulTilingFromEnv] Env INTERNAL_MATMUL_TILING is not set or empty 2025-07-24 11:03:30,809 - mindformers./output/log[mindformers/generation/text_generator.py:1159] - INFO - total time: 1.1057765483856201 s; generated tokens: 408 tokens; generate speed: 368.9714713118669 tokens/s 2025-07-24 11:03:30,810 - mindformers./output/log[mindformers/tools/debug_info.py:93] - INFO - prefill prepare time: 0.0012700557708740234 s; prefill predict time: 0.010054349899291992 s; prefill post time: 0.008684635162353516 s; decode prepare time: 0.0008821086128159325 s; decode predict time: 0.00499650239944458 s; decode post time: 0.004758544487528282 s 2025-07-24 11:03:30,814 - mindformers./output/log[mindformers/modules/block_tables.py:126] - INFO - Clear block table cache engines. parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 parallel_qwen2_0_5b_predict_dp2_mp2, output_text: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to parallel_qwen2_0_5b_predict_dp2_mp2, answer: <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user I believe the meaning of life is<|im_end|> <|im_start|>assistant The meaning of life is a philosophical question that has been debated for centuries, and there is no one definitive answer to it. Some people believe that the meaning of life is to find happiness and fulfillment in their lives, while others believe that it is to achieve success or recognition. Others may argue that the meaning of life is to make a positive impact on the world, to help others, and to contribute to society as a whole. Others may believe that the meaning of life is to pursue knowledge and understanding, to calculate sim is:1.0 large_models/parallel_qwen2_0_5b_predict_dp2_mp2/scheduler.log0000644000175100017500000003016615040321203025136 0ustar jenkinsHwHiAiUser/home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) 2025-07-24 11:02:48,491 - mindformers./output/log[mindformers/tools/register/template.py:84] - WARNING - The input config moe_config is empty. 2025-07-24 11:02:48,525 - mindformers./output/log[mindformers/core/context/build_context.py:168] - INFO - Predict context config, jit_level: O0, infer_boost: on [WARNING] ME(3806904:281473103445808,MainProcess):2025-07-24-11:02:48.526.202 [mindspore/context.py:1412] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(3806904:281473103445808,MainProcess):2025-07-24-11:02:48.526.390 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_device_memory' will be deprecated and removed in a future version. Please use the api mindspore.runtime.set_memory() instead. [WARNING] ME(3806904:281473103445808,MainProcess):2025-07-24-11:02:48.526.483 [mindspore/context.py:1346] For 'context.set_context', when set the argument 'max_device_memory', the argument 'device_target' only supports devices in '['Ascend', 'GPU']', but got 'CPU', ignore it. [WARNING] ME(3806904:281473103445808,MainProcess):2025-07-24-11:02:48.526.556 [mindspore/context.py:1412] For 'context.set_context', the parameter 'max_call_depth' will be deprecated and removed in a future version. Please use the api mindspore.set_recursion_limit() instead. [WARNING] ME(3806904:281473103445808,MainProcess):2025-07-24-11:02:48.526.642 [mindspore/context.py:1655] For 'context.set_context', 'enable_graph_kernel' parameter is deprecated, and will be removed in the next version. Please use jit_config={'jit_level': 'O1'} instead. [WARNING] ME(3806904:281473103445808,MainProcess):2025-07-24-11:02:48.526.765 [mindspore/context.py:1412] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(3806904:281473103445808,MainProcess):2025-07-24-11:02:48.526.842 [mindspore/context.py:1346] For 'context.set_context', when set the argument 'ascend_config', the argument 'device_target' only supports devices in '['Ascend']', but got 'CPU', ignore it. [WARNING] ME(3806904:281473103445808,MainProcess):2025-07-24-11:02:48.526.913 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(3806904:281473103445808,MainProcess):2025-07-24-11:02:48.527.023 [mindspore/context.py:1412] For 'context.set_context', the parameter 'save_graphs_path' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS_PATH instead. 2025-07-24 11:02:48,527 - mindformers./output/log[mindformers/core/context/parallel.py:73] - INFO - full_batch is set to False for non-parallel modes [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:02:48.528.356 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(1/1200). [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:02:49.028.468 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(2/1200). [WARNING] DISTRIBUTED(3806904,ffff01b62060,python):2025-07-24-11:02:49.030.403 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:254] ProcessRegister] The new node: 3(role: MS_WORKER), rank id: 3, device id: 3, hostname: devserver-dfb8-1, ip: 127.0.0.1 is registered successfully. Currently registered node number: 1, expected node number: 4 [WARNING] DISTRIBUTED(3806904,ffff01b62060,python):2025-07-24-11:02:49.044.978 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:254] ProcessRegister] The new node: 1(role: MS_WORKER), rank id: 1, device id: 1, hostname: devserver-dfb8-1, ip: 127.0.0.1 is registered successfully. Currently registered node number: 2, expected node number: 4 [WARNING] DISTRIBUTED(3806904,ffff01b62060,python):2025-07-24-11:02:49.195.529 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:254] ProcessRegister] The new node: 2(role: MS_WORKER), rank id: 2, device id: 2, hostname: devserver-dfb8-1, ip: 127.0.0.1 is registered successfully. Currently registered node number: 3, expected node number: 4 [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:02:49.528.556 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:246] BuildCluster] Topology build timed out., retry(3/1200). [WARNING] DISTRIBUTED(3806904,ffff01b62060,python):2025-07-24-11:02:49.531.539 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:768] ReassignNodeRank] Rank ids are already set by numeric node ids. No need to reassign them. [WARNING] DISTRIBUTED(3806904,ffff01b62060,python):2025-07-24-11:02:49.531.582 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:254] ProcessRegister] The new node: 0(role: MS_WORKER), rank id: 0, device id: 0, hostname: devserver-dfb8-1, ip: 127.0.0.1 is registered successfully. Currently registered node number: 4, expected node number: 4 [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:02:50.028.668 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:249] BuildCluster] Cluster is successfully initialized. [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:02:50.028.727 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 4 alive nodes. [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:02:50.028.741 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:02:55.028.896 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 4 alive nodes. [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:02:55.028.958 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:03:00.029.100 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 4 alive nodes. [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:03:00.029.157 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:03:05.029.332 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 4 alive nodes. [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:03:05.029.394 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:03:10.029.548 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 4 alive nodes. [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:03:10.029.616 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:03:15.029.776 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 4 alive nodes. [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:03:15.029.839 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:03:20.029.981 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 4 alive nodes. [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:03:20.030.039 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:03:25.030.186 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 4 alive nodes. [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:03:25.030.256 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:03:30.030.394 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:98] Finalize] The meta server node can not be finalized because there are still 4 alive nodes. [WARNING] DISTRIBUTED(3806904,ffff90583f30,python):2025-07-24-11:03:30.030.461 [mindspore/ccsrc/distributed/cluster/cluster_context.cc:154] Finalize] This log means the cluster is successfully created. Retry to finalize the node and exit cluster... [WARNING] DISTRIBUTED(3806904,ffff01b62060,python):2025-07-24-11:03:30.231.583 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:323] ProcessUnregister] Node 2 has unregistered. [WARNING] DISTRIBUTED(3806904,ffff01b62060,python):2025-07-24-11:03:30.231.978 [mindspore/ccsrc/distributed/rpc/tcp/connection.cc:79] SocketEventHandler] Event value fd: 23, events: 8193, state: 4, errcode: 11, errno: 11 Resource temporarily unavailable, remote peer: 127.0.0.1:59696, type:1, remote: 1, count: 1, this peer: 127.0.0.1:8240, please check remote peer address: 127.0.0.1:59696 in worker log to find out which worker disconnected. [WARNING] DISTRIBUTED(3806904,ffff01b62060,python):2025-07-24-11:03:30.565.767 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:323] ProcessUnregister] Node 3 has unregistered. [WARNING] DISTRIBUTED(3806904,ffff01b62060,python):2025-07-24-11:03:31.054.631 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:323] ProcessUnregister] Node 0 has unregistered. [WARNING] DISTRIBUTED(3806904,ffff01b62060,python):2025-07-24-11:03:31.067.739 [mindspore/ccsrc/distributed/cluster/topology/meta_server_node.cc:323] ProcessUnregister] Node 1 has unregistered. large_models/qwen2_weight_processor.py0000644000175100017500000003555015040315702020770 0ustar jenkinsHwHiAiUser# Copyright 2025 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ transform huggingface model to mindspore safetensor. """ import os import json import gc import numpy as np from tqdm import tqdm from safetensors import safe_open import mindspore as ms from mindspore.communication.management import get_rank from weight_processor import BaseWeightProcessor class Qwen2WeightProcessor(BaseWeightProcessor): r""" Provide Qwen2 Model weight load and shards. Args: config (Qwen2Config): The config of Qwen2 model. network (InferenceQwen2ForCausalLM): The network of Qwen2. """ def __init__(self, config, network, is_quant): # pylint: disable=useless-super-delegation super().__init__(config, network, is_quant) def infer_convert_outer_weight(self, src_hf_dir, hf_weight_map): """convert weight not in model""" embed_tokens_hf_name = "model.embed_tokens.weight" embed_tokens_ms_name = self.convert_weight_name(embed_tokens_hf_name) if self.config.parallel_config.vocab_emb_dp: np_data, _ = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map) else: np_data, _ = self.get_safetensor_from_file_split_tp_group(embed_tokens_hf_name, src_hf_dir, hf_weight_map, split_axis=0) self.parameter_dict[embed_tokens_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16), name=embed_tokens_ms_name, requires_grad=False) norm_hf_name = "model.norm.weight" norm_ms_name = self.convert_weight_name(norm_hf_name) np_data, _ = self.get_safetensor_from_file(norm_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[norm_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16), name=norm_ms_name, requires_grad=False) lm_head_hf_name = "lm_head.weight" lm_head_ms_name = self.convert_weight_name(lm_head_hf_name) if not self.config.model.model_config.tie_word_embeddings: if not self.config.parallel_config.vocab_emb_dp: np_data, _ = self.get_safetensor_from_file_split_tp_group(lm_head_hf_name, src_hf_dir, hf_weight_map, split_axis=0) else: np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[lm_head_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16), name=lm_head_ms_name, requires_grad=False) def convert_weight_name(self, weight_name: str): """replace weight name""" weight_name = weight_name.replace('embed_tokens.weight', 'tok_embeddings.embedding_weight') weight_name = weight_name.replace('self_attn.q_proj.', 'attention.wq.') weight_name = weight_name.replace('self_attn.k_proj.', 'attention.wk.') weight_name = weight_name.replace('self_attn.v_proj.', 'attention.wv.') weight_name = weight_name.replace('self_attn.o_proj.', 'attention.wo.') weight_name = weight_name.replace('mlp.gate_proj.', 'feed_forward.w1.') weight_name = weight_name.replace('mlp.down_proj.', 'feed_forward.w2.') weight_name = weight_name.replace('mlp.up_proj.', 'feed_forward.w3.') weight_name = weight_name.replace('.input_layernorm.', '.attention_norm.') weight_name = weight_name.replace('.post_attention_layernorm.', '.ffn_norm.') weight_name = weight_name.replace('model.norm.weight', 'model.norm_out.weight') return weight_name def infer_process_dense_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): """infer process dense ffn weight""" ffn_concat = self.config.model.model_config.qkv_concat w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight" w1_ms_name = self.convert_weight_name(w1_hf_name) w1_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w1_hf_name, src_hf_dir, hf_weight_map, split_axis=0) w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight" w2_ms_name = self.convert_weight_name(w2_hf_name) w2_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w2_hf_name, src_hf_dir, hf_weight_map, split_axis=1) w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight" w3_ms_name = self.convert_weight_name(w3_hf_name) w3_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w3_hf_name, src_hf_dir, hf_weight_map, split_axis=0) if ffn_concat: w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden.weight" w_gate_hidden_param = np.concatenate((w1_ms_param, w3_ms_param), axis=0) self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name, requires_grad=False) else: self.parameter_dict[w1_ms_name] = ms.Parameter(ms.from_numpy(w1_ms_param).astype(ms.bfloat16), name=w1_ms_name, requires_grad=False) self.parameter_dict[w3_ms_name] = ms.Parameter(ms.from_numpy(w3_ms_param).astype(ms.bfloat16), name=w3_ms_name, requires_grad=False) self.parameter_dict[w2_ms_name] = ms.Parameter(ms.from_numpy(w2_ms_param).astype(ms.bfloat16), name=w2_ms_name, requires_grad=False) def infer_process_attention_weight(self, src_hf_dir, layer_id, hf_weight_map): """infer process attention weight""" qkv_concat = self.config.model.model_config.qkv_concat # wq wq_hf_name = f"model.layers.{layer_id}.self_attn.q_proj.weight" wq_ms_name = self.convert_weight_name(wq_hf_name) wq_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wq_hf_name, src_hf_dir, hf_weight_map, split_axis=0) # wq bias wq_bias_hf_name = f"model.layers.{layer_id}.self_attn.q_proj.bias" wq_bias_ms_name = self.convert_weight_name(wq_bias_hf_name) wq_bias_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wq_bias_hf_name, src_hf_dir, hf_weight_map, split_axis=0) # wk wk_hf_name = f"model.layers.{layer_id}.self_attn.k_proj.weight" wk_ms_name = self.convert_weight_name(wk_hf_name) wk_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wk_hf_name, src_hf_dir, hf_weight_map, split_axis=0) # wk bias wk_bias_hf_name = f"model.layers.{layer_id}.self_attn.k_proj.bias" wk_bias_ms_name = self.convert_weight_name(wk_bias_hf_name) wk_bias_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wk_bias_hf_name, src_hf_dir, hf_weight_map, split_axis=0) # wv wv_hf_name = f"model.layers.{layer_id}.self_attn.v_proj.weight" wv_ms_name = self.convert_weight_name(wv_hf_name) wv_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wv_hf_name, src_hf_dir, hf_weight_map, split_axis=0) # wv bias wv_bias_hf_name = f"model.layers.{layer_id}.self_attn.v_proj.bias" wv_bias_ms_name = self.convert_weight_name(wv_bias_hf_name) wv_bias_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wv_bias_hf_name, src_hf_dir, hf_weight_map, split_axis=0) if qkv_concat: w_qkv_name = f"model.layers.{layer_id}.attention.w_qkv.weight" w_qkv_param = np.concatenate((wq_ms_param, wk_ms_param, wv_ms_param), axis=0) w_qkv_param = ms.from_numpy(w_qkv_param).astype(ms.bfloat16) self.parameter_dict[w_qkv_name] = ms.Parameter(w_qkv_param, name=w_qkv_name, requires_grad=False) w_qkv_bias_name = f"model.layers.{layer_id}.attention.w_qkv.bias" w_qkv_bias_param = np.concatenate((wq_bias_ms_param, wk_bias_ms_param, wv_bias_ms_param), axis=0) w_qkv_bias_param = ms.from_numpy(w_qkv_bias_param).astype(ms.bfloat16) self.parameter_dict[w_qkv_bias_name] = ms.Parameter(w_qkv_bias_param, name=w_qkv_bias_name, requires_grad=False) else: self.parameter_dict[wq_ms_name] = ms.Parameter(ms.from_numpy(wq_ms_param).astype(ms.bfloat16), name=wq_ms_name, requires_grad=False) self.parameter_dict[wk_ms_name] = ms.Parameter(ms.from_numpy(wk_ms_param).astype(ms.bfloat16), name=wk_ms_name, requires_grad=False) self.parameter_dict[wv_ms_name] = ms.Parameter(ms.from_numpy(wv_ms_param).astype(ms.bfloat16), name=wv_ms_name, requires_grad=False) self.parameter_dict[wq_bias_ms_name] = ms.Parameter( ms.from_numpy(wq_bias_ms_param).astype(ms.bfloat16), name=wq_bias_ms_name, requires_grad=False) self.parameter_dict[wk_bias_ms_name] = ms.Parameter( ms.from_numpy(wk_bias_ms_param).astype(ms.bfloat16), name=wk_bias_ms_name, requires_grad=False) self.parameter_dict[wv_bias_ms_name] = ms.Parameter( ms.from_numpy(wv_bias_ms_param).astype(ms.bfloat16), name=wv_bias_ms_name, requires_grad=False) # wo wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight" wo_ms_name = self.convert_weight_name(wo_hf_name) wo_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wo_hf_name, src_hf_dir, hf_weight_map, split_axis=1) self.parameter_dict[wo_ms_name] = ms.Parameter(ms.from_numpy(wo_ms_param).astype(ms.bfloat16), name=wo_ms_name, requires_grad=False) def infer_process_norm_weight(self, src_hf_dir, layer_id, hf_weight_map): """infer process attention weight""" # attention_norm attention_norm_hf_name = f"model.layers.{layer_id}.input_layernorm.weight" attention_norm_ms_name = self.convert_weight_name(attention_norm_hf_name) attention_norm_ms_param, _ = self.get_safetensor_from_file(attention_norm_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[attention_norm_ms_name] = ms.Parameter( ms.from_numpy(attention_norm_ms_param).astype(ms.bfloat16), name=attention_norm_ms_name, requires_grad=False) # ffn_norm ffn_norm_hf_name = f"model.layers.{layer_id}.post_attention_layernorm.weight" ffn_norm_ms_name = self.convert_weight_name(ffn_norm_hf_name) ffn_norm_ms_param, _ = self.get_safetensor_from_file(ffn_norm_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[ffn_norm_ms_name] = ms.Parameter( ms.from_numpy(ffn_norm_ms_param).astype(ms.bfloat16), name=ffn_norm_ms_name, requires_grad=False) def infer_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map): """infer convert layer weight""" self.infer_process_attention_weight(src_hf_dir, layer_id, hf_weight_map) self.infer_process_dense_ffn_weight(src_hf_dir, layer_id, hf_weight_map) self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map) def load_safetensors_shard(self, src_hf_dir): """qwen load safetensors and shard """ rank_id = get_rank() param_json_path = "" for file in os.listdir(src_hf_dir): if file.endswith('index.json'): param_json_path = os.path.join(src_hf_dir, file) break hf_weight_map = {} if os.path.exists(param_json_path): with open(param_json_path, "r") as fp: hf_weight_map = json.load(fp)['weight_map'] else: # only one safetensor, create a hf_weight_map safetensor_file = "model.safetensors" with safe_open(f"{src_hf_dir}/{safetensor_file}", framework="np") as sf_file: all_keys = sf_file.keys() for key in all_keys: hf_weight_map[str(key).strip()] = safetensor_file self.infer_convert_outer_weight(src_hf_dir, hf_weight_map) num_layers = self.config.model.model_config.num_layers enable_tqdm = rank_id == 0 for layer_id in tqdm(range(num_layers), desc="Weight loading", disable=not enable_tqdm): self.infer_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map) param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, self.parameter_dict) print("param_not_load: %s, ckpt_not_load: %s" % (str(param_not_load), str(ckpt_not_load))) del self.parameter_dict gc.collect()