Diff Coverage

Diff: origin/master...HEAD, staged and unstaged changes

Source File Diff Coverage (%) Missing Lines
hyper_parallel/auto_parallel/sapp_nd/memory_estimation/_backbone.py 30.0% 676-677,679-683
hyper_parallel/auto_parallel/sapp_nd/memory_estimation/evaluators/body.py 100%  
hyper_parallel/auto_parallel/sapp_nd/memory_estimation/evaluators/comm.py 100%  
hyper_parallel/auto_parallel/sapp_nd/memory_estimation/evaluators/layer_block.py 100%  
hyper_parallel/auto_parallel/sapp_nd/nd/common/_cost_model_variables.py 100%  
hyper_parallel/auto_parallel/sapp_nd/nd/common/cost_model_preprocess.py 100%  
hyper_parallel/auto_parallel/sapp_nd/nd/common/cp_types.py 100%  
hyper_parallel/auto_parallel/sapp_nd/nd/common/framework_parsers/_cost_model_parser.py 100%  
hyper_parallel/auto_parallel/sapp_nd/nd/common/hardware.py 100%  
hyper_parallel/auto_parallel/sapp_nd/nd/dimensions.py 95.7% 232,262,389,478
hyper_parallel/auto_parallel/sapp_nd/nd/parallelize.py 100%  
hyper_parallel/auto_parallel/sapp_nd/perf_estimation/comm_time.py 82.5% 63,131,495-496,504,559-563
hyper_parallel/auto_parallel/sapp_nd/memory_estimation/_backbone.py
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
            ins["AllGather Comm"] = self.mb(stage_accu[MemType.AG_COMM])
            ins["All2All Comm"] = self.mb(stage_accu[MemType.A2A_COMM])

            if self._ccfg.cp > 1:
                cp_memory = EvalBody.act_cp_layer(self._ccfg, self._ctx)
                cp_comm_buffer = EvalLayerComm.cp_comm_buffer(self._ccfg, self._ctx)

                ins["CP KV Cache"] = self.mb(cp_memory.kv_cache_memory)
                ins["CP Attn Scores"] = self.mb(cp_memory.attention_scores_memory)
                ins["CP Softmax"] = self.mb(cp_memory.softmax_outputs_memory)
                ins["CP Comm Buffer"] = self.mb(cp_comm_buffer)
                ins["CP Reduction"] = self.mb(cp_memory.total_reduction)

            ins["Node Log"] = sm["logs"][stage_id].node_compute_log
            # VERBOSE
            if verbose and spec_stage_id in (-1, stage_id):
hyper_parallel/auto_parallel/sapp_nd/nd/dimensions.py
228
229
230
231
232
233
234
235
236
    @staticmethod
    def _check_mbn_pp(dims_val, all_dims):
        """Return True if MBN/PP combination is valid."""
        if MBN not in dims_val or PP not in all_dims:
            return True
        valid = dims_val[MBN] >= dims_val[PP]
        valid = valid and not (dims_val[PP] == 1 and dims_val[MBN] > 1)
        if not valid:
            logger.warning("PP and MBN were deemed not suitable")
258
259
260
261
262
263
264
265
266
            if self.dims_val[SP] and self.dims_val[CP] > 1:
                logger.warning("SP & CP cannot coexist")
                return False
        if OP in self.all_dims and not self._check_power_of_two(OP, self.dims_val[OP]):
            return False
        return True

    def val(self, dim):
        """Get Dimension value"""
385
386
387
388
389
390
391
392
393
        )

    attn_upper = p.attention_type_str.upper()
    if attn_upper == "MLA":
        recommended_cp_max = 16
    elif attn_upper == "GQA":
        recommended_cp_max = 8
    else:
        recommended_cp_max = 4
474
475
476
477
478
479
480
481
482
        cp_algo, attention_heads, sp_enabled, num_kv_heads,
    )

    if p.cp_degree <= 1:
        return _cp_ok_result()

    if p.sp_enabled:
        return _cp_ok_result(
            is_valid=False,
hyper_parallel/auto_parallel/sapp_nd/perf_estimation/comm_time.py
59
60
61
62
63
64
65
66
67
    intra_ranks = min(int(cp), int(device_per_node))
    if cp <= device_per_node:
        return "intra-node", bw_intra
    if intra_ranks == 1:
        return "cross-node", bw_inter
    intra_fraction = (intra_ranks - 1) / (cp - 1)
    cross_fraction = 1.0 - intra_fraction
    bw = intra_fraction * bw_intra + cross_fraction * bw_inter
    return "mixed", bw
127
128
129
130
131
132
133
134
135
    cp = ccfg.cp
    t = max(1, ccfg.t)

    if ccfg.a <= 0:
        raise ValueError(f"Number of attention heads must be positive, got {ccfg.a}")

    kv_dim = compute_kv_dim(ccfg)
    attention_type = detect_attention_type(ccfg)
    cp_algo = _resolve_cp_algo(ccfg)
491
492
493
494
495
496
497
498
499
                comm[Dim.EP] += EvalLayerComm.ep_comm_layer(
                    param["cfg"], param["ctx"], 1
                )  # * param["cfg"].ep
                if param["cfg"].cp > 1:
                    cp_comm_details = cp_comm_layer_detailed(param["cfg"], param["ctx"])
                    comm[Dim.CP] += cp_comm_details.comm_volume
                # min(device_type.level_bound_number[0], param["cfg"].ep)
                # comm_cp += EvalLayerComm.cp_comm_layer
                # (param["cfg"], param["ctx"])
500
501
502
503
504
505
506
507
508



        if param["ccfg"].ttype == PerformanceType.TIME:
            for dim, ov in zip([Dim.DP, Dim.TP, Dim.CP], [0.0, 0, 0.0]):
                comm[dim] = estimate_comm_score(
                    param["cfg"],
                    comm[dim],
                    dim,
555
556
557
558
559
560
561
562
563
564
565
566
567
        param["debugger"].info[PerfParts.EP_COMM] = comms[Dim.EP]
        param["debugger"].info[PerfParts.CP_COMM] = comms[Dim.CP]

        if param["cfg"].cp > 1:
            cp_comm_details = cp_comm_layer_detailed(param["cfg"], param["ctx"])
            param["debugger"].info["CP_KV_VOLUME"] = cp_comm_details.total_kv_volume
            param["debugger"].info["CP_EXPOSED_TIME"] = cp_comm_details.exposed_comm_time
            param["debugger"].info["CP_TOPOLOGY"] = cp_comm_details.topology
            param["debugger"].info["CP_BANDWIDTH"] = cp_comm_details.effective_bandwidth

    res = []
    for i, c in enumerate(comms[Dim.TP]):
        res += [c + comms[Dim.DP][i] + comms[Dim.EP][i] + comms[Dim.CP][i]]