Diff Coverage

Source File	Diff Coverage (%)	Missing Lines
hyper_parallel/core/shard/ops/parallel_ms_flash_attention_score.py	52.0%	336-337,343,350,354,358,821-822,942-948,952,955-956,965,987,995,1004,1028-1032,1034-1036,1038-1042,1044-1045,1047-1048,1052-1055,1057,1062,1071-1072,1089
hyper_parallel/core/shard/ops/parallel_npu_flash_attention_score.py	60.9%	110,157-159,161,166,168-170,172-173,366-367,373,380,384,388,545,591,611,906-907,1013,1121-1122
hyper_parallel/core/shard/ops/parallel_outer.py	96.3%	32
hyper_parallel/core/shard/ops/parallel_scaled_dot_product_attention.py	82.9%	36,70-72,74-75,79,88,90,95,345,550
hyper_parallel/core/shard/ops/parallel_scatter.py	100%
hyper_parallel/core/shard/ops/parallel_transpose.py	91.2%	68,114,130

hyper_parallel/core/shard/ops/parallel_ms_flash_attention_score.py

        #
        # kv_seq_split_num > 1 is blocked by a guard in
        # _compute_adjusted_sparse_params before reaching this function,
        # so local_kv_len == global_kv_len is guaranteed here.
        local_q_len = query.shape[seq_dim_idx]
        local_kv_len = key.shape[seq_dim_idx]

        if sparse_mode in (SPARSE_DEFAULT_MASK, SPARSE_BAND):
            new_pre_tokens = pre_tokens
            new_next_tokens = next_tokens

        if sparse_mode in (SPARSE_DEFAULT_MASK, SPARSE_BAND):
            new_pre_tokens = pre_tokens
            new_next_tokens = next_tokens
        else:
            new_pre_tokens = local_kv_len
            new_next_tokens = 0

        new_sparse_mode = SPARSE_BAND if sparse_mode != SPARSE_DEFAULT_MASK else sparse_mode
        update_mode = SPARSE_MODE_UPDATE_MAP[sparse_mode]

        new_sparse_mode = SPARSE_BAND if sparse_mode != SPARSE_DEFAULT_MASK else sparse_mode
        update_mode = SPARSE_MODE_UPDATE_MAP[sparse_mode]

        if update_mode == LEFT_UP_TO_LEFT_UP:
            offset = -split_id * local_q_len
            new_pre_tokens = new_pre_tokens + offset
            new_next_tokens = new_next_tokens - offset
        elif update_mode == LEFT_UP_TO_RIGHT_DOWN:
            offset = local_kv_len - (split_id + 1) * local_q_len
            new_pre_tokens = new_pre_tokens + offset
            new_next_tokens = new_next_tokens - offset
        elif update_mode == RIGHT_DOWN_TO_RIGHT_DOWN:
            offset = (split_num - split_id - 1) * local_q_len
            new_pre_tokens = new_pre_tokens + offset
            new_next_tokens = new_next_tokens - offset

        return new_sparse_mode, new_pre_tokens, new_next_tokens

        is_dynamic: bool,
    ) -> Tuple[int, int, int]:
        """Compute adjusted sparse parameters based on dynamic or static shape."""
        if is_dynamic:
            if kv_seq_split_num > 1:
                raise NotImplementedError(
                    f"For {self.op_name}, dynamic shape with KV sequence sharding "
                    f"(kv_seq_split_num={kv_seq_split_num}) is not yet supported. "
                    f"The dynamic path currently uses local KV length directly, "
                    f"while the static path multiplies by kv_seq_split_num to obtain "

            # Scalar parameters from pyboost may arrive as Tensor objects.
            # Arithmetic and comparison operations on such Tensor scalars
            # would trigger device kernel calls, so convert them to Python
            # native types early.
            head_num = int(self._to_python_scalar(p_head_num))
            keep_prob = self._to_python_scalar(p_keep_prob)
            scale_value = self._to_python_scalar(p_scale_value)
            pre_tokens = int(self._to_python_scalar(p_pre_tokens))
            next_tokens = int(self._to_python_scalar(p_next_tokens))
            inner_precise = int(self._to_python_scalar(p_inner_precise))
            sparse_mode = int(self._to_python_scalar(p_sparse_mode))

            # Ensure the runtime input_layout matches the cached value used
            # for sharding derivation and validation.
            runtime_input_layout = _resolve_input_layout(
                self._to_python_scalar(p_input_layout)
            )
            if runtime_input_layout != input_layout:
                raise ValueError(
                    f"For {self.op_name}, runtime input_layout {runtime_input_layout!r} "
                    f"does not match the cached input_layout {input_layout!r} "
                    f"used for sharding inference. This may indicate an incorrect "
                    f"dispatcher cache key."

                )

            is_varlen = input_layout == "TND" and actual_seq_qlen is not None
            self._validate_attn_mask(attn_mask, sparse_mode, input_layout, is_varlen)
            FlashAttentionScoreDistributedOp._validate_real_shift_configuration(
                real_shift, sparse_mode)

            split_info = self._get_split_info(query_layout, input_layout)
            head_split_num = split_info["head"]

                return FlashAttentionScoreDistributedOp._truncate_result(result)

            adjusted_head_num = self._adjust_head_num(head_num, head_split_num)

            (adjusted_sparse_mode, adjusted_pre_tokens, adjusted_next_tokens,
             adjusted_actual_seq_qlen, adjusted_actual_seq_kvlen) = self._apply_seq_split_adjustments(
                query, key, query_layout, key_layout, input_layout,
                sparse_mode, pre_tokens, next_tokens,
                actual_seq_qlen, actual_seq_kvlen,

                actual_seq_qlen, actual_seq_kvlen,
                seq_split_num, lb_split_id, lb_split_num,
            )

            result = func(
                query, key, value,
                real_shift, drop_mask, padding_mask, attn_mask, prefix,
                adjusted_actual_seq_qlen, adjusted_actual_seq_kvlen,
                int(adjusted_head_num), keep_prob, scale_value,

                int(adjusted_pre_tokens), int(adjusted_next_tokens), inner_precise,
                p_input_layout, int(adjusted_sparse_mode),
            )

            return FlashAttentionScoreDistributedOp._truncate_result(result)

        return _expanded_impl

    def _apply_seq_split_adjustments(  # pylint: disable=too-many-arguments,too-many-locals

        Returns:
            Tuple of (adjusted_sparse_mode, adjusted_pre_tokens, adjusted_next_tokens,
                      adjusted_actual_seq_qlen, adjusted_actual_seq_kvlen).
        """
        adjusted_sparse_mode = sparse_mode
        adjusted_pre_tokens = pre_tokens
        adjusted_next_tokens = next_tokens
        adjusted_actual_seq_qlen = actual_seq_qlen
        adjusted_actual_seq_kvlen = actual_seq_kvlen

        if seq_split_num > 1 or lb_split_id is not None:
            dynamic_info = self._get_dynamic_shape_info(query, key, input_layout)
            is_dynamic = dynamic_info.get('is_dynamic', False)

            if lb_split_id is not None:
                if lb_split_num is None:
                    raise ValueError("lb_split_num must not be None when lb_split_id is set")
                split_id = lb_split_id
                seq_split_num = lb_split_num
            else:
                split_id = self._get_split_id(query_layout, input_layout)
            seq_dim_idx = self._get_seq_dim_idx(self._layout_dims.get(input_layout, {}))

            if seq_dim_idx is None:
                raise ValueError(
                    f"Cannot infer seq/total dim for input_layout={input_layout}"
                )

            kv_seq_split_num = 1
            if key_layout is not None:
                kv_split_info = self._get_split_info(key_layout, input_layout)
                kv_seq_split_num = kv_split_info["seq"]

            self._check_seq_sharding_compatibility(
                query_layout, key_layout, input_layout,
                seq_dim_idx, seq_split_num, kv_seq_split_num
            )

            (adjusted_sparse_mode,
             adjusted_pre_tokens,
             adjusted_next_tokens) = self._compute_adjusted_sparse_params(
                query, key,
                sparse_mode, pre_tokens, next_tokens,

                split_id, seq_split_num, seq_dim_idx,
                kv_seq_split_num, is_dynamic,
            )

            if input_layout == "TND":
                (adjusted_sparse_mode,
                 adjusted_pre_tokens,
                 adjusted_next_tokens,
                 adjusted_actual_seq_qlen,
                 adjusted_actual_seq_kvlen) = self._adjust_tnd_layout_params(

                        kv_seq_split_num=kv_seq_split_num, is_dynamic=is_dynamic,
                    ),
                )

        return (adjusted_sparse_mode, adjusted_pre_tokens, adjusted_next_tokens,
                adjusted_actual_seq_qlen, adjusted_actual_seq_kvlen)

    def _get_seq_dim_idx(self, dims: dict) -> Optional[int]:
        """Get the sequence dimension index."""

hyper_parallel/core/shard/ops/parallel_npu_flash_attention_score.py


    Returns:
        tuple: (positional_args_tuple, empty_kwargs_dict)
    """
    return (
        query, key, value, head_num, input_layout,
        pse, padding_mask, atten_mask,
        scale, keep_prob, pre_tockens, next_tockens,
        inner_precise, prefix, actual_seq_qlen, actual_seq_kvlen,


        Returns:
            tuple: (local_args, local_kwargs, cache_values)
        """
        args, kwargs = _normalize_npu_fusion_attention_args(*args, **kwargs)
        query, key, value = args[0], args[1], args[2]
        input_layout = args[4]

        local_args = (
            query.to_local() if hasattr(query, '_layout') else query,
            key.to_local() if hasattr(key, '_layout') else key,
            value.to_local() if hasattr(value, '_layout') else value,
        ) + args[3:]
        local_kwargs = {}

        query_layout = query.layout if hasattr(query, "_layout") else None
        key_layout = key.layout if hasattr(key, "_layout") else None
        value_layout = value.layout if hasattr(value, "_layout") else None

        cache_values = [query_layout, key_layout, value_layout, input_layout]
        return local_args, local_kwargs, cache_values

    def _is_dynamic_shape(self, tensor: Tensor, dim: int) -> bool:
        """Check if tensor has dynamic shape at given dimension."""
        try:

        #
        # kv_seq_split_num > 1 is blocked by a guard in
        # _compute_adjusted_sparse_params before reaching this function,
        # so local_kv_len == global_kv_len is guaranteed here.
        local_q_len = query.shape[seq_dim_idx]
        local_kv_len = key.shape[seq_dim_idx]

        if sparse_mode in (SPARSE_DEFAULT_MASK, SPARSE_BAND):
            new_pre_tockens = pre_tockens
            new_next_tockens = next_tockens

        if sparse_mode in (SPARSE_DEFAULT_MASK, SPARSE_BAND):
            new_pre_tockens = pre_tockens
            new_next_tockens = next_tockens
        else:
            new_pre_tockens = local_kv_len
            new_next_tockens = 0

        new_sparse_mode = SPARSE_BAND if sparse_mode != SPARSE_DEFAULT_MASK else sparse_mode
        update_mode = SPARSE_MODE_UPDATE_MAP[sparse_mode]

        new_sparse_mode = SPARSE_BAND if sparse_mode != SPARSE_DEFAULT_MASK else sparse_mode
        update_mode = SPARSE_MODE_UPDATE_MAP[sparse_mode]

        if update_mode == LEFT_UP_TO_LEFT_UP:
            offset = -split_id * local_q_len
            new_pre_tockens = new_pre_tockens + offset
            new_next_tockens = new_next_tockens - offset
        elif update_mode == LEFT_UP_TO_RIGHT_DOWN:
            offset = local_kv_len - (split_id + 1) * local_q_len
            new_pre_tockens = new_pre_tockens + offset
            new_next_tockens = new_next_tockens - offset
        elif update_mode == RIGHT_DOWN_TO_RIGHT_DOWN:
            offset = (split_num - split_id - 1) * local_q_len
            new_pre_tockens = new_pre_tockens + offset
            new_next_tockens = new_next_tockens - offset

        return new_sparse_mode, new_pre_tockens, new_next_tockens

        Raises:
            ValueError: If any validation rule is violated.
        """
        if query_layout is None:
            raise ValueError(
                f"For {op_name}, query layout cannot be None"
            )

        NPUFlashAttentionScoreDistributedOp._validate_sharding_consistency(

        value_layout = cache_values[2]
        input_layout_str = cache_values[3]

        if not isinstance(input_layout_str, str):
            raise ValueError(
                f"For {self.op_name}, input_layout should be a string, "
                f"but got {type(input_layout_str)}"
            )

        )

        attention_out_layout = copy.deepcopy(query_layout)
        if attention_out_layout.placements is None and attention_out_layout.tensor_map is not None:
            attention_out_layout.tensor_map_to_placement()

        softmax_layout = self._infer_softmax_layout_by_input_layout(
            query_layout, input_layout_str, ""
        )

        is_dynamic: bool,
    ) -> Tuple[int, int, int]:
        """Compute adjusted sparse parameters based on dynamic or static shape."""
        if is_dynamic:
            if kv_seq_split_num > 1:
                raise NotImplementedError(
                    f"For {self.op_name}, dynamic shape with KV sequence sharding "
                    f"(kv_seq_split_num={kv_seq_split_num}) is not yet supported. "
                    f"The dynamic path currently uses local KV length directly, "
                    f"while the static path multiplies by kv_seq_split_num to obtain "

        query_layout = cache_values[0]
        key_layout = cache_values[1]

        if query_layout is None:
            return None

        def _expanded_impl(  # pylint: disable=R0913
            query,
            key,

            dynamic_info = self._get_dynamic_shape_info(query, key, input_layout)
            is_dynamic = dynamic_info.get('is_dynamic', False)

            if lb_split_id is not None:
                if lb_split_num is None:
                    raise ValueError(
                        "lb_split_num must not be None when lb_split_id is set"
                    )
                split_id = lb_split_id
                seq_split_num = lb_split_num

hyper_parallel/core/shard/ops/parallel_outer.py


def _get_alias_shard_set(dim_alias):
    if isinstance(dim_alias, str):
        return {dim_alias} if dim_alias != "None" else set()
    return set(dim_alias)


class OuterDistributedOp(DistributedOp):
    """Distributed implementation for torch.outer."""

hyper_parallel/core/shard/ops/parallel_scaled_dot_product_attention.py



def _normalize_sdpa_args(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None,
                         enable_gqa=False):
    return (query, key, value, attn_mask, dropout_p, is_causal, scale), {'enable_gqa': enable_gqa}


class ScaledDotProductAttentionDistributedOp(DistributedOp):
    """Distributed operator for torch.nn.functional.scaled_dot_product_attention.

        Returns:
            tuple: (local_args, local_kwargs, cache_values) where local_args contains
                local tensors and runtime scalars, and cache_values contains Layout objects.
        """
        args, kwargs = _normalize_sdpa_args(*args, **kwargs)
        query, key, value, attn_mask, dropout_p, is_causal, scale = args
        enable_gqa = kwargs['enable_gqa']

        if hasattr(attn_mask, '_layout'):
            raise NotImplementedError(
                f"For {self.op_name}, DTensor attn_mask is not supported yet."
            )

        local_args = (
            query.to_local() if hasattr(query, '_layout') else query,
            key.to_local() if hasattr(key, '_layout') else key,
            value.to_local() if hasattr(value, '_layout') else value,
            attn_mask,

            dropout_p,
            is_causal,
            scale,
        )
        local_kwargs = {'enable_gqa': enable_gqa}

        cache_values = [
            query.layout if hasattr(query, '_layout') else None,
            key.layout if hasattr(key, '_layout') else None,
            value.layout if hasattr(value, '_layout') else None,
        ]
        return local_args, local_kwargs, cache_values

    @staticmethod
    def _normalize_dim_map(dim_map):
        """Normalize dim_map to string representation."""

                f"Query ndim: {query_ndim}\n"
                f"Key ndim: {len(key_layout.alias_tensor_map)}"
            )
        if value_layout is not None and len(value_layout.alias_tensor_map) != query_ndim:
            raise ValueError(
                f"For {op_name}, Query, Key and Value must have the same rank.\n"
                f"Query ndim: {query_ndim}\n"
                f"Value ndim: {len(value_layout.alias_tensor_map)}"
            )

                if lb_split_id is not None:
                    if lb_split_num is None:
                        raise ValueError("lb_split_num must not be None when lb_split_id is set")
                    split_id = lb_split_id
                    seq_split_num = lb_split_num
                else:
                    split_id = self._get_split_id(query_layout, dims)
                local_q_len = query.shape[dims["seq"]]
                global_kv_len = key.shape[dims["seq"]]