Coverage for /home/jenkins/.local/lib/python3.10/site-packages/hyper

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ============================================================================

15"""Token-weighted global loss normalization utilities.

17"""

18from typing import Any, Dict

20from hyper_parallel import get_platform

22platform = get_platform()

25def count_loss_token(batch: Dict[str, Any]) -> int:

26 """Count non-padding tokens in a micro-batch.

28 A token is considered valid (non-padding) when its label is not -100,

29 which is the conventional ignore index used in cross-entropy loss.

31 Args:

32 batch: Dictionary containing at least a ``"labels"`` tensor with

33 shape ``(batch_size, seq_len)``.

35 Returns:

36 Integer count of tokens where ``labels != -100``.

37 """

38 labels = batch.get("labels")

39 if labels is None:

40 return 0

41 return int((labels != -100).sum().item())

44def mean_global_loss(

45 loss: Any,

46 micro_batch_tokens: int,

47 total_tokens: int,

48 fsdp_size: int,

49) -> Any:

50 """Compute token-weighted, globally normalised loss for one micro-batch.

52 Each micro-batch contributes a fraction proportional to how many of the

53 total global tokens it contains. Multiplying by ``fsdp_size`` corrects

54 for the fact that FSDP averages gradients across data-parallel ranks,

55 while token counts are *per-rank* (not global).

57 Formula::

59 normalised_loss = raw_loss * (micro_tokens / global_tokens) * fsdp_size

61 Args:

62 loss: Raw loss scalar returned by the model (may be a DTensor partial).

63 micro_batch_tokens: Number of non-padding tokens in this micro-batch.

64 total_tokens: Total non-padding tokens across **all** micro-batches and

65 all data-parallel ranks in this global step.

66 fsdp_size: Number of data-parallel (FSDP) ranks participating in

67 gradient reduction.

69 Returns:

70 Scaled loss with the same type as ``loss``. If ``total_tokens`` is

71 zero, returns ``loss`` unchanged to avoid division by zero.

73 Raises:

74 ValueError: If ``fsdp_size`` is not a positive integer.

75 """

76 if fsdp_size <= 0:

77 raise ValueError(f"fsdp_size must be a positive integer, got {fsdp_size}")

78 if total_tokens <= 0:

79 return loss

80 return loss * (micro_batch_tokens / total_tokens) * fsdp_size

Coverage for / home / jenkins / .local / lib / python3.10 / site-packages / hyper_parallel / trainer / utils / loss.py: 100%

14 statements