Coverage for /home/jenkins/.local/lib/python3.10/site-packages/hyper_parallel/platform/torch/activation_checkpoint/activation

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

15# Adapted from

16# https://github.com/pytorch/pytorch/blob/release/2.6/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py

17# enhanced with activation swap functionality.

18# ============================================================================

19"""Activation Swap implementation for PyTorch."""

20# pylint: disable=W0212, W0613

22from abc import ABC, abstractmethod

23from collections.abc import Iterator

24from typing import Optional, Callable, Any, Union

25import types

26import warnings

27import torch

28from torch import nn

29from torch.distributed.utils import _replace_by_prefix

30from hyper_parallel.core.activation_checkpoint.activation_checkpoint import CheckpointPolicy

31from hyper_parallel.core.activation_checkpoint.swap import SwapManager, SwapTensor, Storage

34_SWAP_WRAPPED_MODULE = "_swap_wrapped_module"

35_SWAP_PREFIX = _SWAP_WRAPPED_MODULE + "."

38class FuncModule(nn.Module):

39 """

40 Thin :class:`~torch.nn.Module` adapter that wraps a plain callable.

42 Allows ordinary Python functions (or any callable without Module

43 parameters) to be passed to :func:`swap_wrapper` and

44 :func:`~hyper_parallel.core.activation_checkpoint.checkpoint_wrapper`

45 in place of an :class:`~torch.nn.Module`.

46 The wrapped function is stored as ``_fn`` and invoked in

47 :meth:`forward`; the module has no trainable parameters.

49 Args:

50 fn (callable): The function to wrap.

52 Example:

53 >>> wrapped = swap_wrapper(lambda x: x * 2)

54 """

56 def __init__(self, fn: Callable):

57 super().__init__()

58 self._fn = fn

60 def forward(self, *args, **kwargs):

61 """Invoke the wrapped callable with the given arguments."""

62 return self._fn(*args, **kwargs)

65def _is_callable_exempt_from_overlap_check(callable_obj: Callable) -> bool:

66 """Return True for callables that cannot be reliably overlap-tracked by object marks."""

67 return isinstance(callable_obj, (types.FunctionType, types.BuiltinFunctionType, types.MethodType))

70def _iter_wrappable_callable_attrs(module: nn.Module) -> Iterator[tuple[str, Callable]]:

71 """Yield public per-instance callable attributes not registered as child modules.

73 Plain functions, builtins and bound methods are skipped: these are stateless

74 module-level utilities shared by reference across many modules (e.g.

75 ``self.act = F.gelu`` repeated in every layer). They are never standalone

76 checkpoint regions, and marking a shared function's ``_is_wrapped`` flag

77 would both mutate a global object and falsely flag every sibling module that

78 references the same function as an overlapping wrap. Only per-instance

79 callables participate in overlap tracking.

80 """

81 for attr_name, attr_value in vars(module).items():

82 if attr_name.startswith("_") or isinstance(attr_value, nn.Module):

83 continue

84 if _is_callable_exempt_from_overlap_check(attr_value):

85 continue

86 if callable(attr_value):

87 yield attr_name, attr_value

90def _mark_wrapped(obj: Any) -> None:

91 try:

92 obj._is_wrapped = True # pylint: disable=W0212

93 except (AttributeError, TypeError):

94 pass

97def _get_wrapped_callable(module: nn.Module) -> Optional[Callable]:

98 wrapped_module = getattr(module, _SWAP_WRAPPED_MODULE, None)

99 if isinstance(wrapped_module, FuncModule):

100 return getattr(wrapped_module, "_fn", None)

101 if isinstance(module, FuncModule):

102 return getattr(module, "_fn", None)

103 return None

104

105

106def _raise_callable_already_wrapped(callable_obj: Callable) -> None:

107 warnings.warn(

108 f"Callable '{callable_obj.__class__.__name__}' is already wrapped. "

109 "Wrapping overlapping module regions is not allowed."

110 )

111

112

113def _check_callable_attr_not_wrapped(owner: nn.Module, attr_name: str, attr_value: Callable) -> None:

114 del owner, attr_name

115 if getattr(attr_value, '_is_wrapped', False):

116 _raise_callable_already_wrapped(attr_value)

117

118

119def _check_and_mark_callable(callable_obj: Callable) -> None:

120 if _is_callable_exempt_from_overlap_check(callable_obj):

121 return

122 if getattr(callable_obj, '_is_wrapped', False):

123 warnings.warn(

124 f"Callable '{callable_obj.__class__.__name__}' or one of its ancestors is already wrapped. "

125 "Wrapping overlapping module regions is not allowed."

126 )

127 _mark_wrapped(callable_obj)

128

129

130def _check_and_mark_wrapped(module: nn.Module) -> None:

131 """Validate no wrapping overlap, then mark module and all descendants as wrapped."""

132 if getattr(module, '_is_wrapped', False):

133 warnings.warn(

134 f"Module '{module.__class__.__name__}' or one of its ancestors is already wrapped. "

135 "Wrapping overlapping module regions is not allowed."

136 )

137 for submodule in module.modules():

138 if submodule is module:

139 continue

140 wrapped_callable = _get_wrapped_callable(submodule)

141 if wrapped_callable is not None and _is_callable_exempt_from_overlap_check(wrapped_callable):

142 continue

143 if getattr(submodule, '_is_wrapped', False):

144 if wrapped_callable is not None:

145 _raise_callable_already_wrapped(wrapped_callable)

146 warnings.warn(

147 f"Submodule '{getattr(submodule, '_swap_wrapped_module', submodule).__class__.__name__}' of "

148 f"'{module.__class__.__name__}' is already wrapped. "

149 "Wrapping overlapping module regions is not allowed."

150 )

151 for submodule in module.modules():

152 for attr_name, attr_value in _iter_wrappable_callable_attrs(submodule):

153 _check_callable_attr_not_wrapped(submodule, attr_name, attr_value)

154 for submodule in module.modules():

155 _mark_wrapped(submodule)

156 for _, attr_value in _iter_wrappable_callable_attrs(submodule):

157 _mark_wrapped(attr_value)

158

159

160def base_check_fn(tensor) -> bool:

161 """

162 Basic check to determine if a tensor is eligible for offloading.

163 - Skip Parameters and their views.

164 - Skip empty storage tensors.

165 """

166 if isinstance(tensor._base, torch.nn.parameter.Parameter) or isinstance(tensor, torch.nn.parameter.Parameter): # pylint: disable=W0212

167 return False

168 if tensor.untyped_storage().size() == 0:

169 return False

170 return True

171

172

173class AsyncSaveOnCpu(torch.autograd.graph.saved_tensors_hooks):

174 """

175 Context manager to offload tensors to CPU during forward pass.

176 """

177 def __init__(self, policy_fn=None, group_swap: bool = False) -> None:

178 self.add_to_storage = False

179 self.storage = Storage()

180 self.count_idx = 0

181 self.policy_fn = policy_fn

182

183 # Cache per-context-manager state once to avoid per-tensor singleton lookups.

184 swap_manager = SwapManager()

185

186 def pack_to_cpu(tensor: torch.Tensor):

187 if not base_check_fn(tensor):

188 return tensor

189 if policy_fn is not None:

190 if policy_fn(tensor) == CheckpointPolicy.MUST_SAVE:

191 return tensor

192 if policy_fn(tensor) != CheckpointPolicy.MUST_SWAP:

193 raise RuntimeError(f"Swap :set an invalid policy {policy_fn(tensor)}")

194 group_name = swap_manager.get_current_group_name()

195 if not group_name:

196 return tensor

197 if not self.add_to_storage:

198 swap_manager.add_storage(group_name, self.storage)

199 self.add_to_storage = True

200 funcname = f"{group_name}::{tensor.shape}"

201 self.storage[self.count_idx].append(

202 SwapTensor(tensor, funcname, group_swap=group_swap)

203 )

204 self.count_idx += 1

205 return tensor

206

207 def unpack_from_cpu(tensor) -> torch.Tensor:

208 if self.storage is not None:

209 self.storage.clear()

210 self.storage = None

211 return tensor

212

213 super().__init__(pack_to_cpu, unpack_from_cpu)

214

215

216class ActivationWrapper(torch.nn.Module, ABC):

217 """

218 Base class for Activation Swap.

219

220 Not meant to be instantiated directly.

221 """

222

223 def __init__(self, module: Union[nn.Module, Callable]):

224 if callable(module) and not isinstance(module, nn.Module):

225 _check_and_mark_callable(module)

226 module = FuncModule(module)

227 _mark_wrapped(module)

228 else:

229 _check_and_mark_wrapped(module)

230 super().__init__()

231 self._swap_wrapped_module = module

232 self._is_wrapped = True

233 # state_dict post hook to remove prefix to allow loading into a

234 # non-swap wrapped module.

235 self._register_state_dict_hook(self._post_state_dict_hook)

236 # load_state_dict pre-hook to allow loading back into

237 # swap-wrapped module.

238 self.register_load_state_dict_pre_hook(self._pre_load_state_dict_hook)

239

240 @property

241 def _wrapped_module(self):

242 return self._swap_wrapped_module

243

244 @abstractmethod

245 def forward(self, *args, **kwargs):

246 """Run the wrapped module's forward pass with activation swapping. Must be implemented by subclasses."""

247 raise ValueError("Subclasses should implement forward().")

248

249 def __getattr__(self, name: str) -> Any:

250 """Forward missing attributes to wrapped module."""

251 try:

252 return super().__getattr__(name) # defer to nn.Module's logic

253 except AttributeError:

254 return getattr(self._swap_wrapped_module, name)

255

256 def __getitem__(self, key: int) -> Any:

257 """Forward indexing calls in case the module is a nn.Sequential."""

258 return self._swap_wrapped_module.__getitem__(key) # type: ignore[operator]

259

260 def named_modules(

261 self,

262 memo: Optional[set[nn.Module]] = None,

263 prefix: str = "",

264 remove_duplicate: bool = True,

265 ) -> Iterator[tuple[str, nn.Module]]:

266 """

267 Yield wrapped-module children without exposing the internal wrapper prefix.

268

269 PyTorch parent modules implement ``named_parameters(recurse=True)`` by

270 iterating ``named_modules()`` and reading each module's direct

271 ``_parameters``. They do not call child modules' ``named_parameters()``

272 overrides. Exposing the wrapped module under the wrapper's own prefix

273 keeps root-module traversals aligned with ``state_dict()`` keys.

274

275 Args:

276 memo (Optional[set[nn.Module]], optional): A memo set to avoid infinite recursion. Default: ``None``.

277 prefix (str, optional): A prefix to prepend to all module names. Default: ``""``.

278 remove_duplicate (bool, optional): Whether to remove duplicate modules. Default: ``True``.

279

280 Returns:

281 Iterator[tuple[str, nn.Module]] An iterator of (name, module) pairs.

282 """

283 if memo is None:

284 memo = set()

285 if self not in memo:

286 memo.add(self)

287 yield prefix, self

288 yield from self._swap_wrapped_module.named_modules(

289 memo=memo,

290 prefix=prefix,

291 remove_duplicate=remove_duplicate,

292 )

293

294 def named_parameters(

295 self,

296 *args,

297 **kwargs,

298 ) -> Iterator[tuple[str, torch.nn.Parameter]]:

299 """

300 Override :meth:`named_parameters()` to intercept parameter names.

301

302 remove all occurrences of ``_SWAP_PREFIX``.

303 """

304 for param_name, param in super().named_parameters(*args, **kwargs):

305 yield param_name.replace(_SWAP_PREFIX, ""), param

306

307 @staticmethod

308 def _post_state_dict_hook(

309 module: nn.Module, # pylint: disable=W0613

310 state_dict: dict[str, Any],

311 prefix: str,

312 *args: Any, # pylint: disable=W0613

313 ) -> dict[str, Any]:

314 """

315 _post_state_dict_hook() is called after the state_dict() of this FSDP module is executed.

316

317 For ``swap_wrapper``, it will strip swap-wrapped module prefix,

318 so that this module can be loaded into non-swapped modules.

319 It would still be able to be loaded into swap-wrapped modules as this class,

320 adds the prefix back before loading the state_dict.

321 """

322 _replace_by_prefix(state_dict, f"{prefix}{_SWAP_PREFIX}", prefix)

323 return state_dict

324

325 @staticmethod

326 def _pre_load_state_dict_hook(

327 module: nn.Module,

328 state_dict: dict[str, Any],

329 prefix: str,

330 *args: Any,

331 ) -> None:

332 """

333 ``_pre_state_dict_hook` is called before ``self._load_from_state_dict()`` is called.

334

335 For ``swap_wrapper``, it will add back the module

336 prefix so that non-swapped modules can be loaded into

337 swap_wrapper modules properly.

338 """

339 _replace_by_prefix(state_dict, prefix, prefix + f"{_SWAP_PREFIX}")

340

341

342class SwapWrapper(ActivationWrapper):

343 """

344 Customize an nn.Module wrapper class to add an AsyncSaveOnCpu context manager for the target model.

345 """

346 def __init__(

347 self,

348 mod: Union[nn.Module, Callable],

349 policy_fn: Optional[Callable] = None,

350 group_swap: bool = False,

351 ):

352 super().__init__(mod)

353 self.policy_fn = policy_fn

354 self.group_swap = group_swap

355

356 def forward(self, *args, **kwargs):

357 """Run the wrapped module inside an AsyncSaveOnCpu context for activation swapping."""

358 with AsyncSaveOnCpu(policy_fn=self.policy_fn, group_swap=self.group_swap):

359 return self._swap_wrapped_module(*args, **kwargs)

360

361

362def swap_wrapper(

363 module: Union[nn.Module, Callable],

364 policy_fn: Optional[Callable] = None,

365 group_swap: bool = False,

366) -> SwapWrapper:

367 """Wrap a module or callable with activation swap functionality."""

368 return SwapWrapper(module, policy_fn, group_swap)

369

370

371def swap_tensor_wrapper(target, tag: Optional[str] = None, group_swap: bool = False):

372 """Register selected tensors into the current swap group.

373

374 This helper is intended to be used inside a forward path that already

375 participates in the existing swap scheduling managed by ``SwapManager``.

376 It preserves the input structure and returns the original tensors.

377 """

378 swap_manager = SwapManager()

379 group_name = swap_manager.get_current_group_name()

380 if not group_name:

381 warnings.warn(

382 f"Tensor {tag} cannot be swapped, for its group is unregistered."

383 )

384 return target

385 if swap_manager.is_last_group(group_name):

386 return target

387

388 storage = Storage()

389 count_idx = 0

390

391 def _register_tensor(tensor):

392 nonlocal count_idx

393 if not base_check_fn(tensor):

394 return tensor

395

396 tensor_tag = tag or f"{group_name}_swap_tensor"

397 funcname = f"{tensor_tag}::{tuple(tensor.shape)}"

398 storage[count_idx].append(SwapTensor(tensor, funcname, group_swap=group_swap))

399 count_idx += 1

400 return tensor

401

402 wrapped = torch.utils._pytree.tree_map( # pylint: disable=protected-access

403 lambda x: _register_tensor(x) if isinstance(x, torch.Tensor) else x,

404 target,

405 )

406 if count_idx > 0:

407 swap_manager.add_storage(group_name, storage)

408 return wrapped

Coverage for / home / jenkins / .local / lib / python3.10 / site-packages / hyper_parallel / platform / torch / activation_checkpoint / activation_swap.py: 85%

183 statements