"""tilelli.utils.runtime — small training-loop niceties. Two helpers, both designed to be no-ops on hardware that doesn't need them: - ``ThermalGuard``: poll Linux thermal zones and sleep when the chip gets too hot. Lets you train overnight on a fanless laptop without cooking the silicon. On a host with no readable thermal zones this becomes a free no-op. - ``polite_training``: yield the CPU to the rest of the system once per training step. Keeps the machine usable while a long training run is in flight. On CUDA hosts it does almost nothing (a malloc trim and a gc); on CPU hosts it adds a sub-millisecond sleep. Neither helper is required to run Tilelli — they exist so that small hosts (cheap laptop, single-board computer, $200 mini-PC) can run a training session without becoming unusable for everything else. """ from __future__ import annotations import gc import time from dataclasses import dataclass from pathlib import Path import torch def _read_thermal_max_celsius() -> float | None: """Return the hottest readable thermal zone in °C, or None if no /sys/class/thermal/* zones are present (most non-Linux hosts).""" try: zones = sorted(Path("/sys/class/thermal").glob("thermal_zone*/temp")) except OSError: return None if not zones: return None temps: list[float] = [] for z in zones: try: temps.append(int(z.read_text().strip()) / 1000.0) except (OSError, ValueError): continue return max(temps) if temps else None @dataclass class ThermalGuard: """Polls the hottest thermal zone and sleeps when it crosses a cap. Usage in a training loop:: guard = ThermalGuard(high_c=80.0, resume_c=72.0) for step in range(steps): guard.maybe_throttle(step) train_step(...) Parameters ---------- high_c : float Start throttling at or above this temperature. resume_c : float Stop throttling only once temperature falls back below this. Must be lower than ``high_c`` to avoid threshold sawtooth. cool_down_s : float How long to sleep per throttle cycle before re-reading. check_every : int Poll every N training steps (avoid reading /sys every step; thermal changes are slow relative to a training step). """ high_c: float = 85.0 resume_c: float = 75.0 cool_down_s: float = 2.0 check_every: int = 20 _throttling: bool = False _total_throttle_s: float = 0.0 _throttle_events: int = 0 _last_temp_c: float | None = None _available: bool | None = None def __post_init__(self) -> None: if self.resume_c >= self.high_c: raise ValueError( f"resume_c ({self.resume_c}) must be < high_c ({self.high_c})" ) @property def available(self) -> bool: if self._available is None: self._available = _read_thermal_max_celsius() is not None return self._available def maybe_throttle(self, step: int) -> None: if not self.available: return if step % self.check_every != 0 and not self._throttling: return t = _read_thermal_max_celsius() if t is None: return self._last_temp_c = t if not self._throttling and t >= self.high_c: self._throttling = True self._throttle_events += 1 while self._throttling: time.sleep(self.cool_down_s) self._total_throttle_s += self.cool_down_s t2 = _read_thermal_max_celsius() if t2 is None or t2 < self.resume_c: self._throttling = False self._last_temp_c = t2 break def polite_training() -> None: """Yield the CPU briefly and trim allocators. Cheap nicety so a long CPU run doesn't make the machine unusable for everything else.""" gc.collect() if not torch.cuda.is_available(): try: import ctypes libc = ctypes.CDLL("libc.so.6") libc.malloc_trim(0) except Exception: pass time.sleep(0.001)