Tilelli-llm / src /tilelli /utils /runtime.py
TilelliLab's picture
Mirror small files (code, paper, results)
f86dc09 verified
Raw
History Blame Contribute Delete
4.21 kB
"""tilelli.utils.runtime — small training-loop niceties.
Two helpers, both designed to be no-ops on hardware that doesn't need them:
- ``ThermalGuard``: poll Linux thermal zones and sleep when the chip
gets too hot. Lets you train overnight on a fanless laptop without
cooking the silicon. On a host with no readable thermal zones this
becomes a free no-op.
- ``polite_training``: yield the CPU to the rest of the system once
per training step. Keeps the machine usable while a long training
run is in flight. On CUDA hosts it does almost nothing (a malloc
trim and a gc); on CPU hosts it adds a sub-millisecond sleep.
Neither helper is required to run Tilelli — they exist so that small
hosts (cheap laptop, single-board computer, $200 mini-PC) can run a
training session without becoming unusable for everything else.
"""
from __future__ import annotations
import gc
import time
from dataclasses import dataclass
from pathlib import Path
import torch
def _read_thermal_max_celsius() -> float | None:
"""Return the hottest readable thermal zone in °C, or None if no
/sys/class/thermal/* zones are present (most non-Linux hosts)."""
try:
zones = sorted(Path("/sys/class/thermal").glob("thermal_zone*/temp"))
except OSError:
return None
if not zones:
return None
temps: list[float] = []
for z in zones:
try:
temps.append(int(z.read_text().strip()) / 1000.0)
except (OSError, ValueError):
continue
return max(temps) if temps else None
@dataclass
class ThermalGuard:
"""Polls the hottest thermal zone and sleeps when it crosses a cap.
Usage in a training loop::
guard = ThermalGuard(high_c=80.0, resume_c=72.0)
for step in range(steps):
guard.maybe_throttle(step)
train_step(...)
Parameters
----------
high_c : float
Start throttling at or above this temperature.
resume_c : float
Stop throttling only once temperature falls back below this.
Must be lower than ``high_c`` to avoid threshold sawtooth.
cool_down_s : float
How long to sleep per throttle cycle before re-reading.
check_every : int
Poll every N training steps (avoid reading /sys every step;
thermal changes are slow relative to a training step).
"""
high_c: float = 85.0
resume_c: float = 75.0
cool_down_s: float = 2.0
check_every: int = 20
_throttling: bool = False
_total_throttle_s: float = 0.0
_throttle_events: int = 0
_last_temp_c: float | None = None
_available: bool | None = None
def __post_init__(self) -> None:
if self.resume_c >= self.high_c:
raise ValueError(
f"resume_c ({self.resume_c}) must be < high_c ({self.high_c})"
)
@property
def available(self) -> bool:
if self._available is None:
self._available = _read_thermal_max_celsius() is not None
return self._available
def maybe_throttle(self, step: int) -> None:
if not self.available:
return
if step % self.check_every != 0 and not self._throttling:
return
t = _read_thermal_max_celsius()
if t is None:
return
self._last_temp_c = t
if not self._throttling and t >= self.high_c:
self._throttling = True
self._throttle_events += 1
while self._throttling:
time.sleep(self.cool_down_s)
self._total_throttle_s += self.cool_down_s
t2 = _read_thermal_max_celsius()
if t2 is None or t2 < self.resume_c:
self._throttling = False
self._last_temp_c = t2
break
def polite_training() -> None:
"""Yield the CPU briefly and trim allocators. Cheap nicety so a long
CPU run doesn't make the machine unusable for everything else."""
gc.collect()
if not torch.cuda.is_available():
try:
import ctypes
libc = ctypes.CDLL("libc.so.6")
libc.malloc_trim(0)
except Exception:
pass
time.sleep(0.001)