TilelliLab
/

Tilelli-llm

Text Generation

small-language-model

mixture-of-experts

negative-results

reproducibility

Model card Files Files and versions

Tilelli-llm / src /tilelli /utils /runtime.py

TilelliLab's picture

Mirror small files (code, paper, results)

f86dc09 verified 3 days ago

History Blame Contribute Delete

4.21 kB

	"""tilelli.utils.runtime — small training-loop niceties.

	Two helpers, both designed to be no-ops on hardware that doesn't need them:

	- ``ThermalGuard``: poll Linux thermal zones and sleep when the chip
	gets too hot. Lets you train overnight on a fanless laptop without
	cooking the silicon. On a host with no readable thermal zones this
	becomes a free no-op.

	- ``polite_training``: yield the CPU to the rest of the system once
	per training step. Keeps the machine usable while a long training
	run is in flight. On CUDA hosts it does almost nothing (a malloc
	trim and a gc); on CPU hosts it adds a sub-millisecond sleep.

	Neither helper is required to run Tilelli — they exist so that small
	hosts (cheap laptop, single-board computer, $200 mini-PC) can run a
	training session without becoming unusable for everything else.
	"""
	from __future__ import annotations

	import gc
	import time
	from dataclasses import dataclass
	from pathlib import Path

	import torch


	def _read_thermal_max_celsius() -> float \| None:
	"""Return the hottest readable thermal zone in °C, or None if no
	/sys/class/thermal/* zones are present (most non-Linux hosts)."""
	try:
	zones = sorted(Path("/sys/class/thermal").glob("thermal_zone*/temp"))
	except OSError:
	return None
	if not zones:
	return None
	temps: list[float] = []
	for z in zones:
	try:
	temps.append(int(z.read_text().strip()) / 1000.0)
	except (OSError, ValueError):
	continue
	return max(temps) if temps else None


	@dataclass
	class ThermalGuard:
	"""Polls the hottest thermal zone and sleeps when it crosses a cap.

	Usage in a training loop::

	guard = ThermalGuard(high_c=80.0, resume_c=72.0)
	for step in range(steps):
	guard.maybe_throttle(step)
	train_step(...)

	Parameters
	----------
	high_c : float
	Start throttling at or above this temperature.
	resume_c : float
	Stop throttling only once temperature falls back below this.
	Must be lower than ``high_c`` to avoid threshold sawtooth.
	cool_down_s : float
	How long to sleep per throttle cycle before re-reading.
	check_every : int
	Poll every N training steps (avoid reading /sys every step;
	thermal changes are slow relative to a training step).
	"""

	high_c: float = 85.0
	resume_c: float = 75.0
	cool_down_s: float = 2.0
	check_every: int = 20
	_throttling: bool = False
	_total_throttle_s: float = 0.0
	_throttle_events: int = 0
	_last_temp_c: float \| None = None
	_available: bool \| None = None

	def __post_init__(self) -> None:
	if self.resume_c >= self.high_c:
	raise ValueError(
	f"resume_c ({self.resume_c}) must be < high_c ({self.high_c})"
	)

	@property
	def available(self) -> bool:
	if self._available is None:
	self._available = _read_thermal_max_celsius() is not None
	return self._available

	def maybe_throttle(self, step: int) -> None:
	if not self.available:
	return
	if step % self.check_every != 0 and not self._throttling:
	return
	t = _read_thermal_max_celsius()
	if t is None:
	return
	self._last_temp_c = t
	if not self._throttling and t >= self.high_c:
	self._throttling = True
	self._throttle_events += 1
	while self._throttling:
	time.sleep(self.cool_down_s)
	self._total_throttle_s += self.cool_down_s
	t2 = _read_thermal_max_celsius()
	if t2 is None or t2 < self.resume_c:
	self._throttling = False
	self._last_temp_c = t2
	break


	def polite_training() -> None:
	"""Yield the CPU briefly and trim allocators. Cheap nicety so a long
	CPU run doesn't make the machine unusable for everything else."""
	gc.collect()
	if not torch.cuda.is_available():
	try:
	import ctypes

	libc = ctypes.CDLL("libc.so.6")
	libc.malloc_trim(0)
	except Exception:
	pass
	time.sleep(0.001)