tiny-epstein-100m
A small transformer model (~100M parameters) trained on the teyler/epstein-files-20k dataset. The architecture is inspired by Tiny Aya modifications and is designed for efficient on-device inference.
Model Details
- Architecture: Decoder-only transformer with parallel blocks, Grouped Query Attention (GQA), SwiGLU activation, and bias‑free LayerNorm.
- Sliding Window Attention: 3:1 local:global ratio (first 75% of layers use sliding window with RoPE; remaining layers use full attention with NoPE).
- Parameters: ~100 million
- Context Length: 1024 tokens (configurable)
- Tokenizer: GPT‑2 (same as used during training)
- Training Data: teyler/epstein-files-20k – 20,000 documents related to the Epstein files.
Intended Use
This model is primarily for research and experimentation. It can generate continuations of text given a prompt, especially on topics related to the Epstein files.
How to Use
Installation
Make sure you have torch and transformers installed.
If you want to run inference, install the required packages:
pip install torch transformers
Loading the Model and Tokenizer
import torch
from transformers import GPT2TokenizerFast
from huggingface_hub import snapshot_download
# Download the model from Hugging Face Hub
model_path = snapshot_download(repo_id="liminerity/tiny-epstein-100m")
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
# ------------------------------------------------------------------------------
# Configuration (scaled to ~150M for L4 GPU)
# ------------------------------------------------------------------------------
class ModelConfig:
vocab_size = 50257 # will be updated from tokenizer
emb_dim = 768 # embedding dimension
hidden_dim = 2048 # intermediate size (FFN) - reduced
num_layers = 12 # number of transformer layers - reduced
num_heads = 12 # number of query heads - reduced
num_kv_heads = 4 # number of key/value heads (GQA)
max_seq_len = 1024 # shorter sequence length to save memory
window_size = 1024 # sliding window size (match max_seq_len)
sliding_window_ratio = 0.75 # fraction of layers with sliding window
rope_theta = 10000.0 # base for RoPE
dtype = torch.float16 # use mixed precision
bias = False # no bias in linear layers
dropout = 0.0 # no dropout mentioned
gradient_checkpointing = True # enable to save memory
# ------------------------------------------------------------------------------
# Helper modules (unchanged)
# ------------------------------------------------------------------------------
class CohereLayerNorm(nn.Module):
"""LayerNorm without bias (scale only)."""
def __init__(self, emb_dim, eps=1e-5):
super().__init__()
self.eps = eps
self.weight = nn.Parameter(torch.ones(emb_dim))
def forward(self, x):
input_dtype = x.dtype
x = x.to(torch.float32)
mean = x.mean(dim=-1, keepdim=True)
variance = (x - mean).pow(2).mean(dim=-1, keepdim=True)
x = (x - mean) * torch.rsqrt(variance + self.eps)
return (self.weight.to(torch.float32) * x).to(input_dtype)
class FeedForward(nn.Module):
"""SwiGLU MLP."""
def __init__(self, config):
super().__init__()
self.fc1 = nn.Linear(config.emb_dim, config.hidden_dim, bias=config.bias)
self.fc2 = nn.Linear(config.emb_dim, config.hidden_dim, bias=config.bias)
self.fc3 = nn.Linear(config.hidden_dim, config.emb_dim, bias=config.bias)
def forward(self, x):
x_fc1 = self.fc1(x)
x_fc2 = self.fc2(x)
x = F.silu(x_fc1) * x_fc2
return self.fc3(x)
def precompute_rope_freqs(dim, max_seq_len, theta=10000.0, dtype=torch.float32):
"""Precompute rotary position embeddings."""
assert dim % 2 == 0, "dim must be even"
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=dtype)[:(dim // 2)] / dim))
t = torch.arange(max_seq_len, dtype=dtype)
freqs = torch.outer(t, freqs) # shape (max_seq_len, dim//2)
emb = torch.cat((freqs, freqs), dim=-1) # shape (max_seq_len, dim)
return emb.sin(), emb.cos()
def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1, x2 = x.chunk(2, dim=-1)
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_emb(x, cos, sin):
"""
Apply rotary embeddings to input tensor.
x: (batch, seq_len, num_heads, head_dim)
cos, sin: (seq_len, head_dim)
"""
cos = cos.unsqueeze(0).unsqueeze(2) # (1, seq_len, 1, head_dim)
sin = sin.unsqueeze(0).unsqueeze(2) # (1, seq_len, 1, head_dim)
return (x * cos) + (rotate_half(x) * sin)
class GroupedQueryAttention(nn.Module):
"""Multi-head attention with GQA and optional sliding window mask."""
def __init__(self, config, layer_id):
super().__init__()
self.num_heads = config.num_heads
self.num_kv_heads = config.num_kv_heads
self.head_dim = config.emb_dim // config.num_heads
assert self.num_heads % self.num_kv_heads == 0
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
self.wq = nn.Linear(config.emb_dim, config.num_heads * self.head_dim, bias=config.bias)
self.wk = nn.Linear(config.emb_dim, config.num_kv_heads * self.head_dim, bias=config.bias)
self.wv = nn.Linear(config.emb_dim, config.num_kv_heads * self.head_dim, bias=config.bias)
self.wo = nn.Linear(config.num_heads * self.head_dim, config.emb_dim, bias=config.bias)
total_layers = config.num_layers
num_sliding = int(total_layers * config.sliding_window_ratio)
self.use_sliding = (layer_id < num_sliding)
self.window_size = config.window_size
self.max_seq_len = config.max_seq_len
self.rope_theta = config.rope_theta
self.rope_sin, self.rope_cos = None, None
def init_rope(self, max_seq_len, device):
if self.rope_sin is not None and self.rope_sin.shape[0] >= max_seq_len:
return
sin, cos = precompute_rope_freqs(
self.head_dim, max_seq_len, theta=self.rope_theta, dtype=torch.float32
)
self.rope_sin = sin.to(device)
self.rope_cos = cos.to(device)
def forward(self, x, mask=None):
batch, seq_len, _ = x.shape
device = x.device
if self.use_sliding:
self.init_rope(seq_len, device)
xq = self.wq(x)
xk = self.wk(x)
xv = self.wv(x)
xq = xq.view(batch, seq_len, self.num_heads, self.head_dim)
xk = xk.view(batch, seq_len, self.num_kv_heads, self.head_dim)
xv = xv.view(batch, seq_len, self.num_kv_heads, self.head_dim)
if self.use_sliding:
xq = apply_rotary_emb(xq, self.rope_cos[:seq_len], self.rope_sin[:seq_len])
xk = apply_rotary_emb(xk, self.rope_cos[:seq_len], self.rope_sin[:seq_len])
xk = xk.repeat_interleave(self.num_queries_per_kv, dim=2)
xv = xv.repeat_interleave(self.num_queries_per_kv, dim=2)
xq = xq.transpose(1, 2)
xk = xk.transpose(1, 2)
xv = xv.transpose(1, 2)
scores = torch.matmul(xq, xk.transpose(-2, -1)) / math.sqrt(self.head_dim)
if mask is not None:
scores = scores + mask
else:
mask = torch.full((seq_len, seq_len), float('-inf'), device=device)
mask = torch.triu(mask, diagonal=1)
if self.use_sliding:
for i in range(seq_len):
low = max(0, i - self.window_size + 1)
mask[i, :low] = float('-inf')
scores = scores + mask
probs = F.softmax(scores, dim=-1, dtype=torch.float32).to(xq.dtype)
out = torch.matmul(probs, xv)
out = out.transpose(1, 2).contiguous().view(batch, seq_len, -1)
return self.wo(out)
class ParallelTransformerBlock(nn.Module):
"""Decoder block with parallel attention and MLP."""
def __init__(self, config, layer_id):
super().__init__()
self.norm = CohereLayerNorm(config.emb_dim)
self.attn = GroupedQueryAttention(config, layer_id)
self.mlp = FeedForward(config)
def forward(self, x, mask=None):
residual = x
x = self.norm(x)
attn_out = self.attn(x, mask=mask)
mlp_out = self.mlp(x)
return residual + attn_out + mlp_out
class TinyAya(nn.Module):
"""Tiny Aya 150M model."""
def __init__(self, config):
super().__init__()
self.config = config
self.token_embedding = nn.Embedding(config.vocab_size, config.emb_dim)
self.layers = nn.ModuleList([
ParallelTransformerBlock(config, i) for i in range(config.num_layers)
])
self.norm = CohereLayerNorm(config.emb_dim)
self.lm_head = nn.Linear(config.emb_dim, config.vocab_size, bias=False)
self.lm_head.weight = self.token_embedding.weight
if config.gradient_checkpointing:
self.gradient_checkpointing_enable()
def gradient_checkpointing_enable(self):
self._gradient_checkpointing = True
def forward(self, input_ids, mask=None):
x = self.token_embedding(input_ids)
for layer in self.layers:
if self.training and getattr(self, '_gradient_checkpointing', False):
x = torch.utils.checkpoint.checkpoint(layer, x, mask)
else:
x = layer(x, mask=mask)
x = self.norm(x)
logits = self.lm_head(x)
return logits
@torch.no_grad()
def generate(self, input_ids, max_new_tokens=50, temperature=1.0):
self.eval()
for _ in range(max_new_tokens):
logits = self(input_ids[:, -self.config.max_seq_len:])
next_token_logits = logits[:, -1, :] / temperature
probs = F.softmax(next_token_logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
input_ids = torch.cat([input_ids, next_token], dim=-1)
return input_ids
tokenizer = GPT2TokenizerFast.from_pretrained(repo_id)
tokenizer.pad_token = tokenizer.eos_token
model = TinyAya(ModelConfig())
state_dict = torch.load(os.path.join(model_path, "pytorch_model.bin"), map_location="cpu")
model.load_state_dict(state_dict)
model.eval()
Text Generation Example
prompt = "The Epstein files reveal"
inputs = tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
outputs = model.generate(
inputs.input_ids,
max_new_tokens=50,
temperature=0.8,
do_sample=True
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
Training Details
The model was trained for one epoch on the full dataset using an L4 GPU in Google Colab. Optimizer: AdamW (lr=1e-4) with gradient clipping (max norm=1.0). Mixed precision (float16) was used.
Limitations
· The model is small and was trained on a limited dataset; it may produce repetitive or nonsensical outputs. · It has not undergone any safety fine‑tuning; use with caution.
License
MIT
- Downloads last month
- 144