tiny-epstein-100m

A small transformer model (~100M parameters) trained on the teyler/epstein-files-20k dataset. The architecture is inspired by Tiny Aya modifications and is designed for efficient on-device inference.

Model Details

  • Architecture: Decoder-only transformer with parallel blocks, Grouped Query Attention (GQA), SwiGLU activation, and bias‑free LayerNorm.
  • Sliding Window Attention: 3:1 local:global ratio (first 75% of layers use sliding window with RoPE; remaining layers use full attention with NoPE).
  • Parameters: ~100 million
  • Context Length: 1024 tokens (configurable)
  • Tokenizer: GPT‑2 (same as used during training)
  • Training Data: teyler/epstein-files-20k – 20,000 documents related to the Epstein files.

Intended Use

This model is primarily for research and experimentation. It can generate continuations of text given a prompt, especially on topics related to the Epstein files.

How to Use

Installation

Make sure you have torch and transformers installed. If you want to run inference, install the required packages:

pip install torch transformers

Loading the Model and Tokenizer

import torch
from transformers import GPT2TokenizerFast
from huggingface_hub import snapshot_download

# Download the model from Hugging Face Hub
model_path = snapshot_download(repo_id="liminerity/tiny-epstein-100m")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# ------------------------------------------------------------------------------
# Configuration (scaled to ~150M for L4 GPU)
# ------------------------------------------------------------------------------
class ModelConfig:
    vocab_size = 50257          # will be updated from tokenizer
    emb_dim = 768                # embedding dimension
    hidden_dim = 2048            # intermediate size (FFN) - reduced
    num_layers = 12              # number of transformer layers - reduced
    num_heads = 12               # number of query heads - reduced
    num_kv_heads = 4             # number of key/value heads (GQA)
    max_seq_len = 1024           # shorter sequence length to save memory
    window_size = 1024           # sliding window size (match max_seq_len)
    sliding_window_ratio = 0.75  # fraction of layers with sliding window
    rope_theta = 10000.0         # base for RoPE
    dtype = torch.float16        # use mixed precision
    bias = False                 # no bias in linear layers
    dropout = 0.0                # no dropout mentioned
    gradient_checkpointing = True # enable to save memory

# ------------------------------------------------------------------------------
# Helper modules (unchanged)
# ------------------------------------------------------------------------------
class CohereLayerNorm(nn.Module):
    """LayerNorm without bias (scale only)."""
    def __init__(self, emb_dim, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(emb_dim))

    def forward(self, x):
        input_dtype = x.dtype
        x = x.to(torch.float32)
        mean = x.mean(dim=-1, keepdim=True)
        variance = (x - mean).pow(2).mean(dim=-1, keepdim=True)
        x = (x - mean) * torch.rsqrt(variance + self.eps)
        return (self.weight.to(torch.float32) * x).to(input_dtype)


class FeedForward(nn.Module):
    """SwiGLU MLP."""
    def __init__(self, config):
        super().__init__()
        self.fc1 = nn.Linear(config.emb_dim, config.hidden_dim, bias=config.bias)
        self.fc2 = nn.Linear(config.emb_dim, config.hidden_dim, bias=config.bias)
        self.fc3 = nn.Linear(config.hidden_dim, config.emb_dim, bias=config.bias)

    def forward(self, x):
        x_fc1 = self.fc1(x)
        x_fc2 = self.fc2(x)
        x = F.silu(x_fc1) * x_fc2
        return self.fc3(x)


def precompute_rope_freqs(dim, max_seq_len, theta=10000.0, dtype=torch.float32):
    """Precompute rotary position embeddings."""
    assert dim % 2 == 0, "dim must be even"
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=dtype)[:(dim // 2)] / dim))
    t = torch.arange(max_seq_len, dtype=dtype)
    freqs = torch.outer(t, freqs)  # shape (max_seq_len, dim//2)
    emb = torch.cat((freqs, freqs), dim=-1)  # shape (max_seq_len, dim)
    return emb.sin(), emb.cos()


def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1, x2 = x.chunk(2, dim=-1)
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_emb(x, cos, sin):
    """
    Apply rotary embeddings to input tensor.
    x: (batch, seq_len, num_heads, head_dim)
    cos, sin: (seq_len, head_dim)
    """
    cos = cos.unsqueeze(0).unsqueeze(2)  # (1, seq_len, 1, head_dim)
    sin = sin.unsqueeze(0).unsqueeze(2)  # (1, seq_len, 1, head_dim)
    return (x * cos) + (rotate_half(x) * sin)


class GroupedQueryAttention(nn.Module):
    """Multi-head attention with GQA and optional sliding window mask."""
    def __init__(self, config, layer_id):
        super().__init__()
        self.num_heads = config.num_heads
        self.num_kv_heads = config.num_kv_heads
        self.head_dim = config.emb_dim // config.num_heads
        assert self.num_heads % self.num_kv_heads == 0
        self.num_queries_per_kv = self.num_heads // self.num_kv_heads

        self.wq = nn.Linear(config.emb_dim, config.num_heads * self.head_dim, bias=config.bias)
        self.wk = nn.Linear(config.emb_dim, config.num_kv_heads * self.head_dim, bias=config.bias)
        self.wv = nn.Linear(config.emb_dim, config.num_kv_heads * self.head_dim, bias=config.bias)
        self.wo = nn.Linear(config.num_heads * self.head_dim, config.emb_dim, bias=config.bias)

        total_layers = config.num_layers
        num_sliding = int(total_layers * config.sliding_window_ratio)
        self.use_sliding = (layer_id < num_sliding)

        self.window_size = config.window_size
        self.max_seq_len = config.max_seq_len
        self.rope_theta = config.rope_theta
        self.rope_sin, self.rope_cos = None, None

    def init_rope(self, max_seq_len, device):
        if self.rope_sin is not None and self.rope_sin.shape[0] >= max_seq_len:
            return
        sin, cos = precompute_rope_freqs(
            self.head_dim, max_seq_len, theta=self.rope_theta, dtype=torch.float32
        )
        self.rope_sin = sin.to(device)
        self.rope_cos = cos.to(device)

    def forward(self, x, mask=None):
        batch, seq_len, _ = x.shape
        device = x.device

        if self.use_sliding:
            self.init_rope(seq_len, device)

        xq = self.wq(x)
        xk = self.wk(x)
        xv = self.wv(x)

        xq = xq.view(batch, seq_len, self.num_heads, self.head_dim)
        xk = xk.view(batch, seq_len, self.num_kv_heads, self.head_dim)
        xv = xv.view(batch, seq_len, self.num_kv_heads, self.head_dim)

        if self.use_sliding:
            xq = apply_rotary_emb(xq, self.rope_cos[:seq_len], self.rope_sin[:seq_len])
            xk = apply_rotary_emb(xk, self.rope_cos[:seq_len], self.rope_sin[:seq_len])

        xk = xk.repeat_interleave(self.num_queries_per_kv, dim=2)
        xv = xv.repeat_interleave(self.num_queries_per_kv, dim=2)

        xq = xq.transpose(1, 2)
        xk = xk.transpose(1, 2)
        xv = xv.transpose(1, 2)

        scores = torch.matmul(xq, xk.transpose(-2, -1)) / math.sqrt(self.head_dim)

        if mask is not None:
            scores = scores + mask
        else:
            mask = torch.full((seq_len, seq_len), float('-inf'), device=device)
            mask = torch.triu(mask, diagonal=1)
            if self.use_sliding:
                for i in range(seq_len):
                    low = max(0, i - self.window_size + 1)
                    mask[i, :low] = float('-inf')
            scores = scores + mask

        probs = F.softmax(scores, dim=-1, dtype=torch.float32).to(xq.dtype)
        out = torch.matmul(probs, xv)
        out = out.transpose(1, 2).contiguous().view(batch, seq_len, -1)
        return self.wo(out)


class ParallelTransformerBlock(nn.Module):
    """Decoder block with parallel attention and MLP."""
    def __init__(self, config, layer_id):
        super().__init__()
        self.norm = CohereLayerNorm(config.emb_dim)
        self.attn = GroupedQueryAttention(config, layer_id)
        self.mlp = FeedForward(config)

    def forward(self, x, mask=None):
        residual = x
        x = self.norm(x)
        attn_out = self.attn(x, mask=mask)
        mlp_out = self.mlp(x)
        return residual + attn_out + mlp_out


class TinyAya(nn.Module):
    """Tiny Aya 150M model."""
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.token_embedding = nn.Embedding(config.vocab_size, config.emb_dim)
        self.layers = nn.ModuleList([
            ParallelTransformerBlock(config, i) for i in range(config.num_layers)
        ])
        self.norm = CohereLayerNorm(config.emb_dim)
        self.lm_head = nn.Linear(config.emb_dim, config.vocab_size, bias=False)
        self.lm_head.weight = self.token_embedding.weight

        if config.gradient_checkpointing:
            self.gradient_checkpointing_enable()

    def gradient_checkpointing_enable(self):
        self._gradient_checkpointing = True

    def forward(self, input_ids, mask=None):
        x = self.token_embedding(input_ids)
        for layer in self.layers:
            if self.training and getattr(self, '_gradient_checkpointing', False):
                x = torch.utils.checkpoint.checkpoint(layer, x, mask)
            else:
                x = layer(x, mask=mask)
        x = self.norm(x)
        logits = self.lm_head(x)
        return logits

    @torch.no_grad()
    def generate(self, input_ids, max_new_tokens=50, temperature=1.0):
        self.eval()
        for _ in range(max_new_tokens):
            logits = self(input_ids[:, -self.config.max_seq_len:])
            next_token_logits = logits[:, -1, :] / temperature
            probs = F.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            input_ids = torch.cat([input_ids, next_token], dim=-1)
        return input_ids
tokenizer = GPT2TokenizerFast.from_pretrained(repo_id)
tokenizer.pad_token = tokenizer.eos_token
model = TinyAya(ModelConfig())
state_dict = torch.load(os.path.join(model_path, "pytorch_model.bin"), map_location="cpu")
model.load_state_dict(state_dict)
model.eval()

Text Generation Example

prompt = "The Epstein files reveal"
inputs = tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=50,
        temperature=0.8,
        do_sample=True
    )
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Training Details

The model was trained for one epoch on the full dataset using an L4 GPU in Google Colab. Optimizer: AdamW (lr=1e-4) with gradient clipping (max norm=1.0). Mixed precision (float16) was used.

Limitations

· The model is small and was trained on a limited dataset; it may produce repetitive or nonsensical outputs. · It has not undergone any safety fine‑tuning; use with caution.

License

MIT

Downloads last month
144
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support