fromziro/jetoncount_corpus
Updated โข 1
Task: regression
Total training time: 111 minutes
Params: 7009
Final MAE: 192
Framework: PyTorch
Authors: Paul Courneya, Jonathon Ly
JetonCount is a 7k-parameter MLP regression model trained to predict the number of tokens a piece of text might contain using only six input features.
charswordsavg_chars_per_wordlongest_word_charssymbol_ratiopunctuation_ratiovocab_size22M rows. 28 tokenizers. 9 sources.
Tokenizers: tokenizers_used.txt
Datasets: datasets_used.txt
(0.9, 0.95)float32Train:
Eval:
Test:
| Actual Tokens | Model Prediction |
|---|---|
| 197 | 239 |
| 1333 | 1395 |
| 5973 | 6609 |
| 18569 | 20423 |
Note: Rounded to nearest integer.
Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in engineering, mathematics and computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals.[1]
2560Out:
{
"actual_token_count": 139,
"prediction" "190.5435028076172",
"model_latency_ms": "0.24457614858252544",
"tokenizer_latency_ms": 0.3174110000009023
}
We came across a dilemma: why build this model if a tokenizer is more accurate anyway?
The answer is speed.
In our tests, especially on long texts, the model is significantly faster than a tokenizer.
| Tokens | Model Latency (ms) | Tokenizer Latency (ms) |
|---|---|---|
| 197 | 0.2429 | 0.4134 |
| 1333 | 0.3409 | 1.8775 |
| 5973 | 0.9827 | 7.6504 |
| 18569 | 5.2890 | 28.8244 |
Note: Model latency changes based on hardware.
Before using, distributing, selling, or modifying this software, you must read the license here.
from __future__ import annotations
import json
import re
import time
from dataclasses import dataclass
from typing import Tuple
import torch
from transformers import AutoModel, AutoTokenizer
MODEL_ID = "fromziro/JetonCount"
TOKENIZER_ID = "fromziro/Er-Tiny-1.3M"
FEATURE_MEAN = None
FEATURE_STD = None
TARGET_OFFSET = 0.0
DEFAULT_VOCAB_SIZE = 2564
TEXT = "Put your text here."
TOKENIZER_ROUNDS = 100
MODEL_ROUNDS = 1000
PUNCTUATION_CHARS = set(r""".,!?;:'"`~@#$%^&*()-_=+[]{}<>/\|""")
SYMBOL_CHARS = set(r"""@#$%^&*()-_=+[]{}<>/\|~`""")
@dataclass
class TextStats:
chars: float
words: float
avg_chars_per_word: float
punctuation_ratio: float
symbol_ratio: float
longest_word_chars: float
vocab_size: float
def compute_text_stats(text: str, vocab_size: int) -> TextStats:
chars = len(text)
words_list = re.findall(r"\b\w+\b", text, flags=re.UNICODE)
words = len(words_list)
total_word_chars = sum(len(w) for w in words_list)
avg_chars_per_word = (total_word_chars / words) if words else 0.0
longest_word_chars = max((len(w) for w in words_list), default=0)
if chars:
punctuation_count = sum(1 for ch in text if ch in PUNCTUATION_CHARS)
symbol_count = sum(1 for ch in text if ch in SYMBOL_CHARS)
punctuation_ratio = punctuation_count / chars
symbol_ratio = symbol_count / chars
else:
punctuation_ratio = 0.0
symbol_ratio = 0.0
return TextStats(
chars=float(chars),
words=float(words),
avg_chars_per_word=float(avg_chars_per_word),
punctuation_ratio=float(punctuation_ratio),
symbol_ratio=float(symbol_ratio),
longest_word_chars=float(longest_word_chars),
vocab_size=float(vocab_size),
)
def build_feature_tensor(stats: TextStats) -> torch.Tensor:
base = torch.tensor(
[
stats.chars,
stats.words,
stats.avg_chars_per_word,
stats.punctuation_ratio,
stats.symbol_ratio,
stats.longest_word_chars,
stats.vocab_size,
],
dtype=torch.float32,
)
chars, words, avg_chars_per_word, punctuation_ratio, symbol_ratio, longest_word_chars, vocab_size = base
eps = 1e-6
extra = torch.tensor(
[
chars / max(words.item(), 1.0),
words / max(chars.item(), 1.0),
torch.log1p(torch.clamp(chars, min=0.0)).item(),
torch.log1p(torch.clamp(words, min=0.0)).item(),
torch.log1p(torch.clamp(vocab_size, min=0.0)).item(),
(chars * punctuation_ratio).item(),
(chars * symbol_ratio).item(),
(words * avg_chars_per_word).item(),
(words * punctuation_ratio).item(),
(longest_word_chars * punctuation_ratio).item(),
((avg_chars_per_word + longest_word_chars) * (1.0 + punctuation_ratio + symbol_ratio)).item(),
((chars + eps) * (punctuation_ratio + symbol_ratio + eps)).item(),
],
dtype=torch.float32,
)
return torch.cat([base, extra], dim=0)
def standardize_features(x: torch.Tensor) -> torch.Tensor:
if FEATURE_MEAN is None or FEATURE_STD is None:
return x
mean = torch.tensor(FEATURE_MEAN, dtype=x.dtype, device=x.device)
std = torch.tensor(FEATURE_STD, dtype=x.dtype, device=x.device)
safe_std = torch.where(torch.isfinite(std) & (std != 0), std, torch.ones_like(std))
safe_mean = torch.where(torch.isfinite(mean), mean, torch.zeros_like(mean))
return (x - safe_mean) / safe_std
def benchmark_tokenizer(tokenizer, text: str, rounds: int = 100) -> Tuple[int, float]:
tokenizer(text)
start = time.perf_counter()
actual_count = 0
for _ in range(rounds):
ids = tokenizer(text, add_special_tokens=False).input_ids
actual_count = len(ids)
elapsed_ms = (time.perf_counter() - start) * 1000.0 / rounds
return actual_count, elapsed_ms
@torch.inference_mode()
def benchmark_model(model, feature_tensor: torch.Tensor, rounds: int = 1000) -> Tuple[float, float]:
x = standardize_features(feature_tensor).unsqueeze(0)
_ = model(input_features=x)
start = time.perf_counter()
pred = 0.0
for _ in range(rounds):
out = model(input_features=x)
pred = float(out.logits.squeeze().item())
elapsed_ms = (time.perf_counter() - start) * 1000.0 / rounds
return pred, elapsed_ms
def main() -> None:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID, use_fast=True)
model = AutoModel.from_pretrained(MODEL_ID, trust_remote_code=True)
model.eval()
stats = compute_text_stats(TEXT, DEFAULT_VOCAB_SIZE)
feature_tensor = build_feature_tensor(stats)
actual_count, tokenizer_latency_ms = benchmark_tokenizer(tokenizer, TEXT, rounds=TOKENIZER_ROUNDS)
prediction, model_latency_ms = benchmark_model(model, feature_tensor, rounds=MODEL_ROUNDS)
result = {
"actual_token_count": actual_count,
"prediction": prediction,
"model_latency_ms": model_latency_ms,
"tokenizer_latency_ms": tokenizer_latency_ms,
"model_id": MODEL_ID,
"tokenizer_id": TOKENIZER_ID,
"vocab_size": DEFAULT_VOCAB_SIZE,
"features": {
"chars": stats.chars,
"words": stats.words,
"avg_chars_per_word": stats.avg_chars_per_word,
"punctuation_ratio": stats.punctuation_ratio,
"symbol_ratio": stats.symbol_ratio,
"longest_word_chars": stats.longest_word_chars,
"vocab_size": stats.vocab_size,
},
}
print(json.dumps(result, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()
Copyright (c) 2026 FromZero
Copyright (c) 2026 Paul Courneya
Copyright (c) 2026 Jonathon LY
@misc{jetoncount,
title = {JetonCount},
organization = [FromZero],
authors = {Paul Courneya, Jonathon LY},
year = {2026},
url = {https://huggingface.co/fromziro/JetonCount}
}