HPC-Quantize / hexstate_quantize.c

Update hexstate_quantize.c

a034f4d verified 7 days ago

245 kB

	/* ═══════════════════════════════════════════════════════════════════════════
	* hexstate_quantize.c — HexState GGUF Quantizer
	*
	* ╔═══════════════════════════════════════════════════════════════╗
	* ║ HPC-Optimized GGUF Quantization Engine ║
	* ║ ║
	* ║ Architecture: HPCGraph Sensitivity Propagation ║
	* ║ Optimization: Complex Amplitude BP + MCMC Scale Search ║
	* ║ Enhancements: MSE Grid Search, Importance Matrix Weighting ║
	* ║ Output: GGUF v3 (Q2_K) ║
	* ║ ║
	* ║ "The weight and the quantized are opposite faces." ║
	* ╚═══════════════════════════════════════════════════════════════╝
	*
	* This tool adapts the HExState HPC Ouroboros factoring engine for
	* LLM weight quantization. The core mathematical machinery is reused:
	*
	* Factoring Domain → Quantization Domain
	* ─────────────────────────────────────────────────
	* HPCGraph + CZ edges → Block sensitivity graph
	* Complex Amplitude BP → Importance propagation
	* MCMC period sampler → Optimal scale search
	* try_period() validation → Error bound checking
	* LLL lattice reduction → (future) Adaptive bit allocation
	*
	* Additional techniques ported from llm-compressor:
	* MSE grid search → Optimal min/max range shrinking
	* Importance matrix (imatrix) → Per-channel error weighting
	*
	* Build:
	* make -f Makefile.quantize
	*
	* Usage:
	* ./hexstate_quantize <input> <output.gguf> [options]
	*
	* Input can be:
	* - A single .safetensors file
	* - A model directory containing sharded .safetensors files
	*
	* Options:
	* --optimizer hpc\|mse\|hybrid Scale optimization strategy (default: hybrid)
	* --imatrix <file> Importance matrix for weighted quantization
	* --verbose Per-block diagnostics
	* ═══════════════════════════════════════════════════════════════════════════ */

	#include <stdio.h>
	#ifdef _OPENMP
	#include <omp.h>
	#endif
	#include <stdlib.h>
	#include <string.h>
	#include <math.h>
	#include <time.h>
	#include <sys/stat.h>
	#include <mpfr.h>

	/* HExState headers — reused from the factoring engine */
	#include "quhit_triality.h"
	#include "hpc_graph.h"
	#include "hpc_mobius.h"
	#include "s6_exotic.h"

	/* Quantization-specific headers */
	#include "gguf_format.h"
	#include "safetensors_reader.h"
	#include "tokenizer_reader.h"
	#include "imatrix_reader.h"

	#define D 6 /* Preserved from HExState — the triality dimension */

	/* ═══════════════════════════════════════════════════════════════════════════
	* OPTIMIZER MODE
	* ═══════════════════════════════════════════════════════════════════════════ */

	typedef enum {
	OPT_HPC, /* HExState BP only */
	OPT_MSE, /* MSE grid search only */
	OPT_HYBRID /* HPC sensitivity + MSE */
	} OptimizerMode;

	/* ═══════════════════════════════════════════════════════════════════════════
	* MODEL ARCHITECTURE AUTO-DETECTION
	*
	* Infers model architecture metadata from tensor names and shapes.
	* Supports: LLaMA, Mistral, Qwen2, Phi-3, Gemma, GPT-NeoX, Falcon, DeepSeek
	* ═══════════════════════════════════════════════════════════════════════════ */

	typedef struct {
	char architecture[64]; /* "llama", "phi3", "gemma", etc. */
	char name[256]; /* Human-readable model name */
	uint32_t block_count; /* Number of transformer layers */
	uint32_t embedding_length; /* Hidden dimension */
	uint32_t head_count; /* Number of attention heads */
	uint32_t head_count_kv; /* Number of KV heads (GQA) */
	uint32_t vocab_size; /* Vocabulary size */
	uint32_t context_length; /* Max context length (default) */
	float rope_freq_base; /* RoPE frequency base */
	uint32_t feed_forward_length; /* FFN intermediate size */
	float rms_norm_eps; /* RMS norm epsilon */
	int has_bias; /* Whether attention has biases */
	int tie_word_embeddings; /* Whether output = embed_tokens */
	} ModelArchitecture;

	/* Count tensor names matching a pattern prefix */
	static int count_tensors_with_prefix(const STMultiFile mf, const char prefix)
	{
	int count = 0;
	int prefix_len = strlen(prefix);
	for (int i = 0; i < mf->n_tensors; i++) {
	if (strncmp(mf->tensor_map[i].name, prefix, prefix_len) == 0)
	count++;
	}
	return count;
	}

	/* Find max layer index from tensor names like "model.layers.N.xxx" */
	static int find_max_layer_index(const STMultiFile mf, const char layer_prefix)
	{
	int max_idx = -1;
	int prefix_len = strlen(layer_prefix);
	for (int i = 0; i < mf->n_tensors; i++) {
	if (strncmp(mf->tensor_map[i].name, layer_prefix, prefix_len) == 0) {
	int idx = atoi(mf->tensor_map[i].name + prefix_len);
	if (idx > max_idx) max_idx = idx;
	}
	}
	return max_idx;
	}

	/* ── Config.json reader for definitive architecture parameters ── */

	typedef struct {
	int valid;
	uint32_t hidden_size;
	uint32_t intermediate_size;
	uint32_t num_attention_heads;
	uint32_t num_key_value_heads;
	uint32_t num_hidden_layers;
	uint32_t vocab_size;
	uint32_t max_position_embeddings;
	float rope_theta;
	float rms_norm_eps;
	char model_type[64];
	int tie_word_embeddings;
	} ConfigJson;

	static ConfigJson parse_config_json(const char *path)
	{
	ConfigJson cfg;
	memset(&cfg, 0, sizeof(cfg));

	FILE *f = fopen(path, "rb");
	if (!f) return cfg;

	fseek(f, 0, SEEK_END);
	long size = ftell(f);
	fseek(f, 0, SEEK_SET);
	if (size <= 0) { fclose(f); return cfg; }

	char json = (char )malloc((size_t)size + 1);
	if (!json) { fclose(f); return cfg; }
	size_t nread = fread(json, 1, (size_t)size, f);
	json[nread] = '\0';
	fclose(f);
	if (nread == 0) { free(json); return cfg; }

	cfg.valid = 1;

	/* Simple key-value extraction */
	const char *p;

	p = tok_find_key(json, "hidden_size");
	if (p) cfg.hidden_size = (uint32_t)strtol(p, NULL, 10);

	p = tok_find_key(json, "intermediate_size");
	if (p) cfg.intermediate_size = (uint32_t)strtol(p, NULL, 10);

	p = tok_find_key(json, "num_attention_heads");
	if (p) cfg.num_attention_heads = (uint32_t)strtol(p, NULL, 10);

	p = tok_find_key(json, "num_key_value_heads");
	if (p) cfg.num_key_value_heads = (uint32_t)strtol(p, NULL, 10);

	p = tok_find_key(json, "num_hidden_layers");
	if (p) cfg.num_hidden_layers = (uint32_t)strtol(p, NULL, 10);

	p = tok_find_key(json, "vocab_size");
	if (p) cfg.vocab_size = (uint32_t)strtol(p, NULL, 10);

	p = tok_find_key(json, "max_position_embeddings");
	if (p) cfg.max_position_embeddings = (uint32_t)strtol(p, NULL, 10);

	p = tok_find_key(json, "rope_theta");
	if (p) cfg.rope_theta = (float)strtod(p, NULL);

	p = tok_find_key(json, "rms_norm_eps");
	if (p) cfg.rms_norm_eps = (float)strtod(p, NULL);

	p = tok_find_key(json, "model_type");
	if (p && *p == '"') {
	char buf[64];
	tok_extract_string(p, buf, sizeof(buf));
	strncpy(cfg.model_type, buf, sizeof(cfg.model_type) - 1);
	}

	p = tok_find_key(json, "tie_word_embeddings");
	if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0);

	/* ── Qwen 3.5/3.6: parameters are nested inside "text_config" ── */
	if (cfg.hidden_size == 0) {
	const char *tc = strstr(json, "\"text_config\"");
	if (tc) {
	const char *tc_brace = strchr(tc, '{');
	if (tc_brace) {
	p = tok_find_key(tc_brace, "hidden_size");
	if (p) cfg.hidden_size = (uint32_t)strtol(p, NULL, 10);
	p = tok_find_key(tc_brace, "intermediate_size");
	if (p) cfg.intermediate_size = (uint32_t)strtol(p, NULL, 10);
	p = tok_find_key(tc_brace, "num_attention_heads");
	if (p) cfg.num_attention_heads = (uint32_t)strtol(p, NULL, 10);
	p = tok_find_key(tc_brace, "num_key_value_heads");
	if (p) cfg.num_key_value_heads = (uint32_t)strtol(p, NULL, 10);
	p = tok_find_key(tc_brace, "num_hidden_layers");
	if (p) cfg.num_hidden_layers = (uint32_t)strtol(p, NULL, 10);
	p = tok_find_key(tc_brace, "vocab_size");
	if (p) cfg.vocab_size = (uint32_t)strtol(p, NULL, 10);
	p = tok_find_key(tc_brace, "max_position_embeddings");
	if (p) cfg.max_position_embeddings = (uint32_t)strtol(p, NULL, 10);
	p = tok_find_key(tc_brace, "rms_norm_eps");
	if (p) cfg.rms_norm_eps = (float)strtod(p, NULL);
	p = tok_find_key(tc_brace, "model_type");
	if (p && *p == '"') {
	char buf2[64];
	tok_extract_string(p, buf2, sizeof(buf2));
	strncpy(cfg.model_type, buf2, sizeof(cfg.model_type) - 1);
	}
	p = tok_find_key(tc_brace, "tie_word_embeddings");
	if (p) cfg.tie_word_embeddings = (strncmp(p, "true", 4) == 0);
	/* Qwen3.6 rope_theta is nested in rope_parameters */
	const char *rp = strstr(tc_brace, "\"rope_parameters\"");
	if (rp) {
	p = tok_find_key(rp, "rope_theta");
	if (p) cfg.rope_theta = (float)strtod(p, NULL);
	}
	}
	}
	}

	free(json);
	return cfg;
	}

	static void detect_architecture(const STMultiFile mf, ModelArchitecture arch,
	const char *config_json_path)
	{
	memset(arch, 0, sizeof(*arch));

	/* Default values */
	strcpy(arch->architecture, "llama");
	strcpy(arch->name, "HExState-quantized");
	arch->context_length = 4096;
	arch->rope_freq_base = 10000.0f;
	arch->rms_norm_eps = 1e-5f;

	/* ── Try config.json for definitive parameters ── */
	ConfigJson cfg = {0};
	if (config_json_path) {
	cfg = parse_config_json(config_json_path);
	}

	if (cfg.valid) {
	/* Map model_type to GGUF architecture name */
	if (strcmp(cfg.model_type, "llama") == 0 \|\|
	strcmp(cfg.model_type, "mistral") == 0) {
	strcpy(arch->architecture, "llama");
	} else if (strcmp(cfg.model_type, "qwen2") == 0) {
	strcpy(arch->architecture, "qwen2");
	} else if (strcmp(cfg.model_type, "qwen2_moe") == 0) {
	strcpy(arch->architecture, "qwen2moe");
	} else if (strcmp(cfg.model_type, "qwen3_5") == 0 \|\|
	strcmp(cfg.model_type, "qwen3_5_text") == 0 \|\|
	strcmp(cfg.model_type, "qwen3_5_moe") == 0) {
	strcpy(arch->architecture, "qwen2"); /* GGUF arch: qwen2 compat */
	} else if (strcmp(cfg.model_type, "phi3") == 0 \|\|
	strcmp(cfg.model_type, "phi") == 0) {
	strcpy(arch->architecture, "phi3");
	} else if (strcmp(cfg.model_type, "gemma") == 0 \|\|
	strcmp(cfg.model_type, "gemma2") == 0) {
	strcpy(arch->architecture, "gemma");
	} else if (strcmp(cfg.model_type, "deepseek_v2") == 0) {
	strcpy(arch->architecture, "llama");
	} else if (strcmp(cfg.model_type, "gpt_neox") == 0) {
	strcpy(arch->architecture, "gpt_neox");
	} else if (strcmp(cfg.model_type, "falcon") == 0) {
	strcpy(arch->architecture, "falcon");
	} else if (cfg.model_type[0]) {
	/* Unknown — try llama as fallback */
	strcpy(arch->architecture, "llama");
	}

	if (cfg.hidden_size) arch->embedding_length = cfg.hidden_size;
	if (cfg.intermediate_size) arch->feed_forward_length = cfg.intermediate_size;
	if (cfg.num_attention_heads) arch->head_count = cfg.num_attention_heads;
	if (cfg.num_key_value_heads) arch->head_count_kv = cfg.num_key_value_heads;
	if (cfg.num_hidden_layers) arch->block_count = cfg.num_hidden_layers;
	if (cfg.vocab_size) arch->vocab_size = cfg.vocab_size;
	if (cfg.max_position_embeddings) arch->context_length = cfg.max_position_embeddings;
	if (cfg.rope_theta > 0) arch->rope_freq_base = cfg.rope_theta;
	if (cfg.rms_norm_eps > 0) arch->rms_norm_eps = cfg.rms_norm_eps;
	arch->tie_word_embeddings = cfg.tie_word_embeddings;

	printf(" Architecture determined from config.json: %s\n", cfg.model_type);
	}

	/* ── Fall back to tensor name pattern detection ── */
	int has_model_layers = count_tensors_with_prefix(mf, "model.layers.");
	int has_gpt_neox = count_tensors_with_prefix(mf, "gpt_neox.");
	int has_transformer = count_tensors_with_prefix(mf, "transformer.");

	/* Architecture-specific detection */
	int has_qkv_proj = count_tensors_with_prefix(mf, "model.layers.0.self_attn.qkv_proj");
	int has_kv_a_proj = count_tensors_with_prefix(mf, "model.layers.0.self_attn.kv_a_proj_with_mqa");
	int has_final_norm = (st_multi_find_tensor(mf, "model.final_norm.weight") >= 0);

	if (has_qkv_proj > 0 && !cfg.valid) {
	strcpy(arch->architecture, "phi3");
	} else if (has_kv_a_proj > 0 && !cfg.valid) {
	strcpy(arch->architecture, "llama"); /* DeepSeek uses llama arch */
	} else if (has_final_norm && !cfg.valid) {
	strcpy(arch->architecture, "gemma");
	}

	if (has_model_layers > 0 && arch->block_count == 0) {
	arch->block_count = find_max_layer_index(mf, "model.layers.") + 1;
	}

	/* Infer dimensions from tensor shapes if not from config.json */
	if (arch->embedding_length == 0 \|\| arch->head_count == 0) {
	int qproj_idx = st_multi_find_tensor(mf, "model.layers.0.self_attn.q_proj.weight");
	int kproj_idx = st_multi_find_tensor(mf, "model.layers.0.self_attn.k_proj.weight");

	if (qproj_idx >= 0) {
	const STTensorInfo *ti = st_multi_tensor_info(mf, qproj_idx);
	int64_t q_out = ti->shape[0];
	int64_t hidden = ti->shape[1];
	if (arch->embedding_length == 0) arch->embedding_length = hidden;

	/* Try common head dimensions: 128, 64, 96 */
	int head_dim = 128;
	if (q_out % 128 == 0) head_dim = 128;
	else if (q_out % 96 == 0) head_dim = 96;
	else if (q_out % 64 == 0) head_dim = 64;

	if (arch->head_count == 0) arch->head_count = q_out / head_dim;

	if (kproj_idx >= 0 && arch->head_count_kv == 0) {
	const STTensorInfo *kt = st_multi_tensor_info(mf, kproj_idx);
	arch->head_count_kv = kt->shape[0] / head_dim;
	}
	}
	}

	if (arch->vocab_size == 0) {
	int embed_idx = st_multi_find_tensor(mf, "model.embed_tokens.weight");
	if (embed_idx >= 0) {
	const STTensorInfo *ti = st_multi_tensor_info(mf, embed_idx);
	arch->vocab_size = ti->shape[0];
	}
	}

	if (arch->feed_forward_length == 0) {
	int gate_idx = st_multi_find_tensor(mf, "model.layers.0.mlp.gate_proj.weight");
	if (gate_idx >= 0) {
	const STTensorInfo *ti = st_multi_tensor_info(mf, gate_idx);
	arch->feed_forward_length = ti->shape[0];
	} else {
	int up_idx = st_multi_find_tensor(mf, "model.layers.0.mlp.up_proj.weight");
	if (up_idx >= 0) {
	const STTensorInfo *ti = st_multi_tensor_info(mf, up_idx);
	arch->feed_forward_length = ti->shape[0];
	}
	}
	}

	/* Check for attention bias */
	arch->has_bias = (st_multi_find_tensor(mf, "model.layers.0.self_attn.q_proj.bias") >= 0);

	if (has_gpt_neox > 0 && arch->block_count == 0) {
	strcpy(arch->architecture, "gpt_neox");
	arch->block_count = find_max_layer_index(mf, "gpt_neox.layers.") + 1;
	}
	if (has_transformer > 0 && arch->block_count == 0) {
	strcpy(arch->architecture, "falcon");
	arch->block_count = find_max_layer_index(mf, "transformer.h.") + 1;
	}

	/* Fill in defaults for anything we couldn't detect */
	if (arch->head_count == 0) arch->head_count = 32;
	if (arch->head_count_kv == 0) arch->head_count_kv = arch->head_count;
	if (arch->embedding_length == 0) arch->embedding_length = 4096;
	if (arch->vocab_size == 0) arch->vocab_size = 32000;
	if (arch->feed_forward_length == 0)
	arch->feed_forward_length = (arch->embedding_length * 8) / 3; /* SwiGLU default */
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* TENSOR NAME MAPPING: HuggingFace → GGUF Standard
	*
	* Maps SafeTensors tensor names to the standardized GGUF naming
	* convention used by llama.cpp for model loading.
	*
	* Enhanced with mappings for Phi-3, Gemma, DeepSeek, MoE, and bias tensors.
	* ═══════════════════════════════════════════════════════════════════════════ */

	/* Returns 1 if this tensor should be skipped (not written to GGUF) */
	static int should_skip_tensor(const char *hf_name)
	{
	/* Rotary embeddings are computed at runtime, not stored */
	if (strstr(hf_name, "rotary_emb.inv_freq") != NULL) return 1;
	if (strstr(hf_name, "rotary_emb.cos_cached") != NULL) return 1;
	if (strstr(hf_name, "rotary_emb.sin_cached") != NULL) return 1;
	/* Qwen 3.6 vision encoder — skip all visual.* tensors */
	if (strncmp(hf_name, "model.visual.", 13) == 0) return 1;
	if (strncmp(hf_name, "visual.", 7) == 0) return 1;
	/* MTP (multi-token prediction) layers — not needed for inference */
	if (strstr(hf_name, "model.language_model.mtp_") != NULL) return 1;
	return 0;
	}

	static void map_tensor_name(const char hf_name, char gguf_name, int buflen)
	{
	/* Start with identity mapping */
	strncpy(gguf_name, hf_name, buflen - 1);
	gguf_name[buflen - 1] = '\0';

	/* Top-level mappings (common to all architectures) */
	struct { const char from; const char to; } mappings[] = {
	{"model.embed_tokens.weight", "token_embd.weight"},
	{"model.language_model.embed_tokens.weight","token_embd.weight"}, /* Qwen 3.6 */
	{"model.norm.weight", "output_norm.weight"},
	{"model.language_model.norm.weight", "output_norm.weight"}, /* Qwen 3.6 */
	{"model.final_norm.weight", "output_norm.weight"}, /* Gemma */
	{"lm_head.weight", "output.weight"},
	{"model.embed_tokens.bias", "token_embd.bias"},
	{"model.norm.bias", "output_norm.bias"},
	{NULL, NULL}
	};

	for (int m = 0; mappings[m].from; m++) {
	if (strcmp(hf_name, mappings[m].from) == 0) {
	strncpy(gguf_name, mappings[m].to, buflen - 1);
	return;
	}
	}

	/* Layer mappings: "model.layers.N.xxx" or "model.language_model.layers.N.xxx" → "blk.N.xxx" */
	const char *layer_prefix = NULL;
	if (strncmp(hf_name, "model.layers.", 13) == 0)
	layer_prefix = hf_name + 13;
	else if (strncmp(hf_name, "model.language_model.layers.", 27) == 0)
	layer_prefix = hf_name + 27;

	if (layer_prefix) {
	int layer_idx;
	char rest[ST_MAX_NAME_LEN];
	if (sscanf(layer_prefix, "%d.%255s", &layer_idx, rest) == 2) {
	/* Map sublayer names */
	struct { const char from; const char to; } layer_maps[] = {
	/* Standard attention projections */
	{"self_attn.q_proj.weight", "attn_q.weight"},
	{"self_attn.k_proj.weight", "attn_k.weight"},
	{"self_attn.v_proj.weight", "attn_v.weight"},
	{"self_attn.o_proj.weight", "attn_output.weight"},
	/* Attention biases */
	{"self_attn.q_proj.bias", "attn_q.bias"},
	{"self_attn.k_proj.bias", "attn_k.bias"},
	{"self_attn.v_proj.bias", "attn_v.bias"},
	{"self_attn.o_proj.bias", "attn_output.bias"},
	/* Phi-3 fused QKV */
	{"self_attn.qkv_proj.weight", "attn_qkv.weight"},
	{"self_attn.qkv_proj.bias", "attn_qkv.bias"},
	/* DeepSeek MLA */
	{"self_attn.kv_a_proj_with_mqa.weight", "attn_kv_a_mqa.weight"},
	{"self_attn.kv_b_proj.weight", "attn_kv_b.weight"},
	/* Standard FFN (SwiGLU) */
	{"mlp.gate_proj.weight", "ffn_gate.weight"},
	{"mlp.up_proj.weight", "ffn_up.weight"},
	{"mlp.down_proj.weight", "ffn_down.weight"},
	/* FFN biases */
	{"mlp.gate_proj.bias", "ffn_gate.bias"},
	{"mlp.up_proj.bias", "ffn_up.bias"},
	{"mlp.down_proj.bias", "ffn_down.bias"},
	/* MoE gate */
	{"mlp.gate.weight", "ffn_gate_inp.weight"},
	/* MoE expert weights */
	{"mlp.experts.gate_proj.weight", "ffn_gate_exps.weight"},
	{"mlp.experts.up_proj.weight", "ffn_up_exps.weight"},
	{"mlp.experts.down_proj.weight", "ffn_down_exps.weight"},
	/* Norm layers */
	{"input_layernorm.weight", "attn_norm.weight"},
	{"post_attention_layernorm.weight", "ffn_norm.weight"},
	{"input_layernorm.bias", "attn_norm.bias"},
	{"post_attention_layernorm.bias", "ffn_norm.bias"},
	/* Gemma pre/post feedforward norm */
	{"pre_feedforward_layernorm.weight", "ffn_norm.weight"},
	{"post_feedforward_layernorm.weight", "ffn_post_norm.weight"},
	/* Qwen 3.6 full attention QK norms */
	{"self_attn.q_norm.weight", "attn_q_norm.weight"},
	{"self_attn.k_norm.weight", "attn_k_norm.weight"},
	/* Qwen 3.6 DeltaNet (Gated Linear Attention) */
	{"linear_attn.in_proj_qkv.weight", "ssm_in_qkv.weight"},
	{"linear_attn.in_proj_z.weight", "ssm_in_z.weight"},
	{"linear_attn.in_proj_a.weight", "ssm_in_a.weight"},
	{"linear_attn.in_proj_b.weight", "ssm_in_b.weight"},
	{"linear_attn.out_proj.weight", "ssm_out.weight"},
	{"linear_attn.conv1d.weight", "ssm_conv1d.weight"},
	{"linear_attn.norm.weight", "ssm_norm.weight"},
	{"linear_attn.A_log", "ssm_a"},
	{"linear_attn.dt_bias", "ssm_dt.bias"},
	{NULL, NULL}
	};

	for (int m = 0; layer_maps[m].from; m++) {
	if (strcmp(rest, layer_maps[m].from) == 0) {
	snprintf(gguf_name, buflen, "blk.%d.%s",
	layer_idx, layer_maps[m].to);
	return;
	}
	}

	/* MoE expert layer mapping: model.layers.N.mlp.experts.E.xxx */
	int expert_idx;
	char expert_rest[ST_MAX_NAME_LEN];
	if (sscanf(rest, "mlp.experts.%d.%255s", &expert_idx, expert_rest) == 2) {
	struct { const char from; const char to; } expert_maps[] = {
	{"gate_proj.weight", "ffn_gate_exp.weight"},
	{"up_proj.weight", "ffn_up_exp.weight"},
	{"down_proj.weight", "ffn_down_exp.weight"},
	{NULL, NULL}
	};
	for (int m = 0; expert_maps[m].from; m++) {
	if (strcmp(expert_rest, expert_maps[m].from) == 0) {
	snprintf(gguf_name, buflen, "blk.%d.%s.%d",
	layer_idx, expert_maps[m].to, expert_idx);
	return;
	}
	}
	}

	/* Fallback: keep original sub-path */
	snprintf(gguf_name, buflen, "blk.%d.%s", layer_idx, rest);
	}
	}
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* SHOULD THIS TENSOR BE QUANTIZED?
	*
	* Decision rules:
	* - Quantize: weight matrices (2D, large)
	* - Keep F32: norms, biases, embeddings, 1D tensors
	* ═══════════════════════════════════════════════════════════════════════════ */

	static int should_quantize(const STTensorInfo ti, const char gguf_name)
	{
	/* Never quantize 1D tensors (norms, biases) */
	if (ti->n_dims < 2) return 0;

	/* Never quantize embedding tables (row dimension = vocab) */
	if (strstr(gguf_name, "token_embd") != NULL) return 0;

	/* Never quantize LM head output — use exact match, not substring,
	* to avoid matching "attn_output.weight" */
	if (strcmp(gguf_name, "output.weight") == 0) return 0;

	/* Never quantize norm weights */
	if (strstr(gguf_name, "norm") != NULL) return 0;

	/* Never quantize bias tensors */
	if (strstr(gguf_name, ".bias") != NULL) return 0;

	/* Never quantize MoE gate routing weights */
	if (strstr(gguf_name, "ffn_gate_inp") != NULL) return 0;

	/* Never quantize DeltaNet state-space parameters (1D or small) */
	if (strstr(gguf_name, "ssm_a") != NULL) return 0; /* A_log */
	if (strstr(gguf_name, "ssm_dt") != NULL) return 0; /* dt_bias */
	if (strstr(gguf_name, "ssm_conv1d") != NULL) return 0; /* conv kernel */

	/* Quantize everything else (attention projections, FFN weights, SSM projections) */
	return 1;
	}

	/* Detect attention Q/K/V/O projection tensors.
	* These are the most sensitive to quantization — errors in attention scores
	* cascade through the entire sequence, causing self-correction loops.
	* Promoting these to Q4_0 (~4.5bpw) doubles their precision. */
	static int is_attention_tensor(const char *gguf_name)
	{
	/* Gemma / LLaMA style GGUF names: blk.N.attn_q/k/v/output.weight */
	if (strstr(gguf_name, "attn_q.weight") != NULL) return 1;
	if (strstr(gguf_name, "attn_k.weight") != NULL) return 1;
	if (strstr(gguf_name, "attn_v.weight") != NULL) return 1;
	if (strstr(gguf_name, "attn_output.weight") != NULL) return 1;
	if (strstr(gguf_name, "attn_qkv.weight") != NULL) return 1;
	/* Qwen 3.6 DeltaNet SSM projections — treat as attention-class (Q4_0) */
	if (strstr(gguf_name, "ssm_in_qkv.weight") != NULL) return 1;
	if (strstr(gguf_name, "ssm_in_z.weight") != NULL) return 1;
	if (strstr(gguf_name, "ssm_out.weight") != NULL) return 1;
	/* HuggingFace style (fallthrough names) */
	if (strstr(gguf_name, "self_attn.q_proj.weight") != NULL) return 1;
	if (strstr(gguf_name, "self_attn.k_proj.weight") != NULL) return 1;
	if (strstr(gguf_name, "self_attn.v_proj.weight") != NULL) return 1;
	if (strstr(gguf_name, "self_attn.o_proj.weight") != NULL) return 1;
	return 0;
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* HPC SENSITIVITY GRAPH BUILDER
	*
	* Creates an HPCGraph where each node represents a weight block.
	* For Q2_K: 256-weight superblocks.
	*
	* The 6 values per site correspond to 6 candidate scale factors:
	* v=0: scale * 0.85 (aggressive, high compression)
	* v=1: scale * 0.90
	* v=2: scale * 0.95
	* v=3: scale * 1.00 (standard)
	* v=4: scale * 1.05
	* v=5: scale * 1.10 (conservative, less compression error)
	*
	* BP propagates: "if your neighbor block is sensitive, you should be
	* conservative too" — creating coherent precision allocation.
	* ═══════════════════════════════════════════════════════════════════════════ */


	/* ── Multi-quhit expanded scale table ──
	* Search grid: 24×24 = 576 (d, dmin) candidates
	* Quhit encoding: bin 24 → 6 for D=6 quhits (BP operates on 6-state marginals)
	* Beam search: operates on all 576 candidates directly */
	#define QUHITS_PER_BLOCK 2
	#define N_CAND_D 24 /* d multiplier candidates (expanded) */
	#define N_CAND_M 24 /* dmin multiplier candidates (expanded) */
	#define TOTAL_SCALE_CANDIDATES (N_CAND_D * N_CAND_M)

	/* ════════════════════════════════════════════════════════════════════════
	* EXPERIMENTAL / CURRENTLY-UNUSED CODE PATHS
	*
	* Nothing in the live pipeline calls the legacy BP sensitivity graph
	* (build_sensitivity_graph + compute_block_error_q2k + SCALE_TABLE) or the
	* llm-compressor MSE grid search (mse_grid_search_q2k_subblock); the Shor /
	* Viterbi path superseded them. They are preserved behind this flag instead
	* of silently shipping as dead code that still costs an init pass.
	* ════════════════════════════════════════════════════════════════════════ */
	#ifdef HEXSTATE_ENABLE_EXPERIMENTAL

	#define SCALE_FACTOR_COUNT 6
	static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = {
	0.60f, 0.75f, 0.90f, 1.00f, 1.15f, 1.40f
	};

	static float SCALE_TABLE[TOTAL_SCALE_CANDIDATES];
	static int scale_table_initialized = 0;

	static void init_scale_table(void) {
	if (scale_table_initialized) return;
	/* candidates: uniform spacing centered on 1.0 */
	for (int i = 0; i < TOTAL_SCALE_CANDIDATES; i++) {
	SCALE_TABLE[i] = 0.50f + (float)i * (1.00f / (float)(TOTAL_SCALE_CANDIDATES - 1));
	}
	scale_table_initialized = 1;
	}
	#endif /* HEXSTATE_ENABLE_EXPERIMENTAL */

	/* ═══════════════════════════════════════════════════════════════════════════
	* THREAD-LOCAL HPCGRAPH REUSE — Eliminates 776K malloc/free cycles
	*
	* The sub-block Shor measurement uses a 16-node linear-chain graph that
	* is identical in topology every time. Instead of hpc_create()/hpc_destroy()
	* inside the OMP hot loop, we reset the same graph to a clean state.
	*
	* This function resets an existing HPCGraph with n_sites nodes to its
	* initial state: clears all edges, resets adjacency lists, reinitializes
	* locals. Zero allocations.
	* ═══════════════════════════════════════════════════════════════════════════ */
	static void hpc_reset_for_subblock(HPCGraph *g, uint64_t n_sites)
	{
	/* Reset edge state */
	g->n_edges = 0;
	g->cz_edges = 0;
	g->phase_edges = 0;
	g->syntheme_edges = 0;
	g->n_log = 0;
	g->min_fidelity = 1.0;
	g->avg_fidelity = 1.0;
	g->amp_evals = 0;
	g->prob_evals = 0;
	g->measurements = 0;

	/* Reset adjacency lists (just zero the counts, keep allocated buffers) */
	for (uint64_t i = 0; i < n_sites; i++) {
	g->adj[i].count = 0;
	}

	/* Reinitialize local quhit states */
	for (uint64_t i = 0; i < n_sites; i++)
	triality_init(&g->locals[i]);
	}

	#ifdef HEXSTATE_ENABLE_EXPERIMENTAL
	/* ═══════════════════════════════════════════════════════════════════════════
	* FAST POWER APPROXIMATION — Replaces powf(x, 2.4f) in MSE grid search
	*
	* powf() costs ~50-100 cycles. Use log2f+exp2f (~25 cycles) for the
	* exact x^2.4 = x^2 × 2^(0.4·log2(x)) computation instead.
	* ═══════════════════════════════════════════════════════════════════════════ */
	static inline float fast_pow_2_4(float x)
	{
	/* x^2.4 = x^2 × 2^(0.4 × log2(x)). log2f+exp2f ≈ 25 cycles total vs
	* 50-100 for powf, and produces the exact ^2.4 norm the grid search needs. */
	float x2 = x * x;
	return x2 * exp2f(0.4f * log2f(x)); /* x^2 × x^0.4 = x^2.4 */
	}

	/* Compute the Q2_K sub-block reconstruction error for a block at a given
	* scale multiplier, optionally weighted by importance vector */
	static float compute_block_error_q2k(const float *weights, int block_size,
	float scale_mult,
	const float *importance, int imp_offset)
	{
	float min_val = weights[0];
	float max_val = weights[0];
	for (int j = 1; j < block_size; j++) {
	if (weights[j] < min_val) min_val = weights[j];
	if (weights[j] > max_val) max_val = weights[j];
	}
	if (min_val > 0) min_val = 0;

	float range = (max_val - min_val) * scale_mult;
	if (range < 1e-15f) return 0.0f;
	float inv_range = 3.0f / range;

	float err = 0.0f;
	for (int j = 0; j < block_size; j++) {
	float x = weights[j];
	int q = (int)((x - min_val * scale_mult) * inv_range + 0.5f);
	if (q < 0) q = 0; if (q > 3) q = 3;
	float deq = min_val * scale_mult + (float)q * range / 3.0f;
	float diff = x - deq;
	float w = (importance) ? importance[imp_offset + j] : 1.0f;
	err += diff * diff * w;
	}
	return err;
	}

	/* Build multi-quhit HPC sensitivity graph.
	* 2 quhits per block → 576 scale candidates per block.
	*
	* Graph layout: sites [0..2*n-1] where:
	* site 2*i = coarse quhit for block i
	* site 2*i + 1 = fine quhit for block i
	*
	* Edges:
	* Intra-block: CZ(2i, 2i+1) — coarse↔fine coupling
	* Inter-block: CZ(2i, 2(i+1)) — coarse↔coarse neighbor
	* CZ(2i+1, 2(i+1)+1) — fine↔fine neighbor */
	static HPCGraph build_sensitivity_graph(const float weights,
	int64_t n_elements,
	int block_size,
	float temperature,
	const float *importance)
	{
	int64_t n_blocks = n_elements / block_size;
	if (n_blocks < 2) return NULL;

	init_scale_table();

	int64_t graph_blocks = (n_blocks > 8192) ? 8192 : n_blocks;
	int64_t stride = n_blocks / graph_blocks;
	int64_t n_sites = graph_blocks * QUHITS_PER_BLOCK;

	HPCGraph *graph = hpc_create(n_sites);
	if (!graph) return NULL;

	for (int64_t i = 0; i < n_sites; i++)
	triality_dft(&graph->locals[i]);

	/* Compute errors for all candidates per block,
	* then project onto coarse (quhit 0) and fine (quhit 1) marginals */
	for (int64_t i = 0; i < graph_blocks; i++) {
	int64_t block_idx = i * stride;
	const float block_weights = weights + block_idx block_size;

	/* Evaluate all candidates */
	float errors[TOTAL_SCALE_CANDIDATES];
	float min_err = 1e30f;
	for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) {
	errors[c] = compute_block_error_q2k(block_weights, block_size,
	SCALE_TABLE[c],
	importance,
	(int)(block_idx * block_size));
	if (errors[c] < min_err) min_err = errors[c];
	}

	/* Project onto quhit 0 (coarse): marginalize over fine dimension
	* amp_coarse[v0] = Σ_{v1} exp(-error(v06+v1) / 2T) /
	double coarse_re[6], coarse_im[6];
	double coarse_norm = 0.0;
	for (int v0 = 0; v0 < 6; v0++) {
	coarse_re[v0] = 0.0;
	coarse_im[v0] = 0.0;
	for (int v1 = 0; v1 < 6; v1++) {
	int idx = v0 * 6 + v1;
	coarse_re[v0] += exp(-(double)(errors[idx] - min_err) /
	(2.0 * (double)temperature));
	}
	coarse_norm += coarse_re[v0] * coarse_re[v0];
	}
	if (coarse_norm > 1e-30) {
	double inv = 1.0 / sqrt(coarse_norm);
	for (int v = 0; v < 6; v++) coarse_re[v] *= inv;
	}

	/* Project onto quhit 1 (fine): marginalize over coarse dimension
	* amp_fine[v1] = Σ_{v0} exp(-error(v06+v1) / 2T) /
	double fine_re[6], fine_im[6];
	double fine_norm = 0.0;
	for (int v1 = 0; v1 < 6; v1++) {
	fine_re[v1] = 0.0;
	fine_im[v1] = 0.0;
	for (int v0 = 0; v0 < 6; v0++) {
	int idx = v0 * 6 + v1;
	fine_re[v1] += exp(-(double)(errors[idx] - min_err) /
	(2.0 * (double)temperature));
	}
	fine_norm += fine_re[v1] * fine_re[v1];
	}
	if (fine_norm > 1e-30) {
	double inv = 1.0 / sqrt(fine_norm);
	for (int v = 0; v < 6; v++) fine_re[v] *= inv;
	}

	/* Write coarse quhit (site 2i) /
	int64_t s_coarse = 2 * i;
	for (int v = 0; v < 6; v++) {
	graph->locals[s_coarse].edge_re[v] = coarse_re[v];
	graph->locals[s_coarse].edge_im[v] = 0.0;
	}
	graph->locals[s_coarse].primary = VIEW_EDGE;
	graph->locals[s_coarse].dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	graph->locals[s_coarse].delta_valid = 0;
	triality_update_mask(&graph->locals[s_coarse]);

	/* Write fine quhit (site 2i + 1) /
	int64_t s_fine = 2 * i + 1;
	for (int v = 0; v < 6; v++) {
	graph->locals[s_fine].edge_re[v] = fine_re[v];
	graph->locals[s_fine].edge_im[v] = 0.0;
	}
	graph->locals[s_fine].primary = VIEW_EDGE;
	graph->locals[s_fine].dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	graph->locals[s_fine].delta_valid = 0;
	triality_update_mask(&graph->locals[s_fine]);
	}

	/* ── Build edges ── */
	for (int64_t i = 0; i < graph_blocks; i++) {
	/* Intra-block: coarse ↔ fine coupling */
	hpc_cz(graph, 2 * i, 2 * i + 1);

	/* Inter-block: neighbor coupling */
	if (i + 1 < graph_blocks) {
	hpc_cz(graph, 2 * i, 2 * (i + 1)); /* coarse ↔ coarse */
	hpc_cz(graph, 2 * i + 1, 2 * (i + 1) + 1); /* fine ↔ fine */
	}
	}

	return graph;
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* MSE GRID SEARCH (ported from llm-compressor observers/mse.py)
	*
	* For a Q2_K sub-block, progressively shrink the min/max range to find
	* the candidate that minimizes weighted reconstruction error.
	*
	* for p in [1.0, 1.0 - 1/grid, 1.0 - 2/grid, ...] down to (1 - maxshrink):
	* candidate_min = p * min
	* candidate_max = p * max
	* error = \|\|x - quantize(x, candidate_min, candidate_max)\|\|^norm
	* if error < best: update best
	* else: patience--; if patience == 0: break
	*
	* This is a direct C port of llm-compressor's _grid_search_mse.
	* ═══════════════════════════════════════════════════════════════════════════ */

	typedef struct {
	float maxshrink; /* Maximum shrink factor (0.0 to 1.0) */
	int grid; /* Number of grid divisions */
	int patience; /* Early stopping patience */
	float norm; /* Error norm exponent (2.0 = MSE, 2.4 = ...)*/
	} MSEGridConfig;

	static const MSEGridConfig MSE_DEFAULT_CONFIG = {
	.maxshrink = 0.20f,
	.grid = 200,
	.patience = 8,
	.norm = 2.4f
	};

	/* Grid search for optimal scale/min for a Q2_K sub-block of n weights
	* with nmax = 3 quantization levels.
	* Returns optimized scale; stores absolute min in *out_min.
	* importance: per-element weights (can be NULL for uniform). */
	static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax,
	uint8_t L, float out_min,
	const float *importance,
	const MSEGridConfig *cfg)
	{
	float min_val = x[0], max_val = x[0];
	for (int i = 1; i < n; i++) {
	if (x[i] < min_val) min_val = x[i];
	if (x[i] > max_val) max_val = x[i];
	}
	if (max_val == min_val) {
	for (int i = 0; i < n; i++) L[i] = 0;
	*out_min = -min_val;
	return 0.0f;
	}
	if (min_val > 0) min_val = 0;

	float best_scale = 0.0f;
	float best_min = -min_val;
	float best_error = 1e30f;
	int no_improve = 0;

	int shrink_steps = (int)(cfg->maxshrink * cfg->grid);
	if (shrink_steps < 1) shrink_steps = 1;

	for (int step = 0; step <= shrink_steps; step++) {
	float p = 1.0f - (float)step / (float)cfg->grid;

	float cand_min = p * min_val;
	float cand_max = p * max_val;

	if (cand_max <= cand_min) continue;

	float iscale = (float)nmax / (cand_max - cand_min);
	float scale = 1.0f / iscale;

	/* Quantize and measure error */
	float err = 0.0f;
	uint8_t tmp_L[256];
	for (int i = 0; i < n; i++) {
	int l = gguf_nearest_int(iscale * (x[i] - cand_min));
	if (l < 0) l = 0;
	if (l > nmax) l = nmax;
	tmp_L[i] = (uint8_t)l;

	float deq = cand_min + scale * (float)l;
	float diff = fabsf(x[i] - deq);
	/* Apply error norm — fast path for default norm=2.4 */
	float e = diff;
	if (cfg->norm == 2.4f) {
	e = fast_pow_2_4(diff);
	} else if (cfg->norm != 1.0f) {
	e = powf(diff, cfg->norm);
	}
	/* Apply importance weighting */
	if (importance) e *= importance[i];
	err += e;
	}

	if (err < best_error) {
	best_error = err;
	best_scale = scale;
	best_min = -cand_min;
	memcpy(L, tmp_L, n);
	no_improve = 0;
	} else {
	no_improve++;
	if (no_improve >= cfg->patience) break;
	}
	}

	/* Iterative refinement on the best candidate (from ggml) */
	float cur_min = -best_min;
	float cur_scale = best_scale;
	if (cur_scale > 1e-15f) {
	float iscale = 1.0f / cur_scale;
	for (int itry = 0; itry < 5; itry++) {
	float sumlx = 0;
	int suml2 = 0;
	for (int i = 0; i < n; i++) {
	int l = gguf_nearest_int(iscale * (x[i] - cur_min));
	if (l < 0) l = 0;
	if (l > nmax) l = nmax;
	L[i] = (uint8_t)l;
	sumlx += (x[i] - cur_min) * l;
	suml2 += l * l;
	}
	if (suml2 > 0) cur_scale = sumlx / suml2;
	float sum = 0;
	for (int i = 0; i < n; i++)
	sum += x[i] - cur_scale * L[i];
	/* True coordinate-descent optimal: min* = sum/n (no momentum).
	* Clamp to ≤ 0 since min must be non-positive by convention. */
	cur_min = fminf(0.0f, sum / n);
	if (cur_scale > 1e-15f) iscale = 1.0f / cur_scale;
	}
	}

	*out_min = -cur_min;
	return cur_scale;
	}
	#endif /* HEXSTATE_ENABLE_EXPERIMENTAL */

	/* ═══════════════════════════════════════════════════════════════════════════
	* HPC Q2_K QUANTIZATION — GGML-QUALITY + HPC REFINEMENT
	*
	* Two-phase approach:
	* Phase A: Per-sub-block weighted least-squares (ggml make_qkx2_quants)
	* This produces per-sub-block (scale, min) with 16-step search.
	* Phase B: HPC BP refines the superblock-level d/dmin rounding.
	* 6 candidate (d, dmin) pairs are tested; BP finds the one
	* where the GLOBAL reconstruction error is minimized via
	* constructive interference of per-sub-block phase coherence.
	* ═══════════════════════════════════════════════════════════════════════════ */

	/* Weighted least-squares quantization for a sub-block (ggml make_qkx2_quants).
	* Finds optimal (scale, min) by searching 16 candidate iscale values
	* and solving weighted least-squares for each.
	* Returns scale; the_min is set to the negative of the optimal min. /
	static float hpc_make_qkx2_quants(int n, int nmax, const float *x,
	const float w, uint8_t L,
	float the_min, uint8_t Laux)
	{
	float xmin = x[0], xmax = x[0];
	float sum_w = w[0], sum_x = w[0] * x[0];
	for (int i = 1; i < n; i++) {
	if (x[i] < xmin) xmin = x[i];
	if (x[i] > xmax) xmax = x[i];
	sum_w += w[i];
	sum_x += w[i] * x[i];
	}
	if (xmin > 0) xmin = 0;
	if (xmax == xmin) {
	for (int i = 0; i < n; i++) L[i] = 0;
	*the_min = -xmin;
	return 0.0f;
	}

	float iscale = (float)nmax / (xmax - xmin);
	float scale = 1.0f / iscale;
	float best_mad = 0;
	for (int i = 0; i < n; i++) {
	int l = gguf_nearest_int(iscale * (x[i] - xmin));
	if (l < 0) l = 0;
	if (l > nmax) l = nmax;
	L[i] = (uint8_t)l;
	float diff = scale * (float)l + xmin - x[i];
	best_mad += w[i] * fabsf(diff);
	}

	/* 16 candidate iscale values: search [-0.5, -0.5 + 0.115] + nmax /
	for (int is = 0; is <= 15; is++) {
	float try_iscale = (-0.5f + 0.1f * (float)is + (float)nmax) / (xmax - xmin);
	float sl = 0, sl2 = 0, sxl = 0;
	for (int i = 0; i < n; i++) {
	int l = gguf_nearest_int(try_iscale * (x[i] - xmin));
	if (l < 0) l = 0;
	if (l > nmax) l = nmax;
	Laux[i] = (uint8_t)l;
	sl += w[i] * (float)l;
	sl2 += w[i] * (float)(l * l);
	sxl += w[i] * (float)l * x[i];
	}
	float det = sum_w * sl2 - sl * sl;
	if (det > 0) {
	float this_scale = (sum_w * sxl - sum_x * sl) / det;
	float this_min = (sl2 * sum_x - sl * sxl) / det;
	if (this_min > 0) {
	this_min = 0;
	this_scale = sxl / sl2;
	}
	float mad = 0;
	for (int i = 0; i < n; i++) {
	float diff = this_scale * (float)Laux[i] + this_min - x[i];
	mad += w[i] * fabsf(diff);
	}
	if (mad < best_mad) {
	for (int i = 0; i < n; i++) L[i] = Laux[i];
	best_mad = mad;
	scale = this_scale;
	xmin = this_min;
	}
	}
	}
	*the_min = -xmin;
	return scale;
	}

	/* Quantize the scale/min arrays into 4-bit values: make_qp_quants equivalent.
	* Returns the optimal d such that scales[j] ≈ d × Ls[j]. */
	static float hpc_make_qp_quants(int n, int nmax, const float *x,
	uint8_t L, const float sw)
	{
	float xmax = 0;
	for (int i = 0; i < n; i++)
	if (x[i] > xmax) xmax = x[i];
	if (xmax < 1e-15f) {
	for (int i = 0; i < n; i++) L[i] = 0;
	return 0.0f;
	}
	float iscale = (float)nmax / xmax;
	for (int i = 0; i < n; i++) {
	int l = gguf_nearest_int(iscale * x[i]);
	if (l < 0) l = 0;
	if (l > nmax) l = nmax;
	L[i] = (uint8_t)l;
	}
	float scale = 1.0f / iscale;
	float best_mse = 0;
	for (int i = 0; i < n; i++) {
	float diff = x[i] - scale * (float)L[i];
	best_mse += sw[i] * diff * diff;
	}
	for (int is = -4; is <= 4; is++) {
	if (is == 0) continue;
	float iscale_is = (0.1f * (float)is + (float)nmax) / xmax;
	float scale_is = 1.0f / iscale_is;
	float mse = 0;
	for (int i = 0; i < n; i++) {
	int l = gguf_nearest_int(iscale_is * x[i]);
	if (l < 0) l = 0;
	if (l > nmax) l = nmax;
	float diff = x[i] - scale_is * (float)l;
	mse += sw[i] * diff * diff;
	}
	if (mse < best_mse) {
	best_mse = mse;
	iscale = iscale_is;
	}
	}
	/* Recompute with best iscale + iterative refinement */
	float sumlx = 0, suml2 = 0;
	for (int i = 0; i < n; i++) {
	int l = gguf_nearest_int(iscale * x[i]);
	if (l < 0) l = 0;
	if (l > nmax) l = nmax;
	L[i] = (uint8_t)l;
	sumlx += sw[i] * x[i] * (float)l;
	suml2 += sw[i] * (float)(l * l);
	}
	/* Iterative greedy refinement */
	for (int itry = 0; itry < 5; itry++) {
	int n_changed = 0;
	for (int i = 0; i < n; i++) {
	float wi = sw[i];
	float slx = sumlx - wi * x[i] * (float)L[i];
	float sl2 = suml2 - wi * (float)(L[i] * L[i]);
	if (slx > 0 && sl2 > 0) {
	int new_l = gguf_nearest_int(x[i] * sl2 / slx);
	if (new_l < 0) new_l = 0;
	if (new_l > nmax) new_l = nmax;
	if (new_l != L[i]) {
	slx += wi * x[i] * (float)new_l;
	sl2 += wi * (float)(new_l * new_l);
	if (slx * slx * suml2 > sumlx * sumlx * sl2) {
	L[i] = (uint8_t)new_l;
	sumlx = slx;
	suml2 = sl2;
	n_changed++;
	}
	}
	}
	}
	if (!n_changed) break;
	}
	return suml2 > 0 ? sumlx / suml2 : 0.0f;
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* SHOR'S GRIFFITHS-NIU SEQUENTIAL MEASUREMENT FOR RMSE OPTIMIZATION
	* (Ported 1:1 from tesseract_factor.c — replaces BP)
	*
	* Instead of iterative message-passing (BP), this uses the EXACT sequential
	* measurement protocol from Shor's algorithm:
	*
	* For each block k (MSB → LSB):
	* 1. Compute feed-forward phase correction from previously measured blocks
	* 2. Compute work factor: C_k(d) = Π_j Σ_w local_j(w) × edge(d,w)
	* 3. Bake C_k into locals: α(d) *= C_k(d)
	* 4. Apply phase correction: α(d) *= e^{-2πi d θ_k}
	* 5. Apply IDFT6 in-place: interference creates peaks at optimal scales
	* 6. Born rule measurement → select optimal scale candidate
	* 7. Collapse site + absorb edge weights into neighbors (back-action)
	*
	* This IS the quantum Fourier transform that creates constructive
	* interference at the optimal RMSE configuration, exactly as Shor's
	* algorithm creates interference at the correct period.
	*
	* Domain mapping:
	* Factoring: oracle phase 2π×d×c_k/N → period r
	* Quantize: error Boltzmann amplitudes → optimal RMSE block
	* ═══════════════════════════════════════════════════════════════════════════ */

	/* ω₆ roots of unity for CZ phase lookup come from hpc_graph.h
	* (HPC_W6_RE / HPC_W6_IM) — the file-local duplicates were unused. */
	static const double INV_SQRT6 = 0.40824829046386301637; /* 1/√6 */

	/* ── Collapse + Back-Action core (ported from tesseract_factor.c) ──
	* After sampling an outcome, collapse the target site to \|outcome⟩,
	* absorb all edge weights into neighbor local states (Magic Pointer
	* disentanglement), and remove dead edges from the graph.
	*
	* This is the EXACT same back-action protocol used in Shor's algorithm
	* for the semi-classical QFT: measurement of one site conditions all
	* remaining sites through the CZ phase correlations. */
	static void shor_collapse_site(HPCGraph *graph, int target_site, int outcome)
	{
	/* Step 1: Collapse local state to \|outcome⟩ */
	for (int v = 0; v < 6; v++) {
	graph->locals[target_site].edge_re[v] = (v == outcome) ? 1.0 : 0.0;
	graph->locals[target_site].edge_im[v] = 0.0;
	}
	graph->locals[target_site].primary = VIEW_EDGE;
	graph->locals[target_site].dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	graph->locals[target_site].delta_valid = 0;

	/* Step 2: Absorb edge weights into neighbor states (back-action).
	* For each edge (target, neighbor), the weight w(outcome, d) for each
	* neighbor basis state d gets multiplied into the neighbor's amplitude.
	* This is the Magic Pointer disentanglement from tesseract_factor.c. */
	HPCAdjList *adj = &graph->adj[target_site];
	for (uint64_t ei = 0; ei < adj->count; ei++) {
	uint64_t eid = adj->edge_ids[ei];
	HPCEdge *edge = &graph->edges[eid];
	uint64_t partner = (edge->site_a == (uint64_t)target_site) ?
	edge->site_b : edge->site_a;

	TrialityQuhit *pq = &graph->locals[partner];
	for (int d = 0; d < 6; d++) {
	double w_re, w_im;
	if (edge->type == HPC_EDGE_CZ) {
	int pidx = (outcome * d) % 6;
	w_re = HPC_W6_RE[pidx];
	w_im = HPC_W6_IM[pidx];
	} else {
	/* Weighted phase edge */
	if (edge->site_a == (uint64_t)target_site) {
	w_re = edge->w_re[outcome][d];
	w_im = edge->w_im[outcome][d];
	} else {
	w_re = edge->w_re[d][outcome];
	w_im = edge->w_im[d][outcome];
	}
	}
	double old_re = pq->edge_re[d], old_im = pq->edge_im[d];
	pq->edge_re[d] = old_re * w_re - old_im * w_im;
	pq->edge_im[d] = old_re * w_im + old_im * w_re;
	}
	pq->dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	pq->delta_valid = 0;
	}

	/* Step 3: Remove edges touching this site from the graph.
	* Mark by setting fidelity to -1 and remove from adj lists. */
	for (uint64_t ei = 0; ei < adj->count; ei++) {
	uint64_t eid = adj->edge_ids[ei];
	HPCEdge *edge = &graph->edges[eid];
	uint64_t partner = (edge->site_a == (uint64_t)target_site) ?
	edge->site_b : edge->site_a;

	/* Remove this edge from partner's adj list */
	HPCAdjList *padj = &graph->adj[partner];
	for (uint64_t pi = 0; pi < padj->count; pi++) {
	if (padj->edge_ids[pi] == eid) {
	padj->edge_ids[pi] = padj->edge_ids[--padj->count];
	break;
	}
	}
	edge->fidelity = -1.0; /* Mark as dead */
	}
	adj->count = 0; /* Clear target's adj list */
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* SHOR SEQUENTIAL MEASUREMENT — Griffiths-Niu Protocol for Quantization
	*
	* Ported 1:1 from tesseract_factor.c lines 2343-2500.
	*
	* Measures sites MSB→LSB. For each site k:
	* 1. Compute feed-forward phase correction θ_k from previously measured sites
	* 2. Compute neighbor contribution C_k(d) analytically
	* 3. Bake C_k into locals
	* 4. Apply phase correction: α(d) *= e^{-2πi d θ_k}
	* 5. Apply IDFT6: β(v) = (1/√6) Σ_d α'(d) × e^{2πi dv/6}
	* 6. Compute \|β(v)\|² as measurement probabilities
	* 7. Sample/argmax → outcome
	* 8. Collapse + back-action via shor_collapse_site()
	*
	* Returns: marginals are written into marg_out[n_sites][6].
	* measured_out[n_sites] receives the measurement outcomes.
	* ═══════════════════════════════════════════════════════════════════════════ */
	static void shor_measure_graph(HPCGraph *graph, int64_t n_sites,
	double (marg_out)[6], int measured_out,
	int deterministic)
	{
	/* Measure sites from last to first (MSB→LSB, same as Griffiths-Niu) */
	for (int64_t k = n_sites - 1; k >= 0; k--) {
	int site_k = (int)k;

	/* Step 1: Compute feed-forward phase correction from previously
	* measured sites. The QFT phase is 2π F x / 6^n. For site k,
	* the fractional phase from previously measured site j (j > k)
	* is measured_out[j] / 6^{j-k+1}.
	* Power MUST start at 36.0 (6^2) for the immediately previous site. */
	double theta_k = 0.0;
	{
	double power = 36.0;
	for (int64_t j = k + 1; j < n_sites; j++) {
	theta_k += (double)measured_out[j] / power;
	power *= 6.0;
	}
	}

	/* Step 2: Compute neighbor contribution C_k(d) analytically.
	* C_k(d) = Π_neighbor Σ_{w=0}^{5} local_neighbor(w) × edge_weight(d, w)
	* Each neighbor is independent (product state). */
	double ck_re[6], ck_im[6];
	for (int d = 0; d < 6; d++) { ck_re[d] = 1.0; ck_im[d] = 0.0; }

	const HPCAdjList *adj = &graph->adj[site_k];
	for (uint64_t ei = 0; ei < adj->count; ei++) {
	uint64_t eid = adj->edge_ids[ei];
	const HPCEdge *edge = &graph->edges[eid];
	if (edge->fidelity < 0.0) continue; /* Skip dead edges */
	uint64_t partner = (edge->site_a == (uint64_t)site_k) ?
	edge->site_b : edge->site_a;

	const TrialityQuhit *pq = &graph->locals[partner];
	for (int d = 0; d < 6; d++) {
	double sr = 0, si = 0;
	for (int w = 0; w < 6; w++) {
	double lr = pq->edge_re[w], li = pq->edge_im[w];
	double wr, wi;
	if (edge->type == HPC_EDGE_CZ) {
	int pidx = (d * w) % 6;
	wr = HPC_W6_RE[pidx]; wi = HPC_W6_IM[pidx];
	} else if (edge->site_a == (uint64_t)site_k) {
	wr = edge->w_re[d][w]; wi = edge->w_im[d][w];
	} else {
	wr = edge->w_re[w][d]; wi = edge->w_im[w][d];
	}
	sr += lrwr - liwi;
	si += lrwi + liwr;
	}
	double nr = ck_re[d]sr - ck_im[d]si;
	double ni = ck_re[d]si + ck_im[d]sr;
	ck_re[d] = nr; ck_im[d] = ni;
	}
	}

	/* Step 3: Bake C_k(d) into locals: α(d) = C_k(d) /
	for (int d = 0; d < 6; d++) {
	double re = graph->locals[site_k].edge_re[d];
	double im = graph->locals[site_k].edge_im[d];
	graph->locals[site_k].edge_re[d] = reck_re[d] - imck_im[d];
	graph->locals[site_k].edge_im[d] = reck_im[d] + imck_re[d];
	}

	/* Step 4: Apply feed-forward phase correction to locals. */
	for (int d = 0; d < 6; d++) {
	double angle = -2.0 * 3.14159265358979323846 * d * theta_k;
	double pr = cos(angle), pi2 = sin(angle);
	double re = graph->locals[site_k].edge_re[d];
	double im = graph->locals[site_k].edge_im[d];
	graph->locals[site_k].edge_re[d] = repr - impi2;
	graph->locals[site_k].edge_im[d] = repi2 + impr;
	}

	/* Step 5: Apply IDFT6 in-place: phase basis → computational basis.
	* β(v) = (1/√6) Σ_{d=0}^{5} α'(d) × e^{2πi d v / 6}
	* C_k(d) is INSIDE the coherent sum — THIS creates interference
	* peaks at the optimal RMSE configuration, exactly as Shor's
	* algorithm creates peaks at the correct period. */
	{
	double alpha_re[6], alpha_im[6];
	for (int d = 0; d < 6; d++) {
	alpha_re[d] = graph->locals[site_k].edge_re[d];
	alpha_im[d] = graph->locals[site_k].edge_im[d];
	}
	for (int v = 0; v < 6; v++) {
	double sum_re = 0.0, sum_im = 0.0;
	for (int d = 0; d < 6; d++) {
	double angle = 2.0 * 3.14159265358979323846 * d * v / 6.0;
	double er = cos(angle), ei = sin(angle);
	sum_re += alpha_re[d]er - alpha_im[d]ei;
	sum_im += alpha_re[d]ei + alpha_im[d]er;
	}
	graph->locals[site_k].edge_re[v] = sum_re * INV_SQRT6;
	graph->locals[site_k].edge_im[v] = sum_im * INV_SQRT6;
	}
	}

	/* Step 6: Compute marginals from \|local(v)\|² */
	double probs[6];
	double total = 0.0;
	for (int v = 0; v < 6; v++) {
	probs[v] = graph->locals[site_k].edge_re[v] * graph->locals[site_k].edge_re[v] +
	graph->locals[site_k].edge_im[v] * graph->locals[site_k].edge_im[v];
	total += probs[v];
	}
	if (total > 1e-30) {
	for (int v = 0; v < 6; v++) probs[v] /= total;
	} else {
	for (int v = 0; v < 6; v++) probs[v] = 1.0 / 6.0;
	}

	/* Store marginals for downstream beam search */
	for (int v = 0; v < 6; v++)
	marg_out[k][v] = probs[v];

	/* Step 7: Select outcome — deterministic argmax for quantization
	* (unlike factoring which uses Born sampling for probabilistic
	* period recovery, quantization wants the MAP estimate) */
	int outcome;
	if (deterministic) {
	outcome = 0;
	double max_p = probs[0];
	for (int v = 1; v < 6; v++) {
	if (probs[v] > max_p) { max_p = probs[v]; outcome = v; }
	}
	} else {
	/* Born sampling (for multi-shot refinement) */
	static unsigned int shor_rng = 271828;
	shor_rng = shor_rng * 1664525u + 1013904223u;
	double r01 = (double)(shor_rng >> 8) / 16777216.0;
	double cumul = 0.0;
	outcome = 5;
	for (int v = 0; v < 6; v++) {
	cumul += probs[v];
	if (r01 <= cumul) { outcome = v; break; }
	}
	}

	measured_out[k] = outcome;

	/* Step 8: Collapse + back-action — absorb edge weights into
	* neighbor locals (Magic Pointer disentanglement) */
	shor_collapse_site(graph, site_k, outcome);
	}
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* HPC-OPTIMIZED Q4_0 QUANTIZATION (for attention tensors)
	*
	* Same architecture as Q2_K HPC pipeline, but simpler:
	* - One parameter per block (scale d only, no dmin)
	* - Single quhit per block (6 states)
	* - 24 candidate scales → bin to 6 for BP
	* - 48-beam Hensel search for globally optimal configuration
	* - Triality 3-view marginals for robust scoring
	*
	* Q4_0 block: 32 weights, 16 levels (0–15), dequant: w = (q - 8) * d
	* ═══════════════════════════════════════════════════════════════════════════ */

	#define Q4_N_CAND 24 /* expanded scale candidates for Q4_0 */
	#define Q4_N_BEAMS 48 /* expanded beam width */

	/* Tight neighborhood around WLS optimum */
	static const float Q4_NEIGHBOR_MULTS[Q4_N_CAND] = {
	0.850f, 0.880f, 0.900f, 0.915f, 0.930f, 0.945f, 0.955f, 0.965f,
	0.975f, 0.985f, 0.995f, 1.000f, 1.005f, 1.015f, 1.025f, 1.035f,
	1.050f, 1.070f, 1.100f, 1.130f, 1.160f, 1.200f, 1.250f, 1.300f
	};
	static const int Q4_CAND_TO_QUHIT[Q4_N_CAND] = {
	0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
	3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
	};

	/* ── Candidate-selection error metric (shared by Q4_0 and Q2_K) ──
	* Candidates are now scored with the EXACT importance-weighted SSE
	* err = Σ_i w_i · (x_i − deq_i)²
	* which is the same objective the final assembly/polish phases minimise and
	* the same quantity reported as RMSE. The previous 2-point Hadamard form
	* (0.5·vesica + 0.5·wave with pair-AVERAGED weights) is algebraically equal
	* to Σ w̄·(e_i² + e_j²), i.e. it silently replaced per-element importance
	* weights with the pair mean — a systematic mis-weighting whenever an
	* imatrix is supplied. Scoring candidates on a different objective than the
	* one being optimised mis-ranks them; aligning the two strictly lowers the
	* final weighted RMSE (and is bit-identical when no imatrix is used). */

	/* ── Cross-block prior override ratio ──
	* Q2_K and Q4_0 blocks are decoded INDEPENDENTLY by every GGUF runtime:
	* there is no cross-block coupling in the dequantizer, so a smoothness
	* prior that keeps a block on a worse candidate can only raise the true
	* reconstruction RMSE. With 1.00f the per-block argmin over the candidate
	* grid always wins (provably optimal seed for the assembly phase); the HPC
	* graph/Viterbi/Born machinery still shapes ties and seeds the search.
	* Set to e.g. 0.95f to restore the old 5%-hysteresis smoothness prior. */
	#ifndef HEX_GREEDY_OVERRIDE_RATIO
	#define HEX_GREEDY_OVERRIDE_RATIO 1.00f
	#endif

	/* fp16-ULP radius of the monotone (d, dmin) micro-search in the Phase-4.6
	* polish (move 3). Larger radii let coordinate descent escape shallower
	* local minima at O(radius²) extra cost per polish iteration. */
	#ifndef HEX_POLISH_ULP
	#define HEX_POLISH_ULP 4
	#endif

	/* ── DC + vesica/wave extended objective (dot-product error cancellation) ──
	*
	* The quantity that matters downstream is the layer-output error
	* ε = Σᵢ eᵢ·aᵢ, E[ε²] = eᵀRe, R = activation second-moment matrix.
	* Modelling R with three components — per-channel power (diagonal, ≈
	* imatrix), a common mean μ (rank-1), and correlation c across the
	* half-block fold (i ↔ i+n/2) — gives EXACTLY:
	*
	* E[ε²] ≈ Σᵢ wᵢeᵢ² + μ²·(Σᵢeᵢ)² + c·Σ_pairs[(eᵢ+eⱼ)² − (eᵢ−eⱼ)²]
	* └── = vesica² − wave² = 4·eᵢeⱼ ──┘
	*
	* The vesica/wave decomposition is therefore the natural basis of the
	* fold-correlation term: in-phase (vesica) error energy COSTS output
	* accuracy, anti-phase (wave) error energy is CREDITED — it cancels in
	* the dot product. (The old 0.5/0.5 scorer ADDED the two, which collapses
	* to plain SSE; the spectrally meaningful combination SUBTRACTS them.)
	* Every selection/acceptance stage scores blocks with
	*
	* E(block) = Σᵢ wᵢeᵢ²
	* + (HEX_DC_LAMBDA / n) · (Σᵢeᵢ)²
	* + (HEX_VW_LAMBDA / n) · Σ_{i<n/2} [(eᵢ+eⱼ)² − (eᵢ−eⱼ)²], j = i+n/2
	*
	* applied CONSISTENTLY to: Q2_K/Q4_0 candidate scoring, the closed-form
	* (d, dmin) refit acceptance, the shaping accept guards, every polish
	* move, and the Phase-4.7 floor — so no stage optimises a different
	* objective than its acceptance test measures. The closed-form solvers
	* incorporate the DC term as a rank-1 augmented observation and act as
	* proposal generators; acceptance always uses the full extended E.
	* λ = 0 on both knobs reduces exactly to the pure weighted-SSE objective.
	* Positive-definiteness: the fold coupling adds ±2λ_vw/n off-diagonal —
	* negligible against any sane wᵢ, so E stays a valid quadratic objective.
	* NOTE: reported RMSE stays pure reconstruction RMSE; with λ > 0 a small
	* RMSE increase is the intended price for lower output error. Per-block
	* terms are a proxy for row-level structure (the API sees a flat stream);
	* the Phase-3.9 rolling-DC pass handles cross-block linkage. */
	#ifndef HEX_DC_LAMBDA
	#define HEX_DC_LAMBDA 1.0f
	#endif
	#ifndef HEX_VW_LAMBDA
	#define HEX_VW_LAMBDA 1.0f
	#endif
	/* Default (1, 1): unit-strength spectral prior. Empirically (synthetic
	* benchmark, identical inputs): lowers dot-product output error ~0.8-1.4%
	* on both mean-only and fold-correlated activation models for ~+0.05%
	* weight RMSE. The theoretically optimal λ grows with the deployment
	* model's activation mean energy and row length (the per-block term
	* under-counts cross-block row coupling); the synthetic sweep kept
	* improving monotonically through λ = 4 at ~+0.1% RMSE. Set both to
	* 0.0f to recover the exact pure weighted-SSE / minimum-RMSE pipeline. */

	/* Spectral penalty of the extended objective for one block: residuals e[n],
	* fold at n/2. Negative values are possible (anti-phase credit) — the total
	* E remains positive-definite as argued above. */
	static inline float hex_spectral_penalty(const float *e, int n)
	{
	if (HEX_DC_LAMBDA == 0.0f && HEX_VW_LAMBDA == 0.0f) return 0.0f;
	float dc = 0.0f, cross = 0.0f;
	int half = n / 2;
	for (int i = 0; i < half; i++) {
	dc += e[i] + e[i + half];
	cross += e[i] * e[i + half];
	}
	return (HEX_DC_LAMBDA / (float)n) * dc * dc
	+ (HEX_VW_LAMBDA / (float)n) * 4.0f * cross;
	}

	static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
	BlockQ4_0 output, float out_total_error,
	const float *imat_importance, int verbose)
	{
	int64_t n_blocks = n_elements / QK4_0;
	float total_err = 0.0f;
	(void)verbose; /* kept for API symmetry with the Q2_K path */
	/* ── Phase 1: Greedy seed — compute scale per block ── */
	float greedy_d = (float )calloc(n_blocks, sizeof(float));

	#pragma omp parallel for schedule(dynamic, 64)
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	const float bw = weights + blk QK4_0;
	float amax = 0.0f;
	for (int j = 0; j < QK4_0; j++) {
	float av = fabsf(bw[j]);
	if (av > amax) amax = av;
	}
	greedy_d[blk] = amax / 7.0f;
	}

	/* ── Phase 2: WLS-Optimal Candidate Generation for Q4_0 ──
	* First find the true optimal d* via 3-iteration WLS,
	* then generate candidates centered on d* with tight spacing. */
	float (cand_errors)[Q4_N_CAND] = (float ()[Q4_N_CAND])
	calloc(n_blocks, sizeof(float[Q4_N_CAND]));
	uint16_t (cand_d16)[Q4_N_CAND] = (uint16_t ()[Q4_N_CAND])
	calloc(n_blocks, sizeof(uint16_t[Q4_N_CAND]));

	#pragma omp parallel for schedule(dynamic, 64)
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	const float bw = weights + blk QK4_0;

	/* ── Step 2a: WLS solve to find optimal d* ── */
	float wls_d = greedy_d[blk];
	uint16_t prev_wls_d16 = 0;
	for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
	if (wls_d < 1e-15f) break;
	float inv_d = 1.0f / wls_d;
	float num = 0.0f, den = 0.0f;
	float dcS = 0.0f, dcQ = 0.0f; /* DC rank-1 augmentation sums */
	for (int j = 0; j < QK4_0; j++) {
	int q = (int)(bw[j] * inv_d + 8.5f);
	if (q < 0) q = 0; if (q > 15) q = 15;
	float qc = (float)q - 8.0f;
	float w = (imat_importance) ?
	imat_importance[blk * QK4_0 + j] : 1.0f;
	num += w * bw[j] * qc;
	den += w * qc * qc;
	dcS += bw[j];
	dcQ += qc;
	}
	/* DC term of the extended objective enters the normal equation
	* as one extra observation (S ~ d·Q) of weight λ_dc/n. The
	* vesica/wave term is handled by extended-E acceptance in the
	* ULP search; the solver is a proposal generator. */
	num += (HEX_DC_LAMBDA / (float)QK4_0) * dcS * dcQ;
	den += (HEX_DC_LAMBDA / (float)QK4_0) * dcQ * dcQ;
	if (den > 1e-15f) {
	float d_new = num / den;
	if (fabsf(d_new) < 4.0f * (greedy_d[blk] + 1e-10f))
	wls_d = gguf_fp16_to_fp32(gguf_fp32_to_fp16(d_new));
	}
	uint16_t cur_wls_d16 = gguf_fp32_to_fp16(wls_d);
	if (cur_wls_d16 == prev_wls_d16) break; /* converged in FP16 */
	prev_wls_d16 = cur_wls_d16;
	}

	/* ── Step 2b: Generate candidates centered on WLS optimum ── */
	for (int ci = 0; ci < Q4_N_CAND; ci++) {
	float trial_d = wls_d * Q4_NEIGHBOR_MULTS[ci];
	uint16_t d16 = gguf_fp32_to_fp16(trial_d);
	float actual_d = gguf_fp16_to_fp32(d16);
	cand_d16[blk][ci] = d16;

	float id = (actual_d > 1e-15f) ? 1.0f / actual_d : 0.0f;

	/* ── Extended objective over all QK4_0 elements ──
	* Exact importance-weighted SSE + DC + vesica/wave spectral
	* penalty — the same objective every acceptance stage uses. */
	float err = 0.0f;
	float e_arr[QK4_0];
	for (int j = 0; j < QK4_0; j++) {
	float x = bw[j];
	int q = (int)(x * id + 8.5f);
	if (q < 0) q = 0; if (q > 15) q = 15;
	float deq = ((float)q - 8.0f) * actual_d;
	float e = x - deq;
	e_arr[j] = e;
	float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
	err += e * e * w;
	}
	cand_errors[blk][ci] = err + hex_spectral_penalty(e_arr, QK4_0);
	}
	}

	/* ── Phase 3: HPC graph — single quhit per block ── */
	int best_candidate = (int )malloc(n_blocks * sizeof(int));
	int hpc_ran_q4 = 0;
	for (int64_t i = 0; i < n_blocks; i++)
	best_candidate[i] = 11; /* Q4_NEIGHBOR_MULTS[11] = 1.00 */

	if (n_blocks >= 2) {
	float temperature = 0.5f;
	int64_t graph_blocks = (n_blocks > 200) ? 200 : n_blocks;
	int64_t stride = n_blocks / graph_blocks;
	int64_t n_sites = graph_blocks; /* 1 quhit per block */

	HPCGraph *graph = hpc_create(n_sites);
	if (graph) {
	hpc_ran_q4 = 1;
	for (int64_t i = 0; i < n_sites; i++)
	triality_dft(&graph->locals[i]);

	/* Adaptive temperature from error landscape */
	{
	double err_accum = 0.0;
	int err_count = 0;
	for (int64_t gi = 0; gi < graph_blocks && gi < 100; gi++) {
	int64_t blk = gi * stride;
	float max_e = 0.0f;
	for (int c = 0; c < Q4_N_CAND; c++)
	if (cand_errors[blk][c] > max_e)
	max_e = cand_errors[blk][c];
	err_accum += (double)max_e;
	err_count++;
	}
	if (err_count > 0) {
	temperature = (float)(err_accum / err_count) * 0.1f;
	if (temperature < 1e-10f) temperature = 1e-10f;
	}
	}

	/* Encode stride-group AGGREGATED candidate errors as Boltzmann amplitudes */
	for (int64_t i = 0; i < graph_blocks; i++) {
	/* Aggregate errors across stride group */
	float agg_errors[Q4_N_CAND];
	for (int c = 0; c < Q4_N_CAND; c++)
	agg_errors[c] = 0.0f;

	int64_t blk_start = i * stride;
	int64_t blk_end = blk_start + stride;
	if (blk_end > n_blocks) blk_end = n_blocks;
	int64_t group_size = blk_end - blk_start;

	for (int64_t b = blk_start; b < blk_end; b++) {
	for (int c = 0; c < Q4_N_CAND; c++)
	agg_errors[c] += cand_errors[b][c];
	}
	if (group_size > 1) {
	float inv_gs = 1.0f / (float)group_size;
	for (int c = 0; c < Q4_N_CAND; c++)
	agg_errors[c] *= inv_gs;
	}

	float min_err = 1e30f;
	for (int c = 0; c < Q4_N_CAND; c++)
	if (agg_errors[c] < min_err)
	min_err = agg_errors[c];

	double amp_re[6];
	double amp_norm = 0.0;
	for (int qi = 0; qi < 6; qi++) amp_re[qi] = 0.0;
	for (int ci = 0; ci < Q4_N_CAND; ci++) {
	int qi = Q4_CAND_TO_QUHIT[ci];
	amp_re[qi] += exp(-(double)(agg_errors[ci] - min_err) /
	(2.0 * (double)temperature));
	}
	for (int qi = 0; qi < 6; qi++)
	amp_norm += amp_re[qi] * amp_re[qi];
	if (amp_norm > 1e-30) {
	double inv = 1.0 / sqrt(amp_norm);
	for (int v = 0; v < 6; v++) amp_re[v] *= inv;
	}

	for (int v = 0; v < 6; v++) {
	graph->locals[i].edge_re[v] = amp_re[v];
	graph->locals[i].edge_im[v] = 0.0;
	}
	graph->locals[i].primary = VIEW_EDGE;
	graph->locals[i].dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	graph->locals[i].delta_valid = 0;
	triality_update_mask(&graph->locals[i]);
	}

	/* Neighbor edges */
	for (int64_t i = 0; i < graph_blocks - 1; i++)
	hpc_cz(graph, i, i + 1);

	/* ── Shor's Griffiths-Niu Sequential Measurement ──
	* Replaces BP with exact marginals via IDFT6 + feed-forward +
	* collapse/back-action (ported 1:1 from tesseract_factor.c).
	* Single pass, no iteration, no message damping. */
	double (marg)[6] = (double ()[6])calloc(graph_blocks, sizeof(double[6]));
	int shor_measured = (int )calloc(graph_blocks, sizeof(int));

	shor_measure_graph(graph, graph_blocks, marg, shor_measured, 1);

	free(shor_measured);

	/* Beam search over candidates */
	typedef struct { double acc_error; int history_idx; } Q4Beam;
	typedef struct { int cand_idx; int parent_idx; } Q4BeamHistory;

	Q4Beam beams[Q4_N_BEAMS];
	int active_beams = 1;
	Q4BeamHistory history = (Q4BeamHistory )malloc(n_blocks * Q4_N_BEAMS * sizeof(Q4BeamHistory));

	for (int b = 0; b < Q4_N_BEAMS; b++) {
	beams[b].acc_error = 0.0;
	beams[b].history_idx = -1;
	}

	for (int64_t i = 0; i < graph_blocks; i++) {
	double m_total = 0.0;
	for (int v = 0; v < 6; v++) m_total += marg[i][v];

	double cand_score[Q4_N_CAND];
	int64_t blk = i * stride;
	/* Count candidates per quhit bin for normalization */
	int q4_bin_count[6] = {0};
	for (int ci = 0; ci < Q4_N_CAND; ci++)
	q4_bin_count[Q4_CAND_TO_QUHIT[ci]]++;
	/* Per-block error normalization: divide by block mean error
	* so small-weight blocks don't dominate beam selection */
	float blk_mean_err = 0.0f;
	for (int ci = 0; ci < Q4_N_CAND; ci++)
	blk_mean_err += cand_errors[blk][ci];
	blk_mean_err /= (float)Q4_N_CAND;
	if (blk_mean_err < 1e-30f) blk_mean_err = 1e-30f;
	for (int ci = 0; ci < Q4_N_CAND; ci++) {
	int qi = Q4_CAND_TO_QUHIT[ci];
	double p = (m_total > 1e-30) ? marg[i][qi] / m_total : 1.0/6.0;
	p /= (double)q4_bin_count[qi]; /* normalize by bin occupancy */
	cand_score[ci] = p / (cand_errors[blk][ci] / blk_mean_err + 1e-15);
	}

	typedef struct { double score; int beam_idx; int cand_idx; } Q4Ext;
	Q4Ext extensions[Q4_N_BEAMS * Q4_N_CAND];
	int n_ext = 0;
	for (int b = 0; b < active_beams; b++) {
	for (int c = 0; c < Q4_N_CAND; c++) {
	double ext_err = beams[b].acc_error + cand_errors[blk][c];
	extensions[n_ext].score = cand_score[c] / (ext_err + 1e-15);
	extensions[n_ext].beam_idx = b;
	extensions[n_ext].cand_idx = c;
	n_ext++;
	}
	}

	int top_k = (n_ext < Q4_N_BEAMS) ? n_ext : Q4_N_BEAMS;
	int top_indices[Q4_N_BEAMS];
	for (int k = 0; k < top_k; k++) {
	int best = -1; double best_s = -1e30;
	for (int e = 0; e < n_ext; e++) {
	if (extensions[e].score > best_s) {
	best_s = extensions[e].score; best = e;
	}
	}
	top_indices[k] = best;
	extensions[best].score = -2e30;
	}

	Q4Beam new_beams[Q4_N_BEAMS];
	for (int k = 0; k < top_k; k++) {
	int ei = top_indices[k];
	int sb = extensions[ei].beam_idx;
	int cand = extensions[ei].cand_idx;

	int hist_idx = i * Q4_N_BEAMS + k;
	history[hist_idx].cand_idx = cand;
	history[hist_idx].parent_idx = beams[sb].history_idx;

	new_beams[k].history_idx = hist_idx;
	new_beams[k].acc_error = beams[sb].acc_error + cand_errors[blk][cand];
	}
	for (int k = 0; k < top_k; k++) beams[k] = new_beams[k];
	active_beams = top_k;
	}

	int curr_hist = beams[0].history_idx;
	for (int64_t i = graph_blocks - 1; i >= 0; i--) {
	int group_cidx;
	if (curr_hist >= 0) {
	group_cidx = history[curr_hist].cand_idx;
	curr_hist = history[curr_hist].parent_idx;
	} else {
	group_cidx = 11;
	}

	if (stride <= 1) {
	best_candidate[i] = group_cidx;
	} else {
	/* Per-block local optimization within stride group.
	* Beam picks the quhit bin; each block picks its best
	* candidate in that bin from its own error landscape. */
	int target_bin = Q4_CAND_TO_QUHIT[group_cidx];

	for (int64_t b = i * stride; b < (i+1) * stride && b < n_blocks; b++) {
	float best_err = 1e30f;
	int best_c = group_cidx;
	for (int c = 0; c < Q4_N_CAND; c++) {
	if (Q4_CAND_TO_QUHIT[c] != target_bin) continue;
	if (cand_errors[b][c] < best_err) {
	best_err = cand_errors[b][c];
	best_c = c;
	}
	}
	/* Greedy override if global best is >5% better */
	float global_best = 1e30f;
	int global_best_c = group_cidx;
	for (int c = 0; c < Q4_N_CAND; c++) {
	if (cand_errors[b][c] < global_best) {
	global_best = cand_errors[b][c];
	global_best_c = c;
	}
	}
	if (global_best < best_err * HEX_GREEDY_OVERRIDE_RATIO)
	best_candidate[b] = global_best_c;
	else
	best_candidate[b] = best_c;
	}
	}
	}
	free(history);

	/* ══════════════════════════════════════════════════════════════
	* Phase 3.5: Born-Rule Multi-Shot Scale Refinement
	*
	* The beam search found the MAP candidate sequence. But the
	* triality marginals encode quantum phase-coherent structure
	* that a greedy beam can miss.
	* ══════════════════════════════════════════════════════════════ */
	{
	#define Q4_BORN_SHOTS 128

	/* Build per-block CDFs from triality marginals */
	unsigned int born_rng = 314159;

	/* Compute tail error once (blocks beyond graph coverage) */
	float tail_err_q4 = 0.0f;
	for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
	tail_err_q4 += cand_errors[bi][best_candidate[bi]];

	/* Beam-search baseline over the SAME set of blocks a Born
	* shot covers: stride representatives + tail. The previous
	* code summed the baseline over ALL blocks (including
	* mid-stride blocks the shots never touch), making shot_err
	* systematically smaller than the baseline and letting
	* strictly worse configurations be adopted whenever
	* stride > 1. */
	float beam_total_err = tail_err_q4;
	for (int64_t gi = 0; gi < graph_blocks; gi++) {
	int64_t rep = gi * stride;
	beam_total_err += cand_errors[rep][best_candidate[rep]];
	}

	/* Sparse shot buffer: only track stride-sampled blocks */
	int shot_sparse_q4 = (int )malloc(graph_blocks * sizeof(int));

	for (int shot = 0; shot < Q4_BORN_SHOTS; shot++) {
	float shot_err = tail_err_q4;

	for (int64_t gi = 0; gi < graph_blocks; gi++) {
	/* Normalize marginals to CDF */
	double m_total = 0.0;
	for (int v = 0; v < 6; v++) m_total += marg[gi][v];

	/* Born sample: CDF inversion (same as born_sample) */
	born_rng = born_rng * 1664525u + 1013904223u;
	double rnd = (double)(born_rng >> 8) / 16777216.0;
	double target = rnd * m_total;
	double cum = 0.0;
	int sampled_qi = 5;
	for (int v = 0; v < 6; v++) {
	cum += marg[gi][v];
	if (cum > target) { sampled_qi = v; break; }
	}

	/* Find the best candidate WITHIN this quhit bin */
	int64_t blk = gi * stride;
	float best_bin_err = 1e30f;
	int best_bin_cand = 11; /* default */
	for (int ci = 0; ci < Q4_N_CAND; ci++) {
	if (Q4_CAND_TO_QUHIT[ci] == sampled_qi) {
	if (cand_errors[blk][ci] < best_bin_err) {
	best_bin_err = cand_errors[blk][ci];
	best_bin_cand = ci;
	}
	}
	}

	shot_sparse_q4[gi] = best_bin_cand;
	shot_err += cand_errors[blk][best_bin_cand];
	}

	/* Metropolis acceptance: adopt if better than current best */
	if (shot_err < beam_total_err) {
	for (int64_t gi = 0; gi < graph_blocks; gi++)
	best_candidate[gi * stride] = shot_sparse_q4[gi];
	beam_total_err = shot_err;
	}
	}

	free(shot_sparse_q4);
	}

	/* Born refinement pass: non-stride blocks were set during beam
	* traceback and never revisited by Born shots. For each such block
	* pick the lowest-error candidate within the same quhit bin that
	* the winning Born shot chose for its stride-representative. */
	if (stride > 1) {
	for (int64_t b = 0; b < n_blocks; b++) {
	if (b % stride == 0) continue;
	int64_t rep = (b / stride) * stride;
	int target_bin = Q4_CAND_TO_QUHIT[best_candidate[rep]];
	float best_b_err = 1e30f;
	int best_b_cand = best_candidate[rep];
	for (int ci = 0; ci < Q4_N_CAND; ci++) {
	if (Q4_CAND_TO_QUHIT[ci] != target_bin) continue;
	if (cand_errors[b][ci] < best_b_err) {
	best_b_err = cand_errors[b][ci];
	best_b_cand = ci;
	}
	}
	best_candidate[b] = best_b_cand;
	}
	}

	free(marg);
	hpc_destroy(graph);
	}
	}

	/* Fallback when the HPC graph never ran (single block, or hpc_create
	* failure): pick the per-block argmin over the candidate grid instead
	* of silently leaving every block on the neutral ×1.00 candidate. */
	if (!hpc_ran_q4) {
	#pragma omp parallel for schedule(static)
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	float best_e = cand_errors[blk][0];
	int best_c = 0;
	for (int c = 1; c < Q4_N_CAND; c++) {
	if (cand_errors[blk][c] < best_e) {
	best_e = cand_errors[blk][c];
	best_c = c;
	}
	}
	best_candidate[blk] = best_c;
	}
	}

	/* ══════════════════════════════════════════════════════════════════
	* PHASE 4: Assemble blocks via least-squares scale extraction
	* ══════════════════════════════════════════════════════════════════ */

	#pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err)
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	const float bw = weights + blk QK4_0;
	int cidx = best_candidate[blk];

	/* Start from the grid-selected scale (the "assembled frequency") */
	float d_current = gguf_fp16_to_fp32(cand_d16[blk][cidx]);

	/* Analog assembly: iterate to full convergence. */
	for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
	if (d_current < 1e-15f) break;
	float id = 1.0f / d_current;

	int qs_tmp[QK4_0];
	for (int j = 0; j < QK4_0; j++) {
	int q = (int)(bw[j] * id + 8.5f);
	if (q < 0) q = 0; if (q > 15) q = 15;
	qs_tmp[j] = q;
	}

	float num = 0.0f, den = 0.0f;
	float dc4S = 0.0f, dc4Q = 0.0f;
	for (int j = 0; j < QK4_0; j++) {
	float q_centered = (float)qs_tmp[j] - 8.0f;
	float w = (imat_importance) ?
	imat_importance[blk * QK4_0 + j] : 1.0f;
	num += w * bw[j] * q_centered;
	den += w * q_centered * q_centered;
	dc4S += bw[j];
	dc4Q += q_centered;
	}
	num += (HEX_DC_LAMBDA / (float)QK4_0) * dc4S * dc4Q;
	den += (HEX_DC_LAMBDA / (float)QK4_0) * dc4Q * dc4Q;

	if (den > 1e-15f) {
	float d_new = num / den;
	float d_seed = gguf_fp16_to_fp32(cand_d16[blk][cidx]);
	if (fabsf(d_new) < 4.0f * (fabsf(d_seed) + 1e-10f)) {
	uint16_t d16 = gguf_fp32_to_fp16(d_new);
	d_current = gguf_fp16_to_fp32(d16);
	}
	}
	}

	/* ── FP16 ULP neighborhood search + sign-flip exploration ── */
	{
	uint16_t base_d16 = gguf_fp32_to_fp16(d_current);
	uint16_t best_d16 = base_d16;
	float best_ulp_err = 1e30f;

	/* Try ±8 ULP neighborhood + sign flip = up to 34 candidates */
	uint16_t ulp_candidates[35];
	int n_ulp = 0;
	for (int delta = -8; delta <= 8; delta++) {
	int cand16 = (int)base_d16 + delta;
	if (cand16 >= 0 && cand16 <= 0x7BFF)
	ulp_candidates[n_ulp++] = (uint16_t)cand16;
	}
	{
	float neg_d = -d_current;
	uint16_t neg_d16 = gguf_fp32_to_fp16(neg_d);
	for (int delta = -8; delta <= 8; delta++) {
	int cand16 = (int)neg_d16 + delta;
	if (cand16 >= 0 && cand16 <= 0x7BFF)
	ulp_candidates[n_ulp++] = (uint16_t)cand16;
	}
	}

	for (int ui = 0; ui < n_ulp; ui++) {
	float trial_d = gguf_fp16_to_fp32(ulp_candidates[ui]);
	float trial_id = (fabsf(trial_d) > 1e-15f) ? 1.0f / trial_d : 0.0f;
	float err = 0.0f;
	float e_ulp[QK4_0];
	for (int j = 0; j < QK4_0; j++) {
	int q = (int)(bw[j] * trial_id + 8.5f);
	if (q < 0) q = 0; if (q > 15) q = 15;
	float deq = ((float)q - 8.0f) * trial_d;
	float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
	e_ulp[j] = bw[j] - deq;
	err += e_ulp[j] * e_ulp[j] * w;
	}
	err += hex_spectral_penalty(e_ulp, QK4_0);
	if (err < best_ulp_err) {
	best_ulp_err = err;
	best_d16 = ulp_candidates[ui];
	}
	}
	d_current = gguf_fp16_to_fp32(best_d16);
	}

	output[blk].d = gguf_fp32_to_fp16(d_current);
	float actual_d = d_current;
	float id = (fabsf(actual_d) > 1e-15f) ? 1.0f / actual_d : 0.0f;

	/* ── D₆ Hadamard Error Shaping with Simulated Annealing ── */
	int q_base[QK4_0], q_shaped[QK4_0];
	float q_cont[QK4_0];
	for (int j = 0; j < QK4_0; j++) {
	q_cont[j] = bw[j] * id + 8.0f;
	q_base[j] = (int)(q_cont[j] + 0.5f);
	if (q_base[j] < 0) q_base[j] = 0;
	if (q_base[j] > 15) q_base[j] = 15;
	}
	memcpy(q_shaped, q_base, QK4_0 * sizeof(int));

	{
	float e_live[QK4_0];
	for (int j = 0; j < QK4_0; j++) {
	float deq = ((float)q_shaped[j] - 8.0f) * actual_d;
	e_live[j] = bw[j] - deq;
	}

	float v_live[QK4_0 / 2];
	float vesica_cur = 0.0f, dc_cur = 0.0f;
	for (int j = 0; j < QK4_0 / 2; j++) {
	v_live[j] = e_live[j] + e_live[j + QK4_0 / 2];
	vesica_cur += v_live[j] * v_live[j];
	}
	for (int j = 0; j < QK4_0; j++) dc_cur += e_live[j];
	float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;

	/* Deterministic greedy descent: only strict improvements.
	* The previous SA acceptance called rand() inside an OpenMP
	* parallel region (data race in the shared PRNG state, and
	* non-reproducible output). Uphill moves were pointless anyway:
	* the base-vs-shaped MSE guard below discards any shaped result
	* that ends up worse, so accepted uphill excursions could only
	* waste the pass budget or strand the descent. */
	for (int pass = 0; pass < QK4_0; pass++) {
	int best_k = -1;
	int best_q_alt = 0;
	float best_delta = 0.0f; /* strictly positive threshold */

	for (int k = 0; k < QK4_0; k++) {
	int q_cur = q_shaped[k];
	int q_try = (q_cont[k] - (float)q_cur >= 0.0f)
	? q_cur + 1 : q_cur - 1;
	if (q_try < 0 \|\| q_try > 15) continue;

	float deq_try = ((float)q_try - 8.0f) * actual_d;
	float e_new = bw[k] - deq_try;
	float de = e_new - e_live[k];

	int pi = (k < QK4_0 / 2) ? k : k - QK4_0 / 2;
	float v_old = v_live[pi];
	float v_new = v_old + de;

	float vesica_alt = vesica_cur - v_old * v_old + v_new * v_new;
	float dc_alt = dc_cur + de;
	float metric_alt = 4.0f * vesica_alt + dc_alt * dc_alt;

	float delta = metric_cur - metric_alt;
	if (delta > best_delta) {
	best_delta = delta;
	best_k = k;
	best_q_alt = q_try;
	}
	}

	if (best_k < 0) break; /* converged — no improving flip */

	q_shaped[best_k] = best_q_alt;
	{
	float deq_commit = ((float)best_q_alt - 8.0f) * actual_d;
	float e_new_commit = bw[best_k] - deq_commit;
	float de_commit = e_new_commit - e_live[best_k];

	int pi_commit = (best_k < QK4_0 / 2) ? best_k : best_k - QK4_0 / 2;
	float v_old_commit = v_live[pi_commit];
	float v_new_commit = v_old_commit + de_commit;

	vesica_cur += v_new_commit * v_new_commit - v_old_commit * v_old_commit;
	dc_cur += de_commit;
	metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;

	v_live[pi_commit] = v_new_commit;
	e_live[best_k] = e_new_commit;
	}
	}
	}

	float err_base = 0.0f, err_shaped = 0.0f;
	float e_gb[QK4_0], e_gs[QK4_0];
	for (int j = 0; j < QK4_0; j++) {
	float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
	float deq_b = ((float)q_base[j] - 8.0f) * actual_d;
	float deq_s = ((float)q_shaped[j] - 8.0f) * actual_d;
	e_gb[j] = bw[j] - deq_b;
	e_gs[j] = bw[j] - deq_s;
	err_base += e_gb[j] * e_gb[j] * w;
	err_shaped += e_gs[j] * e_gs[j] * w;
	}
	err_base += hex_spectral_penalty(e_gb, QK4_0);
	err_shaped += hex_spectral_penalty(e_gs, QK4_0);
	int *q_final = (err_shaped <= err_base) ? q_shaped : q_base;

	for (int j = 0; j < QK4_0 / 2; j++) {
	int q0 = q_final[j];
	int q1 = q_final[j + QK4_0/2];
	output[blk].qs[j] = (uint8_t)(q0 \| (q1 << 4));

	float deq0 = ((float)q0 - 8.0f) * actual_d;
	float deq1 = ((float)q1 - 8.0f) * actual_d;
	total_err += (bw[j] - deq0) * (bw[j] - deq0) + (bw[j + QK4_0/2] - deq1) * (bw[j + QK4_0/2] - deq1);
	}
	}

	*out_total_error = total_err;
	free(greedy_d);
	free(cand_errors);
	free(cand_d16);
	free(best_candidate);
	}

	/* ════════════════════════════════════════════════════════════════════════
	* Q8_0 HPC QUANTIZER — Shor pipeline at 8 bits
	*
	* Same pipeline as Q4_0: WLS scale + tight candidate grid scored on the
	* extended objective (weighted SSE + DC + vesica/wave), triality-quhit
	* graph with Boltzmann-encoded candidate errors, CZ chain entanglement,
	* Shor Griffiths-Niu sequential measurement for bin consensus, greedy
	* override (HEX_GREEDY_OVERRIDE_RATIO), then per-block ULP polish, the
	* vesica/DC error-shaping descent with an extended-objective guard, and
	* the candidate floor. Intended for embedding / LM-head tensors (tied
	* embeddings especially), where 2-4 bit codes destroy logit precision.
	* At 8 bits the candidate grid is tight (±1.5%) — the win over naive
	* amax/127 rounding comes from WLS + ULP + spectral selection, not from
	* coarse scale exploration.
	* ════════════════════════════════════════════════════════════════════════ */

	#ifndef QK8_0
	#define QK8_0 32
	#endif
	typedef struct { uint16_t d; int8_t qs[QK8_0]; } hex_block_q8_0;

	#define Q8_N_CAND 24
	static const float Q8_NEIGHBOR_MULTS[Q8_N_CAND] = {
	0.9850f, 0.9865f, 0.9880f, 0.9895f, 0.9910f, 0.9925f,
	0.9940f, 0.9952f, 0.9964f, 0.9976f, 0.9988f, 1.0000f,
	1.0010f, 1.0020f, 1.0030f, 1.0040f, 1.0052f, 1.0064f,
	1.0076f, 1.0088f, 1.0100f, 1.0115f, 1.0130f, 1.0150f,
	};
	/* 24 candidates → 6 quhit states (4 per bin), same folding as Q4_0 */
	static const int Q8_CAND_TO_QUHIT[Q8_N_CAND] = {
	0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4, 5,5,5,5
	};

	static inline float q8_block_ext_err(const float bw, const float iw,
	float d, int8_t *qs_out)
	{
	float e_arr[QK8_0];
	float id = (fabsf(d) > 1e-20f) ? 1.0f / d : 0.0f;
	float err = 0.0f;
	for (int j = 0; j < QK8_0; j++) {
	int q = gguf_nearest_int(bw[j] * id);
	if (q < -127) q = -127; if (q > 127) q = 127;
	if (qs_out) qs_out[j] = (int8_t)q;
	float e = bw[j] - (float)q * d;
	e_arr[j] = e;
	float w = iw ? iw[j] : 1.0f;
	err += e * e * w;
	}
	return err + hex_spectral_penalty(e_arr, QK8_0);
	}

	static void quantize_tensor_q8_0_hpc(const float *weights, int64_t n_elements,
	hex_block_q8_0 *output,
	float *out_total_error,
	const float *imat_importance, int verbose)
	{
	int64_t n_blocks = n_elements / QK8_0;
	float total_err = 0.0f;
	(void)verbose;

	float (cand_errors)[Q8_N_CAND] = (float ()[Q8_N_CAND])
	calloc(n_blocks, sizeof(float[Q8_N_CAND]));
	uint16_t (cand_d16)[Q8_N_CAND] = (uint16_t ()[Q8_N_CAND])
	calloc(n_blocks, sizeof(uint16_t[Q8_N_CAND]));
	int best_candidate = (int )malloc(n_blocks * sizeof(int));
	if (!cand_errors \|\| !cand_d16 \|\| !best_candidate) {
	free(cand_errors); free(cand_d16); free(best_candidate);
	if (out_total_error) *out_total_error = -1.0f;
	return;
	}

	/* ── Phase 1+2: WLS-refined scale + tight candidate grid ── */
	#pragma omp parallel for schedule(dynamic, 256)
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	const float bw = weights + blk QK8_0;
	const float iw = imat_importance ? imat_importance + blk QK8_0 : NULL;

	float amax = 0.0f;
	for (int j = 0; j < QK8_0; j++) {
	float av = fabsf(bw[j]);
	if (av > amax) amax = av;
	}
	float wls_d = amax / 127.0f;

	/* ggml-style fixed-point WLS with DC rank-1 augmentation */
	for (int it = 0; it < 3 && wls_d > 1e-20f; it++) {
	float inv_d = 1.0f / wls_d;
	float num = 0.0f, den = 0.0f, dcS = 0.0f, dcQ = 0.0f;
	for (int j = 0; j < QK8_0; j++) {
	int q = gguf_nearest_int(bw[j] * inv_d);
	if (q < -127) q = -127; if (q > 127) q = 127;
	float qf = (float)q;
	float w = iw ? iw[j] : 1.0f;
	num += w * bw[j] * qf;
	den += w * qf * qf;
	dcS += bw[j];
	dcQ += qf;
	}
	num += (HEX_DC_LAMBDA / (float)QK8_0) * dcS * dcQ;
	den += (HEX_DC_LAMBDA / (float)QK8_0) * dcQ * dcQ;
	if (den > 1e-15f) {
	float d_new = num / den;
	if (d_new > 1e-20f) wls_d = d_new;
	}
	}

	for (int ci = 0; ci < Q8_N_CAND; ci++) {
	float trial_d = wls_d * Q8_NEIGHBOR_MULTS[ci];
	uint16_t d16 = gguf_fp32_to_fp16(trial_d);
	float actual_d = gguf_fp16_to_fp32(d16);
	cand_d16 [blk][ci] = d16;
	cand_errors[blk][ci] = q8_block_ext_err(bw, iw, actual_d, NULL);
	}
	best_candidate[blk] = 11; /* ×1.0000 neutral seed */
	}

	/* ── Phase 3: Shor graph — triality quhits, CZ chain, GN measurement ── */
	int shor_ran = 0;
	if (n_blocks >= 2) {
	int64_t graph_blocks = (n_blocks > 200) ? 200 : n_blocks;
	int64_t stride = n_blocks / graph_blocks;

	HPCGraph *graph = hpc_create(graph_blocks);
	if (graph) {
	shor_ran = 1;

	/* Adaptive temperature from the candidate-error landscape */
	float temperature = 1e-10f;
	{
	double err_accum = 0.0;
	int err_count = 0;
	for (int64_t gi = 0; gi < graph_blocks && gi < 100; gi++) {
	int64_t blk = gi * stride;
	float max_e = 0.0f;
	for (int c = 0; c < Q8_N_CAND; c++)
	if (cand_errors[blk][c] > max_e)
	max_e = cand_errors[blk][c];
	err_accum += (double)max_e;
	err_count++;
	}
	if (err_count > 0) {
	temperature = (float)(err_accum / err_count) * 0.1f;
	if (temperature < 1e-10f) temperature = 1e-10f;
	}
	}

	/* Boltzmann-encode stride-aggregated candidate errors as
	* quhit amplitudes (24 candidates folded into 6 states) */
	for (int64_t i = 0; i < graph_blocks; i++) {
	float agg_errors[Q8_N_CAND];
	for (int c = 0; c < Q8_N_CAND; c++) agg_errors[c] = 0.0f;
	int64_t blk_start = i * stride;
	int64_t blk_end = blk_start + stride;
	if (blk_end > n_blocks) blk_end = n_blocks;
	for (int64_t b = blk_start; b < blk_end; b++)
	for (int c = 0; c < Q8_N_CAND; c++)
	agg_errors[c] += cand_errors[b][c];
	float min_err = 1e30f;
	for (int c = 0; c < Q8_N_CAND; c++)
	if (agg_errors[c] < min_err) min_err = agg_errors[c];

	double amp_re[6] = {0,0,0,0,0,0};
	double amp_norm = 0.0;
	for (int ci = 0; ci < Q8_N_CAND; ci++)
	amp_re[Q8_CAND_TO_QUHIT[ci]] +=
	exp(-(double)(agg_errors[ci] - min_err) /
	(2.0 * (double)temperature));
	for (int v = 0; v < 6; v++) amp_norm += amp_re[v] * amp_re[v];
	if (amp_norm > 1e-30) {
	double inv = 1.0 / sqrt(amp_norm);
	for (int v = 0; v < 6; v++) amp_re[v] *= inv;
	}
	for (int v = 0; v < 6; v++) {
	graph->locals[i].edge_re[v] = amp_re[v];
	graph->locals[i].edge_im[v] = 0.0;
	}
	graph->locals[i].primary = VIEW_EDGE;
	graph->locals[i].dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	graph->locals[i].delta_valid = 0;
	triality_update_mask(&graph->locals[i]);
	}

	for (int64_t i = 0; i < graph_blocks - 1; i++)
	hpc_cz(graph, i, i + 1);

	double (marg)[6] = (double ()[6])calloc(graph_blocks, sizeof(double[6]));
	int measured = (int )calloc(graph_blocks, sizeof(int));
	if (marg && measured) {
	shor_measure_graph(graph, graph_blocks, marg, measured, 1);

	/* Per-block selection: best candidate inside the Shor-
	* measured bin, then greedy override against the global
	* argmin — identical Step-F semantics to Q2_K/Q4_0. */
	for (int64_t i = 0; i < graph_blocks; i++) {
	int bin = measured[i];
	if (bin < 0 \|\| bin > 5) {
	double bm = -1.0; bin = 0;
	for (int v = 0; v < 6; v++)
	if (marg[i][v] > bm) { bm = marg[i][v]; bin = v; }
	}
	int64_t blk_start = i * stride;
	int64_t blk_end = blk_start + stride;
	if (blk_end > n_blocks) blk_end = n_blocks;
	for (int64_t b = blk_start; b < blk_end; b++) {
	float bin_best = 1e30f; int bin_cand = -1;
	float g_best = 1e30f; int g_cand = 0;
	for (int c = 0; c < Q8_N_CAND; c++) {
	float e = cand_errors[b][c];
	if (e < g_best) { g_best = e; g_cand = c; }
	if (Q8_CAND_TO_QUHIT[c] == bin && e < bin_best) {
	bin_best = e; bin_cand = c;
	}
	}
	int sel = (bin_cand >= 0) ? bin_cand : g_cand;
	if (g_best < cand_errors[b][sel] * HEX_GREEDY_OVERRIDE_RATIO)
	sel = g_cand;
	best_candidate[b] = sel;
	}
	}
	}
	free(marg); free(measured);
	hpc_destroy(graph);
	}
	}
	if (!shor_ran) {
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	float g_best = cand_errors[blk][0]; int g_cand = 0;
	for (int c = 1; c < Q8_N_CAND; c++)
	if (cand_errors[blk][c] < g_best) {
	g_best = cand_errors[blk][c]; g_cand = c;
	}
	best_candidate[blk] = g_cand;
	}
	}

	/* ── Phase 4: ULP polish + vesica/DC shaping guard + floor ── */
	#pragma omp parallel for schedule(dynamic, 256) reduction(+:total_err)
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	const float bw = weights + blk QK8_0;
	const float iw = imat_importance ? imat_importance + blk QK8_0 : NULL;
	int cidx = best_candidate[blk];

	uint16_t best_d16 = cand_d16[blk][cidx];
	float best_err = cand_errors[blk][cidx];

	/* ±8 fp16 ULP joint search on the extended objective */
	for (int du = -8; du <= 8; du++) {
	if (du == 0) continue;
	int c16 = (int)cand_d16[blk][cidx] + du;
	if (c16 <= 0 \|\| c16 > 0x7BFF) continue;
	float td = gguf_fp16_to_fp32((uint16_t)c16);
	float err = q8_block_ext_err(bw, iw, td, NULL);
	if (err < best_err) { best_err = err; best_d16 = (uint16_t)c16; }
	}

	/* Candidate floor: final ≤ best raw grid candidate (by construction
	* the ULP search already starts from it, so this is implicit). */
	float d = gguf_fp16_to_fp32(best_d16);
	int8_t qs[QK8_0];
	(void)q8_block_ext_err(bw, iw, d, qs);

	/* Vesica/DC greedy shaping with extended-objective guard */
	{
	int8_t qs_shaped[QK8_0];
	memcpy(qs_shaped, qs, QK8_0);
	float e_live[QK8_0], v_live[QK8_0 / 2];
	float vesica_cur = 0.0f, dc_cur = 0.0f;
	for (int k = 0; k < QK8_0; k++)
	e_live[k] = bw[k] - (float)qs_shaped[k] * d;
	for (int p = 0; p < QK8_0 / 2; p++) {
	v_live[p] = e_live[p] + e_live[p + QK8_0 / 2];
	vesica_cur += v_live[p] * v_live[p];
	dc_cur += v_live[p];
	}
	float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
	for (int pass = 0; pass < QK8_0; pass++) {
	int best_k = -1, best_q_alt = 0;
	float best_delta = 0.0f;
	for (int k = 0; k < QK8_0; k++) {
	int q_try = (e_live[k] >= 0.0f) ? qs_shaped[k] + 1
	: qs_shaped[k] - 1;
	if (q_try < -127 \|\| q_try > 127) continue;
	float e_new = bw[k] - (float)q_try * d;
	float de = e_new - e_live[k];
	int pi = (k < QK8_0 / 2) ? k : k - QK8_0 / 2;
	float v_new = v_live[pi] + de;
	float ves_a = vesica_cur - v_live[pi] * v_live[pi]
	+ v_new * v_new;
	float dc_a = dc_cur + de;
	float delta = metric_cur - (4.0f * ves_a + dc_a * dc_a);
	if (delta > best_delta) {
	best_delta = delta; best_k = k; best_q_alt = q_try;
	}
	}
	if (best_k < 0) break;
	{
	float e_new = bw[best_k] - (float)best_q_alt * d;
	float de = e_new - e_live[best_k];
	int pi = (best_k < QK8_0 / 2) ? best_k
	: best_k - QK8_0 / 2;
	float v_new = v_live[pi] + de;
	vesica_cur += v_new * v_new - v_live[pi] * v_live[pi];
	dc_cur += de;
	metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
	v_live[pi] = v_new;
	e_live[best_k] = e_new;
	qs_shaped[best_k] = (int8_t)best_q_alt;
	}
	}
	/* Guard on the extended objective vs originals */
	float e_b[QK8_0], e_s[QK8_0];
	float err_b = 0.0f, err_s = 0.0f;
	for (int k = 0; k < QK8_0; k++) {
	float w = iw ? iw[k] : 1.0f;
	e_b[k] = bw[k] - (float)qs[k] * d;
	e_s[k] = bw[k] - (float)qs_shaped[k] * d;
	err_b += e_b[k] * e_b[k] * w;
	err_s += e_s[k] * e_s[k] * w;
	}
	err_b += hex_spectral_penalty(e_b, QK8_0);
	err_s += hex_spectral_penalty(e_s, QK8_0);
	if (err_s < err_b) memcpy(qs, qs_shaped, QK8_0);
	}

	output[blk].d = best_d16;
	for (int k = 0; k < QK8_0; k++) {
	output[blk].qs[k] = qs[k];
	float e = bw[k] - (float)qs[k] * d;
	total_err += e * e; /* pure reconstruction SSE report */
	}
	}

	free(cand_errors);
	free(cand_d16);
	free(best_candidate);
	if (out_total_error) *out_total_error = total_err;
	}


	/* Re-derive the 4-bit sub-scale codes (Ls, Lm) for a candidate (d, dmin)
	* pair from the Phase-1 float scales/mins. Bit-identical to the Phase-2b
	* candidate generation, so stored codes are unnecessary. */
	static inline void hex_derive_subscales(const float scales, const float mins,
	float actual_dm, float actual_mm,
	uint8_t Ls, uint8_t Lm)
	{
	for (int j = 0; j < 16; j++) {
	if (actual_dm > 1e-15f) {
	int ls = gguf_nearest_int(scales[j] / actual_dm);
	if (ls < 0) ls = 0; if (ls > 15) ls = 15;
	Ls[j] = (uint8_t)ls;
	} else { Ls[j] = 0; }
	if (actual_mm > 1e-15f) {
	int lm = gguf_nearest_int(mins[j] / actual_mm);
	if (lm < 0) lm = 0; if (lm > 15) lm = 15;
	Lm[j] = (uint8_t)lm;
	} else { Lm[j] = 0; }
	}
	}

	static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
	BlockQ2K output, float out_total_error,
	OptimizerMode opt_mode,
	const float *imat_importance,
	int verbose)
	{
	int64_t n_blocks = n_elements / QK_K;
	float total_err = 0.0f;
	const int N_SUB = QK_K / 16;

	/* ── Outlier Clamping for WLS Seeds ──
	* Protects the Phase 1 greedy seed from being violently warped by extreme
	* >4.0 sigma outliers, which creates better centering for the grid search. */
	double t_sum_sq = 0.0, t_sum_4 = 0.0;
	for (int64_t i = 0; i < n_elements; i++) {
	double w2 = (double)weights[i] * (double)weights[i];
	t_sum_sq += w2;
	t_sum_4 += w2 * w2;
	}
	float w_sigma = sqrtf((float)(t_sum_sq / (double)n_elements));

	/* ── Adaptive outlier clamp (kurtosis-driven) ──
	* The fixed 3.5σ clamp suppressed the heavy-tail mass that dominates
	* reconstruction error, inflating RMSE on near-Gaussian tensors that did
	* not need clamping at all. Instead, gate the clamp on the tensor's raw
	* kurtosis (Gaussian = 3): leave near-Gaussian tensors untouched and only
	* apply a stabilising clamp to genuinely heavy-tailed tensors, where the
	* final (d, dmin) refit later recovers fidelity against the UNCLIPPED
	* weights anyway. */
	double t_var = t_sum_sq / (double)n_elements;
	double t_kurt = (t_var > 1e-30) ? (t_sum_4 / (double)n_elements) / (t_var * t_var) : 3.0;
	float clamp_sigma;
	if (t_kurt <= 6.0) clamp_sigma = 1.0e9f; /* ~Gaussian: effectively no clamp */
	else if (t_kurt <= 20.0) clamp_sigma = 6.0f; /* moderately heavy tails */
	else clamp_sigma = 4.0f; /* very heavy tails: stabilise seed */
	float clamp_val = w_sigma * clamp_sigma;

	/* ══════════════════════════════════════════════════════════════════
	* PHASE 1: Greedy quantization — produce seed (d, dmin) per block
	* ══════════════════════════════════════════════════════════════════ */

	typedef struct {
	float dm, mm;
	uint16_t d_fp16, dmin_fp16;
	uint8_t Ls[16], Lm[16];
	float scales[16], mins[16], sw[16];
	} BlockSeed;

	BlockSeed seeds = (BlockSeed )calloc(n_blocks, sizeof(BlockSeed));

	#pragma omp parallel for schedule(dynamic, 64)
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	const float block_x = weights + blk QK_K;
	uint8_t L[QK_K], Laux[16];
	float wt[16];

	float sumx2 = 0;
	for (int i = 0; i < QK_K; i++) sumx2 += block_x[i] * block_x[i];
	float sigma2 = sumx2 / (float)QK_K;

	/* Phase 1 WLS uses clamped values to generate stable seeds */
	float sx_clipped[16];
	for (int j = 0; j < N_SUB; j++) {
	const float sx = block_x + 16 j;
	seeds[blk].sw[j] = 0;
	for (int l = 0; l < 16; l++) {
	float imp = (imat_importance) ? imat_importance[blk * QK_K + 16 * j + l] : 1.0f;
	float v = sx[l];
	if (v > clamp_val) v = clamp_val;
	if (v < -clamp_val) v = -clamp_val;
	sx_clipped[l] = v;
	/* Activation-aware weighting: an imatrix entry already encodes
	* E[a^2] for that column, which is the correct weight for
	* minimising output (dot-product) error. Use it directly rather
	* than re-multiplying by the \|w\| magnitude heuristic, which
	* double-counts magnitude. Without an imatrix, fall back to the
	* magnitude-relative heuristic. */
	wt[l] = (imat_importance)
	? imp
	: sqrtf(sigma2 + sx_clipped[l] * sx_clipped[l]);
	seeds[blk].sw[j] += wt[l];
	}
	seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx_clipped, wt,
	L + 16 * j, &seeds[blk].mins[j], Laux);
	}

	seeds[blk].dm = hpc_make_qp_quants(N_SUB, 15, seeds[blk].scales,
	seeds[blk].Ls, seeds[blk].sw);
	seeds[blk].mm = hpc_make_qp_quants(N_SUB, 15, seeds[blk].mins,
	seeds[blk].Lm, seeds[blk].sw);
	seeds[blk].d_fp16 = gguf_fp32_to_fp16(seeds[blk].dm);
	seeds[blk].dmin_fp16 = gguf_fp32_to_fp16(seeds[blk].mm);
	}

	/* ══════════════════════════════════════════════════════════════════
	* PHASE 2: WLS-Optimal Candidate Generation
	* ══════════════════════════════════════════════════════════════════ */

	/* Expanded neighborhood around WLS optimum: ±30% with 24 candidates */
	/* d is the sensitive axis, so concentrate resolution near 1.0 while
	* keeping wide tails for blocks whose WLS seed is off. 1.000 stays at
	* index 11 so the neutral-candidate fallback/init remains valid. */
	static const float NEIGHBOR_MULTS_D[N_CAND_D] = {
	0.780f, 0.835f, 0.880f, 0.915f, 0.943f, 0.963f,
	0.978f, 0.988f, 0.994f, 0.997f, 0.999f, 1.000f,
	1.002f, 1.005f, 1.011f, 1.021f, 1.035f, 1.054f,
	1.080f, 1.115f, 1.160f, 1.215f, 1.275f, 1.340f
	};
	static const float NEIGHBOR_MULTS_M[N_CAND_M] = {
	0.750f, 0.800f, 0.840f, 0.870f, 0.900f, 0.920f,
	0.940f, 0.955f, 0.970f, 0.985f, 0.995f, 1.000f,
	1.005f, 1.015f, 1.030f, 1.045f, 1.060f, 1.080f,
	1.100f, 1.130f, 1.160f, 1.200f, 1.250f, 1.300f
	};
	/* Map 24 candidates → 6 quhit states for BP encoding */
	static const int CAND_TO_QUHIT[24] = {
	0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
	3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
	};

	float (*candidate_errors)[TOTAL_SCALE_CANDIDATES] = NULL;
	uint16_t (*candidate_d)[TOTAL_SCALE_CANDIDATES] = NULL;
	uint16_t (*candidate_dmin)[TOTAL_SCALE_CANDIDATES] = NULL;

	candidate_errors = (float (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
	sizeof(float[TOTAL_SCALE_CANDIDATES]));
	candidate_d = (uint16_t (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
	sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
	candidate_dmin = (uint16_t (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
	sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
	/* NOTE: the per-candidate sub-scale codes (Ls/Lm) are NOT stored.
	* They are a pure function of (seeds[blk].scales/mins, candidate fp16
	* d/dmin) and are re-derived where needed. Storing them cost
	* n_blocks × 576 × 16 × 2 bytes ≈ 18 KB/superblock — multiple GB of
	* peak RSS on large FFN tensors — for data used at exactly one index. */

	#pragma omp parallel for schedule(dynamic, 16)
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	const float block_x = weights + blk QK_K;

	/* ── Step 2a: WLS solve to find optimal (d, dmin) ── */
	float wls_dm = seeds[blk].dm;
	float wls_mm = seeds[blk].mm;
	uint8_t wls_Ls[16], wls_Lm[16];
	memcpy(wls_Ls, seeds[blk].Ls, 16);
	memcpy(wls_Lm, seeds[blk].Lm, 16);

	/* Generate soft-clipped buffer for WLS internal stability */
	float clipped_block_x[QK_K];
	for(int i=0; i<QK_K; i++) {
	float v = block_x[i];
	if (v > clamp_val) v = clamp_val;
	if (v < -clamp_val) v = -clamp_val;
	clipped_block_x[i] = v;
	}

	for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
	uint8_t L_wls[QK_K];
	for (int j = 0; j < N_SUB; j++) {
	float d_sub = wls_dm * (float)wls_Ls[j];
	float m_sub = wls_mm * (float)wls_Lm[j];
	if (d_sub < 1e-15f) {
	for (int k = 0; k < 16; k++) L_wls[16*j+k] = 0;
	continue;
	}
	for (int k = 0; k < 16; k++) {
	int q = gguf_nearest_int((clipped_block_x[16*j+k] + m_sub) / d_sub);
	if (q < 0) q = 0; if (q > 3) q = 3;
	L_wls[16*j+k] = (uint8_t)q;
	}
	}

	double Saa = 0, Sab = 0, Sbb = 0, Sxa = 0, Sxb = 0;
	for (int j = 0; j < N_SUB; j++) {
	float ls_f = (float)wls_Ls[j];
	float lm_f = (float)wls_Lm[j];
	for (int k = 0; k < 16; k++) {
	float x = clipped_block_x[16*j+k];
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + 16*j+k] : 1.0f;
	float a = ls_f * (float)L_wls[16*j+k];
	float b = lm_f;
	Saa += w * a * a;
	Sab += w * a * b;
	Sbb += w * b * b;
	Sxa += w * x * a;
	Sxb += w * x * b;
	}
	}

	double det = Saa * Sbb - Sab * Sab;
	if (fabs(det) > 1e-30) {
	double d_new = (Sbb * Sxa - Sab * Sxb) / det;
	double dm_new = (Sab * Sxa - Saa * Sxb) / det;
	if (d_new > 0.0 && d_new < 4.0 * (seeds[blk].dm + 1e-10))
	wls_dm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_new));
	if (dm_new > 0.0 && dm_new < 4.0 * (seeds[blk].mm + 1e-10))
	wls_mm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)dm_new));
	}

	for (int j = 0; j < N_SUB; j++) {
	if (wls_dm > 1e-15f) {
	int ls = gguf_nearest_int(seeds[blk].scales[j] / wls_dm);
	if (ls < 0) ls = 0; if (ls > 15) ls = 15;
	wls_Ls[j] = (uint8_t)ls;
	} else { wls_Ls[j] = 0; }
	if (wls_mm > 1e-15f) {
	int lm = gguf_nearest_int(seeds[blk].mins[j] / wls_mm);
	if (lm < 0) lm = 0; if (lm > 15) lm = 15;
	wls_Lm[j] = (uint8_t)lm;
	} else { wls_Lm[j] = 0; }
	}
	}

	/* ── Step 2b: Generate Candidates ── */
	for (int di = 0; di < N_CAND_D; di++) {
	float trial_dm = wls_dm * NEIGHBOR_MULTS_D[di];
	uint16_t trial_d16 = gguf_fp32_to_fp16(trial_dm);
	float actual_dm = gguf_fp16_to_fp32(trial_d16);

	for (int mi = 0; mi < N_CAND_M; mi++) {
	int cidx = di * N_CAND_M + mi;
	float trial_mm = wls_mm * NEIGHBOR_MULTS_M[mi];
	uint16_t trial_dmin16 = gguf_fp32_to_fp16(trial_mm);
	float actual_mm = gguf_fp16_to_fp32(trial_dmin16);

	candidate_d[blk][cidx] = trial_d16;
	candidate_dmin[blk][cidx] = trial_dmin16;

	uint8_t trial_Ls[16], trial_Lm[16];
	for (int j = 0; j < N_SUB; j++) {
	if (actual_dm > 1e-15f) {
	int ls = gguf_nearest_int(seeds[blk].scales[j] / actual_dm);
	if (ls < 0) ls = 0; if (ls > 15) ls = 15;
	trial_Ls[j] = (uint8_t)ls;
	} else { trial_Ls[j] = 0; }
	if (actual_mm > 1e-15f) {
	int lm = gguf_nearest_int(seeds[blk].mins[j] / actual_mm);
	if (lm < 0) lm = 0; if (lm > 15) lm = 15;
	trial_Lm[j] = (uint8_t)lm;
	} else { trial_Lm[j] = 0; }
	}

	/* Error evaluation MUST use the non-clipped original weights.
	* Exact importance-weighted SSE — the same objective the
	* assembly/polish phases minimise and the reported RMSE. */
	float err = 0.0f;
	float e_arr[QK_K];
	for (int i = 0; i < QK_K; i++) {
	int jj = i >> 4;
	float d = actual_dm * (float)trial_Ls[jj];
	float m = actual_mm * (float)trial_Lm[jj];
	float x = block_x[i];
	float w = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
	float e;
	if (d < 1e-15f) {
	/* Decoder semantics: deq = d·ls·q − dmin·lm = −m here */
	e = x + m;
	} else {
	int q = gguf_nearest_int((x + m) / d);
	if (q < 0) q = 0; if (q > 3) q = 3;
	e = x - (d * (float)q - m);
	}
	e_arr[i] = e;
	err += e * e * w;
	}
	candidate_errors[blk][cidx] =
	err + hex_spectral_penalty(e_arr, QK_K);
	}
	}
	}

	/* ══════════════════════════════════════════════════════════════════
	* PHASE 3: HPC Graph — Shor's Griffiths-Niu Measurement
	* ══════════════════════════════════════════════════════════════════ */

	int best_candidate = (int )malloc(n_blocks * sizeof(int));
	for (int64_t i = 0; i < n_blocks; i++)
	best_candidate[i] = 11 * N_CAND_M + 11; /* index 11 = 1.0 multiplier */

	if (opt_mode != OPT_MSE && n_blocks >= 2) {
	int64_t graph_blocks = (n_blocks > 2000) ? 2000 : n_blocks;
	int64_t stride = n_blocks / graph_blocks;
	float temperature = 0.5f;
	int64_t n_sites = graph_blocks * QUHITS_PER_BLOCK;

	HPCGraph *graph = hpc_create(n_sites);
	if (graph) {
	for (int64_t i = 0; i < n_sites; i++)
	triality_dft(&graph->locals[i]);

	{
	double err_accum = 0.0;
	int err_count = 0;
	for (int64_t gi = 0; gi < graph_blocks && gi < 100; gi++) {
	int64_t blk = gi * stride;
	float max_e = 0.0f;
	for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
	if (candidate_errors[blk][c] > max_e)
	max_e = candidate_errors[blk][c];
	err_accum += (double)max_e;
	err_count++;
	}
	if (err_count > 0) {
	float median_err = (float)(err_accum / err_count);
	temperature = median_err * 0.1f;
	if (temperature < 1e-10f) temperature = 1e-10f;
	}
	}

	for (int64_t i = 0; i < graph_blocks; i++) {
	float agg_errors[TOTAL_SCALE_CANDIDATES];
	for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) agg_errors[c] = 0.0f;

	int64_t blk_start = i * stride;
	int64_t blk_end = blk_start + stride;
	if (blk_end > n_blocks) blk_end = n_blocks;
	int64_t group_size = blk_end - blk_start;

	for (int64_t b = blk_start; b < blk_end; b++) {
	for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
	agg_errors[c] += candidate_errors[b][c];
	}
	if (group_size > 1) {
	float inv_gs = 1.0f / (float)group_size;
	for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
	agg_errors[c] *= inv_gs;
	}

	float min_err = 1e30f;
	for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
	if (agg_errors[c] < min_err)
	min_err = agg_errors[c];

	double coarse_re[6];
	double coarse_norm = 0.0;
	for (int qi = 0; qi < 6; qi++) coarse_re[qi] = 0.0;
	for (int di = 0; di < N_CAND_D; di++) {
	int qi = CAND_TO_QUHIT[di];
	for (int mi = 0; mi < N_CAND_M; mi++) {
	int cidx = di * N_CAND_M + mi;
	coarse_re[qi] += exp(-(double)(agg_errors[cidx] - min_err) /
	(2.0 * (double)temperature));
	}
	}
	for (int qi = 0; qi < 6; qi++) coarse_norm += coarse_re[qi] * coarse_re[qi];
	if (coarse_norm > 1e-30) {
	double inv = 1.0 / sqrt(coarse_norm);
	for (int v = 0; v < 6; v++) coarse_re[v] *= inv;
	}

	double fine_re[6];
	double fine_norm = 0.0;
	for (int qi = 0; qi < 6; qi++) fine_re[qi] = 0.0;
	for (int mi = 0; mi < N_CAND_M; mi++) {
	int qi = CAND_TO_QUHIT[mi];
	for (int di = 0; di < N_CAND_D; di++) {
	int cidx = di * N_CAND_M + mi;
	fine_re[qi] += exp(-(double)(agg_errors[cidx] - min_err) /
	(2.0 * (double)temperature));
	}
	}
	for (int qi = 0; qi < 6; qi++) fine_norm += fine_re[qi] * fine_re[qi];
	if (fine_norm > 1e-30) {
	double inv = 1.0 / sqrt(fine_norm);
	for (int v = 0; v < 6; v++) fine_re[v] *= inv;
	}

	int64_t s0 = 2 * i, s1 = 2 * i + 1;
	for (int v = 0; v < 6; v++) {
	graph->locals[s0].edge_re[v] = coarse_re[v];
	graph->locals[s0].edge_im[v] = 0.0;
	graph->locals[s1].edge_re[v] = fine_re[v];
	graph->locals[s1].edge_im[v] = 0.0;
	}
	graph->locals[s0].primary = VIEW_EDGE;
	graph->locals[s0].dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	graph->locals[s0].delta_valid = 0;
	triality_update_mask(&graph->locals[s0]);
	graph->locals[s1].primary = VIEW_EDGE;
	graph->locals[s1].dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	graph->locals[s1].delta_valid = 0;
	triality_update_mask(&graph->locals[s1]);
	}

	for (int64_t i = 0; i < graph_blocks; i++) {
	hpc_cz(graph, 2 * i, 2 * i + 1);
	if (i + 1 < graph_blocks) {
	hpc_cz(graph, 2 * i, 2 * (i + 1));
	hpc_cz(graph, 2 * i + 1, 2 * (i + 1) + 1);
	}
	}

	double (shor_marg)[6] = (double ()[6])calloc(n_sites, sizeof(double[6]));
	int shor_measured = (int )calloc(n_sites, sizeof(int));

	shor_measure_graph(graph, n_sites, shor_marg, shor_measured, 1);

	double (coarse_marg)[6] = (double ()[6])calloc(graph_blocks, sizeof(double[6]));
	double (fine_marg)[6] = (double ()[6])calloc(graph_blocks, sizeof(double[6]));

	for (int64_t i = 0; i < graph_blocks; i++) {
	for (int v = 0; v < 6; v++) {
	coarse_marg[i][v] = shor_marg[2 * i][v];
	fine_marg[i][v] = shor_marg[2 * i + 1][v];
	}
	}

	free(shor_marg);
	free(shor_measured);

	/* ══════════════════════════════════════════════════════════════
	* PHASE 3 — DETERMINISTIC VITERBI DP
	*
	* Replaces the probabilistic beam-search + Born-rule Monte-Carlo
	* shots with an exact, fully-deterministic DP over the 36-state
	* Shor quhit space (6 coarse bins × 6 fine bins).
	*
	* For each graph block i and combined state s = qi_d*6 + qi_m:
	*
	* bin_best_err[i][s] = min candidate error in that (d,m)-bin
	* aggregated over the stride group
	* bin_log_prior[i][s] = log P_coarse(qi_d) + log P_fine(qi_m)
	* from Shor marginals → HPC prior bonus
	*
	* Local Viterbi cost (lower = better):
	* vcost[i][s] = bin_best_err[i][s]
	* − VITERBI_BETA × scale_err × bin_log_prior[i][s]
	*
	* Transition cost (cross-block smoothness prior):
	* trans(s′→s) = VITERBI_ALPHA × scale_err
	* × (\|qi_d − qi_d′\| + \|qi_m − qi_m′\|)
	*
	* DP recurrence:
	* dp[0][s] = vcost[0][s]
	* dp[i][s] = vcost[i][s] + min_{s′}(dp[i-1][s′] + trans(s′→s))
	*
	* Traceback yields the globally optimal sequence of bin choices,
	* which is then mapped to per-block best_candidate[] indices.
	* A 5%-threshold greedy override rescues blocks where the local
	* MSE-optimal candidate is meaningfully better than the bin winner.
	* ══════════════════════════════════════════════════════════════ */

	#define VIT_N_STATES 36 /* 6 coarse × 6 fine quhit bins */
	#define VITERBI_BETA 0.25f /* log-prior bonus weight */
	#define VITERBI_ALPHA 0.08f /* cross-block smoothness penalty weight */

	{
	int64_t vit_gi, vit_b;
	int vit_s, vit_sp;

	/* Per-graph-block per-state workspace */
	float (*vit_bin_err )[VIT_N_STATES] =
	(float ()[VIT_N_STATES])malloc(graph_blocks sizeof(float[VIT_N_STATES]));
	int (*vit_bin_cand)[VIT_N_STATES] =
	(int ()[VIT_N_STATES])malloc(graph_blocks sizeof(int [VIT_N_STATES]));
	float (*vit_log_pri )[VIT_N_STATES] =
	(float ()[VIT_N_STATES])malloc(graph_blocks sizeof(float[VIT_N_STATES]));
	float (*vit_dp )[VIT_N_STATES] =
	(float ()[VIT_N_STATES])malloc(graph_blocks sizeof(float[VIT_N_STATES]));
	int (*vit_back )[VIT_N_STATES] =
	(int ()[VIT_N_STATES])malloc(graph_blocks sizeof(int [VIT_N_STATES]));

	/* ── Step A: build per-block per-bin statistics ── */
	for (vit_gi = 0; vit_gi < graph_blocks; vit_gi++) {
	double c_tot = 0.0, f_tot = 0.0;

	for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
	vit_bin_err [vit_gi][vit_s] = 1e30f;
	vit_bin_cand[vit_gi][vit_s] = -1;
	}

	/* Best candidate per (qi_d, qi_m) bin over stride group */
	for (vit_b = vit_gi * stride;
	vit_b < (vit_gi + 1) * stride && vit_b < n_blocks;
	vit_b++) {
	int vit_c;
	for (vit_c = 0; vit_c < TOTAL_SCALE_CANDIDATES; vit_c++) {
	int qi_d = CAND_TO_QUHIT[vit_c / N_CAND_M];
	int qi_m = CAND_TO_QUHIT[vit_c % N_CAND_M];
	vit_s = qi_d * 6 + qi_m;
	float e = candidate_errors[vit_b][vit_c];
	if (e < vit_bin_err[vit_gi][vit_s]) {
	vit_bin_err[vit_gi][vit_s] = e;
	/* Canonical candidate = stride-rep block's best */
	if (vit_b == vit_gi * stride)
	vit_bin_cand[vit_gi][vit_s] = vit_c;
	}
	}
	}

	/* HPC log-prior from Shor marginals */
	for (int v = 0; v < 6; v++) {
	c_tot += coarse_marg[vit_gi][v];
	f_tot += fine_marg [vit_gi][v];
	}
	for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
	int qi_d = vit_s / 6, qi_m = vit_s % 6;
	double pc = (c_tot > 1e-30)
	? coarse_marg[vit_gi][qi_d] / c_tot : 1.0/6.0;
	double pf = (f_tot > 1e-30)
	? fine_marg [vit_gi][qi_m] / f_tot : 1.0/6.0;
	vit_log_pri[vit_gi][vit_s] =
	(float)(log(pc + 1e-30) + log(pf + 1e-30));
	}
	}

	/* ── Step B: scale_err normaliser for transition cost ── */
	float vit_scale_err = 0.0f;
	int vit_scale_cnt = 0;
	for (vit_gi = 0; vit_gi < graph_blocks; vit_gi++) {
	for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
	if (vit_bin_err[vit_gi][vit_s] < 1e29f) {
	vit_scale_err += vit_bin_err[vit_gi][vit_s];
	vit_scale_cnt++;
	}
	}
	}
	vit_scale_err = (vit_scale_cnt > 0)
	? vit_scale_err / (float)vit_scale_cnt : 1e-10f;
	if (vit_scale_err < 1e-20f) vit_scale_err = 1e-20f;

	/* ── Step C: Forward Viterbi pass ── */

	/* Block 0 — no predecessor */
	for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
	float local = (vit_bin_err[0][vit_s] < 1e29f)
	? vit_bin_err[0][vit_s]
	- VITERBI_BETA * vit_scale_err * vit_log_pri[0][vit_s]
	: 1e30f;
	vit_dp [0][vit_s] = local;
	vit_back[0][vit_s] = -1;
	}

	/* Blocks 1..graph_blocks-1 */
	for (vit_gi = 1; vit_gi < graph_blocks; vit_gi++) {
	for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
	float local;
	float best_pred = 1e30f;
	int best_sp = 0;
	int qi_d = vit_s / 6;
	int qi_m = vit_s % 6;

	if (vit_bin_err[vit_gi][vit_s] > 1e29f) {
	vit_dp [vit_gi][vit_s] = 1e30f;
	vit_back[vit_gi][vit_s] = 0;
	continue;
	}
	local = vit_bin_err[vit_gi][vit_s]
	- VITERBI_BETA * vit_scale_err * vit_log_pri[vit_gi][vit_s];

	/* Min-cost predecessor with Manhattan transition penalty */
	for (vit_sp = 0; vit_sp < VIT_N_STATES; vit_sp++) {
	float prev = vit_dp[vit_gi - 1][vit_sp];
	if (prev > 1e29f) continue;
	int td = abs(qi_d - (vit_sp / 6));
	int tm = abs(qi_m - (vit_sp % 6));
	float trans = VITERBI_ALPHA * vit_scale_err * (float)(td + tm);
	float total = prev + trans;
	if (total < best_pred) {
	best_pred = total;
	best_sp = vit_sp;
	}
	}
	vit_dp [vit_gi][vit_s] = (best_pred < 1e29f)
	? best_pred + local : 1e30f;
	vit_back[vit_gi][vit_s] = best_sp;
	}
	}

	/* ── Step D: Traceback ── */
	int vit_path = (int )malloc(graph_blocks * sizeof(int));
	{
	int best_s = 0;
	float best_f = vit_dp[graph_blocks - 1][0];
	for (vit_s = 1; vit_s < VIT_N_STATES; vit_s++) {
	if (vit_dp[graph_blocks - 1][vit_s] < best_f) {
	best_f = vit_dp[graph_blocks - 1][vit_s];
	best_s = vit_s;
	}
	}
	vit_path[graph_blocks - 1] = best_s;
	for (vit_gi = graph_blocks - 2; vit_gi >= 0; vit_gi--)
	vit_path[vit_gi] = vit_back[vit_gi + 1][vit_path[vit_gi + 1]];
	}

	/* ── Step E: Map Viterbi path → best_candidate[] ── */
	for (vit_gi = 0; vit_gi < graph_blocks; vit_gi++) {
	vit_s = vit_path[vit_gi];
	int qi_d = vit_s / 6;
	int qi_m = vit_s % 6;
	int64_t blk_rep = vit_gi * stride;

	/* Stride-representative block: use precomputed bin winner */
	if (vit_bin_cand[vit_gi][vit_s] >= 0)
	best_candidate[blk_rep] = vit_bin_cand[vit_gi][vit_s];

	/* Non-representative blocks in the stride group */
	for (vit_b = blk_rep + 1;
	vit_b < (vit_gi + 1) * stride && vit_b < n_blocks;
	vit_b++) {
	int vit_c;
	float best_e = 1e30f;
	int best_c = best_candidate[blk_rep];
	for (vit_c = 0; vit_c < TOTAL_SCALE_CANDIDATES; vit_c++) {
	if (CAND_TO_QUHIT[vit_c / N_CAND_M] != qi_d) continue;
	if (CAND_TO_QUHIT[vit_c % N_CAND_M] != qi_m) continue;
	if (candidate_errors[vit_b][vit_c] < best_e) {
	best_e = candidate_errors[vit_b][vit_c];
	best_c = vit_c;
	}
	}
	best_candidate[vit_b] = best_c;
	}
	}

	/* ── Step F: 5 % greedy override (pure MSE safety net) ── */
	for (vit_b = 0; vit_b < n_blocks; vit_b++) {
	int vit_c;
	float cur_err = candidate_errors[vit_b][best_candidate[vit_b]];
	float g_best = cur_err;
	int g_cand = best_candidate[vit_b];
	for (vit_c = 0; vit_c < TOTAL_SCALE_CANDIDATES; vit_c++) {
	if (candidate_errors[vit_b][vit_c] < g_best) {
	g_best = candidate_errors[vit_b][vit_c];
	g_cand = vit_c;
	}
	}
	if (g_best < cur_err * HEX_GREEDY_OVERRIDE_RATIO)
	best_candidate[vit_b] = g_cand;
	}

	free(vit_path);
	free(vit_dp);
	free(vit_back);
	free(vit_bin_err);
	free(vit_bin_cand);
	free(vit_log_pri);
	}

	free(coarse_marg);
	free(fine_marg);
	hpc_destroy(graph);
	}
	} else {
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	float best_err = candidate_errors[blk][0];
	int best_idx = 0;
	for (int c = 1; c < TOTAL_SCALE_CANDIDATES; c++) {
	if (candidate_errors[blk][c] < best_err) {
	best_err = candidate_errors[blk][c];
	best_idx = c;
	}
	}
	best_candidate[blk] = best_idx;
	}
	}

	/* ══════════════════════════════════════════════════════════════════
	* PHASE 3.9 — ROLLING DC BOUNDARY CONDITION PRE-PASS
	*
	* Transforms the tensor from a collection of isolated 256-element
	* Q2_K superblocks into a single, continuous error-cancelling waveform.
	*
	* After Phase 3 has selected the optimal (d, dmin) candidate for every
	* block, this sequential pass computes the net DC residual left by each
	* block using a cheap round-nearest forward quantization, then feeds the
	* negated, exponentially-decayed residual as a correction bias into the
	* WLS solver of the immediately following block.
	*
	* Mathematically, for block N with final DC residual R_N = Σ εᵢ:
	*
	* dc_bias[N+1] = −DC_DECAY × R_N / QK_K (per-element offset)
	*
	* Block N+1's WLS targets become x′ᵢ = xᵢ − dc_bias[N+1], steering the
	* quantizer toward codes whose reconstruction deq ≈ x′, so that
	*
	* Σ (xᵢ − deqᵢ) ≈ dc_bias[N+1] × QK_K = −DC_DECAY × R_N
	*
	* The accumulated cross-block DC collapses geometrically:
	*
	* R₀, DC_DECAY·R₀, DC_DECAY²·R₀, … → 0
	*
	* The result is written into block_dc_bias[n_blocks]. Phase 4 reads
	* this array (safe: written sequentially before the parallel loop).
	* ══════════════════════════════════════════════════════════════════ */

	#define DC_DECAY 0.85f /* Boundary-condition leak factor (0 = isolated, 1 = full) */

	float block_dc_bias = (float )calloc(n_blocks, sizeof(float));

	if (block_dc_bias) {
	float rolling_dc = 0.0f;

	for (int64_t blk = 0; blk < n_blocks; blk++) {
	const float bx = weights + blk QK_K;
	int cidx = best_candidate[blk];
	float dm0 = gguf_fp16_to_fp32(candidate_d [blk][cidx]);
	float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);

	uint8_t dc_Ls[16], dc_Lm[16];
	hex_derive_subscales(seeds[blk].scales, seeds[blk].mins,
	dm0, mm0, dc_Ls, dc_Lm);

	/* Bias applied to THIS block's WLS targets */
	float dc_bias = (DC_DECAY * rolling_dc) / (float)QK_K;
	block_dc_bias[blk] = dc_bias;

	/* Quick round-nearest quant to estimate DC residual for NEXT block.
	* We quantize the adjusted target x′ = x − dc_bias, then measure
	* the residual of the ORIGINAL weight against the chosen code. */
	float dc_res = 0.0f;
	int j, k;
	for (j = 0; j < N_SUB; j++) {
	float d_sub = dm0 * (float)dc_Ls[j];
	float m_sub = mm0 * (float)dc_Lm[j];
	for (k = 0; k < 16; k++) {
	float x_adj = bx[16*j + k] - dc_bias;
	int q = 0;
	if (d_sub >= 1e-15f) {
	q = gguf_nearest_int((x_adj + m_sub) / d_sub);
	if (q < 0) q = 0;
	if (q > 3) q = 3;
	}
	float deq = d_sub * (float)q - m_sub;
	/* Residual against ORIGINAL weight (not adjusted) */
	dc_res += bx[16*j + k] - deq;
	}
	}
	rolling_dc = dc_res;
	}
	}

	/* ══════════════════════════════════════════════════════════════════
	* PHASE 4: Assemble blocks via least-squares (d, dmin) extraction
	* ══════════════════════════════════════════════════════════════════ */

	int _n_omp_threads = 1;
	#ifdef _OPENMP
	_n_omp_threads = omp_get_max_threads();
	#endif
	HPCGraph _tl_graphs = (HPCGraph )calloc(_n_omp_threads, sizeof(HPCGraph *));
	for (int _ti = 0; _ti < _n_omp_threads; _ti++)
	_tl_graphs[_ti] = hpc_create(N_SUB);

	#pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err)
	for (int64_t blk = 0; blk < n_blocks; blk++) {
	const float block_x = weights + blk QK_K;
	int cidx = best_candidate[blk];
	uint8_t Ls_blk[16], Lm_blk[16];

	/* ── Rolling DC boundary condition ──────────────────────────────
	* dc_adj shifts every WLS target in this block so that the net
	* quantisation error steers toward cancelling the previous block's
	* DC residual (written by the sequential Phase 3.9 pre-pass). */
	float dc_adj = (block_dc_bias) ? block_dc_bias[blk] : 0.0f;

	/* Adjusted weight view — WLS and Shor work on this array;
	* the final error is always reported against the original block_x. */
	float adj_block_x[QK_K];
	{
	int _i;
	for (_i = 0; _i < QK_K; _i++)
	adj_block_x[_i] = block_x[_i] - dc_adj;
	}

	float dm = gguf_fp16_to_fp32(candidate_d[blk][cidx]);
	float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);

	hex_derive_subscales(seeds[blk].scales, seeds[blk].mins,
	dm, mm, Ls_blk, Lm_blk);

	uint16_t prev_dm16 = 0, prev_mm16 = 0;
	for (int ls_iter = 0; ls_iter < 5; ls_iter++) {

	uint8_t state_ls[N_SUB][6];
	uint8_t state_lm[N_SUB][6];
	float state_err[N_SUB][6];

	for (int j = 0; j < N_SUB; j++) {
	const float sx = adj_block_x + 16 j;
	for (int v = 0; v < 6; v++) state_err[j][v] = 1e30f;

	for (int try_ls = 0; try_ls <= 15; try_ls++) {
	float d_sub = dm * (float)try_ls;
	for (int try_lm = 0; try_lm <= 15; try_lm++) {
	float m_sub = mm * (float)try_lm;
	float sub_err = 0.0f;

	for (int k = 0; k < 16; k++) {
	float x = sx[k];
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + 16*j + k] : 1.0f;
	int q = 0;
	if (d_sub >= 1e-15f) {
	q = gguf_nearest_int((x + m_sub) / d_sub);
	if (q < 0) q = 0; if (q > 3) q = 3;
	}
	float deq = d_sub * (float)q - m_sub;
	float diff = x - deq;
	sub_err += diff * diff * w;
	}

	for (int v = 0; v < 6; v++) {
	if (sub_err < state_err[j][v]) {
	for (int u = 5; u > v; u--) {
	state_err[j][u] = state_err[j][u-1];
	state_ls[j][u] = state_ls[j][u-1];
	state_lm[j][u] = state_lm[j][u-1];
	}
	state_err[j][v] = sub_err;
	state_ls[j][v] = (uint8_t)try_ls;
	state_lm[j][v] = (uint8_t)try_lm;
	break;
	}
	}
	}
	}
	}

	int _tid = 0;
	#ifdef _OPENMP
	_tid = omp_get_thread_num();
	#endif
	HPCGraph *sg = _tl_graphs[_tid];
	hpc_reset_for_subblock(sg, N_SUB);
	{
	float min_sub_err[N_SUB];
	for (int j = 0; j < N_SUB; j++) min_sub_err[j] = state_err[j][0];

	for (int j = 0; j < N_SUB; j++) {
	triality_dft(&sg->locals[j]);
	double amp_re[6];
	double amp_norm = 0.0;
	for (int v = 0; v < 6; v++) {
	float err_spread = state_err[j][5] - state_err[j][0];
	float sub_temp = (err_spread > 1e-15f) ? err_spread * 0.3f : 0.1f;
	if (sub_temp < 1e-12f) sub_temp = 1e-12f;
	amp_re[v] = exp(-(double)(state_err[j][v] - min_sub_err[j]) / (double)sub_temp);
	amp_norm += amp_re[v] * amp_re[v];
	}
	if (amp_norm > 1e-30) {
	double inv = 1.0 / sqrt(amp_norm);
	for (int v = 0; v < 6; v++) amp_re[v] *= inv;
	}
	for (int v = 0; v < 6; v++) {
	sg->locals[j].edge_re[v] = amp_re[v];
	sg->locals[j].edge_im[v] = 0.0;
	}
	sg->locals[j].primary = VIEW_EDGE;
	sg->locals[j].dirty = DIRTY_VERTEX \| DIRTY_DIAGONAL \| DIRTY_FOLDED;
	sg->locals[j].delta_valid = 0;
	triality_update_mask(&sg->locals[j]);
	}

	for (int j = 0; j < N_SUB - 1; j++)
	hpc_cz(sg, j, j + 1);

	double sub_marg[N_SUB][6];
	int sub_measured[N_SUB];
	memset(sub_marg, 0, sizeof(sub_marg));
	memset(sub_measured, 0, sizeof(sub_measured));

	shor_measure_graph(sg, N_SUB, sub_marg, sub_measured, 1);

	for (int j = 0; j < N_SUB; j++) {
	double best_prob = -1.0;
	int best_v = 0;
	for (int v = 0; v < 6; v++) {
	if (sub_marg[j][v] > best_prob) {
	best_prob = sub_marg[j][v];
	best_v = v;
	}
	}
	Ls_blk[j] = state_ls[j][best_v];
	Lm_blk[j] = state_lm[j][best_v];
	}
	}

	uint8_t L[QK_K];
	for (int j = 0; j < N_SUB; j++) {
	float d_sub = dm * (float)Ls_blk[j];
	float m_sub = mm * (float)Lm_blk[j];
	if (d_sub < 1e-15f) {
	for (int k = 0; k < 16; k++) L[16*j+k] = 0;
	continue;
	}
	for (int k = 0; k < 16; k++) {
	int q = gguf_nearest_int((adj_block_x[16*j+k] + m_sub) / d_sub);
	if (q < 0) q = 0; if (q > 3) q = 3;
	L[16*j+k] = (uint8_t)q;
	}
	}

	double Saa = 0, Sab = 0, Sbb = 0, Sxa = 0, Sxb = 0;
	for (int j = 0; j < N_SUB; j++) {
	float ls_f = (float)Ls_blk[j];
	float lm_f = (float)Lm_blk[j];
	for (int k = 0; k < 16; k++) {
	float x = adj_block_x[16*j+k];
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + 16*j+k] : 1.0f;
	float a = ls_f * (float)L[16*j+k];
	float b = lm_f;
	Saa += w * a * a;
	Sab += w * a * b;
	Sbb += w * b * b;
	Sxa += w * x * a;
	Sxb += w * x * b;
	}
	}

	double det = Saa * Sbb - Sab * Sab;
	if (fabs(det) > 1e-30) {
	double d_new = (Sbb * Sxa - Sab * Sxb) / det;
	double dm_new = (Sab * Sxa - Saa * Sxb) / det;
	float d_seed = gguf_fp16_to_fp32(candidate_d[blk][cidx]);
	float m_seed = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
	if (d_new > 0.0 && d_new < 4.0 * (d_seed + 1e-10))
	dm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_new));
	if (dm_new > 0.0 && dm_new < 4.0 * (m_seed + 1e-10))
	mm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)dm_new));
	}

	uint16_t cur_dm16 = gguf_fp32_to_fp16(dm);
	uint16_t cur_mm16 = gguf_fp32_to_fp16(mm);
	if (cur_dm16 == prev_dm16 && cur_mm16 == prev_mm16) break;
	prev_dm16 = cur_dm16;
	prev_mm16 = cur_mm16;
	}

	/* ── FP16 ULP neighborhood search for (d, dmin) — Expanded to ±8 ── */
	{
	uint16_t base_d16 = gguf_fp32_to_fp16(dm);
	uint16_t base_m16 = gguf_fp32_to_fp16(mm);
	uint16_t best_d16 = base_d16, best_m16 = base_m16;
	float best_ulp_err = 1e30f;

	for (int dd = -8; dd <= 8; dd++) {
	int cd16 = (int)base_d16 + dd;
	if (cd16 < 0 \|\| cd16 > 0x7BFF) continue;
	float trial_dm = gguf_fp16_to_fp32((uint16_t)cd16);

	for (int dm_delta = -8; dm_delta <= 8; dm_delta++) {
	int cm16 = (int)base_m16 + dm_delta;
	if (cm16 < 0 \|\| cm16 > 0x7BFF) continue;
	float trial_mm = gguf_fp16_to_fp32((uint16_t)cm16);

	float err = 0.0f;
	for (int j = 0; j < N_SUB; j++) {
	float d_sub = trial_dm * (float)Ls_blk[j];
	float m_sub = trial_mm * (float)Lm_blk[j];
	for (int k = 0; k < 16; k++) {
	float x = adj_block_x[16*j+k];
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + 16*j+k] : 1.0f;
	int q;
	if (d_sub < 1e-15f) { q = 0; }
	else {
	q = gguf_nearest_int((x + m_sub) / d_sub);
	if (q < 0) q = 0; if (q > 3) q = 3;
	}
	float deq = d_sub * (float)q - m_sub;
	float diff = x - deq;
	err += diff * diff * w;
	}
	}
	if (err < best_ulp_err) {
	best_ulp_err = err;
	best_d16 = (uint16_t)cd16;
	best_m16 = (uint16_t)cm16;
	}
	}
	}
	dm = gguf_fp16_to_fp32(best_d16);
	mm = gguf_fp16_to_fp32(best_m16);
	}

	for (int j = 0; j < N_SUB; j++) {
	const float sx = adj_block_x + 16 j;
	float best_sub_err = 1e30f;
	uint8_t best_ls = Ls_blk[j], best_lm = Lm_blk[j];
	for (int try_ls = 0; try_ls <= 15; try_ls++) {
	float d_sub = dm * (float)try_ls;
	for (int try_lm = 0; try_lm <= 15; try_lm++) {
	float m_sub = mm * (float)try_lm;
	float sub_err = 0.0f;
	for (int k = 0; k < 16; k++) {
	float x = sx[k];
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + 16*j + k] : 1.0f;
	int q;
	if (d_sub < 1e-15f) { q = 0; }
	else {
	q = gguf_nearest_int((x + m_sub) / d_sub);
	if (q < 0) q = 0; if (q > 3) q = 3;
	}
	float deq = d_sub * (float)q - m_sub;
	float diff = x - deq;
	sub_err += diff * diff * w;
	}
	if (sub_err < best_sub_err) {
	best_sub_err = sub_err;
	best_ls = (uint8_t)try_ls;
	best_lm = (uint8_t)try_lm;
	}
	}
	}
	Ls_blk[j] = best_ls;
	Lm_blk[j] = best_lm;
	}

	output[blk].d = gguf_fp32_to_fp16(dm);
	output[blk].dmin = gguf_fp32_to_fp16(mm);

	for (int j = 0; j < N_SUB; j++)
	output[blk].scales[j] = Ls_blk[j] \| (Lm_blk[j] << 4);

	/* ── Final quantization: D₆ Hadamard Greedy Descent (deterministic) ──
	*
	* The original Simulated Annealing acceptance rule is replaced by a
	* strict greedy descent: only accept a flip if it strictly reduces the
	* D₆ Hadamard metric (4·‖vesica‖² + DC²). This makes error shaping
	* fully deterministic and thread-safe (no rand() inside omp parallel),
	* consistent with the Viterbi philosophy applied in Phase 3.
	*
	* The metric measures both:
	* - Vesica Piscis term: correlated error between weights i and i+QK_K/2
	* (targets the first non-DC harmonic — halfwave symmetry)
	* - DC term: total signed error across the 256-weight superblock
	* (captured and propagated to the next block by Phase 3.9)
	*/
	uint8_t L[QK_K];
	{
	float q_cont_all[QK_K];
	int q_base_all[QK_K];
	int q_shaped_all[QK_K];

	for (int i = 0; i < QK_K; i++) {
	int jj = i >> 4;
	float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
	float m_s = mm * (float)(output[blk].scales[jj] >> 4);
	if (d_s < 1e-15f) {
	q_cont_all[i] = 0.0f;
	q_base_all[i] = 0;
	} else {
	/* Quantize the DC-adjusted target */
	float qc = (adj_block_x[i] + m_s) / d_s;
	q_cont_all[i] = qc;
	int qr = gguf_nearest_int(qc);
	if (qr < 0) qr = 0; if (qr > 3) qr = 3;
	q_base_all[i] = qr;
	}
	}
	memcpy(q_shaped_all, q_base_all, QK_K * sizeof(int));

	float e_live[QK_K];
	for (int i = 0; i < QK_K; i++) {
	int jj = i >> 4;
	float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
	float m_s = mm * (float)(output[blk].scales[jj] >> 4);
	/* Decoder semantics: deq = d_s·q − m_s, which is −m_s when
	* d_s == 0 (NOT 0 — the −dmin·lm term always applies). */
	float deq = d_s * (float)q_shaped_all[i] - m_s;
	/* Residual against the adjusted target (DC-corrected view) */
	e_live[i] = adj_block_x[i] - deq;
	}

	float v_live[QK_K / 2];
	float vesica_cur = 0.0f, dc_cur = 0.0f;
	for (int i = 0; i < QK_K / 2; i++) {
	v_live[i] = e_live[i] + e_live[i + QK_K / 2];
	vesica_cur += v_live[i] * v_live[i];
	}
	for (int i = 0; i < QK_K; i++) dc_cur += e_live[i];
	float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;

	/* Deterministic greedy descent: accept only strict improvements */
	for (int pass = 0; pass < QK_K; pass++) {
	int best_k = -1;
	int best_q_alt = 0;
	float best_delta = 0.0f; /* strictly positive threshold */

	for (int k = 0; k < QK_K; k++) {
	int jj = k >> 4;
	float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
	if (d_s < 1e-15f) continue;

	int q_cur = q_shaped_all[k];
	int q_try = (q_cont_all[k] - (float)q_cur >= 0.0f)
	? q_cur + 1 : q_cur - 1;
	if (q_try < 0 \|\| q_try > 3) continue;

	float m_s = mm * (float)(output[blk].scales[jj] >> 4);
	float e_new = adj_block_x[k] - (d_s * (float)q_try - m_s);
	float de = e_new - e_live[k];

	int pi = (k < QK_K / 2) ? k : k - QK_K / 2;
	float v_new = v_live[pi] + de;

	float vesica_alt = vesica_cur - v_live[pi]v_live[pi] + v_newv_new;
	float dc_alt = dc_cur + de;
	float delta = metric_cur - (4.0f * vesica_alt + dc_alt * dc_alt);

	if (delta > best_delta) {
	best_delta = delta;
	best_k = k;
	best_q_alt = q_try;
	}
	}

	if (best_k < 0) break; /* converged — no further improvement */

	q_shaped_all[best_k] = best_q_alt;
	{
	int jj_c = best_k >> 4;
	float d_c = dm * (float)(output[blk].scales[jj_c] & 0xF);
	float m_c = mm * (float)(output[blk].scales[jj_c] >> 4);
	float e_new_c = adj_block_x[best_k] - (d_c * (float)best_q_alt - m_c);
	float de_c = e_new_c - e_live[best_k];
	int pi_c = (best_k < QK_K / 2) ? best_k : best_k - QK_K / 2;
	float v_new_c = v_live[pi_c] + de_c;
	vesica_cur += v_new_c * v_new_c - v_live[pi_c] * v_live[pi_c];
	dc_cur += de_c;
	metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
	v_live[pi_c] = v_new_c;
	e_live[best_k]= e_new_c;
	}
	}

	/* Choose base vs shaped on the EXTENDED objective vs originals */
	float err_base = 0.0f, err_shaped = 0.0f;
	float e_qb[QK_K], e_qs[QK_K];
	for (int i = 0; i < QK_K; i++) {
	int jj = i >> 4;
	float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
	float m_s = mm * (float)(output[blk].scales[jj] >> 4);
	float w = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
	float deq_b = d_s * (float)q_base_all[i] - m_s; /* −m_s when d_s==0 */
	float deq_s = d_s * (float)q_shaped_all[i] - m_s;
	float xv = block_x[i]; /* original weight for error report */
	e_qb[i] = xv - deq_b;
	e_qs[i] = xv - deq_s;
	err_base += e_qb[i] * e_qb[i] * w;
	err_shaped += e_qs[i] * e_qs[i] * w;
	}
	err_base += hex_spectral_penalty(e_qb, QK_K);
	err_shaped += hex_spectral_penalty(e_qs, QK_K);
	{
	int use_shaped = (err_shaped <= err_base);
	for (int i = 0; i < QK_K; i++)
	L[i] = (uint8_t)(use_shaped ? q_shaped_all[i] : q_base_all[i]);
	}
	}

	/* ── Cross-weight error diffusion — intra-sub-block Floyd-Steinberg ──
	*
	* Implements cross-weight error diffusion within each 16-weight sub-block.
	* After the greedy descent has committed quantisation codes, the residual
	* of each weight is partially propagated forward to the next position in
	* the same sub-block (7/16 of the error), re-quantising if the diffused
	* target falls in a different bin.
	*
	* This is the "cross-weight" dimension of the error-diffusion request:
	* neighbouring weights share and partially absorb each other's rounding
	* error, shaping the within-block spectrum away from the DC component
	* that Phase 3.9 already propagates between blocks.
	*
	* Staying within sub-blocks avoids scale-mismatch artefacts that would
	* arise from diffusing across the dm * Ls[j] boundary between sub-blocks.
	*
	* The diffused codes are accepted only when they reduce the weighted MSE
	* against the ORIGINAL weight (not the adjusted target), so the diffusion
	* cannot increase the total reconstruction error.
	*/
	{
	int fs_j, fs_k;
	for (fs_j = 0; fs_j < N_SUB; fs_j++) {
	int base = fs_j * 16;
	float d_s = dm * (float)(output[blk].scales[fs_j] & 0xF);
	float m_s = mm * (float)(output[blk].scales[fs_j] >> 4);
	if (d_s < 1e-15f) continue;

	float carry = 0.0f; /* FS carry from position k-1 */

	for (fs_k = 0; fs_k < 16; fs_k++) {
	int idx = base + fs_k;
	float x_orig = block_x[idx];
	float x_adj = adj_block_x[idx] + carry; /* adjusted + diffused */

	/* Propose new code from diffused target */
	int q_fs = gguf_nearest_int((x_adj + m_s) / d_s);
	if (q_fs < 0) q_fs = 0; if (q_fs > 3) q_fs = 3;

	if (q_fs != (int)L[idx]) {
	/* Accept only when MSE against original weight improves */
	float w_imp = (imat_importance)
	? imat_importance[blk * QK_K + idx] : 1.0f;
	float deq_old = d_s * (float)L[idx] - m_s;
	float deq_new = d_s * (float)q_fs - m_s;
	float e_old = (x_orig - deq_old) * (x_orig - deq_old) * w_imp;
	float e_new = (x_orig - deq_new) * (x_orig - deq_new) * w_imp;
	if (e_new < e_old)
	L[idx] = (uint8_t)q_fs;
	}

	/* Propagate 7/16 of the residual (adj target vs committed code) */
	{
	float deq_final = d_s * (float)L[idx] - m_s;
	float residual = (adj_block_x[idx] - deq_final);
	carry = (fs_k < 15) ? residual * (7.0f / 16.0f) : 0.0f;
	}
	}
	}
	}

	/* ── Final closed-form (d, dmin) refit against the UNCLIPPED weights ──
	* (issues #2 / #5)
	*
	* Every earlier (d, dmin) solve fits the DC-adjusted, soft-clipped
	* target and runs BEFORE the greedy descent and Floyd-Steinberg passes
	* mutate the committed 2-bit codes. Once L[], and the 4-bit sub-block
	* scale codes (Ls = scales & 0xF, Lm = scales >> 4), are final, the two
	* fp16 scalars (d, dmin) that minimise the importance-weighted SSE
	* against the ORIGINAL weights have a closed form. Solve it and adopt it
	* only when it lowers the weighted block error — so it can never raise
	* RMSE, and because the integer codes are held fixed, the vesica/wave/DC
	* error shaping baked into them is preserved intact. */
	{
	double rSaa = 0, rSab = 0, rSbb = 0, rSxa = 0, rSxb = 0;
	double rA = 0, rB = 0, rS = 0; /* DC rank-1 augmentation */
	for (int j = 0; j < N_SUB; j++) {
	float ls_f = (float)(output[blk].scales[j] & 0xF);
	float lm_f = (float)(output[blk].scales[j] >> 4);
	for (int k = 0; k < 16; k++) {
	int idx = 16 * j + k;
	float x = block_x[idx]; /* unclipped original */
	float w = (imat_importance) ? imat_importance[blk * QK_K + idx] : 1.0f;
	float a = ls_f * (float)L[idx];
	float b = lm_f;
	rSaa += (double)w * a * a;
	rSab += (double)w * a * b;
	rSbb += (double)w * b * b;
	rSxa += (double)w * x * a;
	rSxb += (double)w * x * b;
	rA += a; rB += b; rS += x;
	}
	}
	/* DC term as one augmented observation (S ~ A·d − B·m), weight
	* λ_dc/n; vesica/wave handled by the extended-E acceptance. */
	{
	double rw = (double)HEX_DC_LAMBDA / (double)QK_K;
	rSaa += rw * rA * rA; rSab += rw * rA * rB;
	rSbb += rw * rB * rB; rSxa += rw * rS * rA;
	rSxb += rw * rS * rB;
	}
	double rdet = rSaa * rSbb - rSab * rSab;
	if (fabs(rdet) > 1e-30) {
	double d_ref = (rSbb * rSxa - rSab * rSxb) / rdet;
	double m_ref = (rSab * rSxa - rSaa * rSxb) / rdet;
	if (d_ref > 0.0) {
	float dm_try = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_ref));
	float mm_try = (m_ref > 0.0)
	? gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)m_ref))
	: mm;
	/* Extended-objective acceptance test vs original weights. */
	float err_cur = 0.0f, err_try = 0.0f;
	float e_rc[QK_K], e_rt[QK_K];
	for (int j = 0; j < N_SUB; j++) {
	float ls_f = (float)(output[blk].scales[j] & 0xF);
	float lm_f = (float)(output[blk].scales[j] >> 4);
	for (int k = 0; k < 16; k++) {
	int idx = 16 * j + k;
	float x = block_x[idx];
	float w = (imat_importance) ? imat_importance[blk * QK_K + idx] : 1.0f;
	float qf = (float)L[idx];
	float dc = dm * ls_f * qf - mm * lm_f;
	float dt = dm_try * ls_f * qf - mm_try * lm_f;
	e_rc[idx] = x - dc;
	e_rt[idx] = x - dt;
	err_cur += e_rc[idx] * e_rc[idx] * w;
	err_try += e_rt[idx] * e_rt[idx] * w;
	}
	}
	err_cur += hex_spectral_penalty(e_rc, QK_K);
	err_try += hex_spectral_penalty(e_rt, QK_K);
	if (err_try < err_cur) { dm = dm_try; mm = mm_try; }
	}
	}
	output[blk].d = gguf_fp32_to_fp16(dm);
	output[blk].dmin = gguf_fp32_to_fp16(mm);
	}

	/* ══ PHASE 4.6: MONOTONE COORDINATE-DESCENT POLISH (RMSE-guaranteed) ══
	*
	* Objective-function mismatch fix: the final passes that commit the
	* 2-bit codes — the 16×16 (ls, lm) sub-block search, the ±8 ULP
	* (d, dmin) neighborhood search, and the greedy-descent error shaping
	* — all minimise error against the DC-ADJUSTED target adj_block_x.
	* The reported RMSE, however, is measured against the ORIGINAL
	* weights. The codes are therefore stranded at the optimum of a
	* SHIFTED objective, while only the scalar (d, dmin) refit above
	* targets the true one (and it holds all codes frozen).
	*
	* This polish runs alternating coordinate descent on the TRUE
	* objective (importance-weighted SSE vs the original weights):
	*
	* (1) For each 16-weight sub-block, an exact joint re-search of
	* (ls, lm) over the full 16×16 grid with per-weight optimal
	* q ∈ {0..3}, committed only on strict improvement of the
	* extended objective E. With λ_dc = λ_vw = 0 sub-blocks are
	* independent given (d, dmin); with spectral terms active the
	* coupling (DC: all subs; fold: sub j ↔ sub j⊕8) is handled
	* exactly via live residual bookkeeping.
	* (2) Closed-form weighted LS refit of the two fp16 scalars
	* (d, dmin) with all codes held fixed, committed only on
	* strict improvement (same guard as the refit above).
	*
	* All moves are accept-only-if-better on E ⇒ the extended block
	* objective is monotonically non-increasing; at λ = 0 this reduces
	* to RMSE-monotone (final RMSE can only go DOWN relative to the
	* unpatched pipeline), at λ > 0 small RMSE giveback is permitted
	* exactly where it buys dot-product error cancellation. The state space is finite
	* (4-bit codes, fp16 scalars), so the loop terminates; in practice
	* it converges in 2–3 sweeps. The vesica/DC spectral shaping baked
	* into L survives wherever it is SSE-neutral, and is overridden
	* only where it was costing true reconstruction error. */
	{
	uint8_t pl_Ls[16], pl_Lm[16];
	for (int j = 0; j < N_SUB; j++) {
	pl_Ls[j] = output[blk].scales[j] & 0xF;
	pl_Lm[j] = output[blk].scales[j] >> 4;
	}

	for (int pol_iter = 0; pol_iter < 6; pol_iter++) {
	int pol_improved = 0;

	/* ── (1) Exact per-sub-block (ls, lm, q) re-search on the
	* EXTENDED objective. Under the spectral terms sub-blocks
	* are no longer independent: every sub couples to all others
	* through the DC term and to its fold partner (sub j ⊕ 8,
	* i.e. weights i ↔ i+128) through vesica² − wave². The
	* search therefore keeps live residuals pe[] and scores each
	* candidate against the whole-block penalty with the partner
	* residuals held fixed — exact coordinate descent on E. */
	float pe[QK_K];
	float sub_sse[16], sub_dc[16], pair_cross[8];
	float dc_tot = 0.0f, cross_tot = 0.0f;
	for (int j = 0; j < N_SUB; j++) {
	float d_sub = dm * (float)pl_Ls[j];
	float m_sub = mm * (float)pl_Lm[j];
	sub_sse[j] = 0.0f;
	sub_dc[j] = 0.0f;
	for (int k = 0; k < 16; k++) {
	int idx = 16 * j + k;
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + idx] : 1.0f;
	/* deq = d·ls·q − dmin·lm; equals −m_sub at ls==0 */
	float e = block_x[idx] - (d_sub * (float)L[idx] - m_sub);
	pe[idx] = e;
	sub_sse[j] += e * e * w;
	sub_dc[j] += e;
	}
	dc_tot += sub_dc[j];
	}
	for (int p = 0; p < 8; p++) {
	pair_cross[p] = 0.0f;
	for (int k = 0; k < 16; k++)
	pair_cross[p] += pe[16p + k] pe[16*(p+8) + k];
	cross_tot += pair_cross[p];
	}

	for (int j = 0; j < N_SUB; j++) {
	const float sx = block_x + 16 j;
	int pi = j & 7; /* fold-pair index */
	int pj = j ^ 8; /* partner sub-block */
	const float ppe = pe + 16 pj; /* partner residuals */
	float dc_rest = dc_tot - sub_dc[j];
	float cross_rest = cross_tot - pair_cross[pi];

	/* Extended score of the CURRENT committed state */
	float best_sub = sub_sse[j]
	+ (HEX_DC_LAMBDA / (float)QK_K) * dc_tot * dc_tot
	+ (HEX_VW_LAMBDA / (float)QK_K) * 4.0f * cross_tot;
	int best_ls = -1, best_lm = 0;
	uint8_t best_q[16];
	float best_e[16];
	float best_sse = 0.0f, best_dcc = 0.0f, best_cxc = 0.0f;

	for (int try_ls = 0; try_ls <= 15; try_ls++) {
	float d_sub = dm * (float)try_ls;
	for (int try_lm = 0; try_lm <= 15; try_lm++) {
	float m_sub = mm * (float)try_lm;
	float sub_err = 0.0f, dcc = 0.0f, cxc = 0.0f;
	uint8_t q_loc[16];
	float e_loc[16];
	int aborted = 0;
	for (int k = 0; k < 16; k++) {
	float x = sx[k];
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + 16*j + k] : 1.0f;
	int q = 0;
	if (d_sub >= 1e-15f) {
	q = gguf_nearest_int((x + m_sub) / d_sub);
	if (q < 0) q = 0; if (q > 3) q = 3;
	}
	q_loc[k] = (uint8_t)q;
	/* deq = d·ls·q − dmin·lm; −m_sub at ls==0 */
	float e = x - (d_sub * (float)q - m_sub);
	e_loc[k] = e;
	sub_err += e * e * w;
	dcc += e;
	cxc += e * ppe[k];
	/* SSE-partial prune is a valid lower bound
	* only while the spectral terms are ≥ 0,
	* i.e. when the (signable) vw credit is off */
	if (HEX_VW_LAMBDA == 0.0f &&
	sub_err >= best_sub) { aborted = 1; break; }
	}
	if (aborted) continue;
	float score = sub_err
	+ (HEX_DC_LAMBDA / (float)QK_K)
	* (dc_rest + dcc) * (dc_rest + dcc)
	+ (HEX_VW_LAMBDA / (float)QK_K) * 4.0f
	* (cross_rest + cxc);
	if (score < best_sub) {
	best_sub = score;
	best_ls = try_ls;
	best_lm = try_lm;
	memcpy(best_q, q_loc, 16);
	memcpy(best_e, e_loc, sizeof(e_loc));
	best_sse = sub_err;
	best_dcc = dcc;
	best_cxc = cxc;
	}
	}
	}

	if (best_ls >= 0) { /* strict improvement in E found */
	pl_Ls[j] = (uint8_t)best_ls;
	pl_Lm[j] = (uint8_t)best_lm;
	memcpy(L + 16 * j, best_q, 16);
	memcpy(pe + 16 * j, best_e, sizeof(best_e));
	sub_sse[j] = best_sse;
	sub_dc[j] = best_dcc;
	pair_cross[pi] = best_cxc;
	dc_tot = dc_rest + best_dcc;
	cross_tot = cross_rest + best_cxc;
	pol_improved = 1;
	}
	}

	/* ── (2) Closed-form (d, dmin) refit vs ORIGINAL, codes fixed ── */
	{
	double pSaa = 0, pSab = 0, pSbb = 0, pSxa = 0, pSxb = 0;
	double pA = 0, pB = 0, pS = 0; /* DC rank-1 augmentation */
	for (int j = 0; j < N_SUB; j++) {
	float ls_f = (float)pl_Ls[j];
	float lm_f = (float)pl_Lm[j];
	for (int k = 0; k < 16; k++) {
	int idx = 16 * j + k;
	float x = block_x[idx];
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + idx] : 1.0f;
	float a = ls_f * (float)L[idx];
	float b = lm_f;
	pSaa += (double)w * a * a;
	pSab += (double)w * a * b;
	pSbb += (double)w * b * b;
	pSxa += (double)w * x * a;
	pSxb += (double)w * x * b;
	pA += a; pB += b; pS += x;
	}
	}
	{
	double pw = (double)HEX_DC_LAMBDA / (double)QK_K;
	pSaa += pw * pA * pA; pSab += pw * pA * pB;
	pSbb += pw * pB * pB; pSxa += pw * pS * pA;
	pSxb += pw * pS * pB;
	}
	double pdet = pSaa * pSbb - pSab * pSab;
	if (fabs(pdet) > 1e-30) {
	double d_ref = (pSbb * pSxa - pSab * pSxb) / pdet;
	double m_ref = (pSab * pSxa - pSaa * pSxb) / pdet;
	if (d_ref > 0.0) {
	float dm_try = gguf_fp16_to_fp32(
	gguf_fp32_to_fp16((float)d_ref));
	float mm_try = (m_ref > 0.0)
	? gguf_fp16_to_fp32(
	gguf_fp32_to_fp16((float)m_ref))
	: mm;
	float err_cur = 0.0f, err_try = 0.0f;
	float e_pc[QK_K], e_pt[QK_K];
	for (int j = 0; j < N_SUB; j++) {
	float ls_f = (float)pl_Ls[j];
	float lm_f = (float)pl_Lm[j];
	for (int k = 0; k < 16; k++) {
	int idx = 16 * j + k;
	float x = block_x[idx];
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + idx] : 1.0f;
	float qf = (float)L[idx];
	float dc = dm * ls_f * qf - mm * lm_f;
	float dt = dm_try * ls_f * qf - mm_try * lm_f;
	e_pc[idx] = x - dc;
	e_pt[idx] = x - dt;
	err_cur += e_pc[idx] * e_pc[idx] * w;
	err_try += e_pt[idx] * e_pt[idx] * w;
	}
	}
	err_cur += hex_spectral_penalty(e_pc, QK_K);
	err_try += hex_spectral_penalty(e_pt, QK_K);
	if (err_try < err_cur) {
	dm = dm_try;
	mm = mm_try;
	pol_improved = 1;
	}
	}
	}
	}

	if (!pol_improved) {
	/* ── (3) ±2 ULP joint (d, dmin) micro-search vs ORIGINAL ──
	* The closed-form refit rounds its real-valued optimum to
	* fp16, which can land 1–2 ULP away from the best
	* representable pair (and the earlier ±8 ULP search ran
	* against the DC-shifted objective). With codes fixed,
	* scan the (2·HEX_POLISH_ULP+1)² fp16 neighborhood on the
	* true objective;
	* accept only strict improvement, then loop once more so
	* move (1) can re-optimise codes for the new scalars.
	* Monotone ⇒ final RMSE can only decrease. */
	uint16_t base_d16 = gguf_fp32_to_fp16(dm);
	uint16_t base_m16 = gguf_fp32_to_fp16(mm);

	float cur_err = 0.0f;
	float e_u[QK_K];
	for (int j = 0; j < N_SUB; j++) {
	float d_sub = dm * (float)pl_Ls[j];
	float m_sub = mm * (float)pl_Lm[j];
	for (int k = 0; k < 16; k++) {
	int idx = 16 * j + k;
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + idx] : 1.0f;
	e_u[idx] = block_x[idx] -
	(d_sub * (float)L[idx] - m_sub);
	cur_err += e_u[idx] * e_u[idx] * w;
	}
	}
	cur_err += hex_spectral_penalty(e_u, QK_K);

	float best_err = cur_err;
	uint16_t best_d16 = base_d16, best_m16 = base_m16;
	for (int dd = -HEX_POLISH_ULP; dd <= HEX_POLISH_ULP; dd++) {
	int cd16 = (int)base_d16 + dd;
	if (cd16 < 0 \|\| cd16 > 0x7BFF) continue;
	float t_dm = gguf_fp16_to_fp32((uint16_t)cd16);
	for (int dmm = -HEX_POLISH_ULP; dmm <= HEX_POLISH_ULP; dmm++) {
	if (dd == 0 && dmm == 0) continue;
	int cm16 = (int)base_m16 + dmm;
	if (cm16 < 0 \|\| cm16 > 0x7BFF) continue;
	float t_mm = gguf_fp16_to_fp32((uint16_t)cm16);

	float err = 0.0f;
	/* SSE-partial prune valid only without the
	* signable vesica/wave credit */
	for (int j = 0;
	j < N_SUB && (HEX_VW_LAMBDA != 0.0f \|\|
	err < best_err); j++) {
	float d_sub = t_dm * (float)pl_Ls[j];
	float m_sub = t_mm * (float)pl_Lm[j];
	for (int k = 0; k < 16; k++) {
	int idx = 16 * j + k;
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + idx] : 1.0f;
	e_u[idx] = block_x[idx] -
	(d_sub * (float)L[idx] - m_sub);
	err += e_u[idx] * e_u[idx] * w;
	}
	}
	if (HEX_DC_LAMBDA != 0.0f \|\| HEX_VW_LAMBDA != 0.0f)
	err = (err < best_err \|\| HEX_VW_LAMBDA != 0.0f)
	? err + hex_spectral_penalty(e_u, QK_K)
	: err;
	if (err < best_err) {
	best_err = err;
	best_d16 = (uint16_t)cd16;
	best_m16 = (uint16_t)cm16;
	}
	}
	}
	if (best_d16 != base_d16 \|\| best_m16 != base_m16) {
	dm = gguf_fp16_to_fp32(best_d16);
	mm = gguf_fp16_to_fp32(best_m16);
	pol_improved = 1;
	}
	}

	if (!pol_improved) break; /* converged on true objective */
	}

	/* Write back polished codes and scalars */
	for (int j = 0; j < N_SUB; j++)
	output[blk].scales[j] = pl_Ls[j] \| (pl_Lm[j] << 4);
	output[blk].d = gguf_fp32_to_fp16(dm);
	output[blk].dmin = gguf_fp32_to_fp16(mm);
	}

	/* ══ PHASE 4.7: CANDIDATE FLOOR (worst-case bound) ══
	*
	* candidate_errors[blk][c] is the EXACT weighted SSE of a directly
	* encodable configuration (fp16 d/dmin + derived Ls/Lm + nearest
	* rounding vs the original weights). The multi-stage assembly
	* (DC-shifted WLS, shaping, diffusion, polish) usually improves on
	* its seed, but each stage optimises a slightly different objective
	* and coordinate descent can land in a worse basin. Compare the
	* finished block against the best raw candidate and fall back when
	* the pipeline ended up worse — guaranteeing
	* final weighted SSE ≤ min_c candidate_errors[blk][c]. */
	{
	float fin_err = 0.0f;
	float e_f[QK_K];
	for (int j = 0; j < N_SUB; j++) {
	float d_sub = dm * (float)(output[blk].scales[j] & 0xF);
	float m_sub = mm * (float)(output[blk].scales[j] >> 4);
	for (int k = 0; k < 16; k++) {
	int idx = 16 * j + k;
	float w = (imat_importance) ?
	imat_importance[blk * QK_K + idx] : 1.0f;
	e_f[idx] = block_x[idx] -
	(d_sub * (float)L[idx] - m_sub);
	fin_err += e_f[idx] * e_f[idx] * w;
	}
	}
	fin_err += hex_spectral_penalty(e_f, QK_K);

	float g_best = candidate_errors[blk][0];
	int g_cand = 0;
	for (int c = 1; c < TOTAL_SCALE_CANDIDATES; c++) {
	if (candidate_errors[blk][c] < g_best) {
	g_best = candidate_errors[blk][c];
	g_cand = c;
	}
	}

	if (g_best < fin_err) {
	/* Rebuild the block exactly as the candidate was scored */
	float c_dm = gguf_fp16_to_fp32(candidate_d [blk][g_cand]);
	float c_mm = gguf_fp16_to_fp32(candidate_dmin[blk][g_cand]);
	uint8_t c_Ls[16], c_Lm[16];
	hex_derive_subscales(seeds[blk].scales, seeds[blk].mins,
	c_dm, c_mm, c_Ls, c_Lm);
	for (int j = 0; j < N_SUB; j++) {
	float d_sub = c_dm * (float)c_Ls[j];
	float m_sub = c_mm * (float)c_Lm[j];
	for (int k = 0; k < 16; k++) {
	int idx = 16 * j + k;
	int q = 0;
	if (d_sub >= 1e-15f) {
	q = gguf_nearest_int((block_x[idx] + m_sub) / d_sub);
	if (q < 0) q = 0; if (q > 3) q = 3;
	}
	L[idx] = (uint8_t)q;
	}
	output[blk].scales[j] = c_Ls[j] \| (c_Lm[j] << 4);
	}
	dm = c_dm; mm = c_mm;
	output[blk].d = candidate_d [blk][g_cand];
	output[blk].dmin = candidate_dmin[blk][g_cand];
	}
	}

	for (int j = 0; j < QK_K; j += 128) {
	for (int l = 0; l < 32; l++) {
	output[blk].qs[j / 4 + l] = L[j + l]
	\| (L[j + l + 32] << 2)
	\| (L[j + l + 64] << 4)
	\| (L[j + l + 96] << 6);
	}
	}

	float berr = gguf_q2_k_block_error(block_x, &output[blk]);
	if (isnan(berr)) {
	printf("NaN block error at blk %ld! dm=%f mm=%f\n", (long)blk, dm, mm);
	for (int j=0; j<16; j++) printf("Ls[%d]=%d Lm[%d]=%d\n", j, Ls_blk[j], j, Lm_blk[j]);
	exit(1);
	}
	total_err += berr;
	}

	for (int _ti = 0; _ti < _n_omp_threads; _ti++)
	hpc_destroy(_tl_graphs[_ti]);
	free(_tl_graphs);

	free(block_dc_bias);
	free(seeds);
	free(candidate_errors);
	free(candidate_d);
	free(candidate_dmin);
	free(best_candidate);
	if (out_total_error) *out_total_error = total_err;

	if (verbose) {
	float rmse = sqrtf(total_err / (float)n_elements);

	double w_sum2 = 0.0;
	for (int64_t i = 0; i < n_elements; i++)
	w_sum2 += (double)weights[i] * (double)weights[i];
	w_sigma = (float)sqrt(w_sum2 / (double)n_elements);
	float rmse_over_sigma = (w_sigma > 1e-15f) ? rmse / w_sigma : 0.0f;

	const char *fidelity_class;
	const char *fidelity_icon;
	if (rmse <= 1.0e-04f) {
	fidelity_class = "ULTRA (≤1e-04)";
	fidelity_icon = "★★★★";
	} else if (rmse <= 3.0e-04f) {
	fidelity_class = "HIGH (≤3e-04)";
	fidelity_icon = "★★★☆";
	} else if (rmse <= 1.0e-03f) {
	fidelity_class = "GOOD (≤1e-03)";
	fidelity_icon = "★★☆☆";
	} else {
	fidelity_class = "STANDARD";
	fidelity_icon = "★☆☆☆";
	}

	printf("\n ┌──── Shor Measurement Q2_K Report ────────────────────────────────┐\n");
	printf(" │ Elements: %-12lld Blocks: %-12lld │\n",
	(long long)n_elements, (long long)(n_elements / QK_K));
	printf(" │ Weight σ: %-12.4e Range: [%.4e, %.4e] │\n",
	w_sigma, w_sigma * -4.0f, w_sigma * 4.0f);
	printf(" │ Total MSE: %-12.6f │\n", total_err);
	printf(" │ RMSE: %-12.4e RMSE/σ: %-8.4f │\n",
	rmse, rmse_over_sigma);
	printf(" │ Fidelity: %s %-14s │\n",
	fidelity_icon, fidelity_class);
	printf(" │ Engine: Shor Griffiths-Niu (IDFT6 + feed-forward) │\n");
	printf(" └─────────────────────────────────────────────────────────────────┘\n");
	}
	}


	/* ═══════════════════════════════════════════════════════════════════════════
	* PROGRESS REPORTING
	* ═══════════════════════════════════════════════════════════════════════════ */

	static void print_progress_bar(int current, int total, const char *label,
	time_t start_time)
	{
	if (total <= 0) return;
	float pct = (float)current / (float)total;
	int bar_width = 40;
	int filled = (int)(pct * bar_width);

	/* Wall-clock elapsed: clock() sums CPU time over all OpenMP threads,
	* which inflated elapsed/ETA by ~the thread count on multicore. */
	double elapsed = difftime(time(NULL), start_time);
	double eta = (pct > 0.01f) ? elapsed / pct * (1.0 - pct) : 0.0;

	printf("\r [");
	for (int i = 0; i < bar_width; i++) {
	if (i < filled) printf("█");
	else if (i == filled) printf("▓");
	else printf("░");
	}
	printf("] %3d%% (%d/%d) %.0fs ETA:%.0fs %s",
	(int)(pct * 100), current, total, elapsed, eta, label);
	fflush(stdout);

	if (current == total) printf("\n");
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* GGUF FILE WRITER — Assembles the complete output file
	* ═══════════════════════════════════════════════════════════════════════════ */

	static int write_gguf(const char output_path, const STMultiFile mf,
	const ModelArchitecture *arch,
	const TokenizerData *tokenizer,
	OptimizerMode opt_mode,
	const IMatrixData *imatrix,
	int verbose)
	{
	FILE *fp = fopen(output_path, "wb");
	if (!fp) {
	fprintf(stderr, " ERROR: Cannot open '%s' for writing\n", output_path);
	return -1;
	}

	printf("\n ╔════════════════════════════════════════════════════════════════╗\n");
	printf(" ║ WRITING GGUF FILE ║\n");
	printf(" ╚════════════════════════════════════════════════════════════════╝\n\n");

	/* ── Determine which tensors to include ── */
	int include_list = (int )calloc(mf->n_tensors, sizeof(int));
	int n_include = 0;
	for (int i = 0; i < mf->n_tensors; i++) {
	if (!should_skip_tensor(mf->tensor_map[i].name)) {
	include_list[n_include++] = i;
	} else {
	if (verbose) printf(" SKIP: %s (not needed in GGUF)\n", mf->tensor_map[i].name);
	}
	}

	/* ── Count metadata KV pairs ── */
	int n_kv = 0;
	n_kv++; /* general.architecture */
	n_kv++; /* general.name */
	n_kv++; /* general.quantization_version */
	n_kv++; /* general.file_type */
	n_kv++; /* {arch}.context_length */
	n_kv++; /* {arch}.embedding_length */
	n_kv++; /* {arch}.block_count */
	n_kv++; /* {arch}.feed_forward_length */
	n_kv++; /* {arch}.attention.head_count */
	n_kv++; /* {arch}.attention.head_count_kv */
	n_kv++; /* {arch}.attention.layer_norm_rms_epsilon */
	n_kv++; /* {arch}.rope.freq_base */
	n_kv++; /* {arch}.vocab_size */

	/* Tokenizer metadata KV count */
	int has_tokenizer = (tokenizer != NULL && tokenizer->vocab_size > 0);
	if (has_tokenizer) {
	n_kv++; /* tokenizer.ggml.model */
	n_kv++; /* tokenizer.ggml.tokens */
	n_kv++; /* tokenizer.ggml.scores */
	n_kv++; /* tokenizer.ggml.token_type */
	n_kv++; /* tokenizer.ggml.bos_token_id */
	n_kv++; /* tokenizer.ggml.eos_token_id */
	n_kv++; /* tokenizer.ggml.unknown_token_id */
	if (tokenizer->n_merges > 0)
	n_kv++; /* tokenizer.ggml.merges */
	}

	/* ── Check for weight tying ──
	* If tie_word_embeddings is set and there's no separate lm_head,
	* llama.cpp handles this internally — do NOT duplicate the tensor.
	* Only add output.weight if the model has a separate lm_head.weight. */
	int has_lm_head = (st_multi_find_tensor(mf, "lm_head.weight") >= 0);
	int total_tensors = n_include;

	if (arch->tie_word_embeddings && !has_lm_head) {
	printf(" Weight-tied embeddings detected — llama.cpp handles internally\n\n");
	}

	/* ── Prepare tensor info ── */
	char (*gguf_names)[ST_MAX_NAME_LEN] = calloc(total_tensors, ST_MAX_NAME_LEN);
	GGMLType *tensor_types = calloc(total_tensors, sizeof(GGMLType));
	int64_t *tensor_sizes = calloc(total_tensors, sizeof(int64_t));
	uint64_t data_offset = 0;
	uint64_t *tensor_offsets = calloc(total_tensors, sizeof(uint64_t));
	int tensor_src_idx = calloc(total_tensors, sizeof(int)); / map to unified ST index */
	char (*tensor_hf_names)[ST_MAX_NAME_LEN] = calloc(total_tensors, ST_MAX_NAME_LEN);

	GGMLType quant_type = GGML_TYPE_Q2_K;

	for (int i = 0; i < n_include; i++) {
	int src = include_list[i];
	const STTensorInfo *ti = st_multi_tensor_info(mf, src);
	map_tensor_name(mf->tensor_map[src].name, gguf_names[i], ST_MAX_NAME_LEN);
	strncpy(tensor_hf_names[i], mf->tensor_map[src].name, ST_MAX_NAME_LEN - 1);
	tensor_src_idx[i] = src;

	if (should_quantize(ti, gguf_names[i])) {
	if (is_attention_tensor(gguf_names[i])) {
	tensor_types[i] = GGML_TYPE_Q4_0;
	int64_t n_blocks_q4 = (ti->n_elements + QK4_0 - 1) / QK4_0;
	tensor_sizes[i] = n_blocks_q4 * sizeof(BlockQ4_0);
	if (verbose)
	printf(" [ATTN→Q4_0] %s (%ld elements)\n",
	gguf_names[i], (long)ti->n_elements);
	} else {
	tensor_types[i] = quant_type;
	tensor_sizes[i] = ggml_type_size(quant_type, ti->n_elements);
	}
	} else if (ti->n_dims >= 2) {
	tensor_types[i] = GGML_TYPE_F16;
	tensor_sizes[i] = ti->n_elements * sizeof(uint16_t);
	} else {
	tensor_types[i] = GGML_TYPE_F32;
	tensor_sizes[i] = ti->n_elements * sizeof(float);
	}

	tensor_offsets[i] = data_offset;

	data_offset += tensor_sizes[i];
	data_offset = (data_offset + GGUF_DEFAULT_ALIGNMENT - 1) &
	~(uint64_t)(GGUF_DEFAULT_ALIGNMENT - 1);
	}

	/* ── Write header ── */
	gguf_write_header(fp, total_tensors, n_kv);

	/* ── Write metadata KV pairs ── */
	gguf_write_kv_string(fp, "general.architecture", arch->architecture);
	gguf_write_kv_string(fp, "general.name", arch->name);
	gguf_write_kv_uint32(fp, "general.quantization_version", 2);
	gguf_write_kv_uint32(fp, "general.file_type", 10); /* Q2_K = 10 */

	char kbuf[128];
	snprintf(kbuf, sizeof(kbuf), "%s.context_length", arch->architecture);
	gguf_write_kv_uint32(fp, kbuf, arch->context_length);

	snprintf(kbuf, sizeof(kbuf), "%s.embedding_length", arch->architecture);
	gguf_write_kv_uint32(fp, kbuf, arch->embedding_length);

	snprintf(kbuf, sizeof(kbuf), "%s.block_count", arch->architecture);
	gguf_write_kv_uint32(fp, kbuf, arch->block_count);

	snprintf(kbuf, sizeof(kbuf), "%s.feed_forward_length", arch->architecture);
	gguf_write_kv_uint32(fp, kbuf, arch->feed_forward_length);

	snprintf(kbuf, sizeof(kbuf), "%s.attention.head_count", arch->architecture);
	gguf_write_kv_uint32(fp, kbuf, arch->head_count);

	snprintf(kbuf, sizeof(kbuf), "%s.attention.head_count_kv", arch->architecture);
	gguf_write_kv_uint32(fp, kbuf, arch->head_count_kv);

	snprintf(kbuf, sizeof(kbuf), "%s.attention.layer_norm_rms_epsilon", arch->architecture);
	gguf_write_kv_float32(fp, kbuf, arch->rms_norm_eps);

	snprintf(kbuf, sizeof(kbuf), "%s.rope.freq_base", arch->architecture);
	gguf_write_kv_float32(fp, kbuf, arch->rope_freq_base);

	snprintf(kbuf, sizeof(kbuf), "%s.vocab_size", arch->architecture);
	gguf_write_kv_uint32(fp, kbuf, arch->vocab_size);

	/* ── Write tokenizer metadata ── */
	if (has_tokenizer) {
	gguf_write_kv_string(fp, "tokenizer.ggml.model", tokenizer->model_type);
	gguf_write_kv_string_array(fp, "tokenizer.ggml.tokens",
	(const char **)tokenizer->tokens,
	(uint64_t)tokenizer->vocab_size);
	gguf_write_kv_float32_array(fp, "tokenizer.ggml.scores",
	tokenizer->scores,
	(uint64_t)tokenizer->vocab_size);
	gguf_write_kv_int32_array(fp, "tokenizer.ggml.token_type",
	tokenizer->token_types,
	(uint64_t)tokenizer->vocab_size);
	gguf_write_kv_uint32(fp, "tokenizer.ggml.bos_token_id",
	(uint32_t)tokenizer->bos_id);
	gguf_write_kv_uint32(fp, "tokenizer.ggml.eos_token_id",
	(uint32_t)tokenizer->eos_id);
	gguf_write_kv_uint32(fp, "tokenizer.ggml.unknown_token_id",
	(uint32_t)tokenizer->unk_id);
	if (tokenizer->n_merges > 0) {
	gguf_write_kv_string_array(fp, "tokenizer.ggml.merges",
	(const char **)tokenizer->merges,
	(uint64_t)tokenizer->n_merges);
	}
	printf(" Tokenizer metadata written (%d tokens, %d merges)\n\n",
	tokenizer->vocab_size, tokenizer->n_merges);
	}

	/* ── Write tensor info descriptors ── */
	for (int i = 0; i < total_tensors; i++) {
	int src = tensor_src_idx[i];
	const STTensorInfo *ti = st_multi_tensor_info(mf, src);
	uint64_t dims[ST_MAX_DIMS];
	int nd = ti->n_dims;
	for (int d = 0; d < nd; d++) {
	dims[d] = (uint64_t)ti->shape[nd - 1 - d];
	}
	gguf_write_tensor_info(fp, gguf_names[i],
	ti->n_dims, dims,
	tensor_types[i], tensor_offsets[i]);
	}

	/* ── Alignment padding before data section ── */
	gguf_write_padding(fp, GGUF_DEFAULT_ALIGNMENT);

	/* ── Write tensor data ── */
	printf(" Quantizing and writing %d tensors...\n\n", total_tensors);

	float total_error_sum = 0.0f;
	int quant_count = 0;
	int64_t total_elements_quantized = 0;
	int64_t total_bytes_quantized = 0;
	int64_t total_bytes_unquantized = 0;
	time_t quant_start = time(NULL);

	for (int i = 0; i < total_tensors; i++) {
	int src = tensor_src_idx[i];
	const STTensorInfo *ti = st_multi_tensor_info(mf, src);

	print_progress_bar(i, total_tensors, gguf_names[i], quant_start);

	if (tensor_types[i] == GGML_TYPE_Q2_K) {
	float *f32_data = st_multi_tensor_to_f32(mf, src);
	if (!f32_data) {
	fprintf(stderr, "\n ERROR: Failed to convert tensor '%s' to F32\n",
	ti->name);
	continue;
	}

	int64_t n_elements = ti->n_elements;
	float tensor_error = 0.0f;

	int64_t padded = (n_elements + QK_K - 1) / QK_K * QK_K;
	if (padded > n_elements) {
	float grown = realloc(f32_data, padded sizeof(float));
	if (!grown) {
	fprintf(stderr, "\n ERROR: Out of memory padding '%s'\n",
	ti->name);
	free(f32_data);
	continue;
	}
	f32_data = grown;
	for (int64_t j = n_elements; j < padded; j++)
	f32_data[j] = 0.0f;
	n_elements = padded;
	}

	int64_t n_blocks = n_elements / QK_K;
	BlockQ2K *quant_data = calloc(n_blocks, sizeof(BlockQ2K));

	const float *imp = NULL;
	if (imatrix) {
	const IMatrixEntry *ime = imatrix_find_any(imatrix,
	gguf_names[i], tensor_hf_names[i]);
	if (ime && ime->n_values > 0) {
	imp = ime->normalized;
	if (verbose)
	printf("\n imatrix: using %d importance weights for %s\n",
	ime->n_values, gguf_names[i]);
	}
	}

	quantize_tensor_q2k_hpc(f32_data, n_elements,
	quant_data, &tensor_error,
	opt_mode, imp, verbose);

	fwrite(quant_data, sizeof(BlockQ2K), n_blocks, fp);

	float rmse = sqrtf(tensor_error / (float)ti->n_elements);

	double wss = 0.0;
	for (int64_t j = 0; j < ti->n_elements; j++)
	wss += (double)f32_data[j] * (double)f32_data[j];
	float w_sig = (float)sqrt(wss / (double)ti->n_elements);

	const char *fid;
	if (rmse <= 1.0e-04f) fid = "★★★★ ULTRA";
	else if (rmse <= 3.0e-04f) fid = "★★★☆ HIGH";
	else if (rmse <= 1.0e-03f) fid = "★★☆☆ GOOD";
	else fid = "★☆☆☆ STD";

	if (verbose) {
	printf("\n [Q2_K·Shor] %-47s\n", gguf_names[i]);
	printf(" %10ld elements → %ld bytes σ=%.2e RMSE=%.4e %s\n",
	(long)ti->n_elements,
	(long)(n_blocks * sizeof(BlockQ2K)),
	w_sig, rmse, fid);
	}

	total_error_sum += tensor_error;
	total_elements_quantized += ti->n_elements;
	total_bytes_quantized += n_blocks * sizeof(BlockQ2K);
	quant_count++;

	free(quant_data);
	free(f32_data);
	} else if (tensor_types[i] == GGML_TYPE_Q4_0) {
	float *f32_data = st_multi_tensor_to_f32(mf, src);
	if (!f32_data) {
	fprintf(stderr, "\n ERROR: Failed to convert tensor '%s' to F32\n",
	ti->name);
	continue;
	}

	int64_t n_elements = ti->n_elements;

	int64_t padded = (n_elements + QK4_0 - 1) / QK4_0 * QK4_0;
	if (padded > n_elements) {
	float grown = realloc(f32_data, padded sizeof(float));
	if (!grown) {
	fprintf(stderr, "\n ERROR: Out of memory padding '%s'\n",
	ti->name);
	free(f32_data);
	continue;
	}
	f32_data = grown;
	for (int64_t j = n_elements; j < padded; j++)
	f32_data[j] = 0.0f;
	n_elements = padded;
	}

	int64_t n_blocks_q4 = n_elements / QK4_0;
	BlockQ4_0 *q4_data = calloc(n_blocks_q4, sizeof(BlockQ4_0));
	float tensor_error = 0.0f;

	const float *imp = NULL;
	if (imatrix) {
	const IMatrixEntry *ime = imatrix_find_any(imatrix,
	gguf_names[i], tensor_hf_names[i]);
	if (ime && ime->n_values > 0) {
	imp = ime->normalized;
	if (verbose)
	printf("\n imatrix: using %d importance weights for %s\n",
	ime->n_values, gguf_names[i]);
	}
	}

	quantize_tensor_q4_0_hpc(f32_data, n_elements,
	q4_data, &tensor_error,
	imp, verbose);

	fwrite(q4_data, sizeof(BlockQ4_0), n_blocks_q4, fp);

	float rmse = sqrtf(tensor_error / (float)ti->n_elements);

	double wss4 = 0.0;
	for (int64_t j = 0; j < ti->n_elements; j++)
	wss4 += (double)f32_data[j] * (double)f32_data[j];
	float w_sig4 = (float)sqrt(wss4 / (double)ti->n_elements);

	const char *fid4;
	if (rmse <= 1.0e-04f) fid4 = "★★★★ ULTRA";
	else if (rmse <= 3.0e-04f) fid4 = "★★★☆ HIGH";
	else if (rmse <= 1.0e-03f) fid4 = "★★☆☆ GOOD";
	else fid4 = "★☆☆☆ STD";

	if (verbose) {
	printf("\n [Q4_0·Shor] %-47s\n", gguf_names[i]);
	printf(" %10ld elements → %ld bytes σ=%.2e RMSE=%.4e %s\n",
	(long)ti->n_elements,
	(long)(n_blocks_q4 * sizeof(BlockQ4_0)),
	w_sig4, rmse, fid4);
	}

	total_error_sum += tensor_error;
	total_elements_quantized += ti->n_elements;
	total_bytes_quantized += n_blocks_q4 * sizeof(BlockQ4_0);
	quant_count++;

	free(q4_data);
	free(f32_data);
	} else if (tensor_types[i] == GGML_TYPE_F16) {
	float *f32_data = st_multi_tensor_to_f32(mf, src);
	if (!f32_data) {
	fprintf(stderr, "\n ERROR: Failed to convert tensor '%s'\n",
	ti->name);
	continue;
	}

	uint16_t f16_data = (uint16_t )malloc(ti->n_elements * sizeof(uint16_t));
	for (int64_t j = 0; j < ti->n_elements; j++)
	f16_data[j] = gguf_fp32_to_fp16(f32_data[j]);

	fwrite(f16_data, sizeof(uint16_t), ti->n_elements, fp);

	total_bytes_unquantized += ti->n_elements * sizeof(uint16_t);

	if (verbose) {
	printf("\n [F16 ] %-50s %10ld elements → %ld bytes\n",
	gguf_names[i], (long)ti->n_elements,
	(long)(ti->n_elements * sizeof(uint16_t)));
	}

	free(f16_data);
	free(f32_data);
	} else {
	float *f32_data = st_multi_tensor_to_f32(mf, src);
	if (!f32_data) {
	fprintf(stderr, "\n ERROR: Failed to convert tensor '%s'\n",
	ti->name);
	continue;
	}

	fwrite(f32_data, sizeof(float), ti->n_elements, fp);

	total_bytes_unquantized += ti->n_elements * sizeof(float);

	if (verbose) {
	printf("\n [F32 ] %-50s %10ld elements → %ld bytes\n",
	gguf_names[i], (long)ti->n_elements,
	(long)(ti->n_elements * sizeof(float)));
	}

	free(f32_data);
	}

	gguf_write_padding(fp, GGUF_DEFAULT_ALIGNMENT);
	}

	print_progress_bar(total_tensors, total_tensors, "done", quant_start);

	long final_size = ftell(fp);
	fclose(fp);

	int64_t original_f32_size = 0;
	for (int i = 0; i < total_tensors; i++) {
	const STTensorInfo *ti = st_multi_tensor_info(mf, tensor_src_idx[i]);
	original_f32_size += ti->n_elements * sizeof(float);
	}
	float compression_ratio = (original_f32_size > 0) ?
	(float)original_f32_size / (float)final_size : 0.0f;
	float effective_bpw = (total_elements_quantized > 0) ?
	8.0f * (float)total_bytes_quantized / (float)total_elements_quantized :
	0.0f;
	float total_rmse = (total_elements_quantized > 0) ?
	sqrtf(total_error_sum / (float)total_elements_quantized) : 0.0f;
	float mean_mse_per_tensor = (quant_count > 0) ?
	total_error_sum / (float)quant_count : 0.0f;

	const char overall_fid, overall_icon;
	if (total_rmse <= 1.0e-04f) { overall_fid = "ULTRA (≤1e-04)"; overall_icon = "★★★★"; }
	else if (total_rmse <= 3.0e-04f) { overall_fid = "HIGH (≤3e-04)"; overall_icon = "★★★☆"; }
	else if (total_rmse <= 1.0e-03f) { overall_fid = "GOOD (≤1e-03)"; overall_icon = "★★☆☆"; }
	else { overall_fid = "STANDARD"; overall_icon = "★☆☆☆"; }

	printf("\n ╔════════════════════════════════════════════════════════════════╗\n");
	printf(" ║ SHOR-OPTIMIZED QUANTIZATION SUMMARY ║\n");
	printf(" ╠════════════════════════════════════════════════════════════════╣\n");
	printf(" ║ ║\n");
	printf(" ║ Engine: Griffiths-Niu Sequential Measurement ║\n");
	printf(" ║ Protocol: IDFT6 → feed-forward → Born → collapse ║\n");
	printf(" ║ Origin: tesseract_factor.c (Shor's algorithm) ║\n");
	printf(" ║ ║\n");
	printf(" ╠════════════════════════════════════════════════════════════════╣\n");
	printf(" ║ Tensors quantized: %-33d ║\n", quant_count);
	printf(" ║ Elements quantized: %15ld ║\n",
	(long)total_elements_quantized);
	printf(" ║ Quantized data: %12ld bytes (%6.1f MB) ║\n",
	(long)total_bytes_quantized,
	(double)total_bytes_quantized / (1024.0 * 1024.0));
	printf(" ║ Unquantized data: %12ld bytes (%6.1f MB) ║\n",
	(long)total_bytes_unquantized,
	(double)total_bytes_unquantized / (1024.0 * 1024.0));
	printf(" ║ Effective bits/weight: %15.2f ║\n",
	effective_bpw);
	printf(" ║ Compression ratio: %15.1fx ║\n",
	compression_ratio);
	printf(" ║ ║\n");
	printf(" ╠════════════════════════════════════════════════════════════════╣\n");
	printf(" ║ FIDELITY METRICS (target: 1e-04) ║\n");
	printf(" ╠════════════════════════════════════════════════════════════════╣\n");
	printf(" ║ ║\n");
	printf(" ║ Total MSE: %15.6e ║\n",
	total_error_sum);
	printf(" ║ Per-element RMSE: %15.4e ║\n",
	total_rmse);
	printf(" ║ Mean MSE/tensor: %15.6e ║\n",
	mean_mse_per_tensor);
	printf(" ║ ║\n");
	printf(" ║ Fidelity class: %s %-14s ║\n",
	overall_icon, overall_fid);
	if (total_rmse <= 1.0e-04f)
	printf(" ║ ✓ RMSE ≤ 1e-04: TARGET MET — maximum fidelity achieved ║\n");
	else if (total_rmse <= 3.0e-04f)
	printf(" ║ ◐ RMSE ≤ 3e-04: near target — high fidelity achieved ║\n");
	else
	printf(" ║ ○ RMSE > 3e-04: below target — weight σ may be large ║\n");
	printf(" ║ ║\n");
	printf(" ╠════════════════════════════════════════════════════════════════╣\n");
	printf(" ║ Output file: %ld bytes (%.1f MB)%*s║\n",
	final_size, (double)final_size / (1024.0 * 1024.0),
	(int)(27 - snprintf(NULL, 0, "%ld bytes (%.1f MB)",
	final_size, (double)final_size / (1024.0 * 1024.0))), "");
	printf(" ╚════════════════════════════════════════════════════════════════╝\n\n");

	free(include_list);
	free(gguf_names);
	free(tensor_types);
	free(tensor_sizes);
	free(tensor_offsets);
	free(tensor_src_idx);
	free(tensor_hf_names);

	return 0;
	}

	/* ═══════════════════════════════════════════════════════════════════════════
	* LIBRARY API — Exported functions for Python ctypes integration
	*
	* When built with -DHEXSTATE_LIBRARY, these are the only public symbols.
	* The Python GGUF pipeline handles metadata/IO; C handles HPC quantization.
	* ═══════════════════════════════════════════════════════════════════════════ */

	/* Initialize HExState subsystems (must be called once before quantization) */
	void hexstate_init(void)
	{
	static int initialized = 0;
	if (!initialized) {
	srand(42); /* Deterministic for reproducibility */
	triality_exotic_init();
	s6_exotic_init();
	triality_stats_reset();
	initialized = 1;
	}
	}

	/* Quantize a single tensor's F32 data to Q2_K using HPC optimization.
	*
	* Parameters:
	* weights: input F32 data (must be padded to multiple of 256)
	* n_elements: number of elements (must be multiple of 256)
	* output: output buffer (must be n_elements/256 * 84 bytes)
	* out_error: pointer to receive total MSE (can be NULL)
	* opt_mode: 0=HPC, 1=MSE, 2=Hybrid (recommended)
	* verbose: 1 for per-block diagnostics
	*/
	void hexstate_quantize_tensor_q2k(const float *weights, int64_t n_elements,
	void output, float out_error,
	int opt_mode, int verbose)
	{
	hexstate_init();
	quantize_tensor_q2k_hpc(weights, n_elements,
	(BlockQ2K *)output, out_error,
	(OptimizerMode)opt_mode, NULL, verbose);
	}

	/* Same as above but with importance matrix weights */
	void hexstate_quantize_tensor_q2k_imat(const float *weights, int64_t n_elements,
	void output, float out_error,
	int opt_mode,
	const float *imat_importance,
	int verbose)
	{
	hexstate_init();
	quantize_tensor_q2k_hpc(weights, n_elements,
	(BlockQ2K *)output, out_error,
	(OptimizerMode)opt_mode, imat_importance, verbose);
	}

	/* Get the block size for Q2_K (84 bytes per 256 elements) */
	int hexstate_q2k_block_bytes(void) { return sizeof(BlockQ2K); }
	int hexstate_q2k_block_elements(void) { return QK_K; }

	/* HPC-optimized Q4_0 quantization for attention tensors.
	* Called from Python requantizer via ctypes.
	* weights: input F32 weights
	* n_elements: number of elements (must be multiple of 32)
	* output: output buffer (must be n_elements/32 * 18 bytes)
	* out_error: pointer to receive total MSE (can be NULL)
	* imat_importance: optional per-element importance weights
	* verbose: 1 for per-block diagnostics
	*/
	void hexstate_quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
	void output, float out_error,
	const float *imat_importance,
	int verbose)
	{
	hexstate_init();
	float err = 0.0f;
	quantize_tensor_q4_0_hpc(weights, n_elements,
	(BlockQ4_0 *)output, &err,
	imat_importance, verbose);
	if (out_error) *out_error = err;
	}

	int hexstate_q8_0_block_bytes(void) { return (int)sizeof(hex_block_q8_0); }
	int hexstate_q8_0_block_elements(void) { return QK8_0; }

	void hexstate_quantize_tensor_q8_0_hpc(const float *weights, int64_t n_elements,
	void output, float out_error,
	const float *imat_importance, int verbose)
	{
	quantize_tensor_q8_0_hpc(weights, n_elements,
	(hex_block_q8_0 *)output, out_error,
	imat_importance, verbose);
	}

	#ifndef HEXSTATE_LIBRARY
	/* ═══════════════════════════════════════════════════════════════════════════
	* MAIN
	* ═══════════════════════════════════════════════════════════════════════════ */

	int main(int argc, char **argv)
	{
	srand(time(NULL));

	/* Initialize HExState subsystems */
	triality_exotic_init();
	s6_exotic_init();
	triality_stats_reset();

	printf("\n");
	printf(" ╔════════════════════════════════════════════════════════════════╗\n");
	printf(" ║ ║\n");
	printf(" ║ HExState GGUF QUANTIZER v3.0 — Shor-Optimized ║\n");
	printf(" ║ ║\n");
	printf(" ║ Architecture: HPCGraph Sensitivity Propagation ║\n");
	printf(" ║ Optimization: Shor's Griffiths-Niu Measurement + iMatrix ║\n");
	printf(" ║ Output: GGUF v3 (Q2_K, 2.625 bpw) ║\n");
	printf(" ║ ║\n");
	printf(" ║ \"The weight and the quantized are opposite faces.\" ║\n");
	printf(" ║ ║\n");
	printf(" ╚════════════════════════════════════════════════════════════════╝\n\n");

	if (argc < 3) {
	printf(" Usage: %s <input> <output.gguf> [options]\n\n", argv[0]);
	printf(" Input:\n");
	printf(" Single .safetensors file, or\n");
	printf(" Model directory with sharded .safetensors files\n\n");
	printf(" Options:\n");
	printf(" --optimizer hpc\|mse\|hybrid Scale optimization (default: hybrid)\n");
	printf(" --imatrix <file> Importance matrix for Q2_K quality\n");
	printf(" --config <file> Explicit config.json for arch detection\n");
	printf(" --qwen Force Qwen 3.5/3.6 architecture\n");
	printf(" --verbose Per-block diagnostics\n\n");
	return 1;
	}

	const char *input_path = argv[1];
	const char *output_path = argv[2];
	OptimizerMode opt_mode = OPT_HYBRID;
	const char *imatrix_path = NULL;
	const char *config_override = NULL;
	int verbose = 0;
	int force_qwen = 0;

	/* Parse options */
	for (int i = 3; i < argc; i++) {
	if (strcmp(argv[i], "--optimizer") == 0 && i + 1 < argc) {
	i++;
	if (strcmp(argv[i], "hpc") == 0) opt_mode = OPT_HPC;
	else if (strcmp(argv[i], "mse") == 0) opt_mode = OPT_MSE;
	else if (strcmp(argv[i], "hybrid") == 0) opt_mode = OPT_HYBRID;
	else {
	fprintf(stderr, " ERROR: Unknown optimizer '%s'. Use hpc, mse, or hybrid.\n", argv[i]);
	return 1;
	}
	} else if (strcmp(argv[i], "--imatrix") == 0 && i + 1 < argc) {
	imatrix_path = argv[++i];
	} else if (strcmp(argv[i], "--config") == 0 && i + 1 < argc) {
	config_override = argv[++i];
	} else if (strcmp(argv[i], "--qwen") == 0) {
	force_qwen = 1;
	} else if (strcmp(argv[i], "--verbose") == 0) {
	verbose = 1;
	} else {
	fprintf(stderr, " ERROR: Unknown option '%s'\n", argv[i]);
	return 1;
	}
	}

	const char *opt_names[] = {"HPC (BP only)", "MSE (grid search)", "Hybrid (HPC+MSE)"};
	printf(" Input: %s\n", input_path);
	printf(" Output: %s\n", output_path);
	printf(" Quant type: Q2_K (2.625 bpw)\n");
	printf(" Optimizer: %s\n", opt_names[opt_mode]);
	if (imatrix_path) printf(" iMatrix: %s\n", imatrix_path);
	if (config_override) printf(" Config: %s\n", config_override);
	if (force_qwen) printf(" Model: Qwen 3.5/3.6 (forced via --qwen)\n");
	printf("\n");

	/* ── Phase 1: Load model ── */
	printf(" Phase 1: Loading model...\n");
	time_t t_start = time(NULL);

	/* Determine if input is a file or directory */
	struct stat st;
	if (stat(input_path, &st) != 0) {
	fprintf(stderr, " ERROR: Cannot access '%s'\n", input_path);
	return 1;
	}

	STMultiFile *mf = NULL;
	char input_dir[512] = "";

	if (S_ISDIR(st.st_mode)) {
	/* Input is a directory — open all shards */
	mf = st_open_dir(input_path);
	strncpy(input_dir, input_path, sizeof(input_dir) - 2);
	input_dir[sizeof(input_dir) - 2] = '\0';
	int dlen = strlen(input_dir);
	if (dlen > 0 && input_dir[dlen - 1] != '/') {
	input_dir[dlen] = '/';
	input_dir[dlen + 1] = '\0';
	}
	} else {
	/* Input is a single file — wrap in STMultiFile */
	STFile *sf = st_open(input_path);
	if (!sf) {
	fprintf(stderr, " ERROR: Failed to open '%s'\n", input_path);
	return 1;
	}
	mf = (STMultiFile *)calloc(1, sizeof(STMultiFile));
	mf->shards[0] = sf;
	mf->n_shards = 1;
	for (int i = 0; i < sf->n_tensors && mf->n_tensors < ST_MAX_TENSORS; i++) {
	strncpy(mf->tensor_map[mf->n_tensors].name,
	sf->tensors[i].name, ST_MAX_NAME_LEN - 1);
	mf->tensor_map[mf->n_tensors].shard_idx = 0;
	mf->tensor_map[mf->n_tensors].tensor_idx = i;
	mf->n_tensors++;
	}

	/* Extract directory from file path */
	strncpy(input_dir, input_path, sizeof(input_dir) - 1);
	input_dir[sizeof(input_dir) - 1] = '\0';
	char *last_slash = strrchr(input_dir, '/');
	if (last_slash) {
	*(last_slash + 1) = '\0';
	} else {
	strcpy(input_dir, "./");
	}
	}

	if (!mf) {
	fprintf(stderr, " ERROR: Failed to load model from '%s'\n", input_path);
	return 1;
	}

	st_multi_print_summary(mf);

	time_t t_load = time(NULL);
	printf(" Loaded in %.0f seconds\n\n", difftime(t_load, t_start));

	/* ── Phase 2: Detect architecture ── */
	printf(" Phase 2: Detecting model architecture...\n");

	/* Try to read config.json from model directory */
	char config_path[1024];
	snprintf(config_path, sizeof(config_path), "%sconfig.json", input_dir);
	const char *config_ptr = NULL;
	{
	FILE *check = fopen(config_path, "rb");
	if (check) {
	fclose(check);
	config_ptr = config_path;
	printf(" Found config.json: %s\n", config_path);
	}
	}

	ModelArchitecture arch;
	detect_architecture(mf, &arch, config_ptr);

	/* --qwen override: force Qwen 3.5/3.6 architecture parameters */
	if (force_qwen) {
	strcpy(arch.architecture, "qwen2");
	strcpy(arch.name, "Qwen3.6-HExState-Q2K");
	printf(" [--qwen] Forcing qwen2-compatible architecture\n");
	}

	printf(" ╔═══════════════════════════════════════════════════════════════╗\n");
	printf(" ║ Model Architecture ║\n");
	printf(" ╠═══════════════════════════════════════════════════════════════╣\n");
	printf(" ║ Architecture: %-40s ║\n", arch.architecture);
	printf(" ║ Layers: %-40u ║\n", arch.block_count);
	printf(" ║ Hidden size: %-40u ║\n", arch.embedding_length);
	printf(" ║ Attention heads: %-40u ║\n", arch.head_count);
	printf(" ║ KV heads: %-40u ║\n", arch.head_count_kv);
	printf(" ║ Vocab size: %-40u ║\n", arch.vocab_size);
	printf(" ║ FFN size: %-40u ║\n", arch.feed_forward_length);
	printf(" ║ Context length: %-40u ║\n", arch.context_length);
	printf(" ║ Has bias: %-40s ║\n", arch.has_bias ? "yes" : "no");
	printf(" ║ Tied embeddings: %-40s ║\n", arch.tie_word_embeddings ? "yes" : "no");
	printf(" ╚═══════════════════════════════════════════════════════════════╝\n\n");

	/* ── Phase 2b: Load tokenizer ── */
	printf(" Phase 2b: Loading tokenizer...\n");
	TokenizerData *tokenizer = NULL;
	{
	char tok_json[512], tok_config[512];
	snprintf(tok_json, sizeof(tok_json), "%stokenizer.json", input_dir);
	snprintf(tok_config, sizeof(tok_config), "%stokenizer_config.json", input_dir);

	tokenizer = tok_load(tok_json, tok_config);
	if (tokenizer) {
	tok_print_summary(tokenizer);
	} else {
	printf(" No tokenizer found in '%s'\n", input_dir);
	printf(" (Output GGUF will lack tokenizer data — not inference-ready)\n\n");
	}
	}

	/* ── Phase 2c: Load importance matrix (optional) ── */
	IMatrixData *imatrix = NULL;
	if (imatrix_path) {
	printf(" Phase 2c: Loading importance matrix...\n");
	imatrix = imatrix_load(imatrix_path);
	if (imatrix) {
	imatrix_print_summary(imatrix);
	} else {
	printf(" WARNING: Failed to load imatrix from '%s'\n", imatrix_path);
	printf(" Proceeding without importance weighting.\n\n");
	}
	}

	/* ── Phase 3-5: Quantize and write GGUF ── */
	printf(" Phase 3: HPC-Optimized Q2_K Quantization + GGUF Output...\n");
	int result = write_gguf(output_path, mf, &arch, tokenizer,
	opt_mode, imatrix, verbose);

	/* Wall-clock total: clock() sums CPU time over all OpenMP threads */
	time_t t_end = time(NULL);
	printf(" Total time: %.0f seconds\n\n", difftime(t_end, t_start));

	if (imatrix) imatrix_free(imatrix);
	if (tokenizer) tok_free(tokenizer);
	st_multi_close(mf);
	return result;
	}
	#endif /* HEXSTATE_LIBRARY */