Instructions to use ByteDance-Seed/Stable-DiffCoder-8B-Base with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use ByteDance-Seed/Stable-DiffCoder-8B-Base with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="ByteDance-Seed/Stable-DiffCoder-8B-Base", trust_remote_code=True)

# Load model directly
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("ByteDance-Seed/Stable-DiffCoder-8B-Base", trust_remote_code=True)
model = AutoModel.from_pretrained("ByteDance-Seed/Stable-DiffCoder-8B-Base", trust_remote_code=True)

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use ByteDance-Seed/Stable-DiffCoder-8B-Base with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "ByteDance-Seed/Stable-DiffCoder-8B-Base"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "ByteDance-Seed/Stable-DiffCoder-8B-Base",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/ByteDance-Seed/Stable-DiffCoder-8B-Base

SGLang

How to use ByteDance-Seed/Stable-DiffCoder-8B-Base with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "ByteDance-Seed/Stable-DiffCoder-8B-Base" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "ByteDance-Seed/Stable-DiffCoder-8B-Base",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "ByteDance-Seed/Stable-DiffCoder-8B-Base" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "ByteDance-Seed/Stable-DiffCoder-8B-Base",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use ByteDance-Seed/Stable-DiffCoder-8B-Base with Docker Model Runner:
```
docker model run hf.co/ByteDance-Seed/Stable-DiffCoder-8B-Base
```

Fix pulled from Instruct model

by Seas0 - opened Mar 25

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+27

-13

Files changed (2) hide show

config.json +4 -5
modeling_stable_diffcoder.py +23 -8

config.json CHANGED Viewed

@@ -1,8 +1,7 @@
 {
-  "architectures": [
-    "StableDiffcoderForCausalLM"
-  ],
   "auto_map": {
     "AutoModelForCausalLM": "modeling_stable_diffcoder.StableDiffcoderForCausalLM"
   },
   "attention_bias": false,
@@ -21,11 +20,11 @@
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
   "resid_pdrop": 0.1,
-  "rms_norm_eps": 1e-06,
   "rope_theta": 500000.0,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
   "transformers_version": "5.3.0",
   "use_cache": true,
   "vocab_size": 155136
-}

 {
+  "architectures": ["StableDiffcoderForCausalLM"],
   "auto_map": {
+    "AutoModel": "modeling_stable_diffcoder.StableDiffcoderForCausalLM",
     "AutoModelForCausalLM": "modeling_stable_diffcoder.StableDiffcoderForCausalLM"
   },
   "attention_bias": false,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
   "resid_pdrop": 0.1,
+  "rms_norm_eps": 1e-6,
   "rope_theta": 500000.0,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
   "transformers_version": "5.3.0",
   "use_cache": true,
   "vocab_size": 155136
+}

modeling_stable_diffcoder.py CHANGED Viewed

@@ -137,8 +137,10 @@ class StableDiffcoderForCausalLM(LlamaForCausalLM):
         prompt_length = input_ids.shape[1]
         gen_block_list = [block_length for _ in range(gen_blocks)]
-        res_block = block_length - (prompt_length % block_length)
-        if res_block > 0:
             gen_block_list = [res_block] + gen_block_list
             gen_block_list[-1] = block_length - res_block
             gen_blocks += 1
@@ -156,16 +158,23 @@ class StableDiffcoderForCausalLM(LlamaForCausalLM):
         nfe = 0
         final_flag = False
         prefill_length = prompt_length // block_length * block_length
         if prefill_length > 0:
             cur_attn_mask = block_diffusion_attention_mask[
                 ..., :prefill_length, :prefill_length
             ]
             self(
                 x[:, :prefill_length],
                 past_key_values=past_key_values,
                 attention_mask=cur_attn_mask,
                 use_cache=True,
-            ).past_key_values
         for block_id, block_size in enumerate(gen_block_list):
             block_start = (
@@ -182,7 +191,7 @@ class StableDiffcoderForCausalLM(LlamaForCausalLM):
             replace_position[:, block_start:block_end] = True
             for token_count in num_transfer_tokens:
-                if token_count:
                     nfe += 1
                     mask_map = x[:, block_start:block_end] == mask_id
                     attention_mask = block_diffusion_attention_mask[
@@ -205,22 +214,28 @@ class StableDiffcoderForCausalLM(LlamaForCausalLM):
                         remasking,
                         mask_map,
                         x[:, block_start:block_end],
-                        token_count if threshold is None else None,
                         threshold,
-                        shift=False,
                     )
                     x[:, block_start:block_end][transfer_map] = x0[transfer_map]
                 if (x[:, block_start:block_end] == mask_id).sum() == 0:
                     if (
                         eos_id is not None
-                        and (x[:, block_start:block_end] == eos_id).sum() > 0
                     ):
                         final_flag = True
                         x = x[:, :block_end]
-                        eos_pos = (x == eos_id).nonzero(as_tuple=True)[1][0].item()
                         x[0, eos_pos:] = eos_id
                         break
                     nfe += 1
                     self(
                         x[:, block_start:block_end],

         prompt_length = input_ids.shape[1]
         gen_block_list = [block_length for _ in range(gen_blocks)]
+        # Fix 3: Only handle residual blocks if the prompt length is NOT cleanly divisible
+        remainder = prompt_length % block_length
+        if remainder != 0:
+            res_block = block_length - remainder
             gen_block_list = [res_block] + gen_block_list
             gen_block_list[-1] = block_length - res_block
             gen_blocks += 1
         nfe = 0
         final_flag = False
         prefill_length = prompt_length // block_length * block_length
         if prefill_length > 0:
             cur_attn_mask = block_diffusion_attention_mask[
                 ..., :prefill_length, :prefill_length
             ]
+            # Fix 1: Explicitly pass cache_position for newer transformers prefill
+            # actually not necessary since transformers will automatically generate it for prefilling
+            # if unspecified, but the official `generate` method does pass it,
+            # so we follow that for consistency and to avoid potential issues in future transformers updates
+            cache_pos = torch.arange(prefill_length, device=x.device)
             self(
                 x[:, :prefill_length],
                 past_key_values=past_key_values,
                 attention_mask=cur_attn_mask,
                 use_cache=True,
+                cache_position=cache_pos,
+            )
         for block_id, block_size in enumerate(gen_block_list):
             block_start = (
             replace_position[:, block_start:block_end] = True
             for token_count in num_transfer_tokens:
+                if token_count > 0:
                     nfe += 1
                     mask_map = x[:, block_start:block_end] == mask_id
                     attention_mask = block_diffusion_attention_mask[
                         remasking,
                         mask_map,
                         x[:, block_start:block_end],
+                        token_count.item() if threshold is None else None,
                         threshold,
+                        shift=shift,
                     )
                     x[:, block_start:block_end][transfer_map] = x0[transfer_map]
                 if (x[:, block_start:block_end] == mask_id).sum() == 0:
+                   # Fix 2: Calculate where the generated tokens ACTUALLY start in this block
+                    gen_start = max(block_start, prompt_length)
                     if (
                         eos_id is not None
+                        and gen_start < block_end
+                        and (x[:, gen_start:block_end] == eos_id).sum() > 0
                     ):
                         final_flag = True
                         x = x[:, :block_end]
+                        eos_pos = (x[:, gen_start:block_end] == eos_id).nonzero(as_tuple=True)[1][0].item() + gen_start
                         x[0, eos_pos:] = eos_id
                         break
                     nfe += 1
                     self(
                         x[:, block_start:block_end],