leejet · fszontagh · Jun 25, 2026
diff --git a/README.md b/README.md
@@ -53,6 +53,7 @@ API and command-line option may change frequently.***
     - [ERNIE-Image](./docs/ernie_image.md)
     - [Boogu Image](./docs/boogu_image.md)
     - [Krea2](./docs/krea2.md)
+    - [SeFi-Image](./docs/sefi_image.md)
     - [HiDream-O1-Image](./docs/hidream_o1_image.md)
     - [Ideogram4](./docs/ideogram4.md)
   - Image Edit Models

diff --git a/assets/sefi_image/example.png b/assets/sefi_image/example.png
diff --git a/docs/sefi_image.md b/docs/sefi_image.md
@@ -0,0 +1,50 @@
+# How to Use
+
+SeFi-Image uses a Flux2-style dual-time transformer (semantic + texture streams), the standard Flux2 VAE, and Qwen3-VL as the LLM text encoder. Tech report: [arXiv:2606.22568](https://arxiv.org/abs/2606.22568).
+
+## Download weights
+
+The SeFi-Image family ships in three scales (1B / 2B / 5B) and three families (Base / RL / turbo), all gated on Hugging Face under https://huggingface.co/SeFi-Image.
+
+- 1B and 2B variants pair with Qwen3-VL-2B-Instruct.
+- 5B variants pair with Qwen3-VL-4B-Instruct.
+- All variants use the standard Flux2 VAE (`flux2_ae.safetensors` from https://huggingface.co/black-forest-labs/FLUX.2-dev).
+
+Convert the transformer and text encoder to sd.cpp safetensors:
+
+```bash
+python3 script/convert_sefi.py     <hf_repo_dir>                          <out_dir>/sefi_<scale>_<family>.safetensors
+python3 script/convert_qwen3_vl.py  <hf_repo_dir>/Qwen3-VL-XB-Instruct    <out_dir>/qwen3_vl_<X>b.safetensors
+```
+
+## Variant defaults
+
+| Family | timestep_shift_alpha | steps | cfg-scale |
+|---|---|---|---|
+| Base | 0.3 | 50 | 4.0 |
+| RL | 0.3 | 50 | 4.0 |
+| turbo | 1.0 | 4 | 1.0 |
+
+The dispatcher picks `alpha` from the filename (`turbo` substring => 1.0, otherwise 0.3). Override via `--extra-sample-args sefi_alpha=<value>` or `sefi_delta_t=<value>`.
+
+## Examples
+
+### 1B / 2B turbo
+
+```
+./build/bin/sd-cli --diffusion-model /path/to/sefi_1b_turbo.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_2b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 1.0 --steps 4 -W 1024 -H 1024 -s 42 --diffusion-fa --offload-to-cpu -o out.png
+```
+
+### 1B / 2B base
+
+```
+./build/bin/sd-cli --diffusion-model /path/to/sefi_1b_base.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_2b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 4.0 --steps 50 -W 1024 -H 1024 -s 42 --diffusion-fa --offload-to-cpu -o out.png
+```
+
+### 5B (needs streaming on 12 GiB VRAM)
+
+```
+./build/bin/sd-cli --diffusion-model /path/to/sefi_5b_turbo.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_4b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 1.0 --steps 4 -W 1024 -H 1024 -s 42 --diffusion-fa --max-vram 8 --stream-layers --offload-to-cpu -o out.png
+```
+
+<img alt="SeFi-Image 5B turbo example" src="../assets/sefi_image/example.png" />
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
@@ -81,6 +81,7 @@ enum prediction_t {
     FLOW_PRED,
     FLUX_FLOW_PRED,
     FLUX2_FLOW_PRED,
+    SEFI_FLOW_PRED,
     PREDICTION_COUNT
 };
 

diff --git a/script/convert_qwen3_vl.py b/script/convert_qwen3_vl.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+"""Convert a Qwen3-VL HF safetensors checkpoint into a sd.cpp-loadable form.
+
+The HF dump prefixes text-tower keys with ``model.language_model.`` and
+vision-tower keys with ``model.visual.``. sd.cpp expects ``model.<rest>`` for
+the text side; the vision side is converted by sd.cpp's own
+``convert_qwen3_vl_vision_name`` and is left as-is here.
+
+Operates on raw safetensors bytes so any dtype (BF16/F16/F32) is preserved.
+
+Usage:
+    python3 script/convert_qwen3_vl.py <hf_qwen3_vl_dir_or_safetensors> <output.safetensors>
+"""
+
+import argparse
+import json
+import os
+import struct
+import sys
+
+
+def rewrite_key(key: str) -> str:
+    if key.startswith("model.language_model."):
+        return "model." + key[len("model.language_model."):]
+    return key
+
+
+def read_safetensors_header(path: str):
+    with open(path, "rb") as f:
+        hdr_len = struct.unpack("<Q", f.read(8))[0]
+        hdr_bytes = f.read(hdr_len)
+    return json.loads(hdr_bytes), 8 + hdr_len
+
+
+def collect_shard_paths(path: str):
+    if os.path.isdir(path):
+        index_path = os.path.join(path, "model.safetensors.index.json")
+        if os.path.isfile(index_path):
+            with open(index_path) as f:
+                idx = json.load(f)
+            return sorted({os.path.join(path, n) for n in idx["weight_map"].values()})
+        single = os.path.join(path, "model.safetensors")
+        if os.path.isfile(single):
+            return [single]
+        raise FileNotFoundError(f"No Qwen3-VL safetensors in {path}")
+    if os.path.isfile(path):
+        return [path]
+    raise FileNotFoundError(path)
+
+
+def stage_tensors(input_path: str):
+    entries = []
+    for shard_path in collect_shard_paths(input_path):
+        hdr, data_off = read_safetensors_header(shard_path)
+        for key, info in hdr.items():
+            if key == "__metadata__":
+                continue
+            entries.append((rewrite_key(key), shard_path, data_off, info))
+    return entries
+
+
+def write_consolidated(out_path: str, entries):
+    entries = sorted(entries, key=lambda e: e[0])
+
+    new_header = {}
+    cur_offset = 0
+    for new_key, shard_path, data_off, info in entries:
+        start, end = info["data_offsets"]
+        size = end - start
+        new_header[new_key] = {
+            "dtype": info["dtype"],
+            "shape": info["shape"],
+            "data_offsets": [cur_offset, cur_offset + size],
+        }
+        cur_offset += size
+
+    header_json = json.dumps(new_header, separators=(",", ":")).encode("utf-8")
+    pad = (-len(header_json)) % 8
+    header_json = header_json + (b" " * pad)
+
+    with open(out_path, "wb") as out:
+        out.write(struct.pack("<Q", len(header_json)))
+        out.write(header_json)
+        for new_key, shard_path, data_off, info in entries:
+            start, end = info["data_offsets"]
+            with open(shard_path, "rb") as src:
+                src.seek(data_off + start)
+                remaining = end - start
+                while remaining > 0:
+                    chunk = src.read(min(8 * 1024 * 1024, remaining))
+                    if not chunk:
+                        raise IOError(f"Truncated tensor in {shard_path}")
+                    out.write(chunk)
+                    remaining -= len(chunk)
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("input", help="HF Qwen3-VL directory or single safetensors file")
+    parser.add_argument("output", help="Output single safetensors path")
+    args = parser.parse_args()
+
+    entries = stage_tensors(args.input)
+    print(f"Tensors: {len(entries)}")
+    print(f"Writing -> {args.output}")
+    os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+    write_consolidated(args.output, entries)
+    print(f"Done. Output size: {os.path.getsize(args.output) / 1e9:.2f} GB")
+
+
+if __name__ == "__main__":
+    main()