Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ API and command-line option may change frequently.***
- [ERNIE-Image](./docs/ernie_image.md)
- [Boogu Image](./docs/boogu_image.md)
- [Krea2](./docs/krea2.md)
- [SeFi-Image](./docs/sefi_image.md)
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
- [Ideogram4](./docs/ideogram4.md)
- Image Edit Models
Expand Down
Binary file added assets/sefi_image/example.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
50 changes: 50 additions & 0 deletions docs/sefi_image.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# How to Use

SeFi-Image uses a Flux2-style dual-time transformer (semantic + texture streams), the standard Flux2 VAE, and Qwen3-VL as the LLM text encoder. Tech report: [arXiv:2606.22568](https://arxiv.org/abs/2606.22568).

## Download weights

The SeFi-Image family ships in three scales (1B / 2B / 5B) and three families (Base / RL / turbo), all gated on Hugging Face under https://huggingface.co/SeFi-Image.

- 1B and 2B variants pair with Qwen3-VL-2B-Instruct.
- 5B variants pair with Qwen3-VL-4B-Instruct.
- All variants use the standard Flux2 VAE (`flux2_ae.safetensors` from https://huggingface.co/black-forest-labs/FLUX.2-dev).

Convert the transformer and text encoder to sd.cpp safetensors:

```bash
python3 script/convert_sefi.py <hf_repo_dir> <out_dir>/sefi_<scale>_<family>.safetensors
python3 script/convert_qwen3_vl.py <hf_repo_dir>/Qwen3-VL-XB-Instruct <out_dir>/qwen3_vl_<X>b.safetensors
```

## Variant defaults

| Family | timestep_shift_alpha | steps | cfg-scale |
|---|---|---|---|
| Base | 0.3 | 50 | 4.0 |
| RL | 0.3 | 50 | 4.0 |
| turbo | 1.0 | 4 | 1.0 |

The dispatcher picks `alpha` from the filename (`turbo` substring => 1.0, otherwise 0.3). Override via `--extra-sample-args sefi_alpha=<value>` or `sefi_delta_t=<value>`.

## Examples

### 1B / 2B turbo

```
./build/bin/sd-cli --diffusion-model /path/to/sefi_1b_turbo.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_2b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 1.0 --steps 4 -W 1024 -H 1024 -s 42 --diffusion-fa --offload-to-cpu -o out.png
```

### 1B / 2B base

```
./build/bin/sd-cli --diffusion-model /path/to/sefi_1b_base.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_2b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 4.0 --steps 50 -W 1024 -H 1024 -s 42 --diffusion-fa --offload-to-cpu -o out.png
```

### 5B (needs streaming on 12 GiB VRAM)

```
./build/bin/sd-cli --diffusion-model /path/to/sefi_5b_turbo.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_4b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 1.0 --steps 4 -W 1024 -H 1024 -s 42 --diffusion-fa --max-vram 8 --stream-layers --offload-to-cpu -o out.png
```

<img alt="SeFi-Image 5B turbo example" src="../assets/sefi_image/example.png" />
1 change: 1 addition & 0 deletions include/stable-diffusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ enum prediction_t {
FLOW_PRED,
FLUX_FLOW_PRED,
FLUX2_FLOW_PRED,
SEFI_FLOW_PRED,
PREDICTION_COUNT
};

Expand Down
112 changes: 112 additions & 0 deletions script/convert_qwen3_vl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/usr/bin/env python3
"""Convert a Qwen3-VL HF safetensors checkpoint into a sd.cpp-loadable form.

The HF dump prefixes text-tower keys with ``model.language_model.`` and
vision-tower keys with ``model.visual.``. sd.cpp expects ``model.<rest>`` for
the text side; the vision side is converted by sd.cpp's own
``convert_qwen3_vl_vision_name`` and is left as-is here.

Operates on raw safetensors bytes so any dtype (BF16/F16/F32) is preserved.

Usage:
python3 script/convert_qwen3_vl.py <hf_qwen3_vl_dir_or_safetensors> <output.safetensors>
"""

import argparse
import json
import os
import struct
import sys


def rewrite_key(key: str) -> str:
if key.startswith("model.language_model."):
return "model." + key[len("model.language_model."):]
return key


def read_safetensors_header(path: str):
with open(path, "rb") as f:
hdr_len = struct.unpack("<Q", f.read(8))[0]
hdr_bytes = f.read(hdr_len)
return json.loads(hdr_bytes), 8 + hdr_len


def collect_shard_paths(path: str):
if os.path.isdir(path):
index_path = os.path.join(path, "model.safetensors.index.json")
if os.path.isfile(index_path):
with open(index_path) as f:
idx = json.load(f)
return sorted({os.path.join(path, n) for n in idx["weight_map"].values()})
single = os.path.join(path, "model.safetensors")
if os.path.isfile(single):
return [single]
raise FileNotFoundError(f"No Qwen3-VL safetensors in {path}")
if os.path.isfile(path):
return [path]
raise FileNotFoundError(path)


def stage_tensors(input_path: str):
entries = []
for shard_path in collect_shard_paths(input_path):
hdr, data_off = read_safetensors_header(shard_path)
for key, info in hdr.items():
if key == "__metadata__":
continue
entries.append((rewrite_key(key), shard_path, data_off, info))
return entries


def write_consolidated(out_path: str, entries):
entries = sorted(entries, key=lambda e: e[0])

new_header = {}
cur_offset = 0
for new_key, shard_path, data_off, info in entries:
start, end = info["data_offsets"]
size = end - start
new_header[new_key] = {
"dtype": info["dtype"],
"shape": info["shape"],
"data_offsets": [cur_offset, cur_offset + size],
}
cur_offset += size

header_json = json.dumps(new_header, separators=(",", ":")).encode("utf-8")
pad = (-len(header_json)) % 8
header_json = header_json + (b" " * pad)

with open(out_path, "wb") as out:
out.write(struct.pack("<Q", len(header_json)))
out.write(header_json)
for new_key, shard_path, data_off, info in entries:
start, end = info["data_offsets"]
with open(shard_path, "rb") as src:
src.seek(data_off + start)
remaining = end - start
while remaining > 0:
chunk = src.read(min(8 * 1024 * 1024, remaining))
if not chunk:
raise IOError(f"Truncated tensor in {shard_path}")
out.write(chunk)
remaining -= len(chunk)


def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("input", help="HF Qwen3-VL directory or single safetensors file")
parser.add_argument("output", help="Output single safetensors path")
args = parser.parse_args()

entries = stage_tensors(args.input)
print(f"Tensors: {len(entries)}")
print(f"Writing -> {args.output}")
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
write_consolidated(args.output, entries)
print(f"Done. Output size: {os.path.getsize(args.output) / 1e9:.2f} GB")


if __name__ == "__main__":
main()
Loading
Loading