From d29917bb8efe27a54148cb292efd8a6b5d115e06 Mon Sep 17 00:00:00 2001
From: Terry Jia <terryjia88@gmail.com>
Date: Mon, 15 Jun 2026 14:32:21 -0400
Subject: [PATCH] feat: Ideogram structured-caption nodes

---
 comfy_api/latest/_io.py              |  39 +++++
 comfy_extras/color_util.py           |  23 +++
 comfy_extras/nodes_bounding_boxes.py | 253 +++++++++++++++++++++++++++
 comfy_extras/nodes_color.py          |   5 +-
 comfy_extras/nodes_json_prompt.py    |  77 ++++++++
 comfy_extras/nodes_string.py         |  53 ++++++
 nodes.py                             |   2 +
 7 files changed, 449 insertions(+), 3 deletions(-)
 create mode 100644 comfy_extras/color_util.py
 create mode 100644 comfy_extras/nodes_bounding_boxes.py
 create mode 100644 comfy_extras/nodes_json_prompt.py

diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py
index 012fae3aca8f..58e49d8e2d29 100644
--- a/comfy_api/latest/_io.py
+++ b/comfy_api/latest/_io.py
@@ -891,6 +891,14 @@ class TrackDict(TypedDict):
         track_visibility: torch.Tensor
     Type = TrackDict
 
+@comfytype(io_type="DICT")
+class Dict(ComfyTypeIO):
+    Type = dict
+
+@comfytype(io_type="ARRAY")
+class Array(ComfyTypeIO):
+    Type = list
+
 @comfytype(io_type="COMFY_MULTITYPED_V3")
 class MultiType:
     Type = Any
@@ -1279,6 +1287,19 @@ def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str
       def as_dict(self):
           return super().as_dict()
 
+
+@comfytype(io_type="COLORS")
+class Colors(ComfyTypeIO):
+    Type = list[Color.Type]
+
+    class Input(WidgetInput):
+        def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None,
+                     socketless: bool=True, default: list[str]=None, advanced: bool=None):
+            super().__init__(id, display_name, optional, tooltip, None, default, socketless, None, None, None, None, advanced)
+            if default is None:
+                self.default = []
+
+
 @comfytype(io_type="BOUNDING_BOX")
 class BoundingBox(ComfyTypeIO):
     class BoundingBoxDict(TypedDict):
@@ -1326,6 +1347,20 @@ def as_dict(self):
             return d
 
 
+@comfytype(io_type="BOUNDING_BOXES")
+class BoundingBoxes(ComfyTypeIO):
+    class BoundingBoxWithMetadata(BoundingBox.BoundingBoxDict):
+        metadata: dict
+    Type = list[BoundingBoxWithMetadata]
+
+    class Input(WidgetInput):
+        def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None,
+                     socketless: bool=True, default: list[dict]=None, advanced: bool=None):
+            super().__init__(id, display_name, optional, tooltip, None, default, socketless, None, None, None, None, advanced)
+            if default is None:
+                self.default = []
+
+
 @comfytype(io_type="HISTOGRAM")
 class Histogram(ComfyTypeIO):
     """A histogram represented as a list of bin counts."""
@@ -2376,6 +2411,8 @@ def as_dict(self):
     "AnyType",
     "MultiType",
     "Tracks",
+    "Dict",
+    "Array",
     "Color",
     # Dynamic Types
     "MatchType",
@@ -2394,6 +2431,8 @@ def as_dict(self):
     "PriceBadgeDepends",
     "PriceBadge",
     "BoundingBox",
+    "BoundingBoxes",
+    "Colors",
     "Curve",
     "Histogram",
     "Range",
diff --git a/comfy_extras/color_util.py b/comfy_extras/color_util.py
new file mode 100644
index 000000000000..d50795ae352a
--- /dev/null
+++ b/comfy_extras/color_util.py
@@ -0,0 +1,23 @@
+def hex_to_rgb(value: str) -> tuple[int, int, int]:
+    h = value.lstrip("#")
+    if len(h) != 6:
+        return (255, 255, 255)
+    try:
+        return (int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16))
+    except ValueError:
+        return (255, 255, 255)
+
+
+def readable_color(rgb: tuple[int, int, int]) -> tuple[int, int, int]:
+    r, g, b = rgb
+    lum = 0.299 * r + 0.587 * g + 0.114 * b
+    if lum >= 130:
+        return (r, g, b)
+    t = (130 - lum) / (255 - lum)
+    return (round(r + (255 - r) * t), round(g + (255 - g) * t), round(b + (255 - b) * t))
+
+
+def normalize_palette(colors) -> list[str]:
+    if isinstance(colors, dict):
+        colors = colors.values()
+    return [c.upper() for c in colors if isinstance(c, str) and c]
diff --git a/comfy_extras/nodes_bounding_boxes.py b/comfy_extras/nodes_bounding_boxes.py
new file mode 100644
index 000000000000..77cbf864998e
--- /dev/null
+++ b/comfy_extras/nodes_bounding_boxes.py
@@ -0,0 +1,253 @@
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageEnhance, ImageFont
+from typing_extensions import override
+
+from comfy_api.latest import ComfyExtension, io
+from comfy_extras.color_util import hex_to_rgb, normalize_palette, readable_color
+
+_PREVIEW_LONG_EDGE = 1024
+_PREVIEW_DIM = 0.25
+
+
+def pixels_to_fractions(box: dict, width: int, height: int) -> dict:
+    w = width or 1
+    h = height or 1
+    return {
+        "x": box.get("x", 0) / w,
+        "y": box.get("y", 0) / h,
+        "w": box.get("width", 0) / w,
+        "h": box.get("height", 0) / h,
+    }
+
+
+def fractions_to_pixels(box: dict, width: int, height: int) -> dict:
+    x, y = box.get("x", 0.0), box.get("y", 0.0)
+    w, h = box.get("w", 0.0), box.get("h", 0.0)
+    if w < 0:
+        x, w = x + w, -w
+    if h < 0:
+        y, h = y + h, -h
+    return {
+        "x": round(x * width),
+        "y": round(y * height),
+        "width": round(w * width),
+        "height": round(h * height),
+    }
+
+
+def fractions_to_bbox_frame(boxes: list, width: int, height: int) -> list:
+    pixels = [
+        fractions_to_pixels(box, width, height)
+        for box in boxes
+        if isinstance(box, dict)
+    ]
+    return [pixels] if pixels else []
+
+
+def _font(size: int):
+    try:
+        return ImageFont.load_default(size)
+    except Exception:
+        return ImageFont.load_default()
+
+
+def _wrap(draw, text: str, font, max_w: float) -> list[str]:
+    lines = []
+    for para in text.split("\n"):
+        line = ""
+        for word in para.split():
+            test = word if not line else line + " " + word
+            if line and draw.textlength(test, font=font) > max_w:
+                lines.append(line)
+                line = word
+            else:
+                line = test
+        lines.append(line)
+    return lines
+
+
+def _bg_from_image(image) -> Image.Image | None:
+    if image is None:
+        return None
+    try:
+        arr = (image[0].detach().cpu().numpy() * 255).clip(0, 255).astype(np.uint8)
+        return Image.fromarray(arr)
+    except Exception:
+        return None
+
+
+def render_preview(regions, width, height, bg=None):
+    if bg is not None:
+        iw, ih = bg.size
+        long_edge = max(iw, ih) or 1
+        scale = min(1.0, _PREVIEW_LONG_EDGE / long_edge)
+        rw, rh = max(1, round(iw * scale)), max(1, round(ih * scale))
+        base = bg.convert("RGB").resize((rw, rh), Image.LANCZOS)
+        base = ImageEnhance.Brightness(base).enhance(_PREVIEW_DIM)
+        img = base.convert("RGBA")
+    else:
+        long_edge = max(width, height) or 1
+        scale = min(1.0, _PREVIEW_LONG_EDGE / long_edge)
+        rw, rh = max(1, round(width * scale)), max(1, round(height * scale))
+        grey = round(_PREVIEW_DIM * 128)
+        img = Image.new("RGBA", (rw, rh), (grey, grey, grey, 255))
+
+    overlay = Image.new("RGBA", (rw, rh), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(overlay)
+    fs = max(10, round(rh / 64))
+    font = _font(fs)
+    tag_font = _font(max(9, fs - 2))
+    line_h = fs + 2
+
+    for i, region in enumerate(regions):
+        if not isinstance(region, dict):
+            continue
+        palette = [c for c in (region.get("palette") or []) if c]
+        r, g, b = hex_to_rgb(palette[0]) if palette else (140, 140, 140)
+        x1 = max(0, min(rw, round(region.get("x", 0) * rw)))
+        y1 = max(0, min(rh, round(region.get("y", 0) * rh)))
+        x2 = max(0, min(rw, round((region.get("x", 0) + region.get("w", 0)) * rw)))
+        y2 = max(0, min(rh, round((region.get("y", 0) + region.get("h", 0)) * rh)))
+        if x2 < x1:
+            x1, x2 = x2, x1
+        if y2 < y1:
+            y1, y2 = y2, y1
+
+        draw.rectangle([x1, y1, x2, y2], outline=(r, g, b, 255), width=2)
+
+        swatches = palette[:5]
+        if swatches and (x2 - x1) > 2:
+            sh = max(5, fs // 2)
+            seg = (x2 - x1) / len(swatches)
+            for p, hexc in enumerate(swatches):
+                sx = x1 + round(p * seg)
+                draw.rectangle([sx, y1, x1 + round((p + 1) * seg), y1 + sh], fill=hex_to_rgb(hexc))
+
+        etype = "text" if region.get("type") == "text" else "obj"
+        tag = str(i + 1).zfill(2)
+        tw = draw.textlength(tag, font=tag_font)
+        draw.rectangle([x1, y1, x1 + tw + 6, y1 + fs + 2], fill=(r, g, b, 255))
+        tag_fill = (0, 0, 0, 255) if (0.299 * r + 0.587 * g + 0.114 * b) > 140 else (255, 255, 255, 255)
+        draw.text((x1 + 3, y1 + 1), tag, fill=tag_fill, font=tag_font)
+
+        body = region.get("desc", "") or ""
+        if etype == "text" and region.get("text"):
+            body = '"%s"%s' % (region["text"], " — " + body if body else "")
+        if body and (x2 - x1) > 8:
+            ty = y1 + fs + 5
+            for line in _wrap(draw, body, font, x2 - x1 - 8):
+                if ty > y2:
+                    break
+                draw.text((x1 + 4, ty), line, fill=readable_color((r, g, b)) + (255,), font=font)
+                ty += line_h
+
+    composed = Image.alpha_composite(img, overlay).convert("RGB")
+    arr = np.asarray(composed, dtype=np.float32) / 255.0
+    return torch.from_numpy(arr).unsqueeze(0)
+
+
+def boxes_to_regions(boxes, width: int, height: int) -> list:
+    regions: list = []
+    if not isinstance(boxes, list):
+        return regions
+    for box in boxes:
+        if not isinstance(box, dict):
+            continue
+        meta = box.get("metadata")
+        meta = meta if isinstance(meta, dict) else {}
+        regions.append({
+            **pixels_to_fractions(box, width, height),
+            "type": meta.get("type", "obj"),
+            "text": meta.get("text", ""),
+            "desc": meta.get("desc", ""),
+            "palette": meta.get("palette", []),
+        })
+    return regions
+
+
+def _norm_bbox(region: dict) -> list[int]:
+    def grid(value: float) -> int:
+        return max(0, min(1000, round(value * 1000)))
+
+    x, y = region.get("x", 0.0), region.get("y", 0.0)
+    w, h = region.get("w", 0.0), region.get("h", 0.0)
+    ymin, xmin, ymax, xmax = grid(y), grid(x), grid(y + h), grid(x + w)
+    if ymin > ymax:
+        ymin, ymax = ymax, ymin
+    if xmin > xmax:
+        xmin, xmax = xmax, xmin
+    return [ymin, xmin, ymax, xmax]
+
+
+def build_elements(regions: list) -> list:
+    elements = []
+    for region in regions:
+        if not isinstance(region, dict):
+            continue
+        etype = "text" if region.get("type") == "text" else "obj"
+        element = {"type": etype}
+        element["bbox"] = _norm_bbox(region)
+        if etype == "text":
+            element["text"] = region.get("text", "")
+        element["desc"] = region.get("desc", "")
+        palette = normalize_palette(region.get("palette", []))
+        if palette:
+            element["color_palette"] = palette[:5]
+        elements.append(element)
+    return elements
+
+
+class CreateBoundingBoxes(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        editor_state = io.BoundingBoxes.Input(
+            "editor_state",
+            socketless=False,
+            tooltip="Draw bounding boxes and set each box type, text, description, color palette. Start with background element first and foreground last.",
+        )
+        return io.Schema(
+            node_id="CreateBoundingBoxes",
+            display_name="Create Bounding Boxes",
+            category="utilities",
+            description="Draw bounding boxes in a canvas. Outputs Ideogram prompt elements, pixel-space bounding boxes, and a preview image.",
+            inputs=[
+                io.Image.Input(
+                    "background",
+                    optional=True,
+                    tooltip="Optional image used as background in the canvas and preview.",
+                ),
+                io.Int.Input("width", default=1024, min=64, max=16384, step=16,
+                             tooltip="Width of the canvas and the pixel grid for the bounding boxes."),
+                io.Int.Input("height", default=1024, min=64, max=16384, step=16,
+                             tooltip="Height of the canvas and the pixel grid for the bounding boxes."),
+                editor_state,
+            ],
+            outputs=[
+                io.Image.Output(display_name="preview"),
+                io.BoundingBox.Output(display_name="bboxes"),
+                io.Array.Output(display_name="elements"),
+            ],
+            is_experimental=True,
+        )
+
+    @classmethod
+    def execute(cls, width, height, editor_state=None, background=None) -> io.NodeOutput:
+        regions = boxes_to_regions(editor_state, width, height)
+        preview = render_preview(regions, width, height, _bg_from_image(background))
+        return io.NodeOutput(
+            preview,
+            fractions_to_bbox_frame(regions, width, height),
+            build_elements(regions),
+            ui={"dims": [width, height]},
+        )
+
+
+class BoundingBoxesExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [CreateBoundingBoxes]
+
+
+async def comfy_entrypoint() -> BoundingBoxesExtension:
+    return BoundingBoxesExtension()
diff --git a/comfy_extras/nodes_color.py b/comfy_extras/nodes_color.py
index 688254e4eb22..1bf256bf6543 100644
--- a/comfy_extras/nodes_color.py
+++ b/comfy_extras/nodes_color.py
@@ -1,5 +1,6 @@
 from typing_extensions import override
 from comfy_api.latest import ComfyExtension, io
+from comfy_extras.color_util import hex_to_rgb
 
 
 class ColorToRGBInt(io.ComfyNode):
@@ -24,9 +25,7 @@ def execute(cls, color: str) -> io.NodeOutput:
         # expect format #RRGGBB
         if len(color) != 7 or color[0] != "#":
             raise ValueError("Color must be in format #RRGGBB")
-        r = int(color[1:3], 16)
-        g = int(color[3:5], 16)
-        b = int(color[5:7], 16)
+        r, g, b = hex_to_rgb(color)
 
         rgb_int = r * 256 * 256 + g * 256 + b
         return io.NodeOutput(rgb_int, color)
diff --git a/comfy_extras/nodes_json_prompt.py b/comfy_extras/nodes_json_prompt.py
new file mode 100644
index 000000000000..206f5aa710e2
--- /dev/null
+++ b/comfy_extras/nodes_json_prompt.py
@@ -0,0 +1,77 @@
+from typing_extensions import override
+
+from comfy_api.latest import ComfyExtension, io
+from comfy_extras.color_util import normalize_palette
+
+
+class BuildJsonPromptIdeogram(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        color_palette = io.Colors.Input(
+            "color_palette",
+            socketless=False,
+            tooltip="Hex color codes that steer the image's dominant colors. Up to 16 entries.",
+        )
+        return io.Schema(
+            node_id="BuildJsonPromptIdeogram",
+            display_name="Build JSON Prompt (Ideogram)",
+            category="text",
+            description="Build a JSON prompt for the Ideogram 4 model.",
+            inputs=[
+                io.Array.Input("element", tooltip="Prompt elements from the node Create Bounding Boxes."),
+                io.String.Input("high_level_description", multiline=True, default="",
+                                tooltip="Optional description of the image in one or two sentences. Strongly recommended."),
+                io.String.Input("background", multiline=True, default="",
+                                tooltip="Mandatory description of the image background or environment."),
+                io.DynamicCombo.Input("style", options=[
+                    io.DynamicCombo.Option("none", []),
+                    io.DynamicCombo.Option("photo", [io.String.Input("photo", default="", tooltip="Camera or lens details for photographic outputs (e.g. 35mm, f/1.4, bokeh).")]),
+                    io.DynamicCombo.Option("art_style", [io.String.Input("art_style", default="", tooltip="Art style description (e.g. flat vector illustration, bold outlines).")]),
+                ]),
+                io.String.Input("aesthetics", default="", tooltip="Mandatory aesthetic keywords (e.g. moody, cinematic, desaturated)."),
+                io.String.Input("lighting", default="", tooltip="Mandatory lighting description (e.g. golden hour, rim light, dramatic shadows)."),
+                io.String.Input("medium", default="", tooltip="Mandatory medium type (e.g. photograph, illustration, 3d_render, painting, graphic_design). When style = photo, set to photograph."),
+                color_palette,
+            ],
+            outputs=[io.Dict.Output(display_name="prompt")],
+            is_experimental=True,
+        )
+
+    @classmethod
+    def execute(cls, element, style, high_level_description="", background="",
+                aesthetics="", lighting="", medium="", color_palette=None) -> io.NodeOutput:
+        elements = element if isinstance(element, list) else []
+        kind = style.get("style", "none") if isinstance(style, dict) else "none"
+        photo = style.get("photo", "") if isinstance(style, dict) else ""
+        art_style = style.get("art_style", "") if isinstance(style, dict) else ""
+        palette = normalize_palette(color_palette or [])
+
+        caption: dict = {}
+        if high_level_description.strip():
+            caption["high_level_description"] = high_level_description
+        if kind != "none":
+            style_desc: dict = {"aesthetics": aesthetics, "lighting": lighting}
+            if kind == "photo":
+                style_desc["photo"] = photo
+                style_desc["medium"] = medium
+            else:
+                style_desc["medium"] = medium
+                style_desc["art_style"] = art_style
+            if palette:
+                style_desc["color_palette"] = palette
+            caption["style_description"] = style_desc
+        caption["compositional_deconstruction"] = {
+            "background": background,
+            "elements": elements,
+        }
+        return io.NodeOutput(caption)
+
+
+class JsonPromptExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [BuildJsonPromptIdeogram]
+
+
+async def comfy_entrypoint() -> JsonPromptExtension:
+    return JsonPromptExtension()
diff --git a/comfy_extras/nodes_string.py b/comfy_extras/nodes_string.py
index 97485c8c5edb..21929ae630e7 100644
--- a/comfy_extras/nodes_string.py
+++ b/comfy_extras/nodes_string.py
@@ -440,6 +440,57 @@ def execute(cls, json_string, key):
         except (json.JSONDecodeError, TypeError):
             return io.NodeOutput("")
 
+
+def _dump_json(value, indent):
+    return json.dumps(value, ensure_ascii=False, indent=indent or None)
+
+
+class ConvertDictionaryToString(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="ConvertDictionaryToString",
+            display_name="Convert Dictionary to String",
+            category="text",
+            search_aliases=["json", "dict to json", "stringify", "serialize", "dict to string"],
+            inputs=[
+                io.Dict.Input("dictionary"),
+                io.Int.Input("indent", default=2, min=0, max=8,
+                             tooltip="Spaces per indent level. 0 produces compact single-line string."),
+            ],
+            outputs=[
+                io.String.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, dictionary, indent=2):
+        return io.NodeOutput(_dump_json(dictionary, indent))
+
+
+class ConvertArrayToString(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="ConvertArrayToString",
+            display_name="Convert Array to String",
+            category="text",
+            search_aliases=["json", "list to json", "stringify", "serialize", "list to string", "array to json"],
+            inputs=[
+                io.Array.Input("array"),
+                io.Int.Input("indent", default=2, min=0, max=8,
+                             tooltip="Spaces per indent level. 0 produces compact single-line string."),
+            ],
+            outputs=[
+                io.String.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, array, indent=2):
+        return io.NodeOutput(_dump_json(array, indent))
+
+
 class StringExtension(ComfyExtension):
     @override
     async def get_node_list(self) -> list[type[io.ComfyNode]]:
@@ -457,6 +508,8 @@ async def get_node_list(self) -> list[type[io.ComfyNode]]:
             RegexExtract,
             RegexReplace,
             JsonExtractString,
+            ConvertDictionaryToString,
+            ConvertArrayToString,
         ]
 
 async def comfy_entrypoint() -> StringExtension:
diff --git a/nodes.py b/nodes.py
index b1a663f4cbc8..c7fbd3475a2c 100644
--- a/nodes.py
+++ b/nodes.py
@@ -2368,6 +2368,8 @@ async def init_builtin_extra_nodes():
         "nodes_images.py",
         "nodes_video_model.py",
         "nodes_ideogram4.py",
+        "nodes_bounding_boxes.py",
+        "nodes_json_prompt.py",
         "nodes_train.py",
         "nodes_dataset.py",
         "nodes_sag.py",