From d29917bb8efe27a54148cb292efd8a6b5d115e06 Mon Sep 17 00:00:00 2001 From: Terry Jia Date: Mon, 15 Jun 2026 14:32:21 -0400 Subject: [PATCH] feat: Ideogram structured-caption nodes --- comfy_api/latest/_io.py | 39 +++++ comfy_extras/color_util.py | 23 +++ comfy_extras/nodes_bounding_boxes.py | 253 +++++++++++++++++++++++++++ comfy_extras/nodes_color.py | 5 +- comfy_extras/nodes_json_prompt.py | 77 ++++++++ comfy_extras/nodes_string.py | 53 ++++++ nodes.py | 2 + 7 files changed, 449 insertions(+), 3 deletions(-) create mode 100644 comfy_extras/color_util.py create mode 100644 comfy_extras/nodes_bounding_boxes.py create mode 100644 comfy_extras/nodes_json_prompt.py diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 012fae3aca8f..58e49d8e2d29 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -891,6 +891,14 @@ class TrackDict(TypedDict): track_visibility: torch.Tensor Type = TrackDict +@comfytype(io_type="DICT") +class Dict(ComfyTypeIO): + Type = dict + +@comfytype(io_type="ARRAY") +class Array(ComfyTypeIO): + Type = list + @comfytype(io_type="COMFY_MULTITYPED_V3") class MultiType: Type = Any @@ -1279,6 +1287,19 @@ def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str def as_dict(self): return super().as_dict() + +@comfytype(io_type="COLORS") +class Colors(ComfyTypeIO): + Type = list[Color.Type] + + class Input(WidgetInput): + def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, + socketless: bool=True, default: list[str]=None, advanced: bool=None): + super().__init__(id, display_name, optional, tooltip, None, default, socketless, None, None, None, None, advanced) + if default is None: + self.default = [] + + @comfytype(io_type="BOUNDING_BOX") class BoundingBox(ComfyTypeIO): class BoundingBoxDict(TypedDict): @@ -1326,6 +1347,20 @@ def as_dict(self): return d +@comfytype(io_type="BOUNDING_BOXES") +class BoundingBoxes(ComfyTypeIO): + class BoundingBoxWithMetadata(BoundingBox.BoundingBoxDict): + metadata: dict + Type = list[BoundingBoxWithMetadata] + + class Input(WidgetInput): + def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, + socketless: bool=True, default: list[dict]=None, advanced: bool=None): + super().__init__(id, display_name, optional, tooltip, None, default, socketless, None, None, None, None, advanced) + if default is None: + self.default = [] + + @comfytype(io_type="HISTOGRAM") class Histogram(ComfyTypeIO): """A histogram represented as a list of bin counts.""" @@ -2376,6 +2411,8 @@ def as_dict(self): "AnyType", "MultiType", "Tracks", + "Dict", + "Array", "Color", # Dynamic Types "MatchType", @@ -2394,6 +2431,8 @@ def as_dict(self): "PriceBadgeDepends", "PriceBadge", "BoundingBox", + "BoundingBoxes", + "Colors", "Curve", "Histogram", "Range", diff --git a/comfy_extras/color_util.py b/comfy_extras/color_util.py new file mode 100644 index 000000000000..d50795ae352a --- /dev/null +++ b/comfy_extras/color_util.py @@ -0,0 +1,23 @@ +def hex_to_rgb(value: str) -> tuple[int, int, int]: + h = value.lstrip("#") + if len(h) != 6: + return (255, 255, 255) + try: + return (int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)) + except ValueError: + return (255, 255, 255) + + +def readable_color(rgb: tuple[int, int, int]) -> tuple[int, int, int]: + r, g, b = rgb + lum = 0.299 * r + 0.587 * g + 0.114 * b + if lum >= 130: + return (r, g, b) + t = (130 - lum) / (255 - lum) + return (round(r + (255 - r) * t), round(g + (255 - g) * t), round(b + (255 - b) * t)) + + +def normalize_palette(colors) -> list[str]: + if isinstance(colors, dict): + colors = colors.values() + return [c.upper() for c in colors if isinstance(c, str) and c] diff --git a/comfy_extras/nodes_bounding_boxes.py b/comfy_extras/nodes_bounding_boxes.py new file mode 100644 index 000000000000..77cbf864998e --- /dev/null +++ b/comfy_extras/nodes_bounding_boxes.py @@ -0,0 +1,253 @@ +import numpy as np +import torch +from PIL import Image, ImageDraw, ImageEnhance, ImageFont +from typing_extensions import override + +from comfy_api.latest import ComfyExtension, io +from comfy_extras.color_util import hex_to_rgb, normalize_palette, readable_color + +_PREVIEW_LONG_EDGE = 1024 +_PREVIEW_DIM = 0.25 + + +def pixels_to_fractions(box: dict, width: int, height: int) -> dict: + w = width or 1 + h = height or 1 + return { + "x": box.get("x", 0) / w, + "y": box.get("y", 0) / h, + "w": box.get("width", 0) / w, + "h": box.get("height", 0) / h, + } + + +def fractions_to_pixels(box: dict, width: int, height: int) -> dict: + x, y = box.get("x", 0.0), box.get("y", 0.0) + w, h = box.get("w", 0.0), box.get("h", 0.0) + if w < 0: + x, w = x + w, -w + if h < 0: + y, h = y + h, -h + return { + "x": round(x * width), + "y": round(y * height), + "width": round(w * width), + "height": round(h * height), + } + + +def fractions_to_bbox_frame(boxes: list, width: int, height: int) -> list: + pixels = [ + fractions_to_pixels(box, width, height) + for box in boxes + if isinstance(box, dict) + ] + return [pixels] if pixels else [] + + +def _font(size: int): + try: + return ImageFont.load_default(size) + except Exception: + return ImageFont.load_default() + + +def _wrap(draw, text: str, font, max_w: float) -> list[str]: + lines = [] + for para in text.split("\n"): + line = "" + for word in para.split(): + test = word if not line else line + " " + word + if line and draw.textlength(test, font=font) > max_w: + lines.append(line) + line = word + else: + line = test + lines.append(line) + return lines + + +def _bg_from_image(image) -> Image.Image | None: + if image is None: + return None + try: + arr = (image[0].detach().cpu().numpy() * 255).clip(0, 255).astype(np.uint8) + return Image.fromarray(arr) + except Exception: + return None + + +def render_preview(regions, width, height, bg=None): + if bg is not None: + iw, ih = bg.size + long_edge = max(iw, ih) or 1 + scale = min(1.0, _PREVIEW_LONG_EDGE / long_edge) + rw, rh = max(1, round(iw * scale)), max(1, round(ih * scale)) + base = bg.convert("RGB").resize((rw, rh), Image.LANCZOS) + base = ImageEnhance.Brightness(base).enhance(_PREVIEW_DIM) + img = base.convert("RGBA") + else: + long_edge = max(width, height) or 1 + scale = min(1.0, _PREVIEW_LONG_EDGE / long_edge) + rw, rh = max(1, round(width * scale)), max(1, round(height * scale)) + grey = round(_PREVIEW_DIM * 128) + img = Image.new("RGBA", (rw, rh), (grey, grey, grey, 255)) + + overlay = Image.new("RGBA", (rw, rh), (0, 0, 0, 0)) + draw = ImageDraw.Draw(overlay) + fs = max(10, round(rh / 64)) + font = _font(fs) + tag_font = _font(max(9, fs - 2)) + line_h = fs + 2 + + for i, region in enumerate(regions): + if not isinstance(region, dict): + continue + palette = [c for c in (region.get("palette") or []) if c] + r, g, b = hex_to_rgb(palette[0]) if palette else (140, 140, 140) + x1 = max(0, min(rw, round(region.get("x", 0) * rw))) + y1 = max(0, min(rh, round(region.get("y", 0) * rh))) + x2 = max(0, min(rw, round((region.get("x", 0) + region.get("w", 0)) * rw))) + y2 = max(0, min(rh, round((region.get("y", 0) + region.get("h", 0)) * rh))) + if x2 < x1: + x1, x2 = x2, x1 + if y2 < y1: + y1, y2 = y2, y1 + + draw.rectangle([x1, y1, x2, y2], outline=(r, g, b, 255), width=2) + + swatches = palette[:5] + if swatches and (x2 - x1) > 2: + sh = max(5, fs // 2) + seg = (x2 - x1) / len(swatches) + for p, hexc in enumerate(swatches): + sx = x1 + round(p * seg) + draw.rectangle([sx, y1, x1 + round((p + 1) * seg), y1 + sh], fill=hex_to_rgb(hexc)) + + etype = "text" if region.get("type") == "text" else "obj" + tag = str(i + 1).zfill(2) + tw = draw.textlength(tag, font=tag_font) + draw.rectangle([x1, y1, x1 + tw + 6, y1 + fs + 2], fill=(r, g, b, 255)) + tag_fill = (0, 0, 0, 255) if (0.299 * r + 0.587 * g + 0.114 * b) > 140 else (255, 255, 255, 255) + draw.text((x1 + 3, y1 + 1), tag, fill=tag_fill, font=tag_font) + + body = region.get("desc", "") or "" + if etype == "text" and region.get("text"): + body = '"%s"%s' % (region["text"], " — " + body if body else "") + if body and (x2 - x1) > 8: + ty = y1 + fs + 5 + for line in _wrap(draw, body, font, x2 - x1 - 8): + if ty > y2: + break + draw.text((x1 + 4, ty), line, fill=readable_color((r, g, b)) + (255,), font=font) + ty += line_h + + composed = Image.alpha_composite(img, overlay).convert("RGB") + arr = np.asarray(composed, dtype=np.float32) / 255.0 + return torch.from_numpy(arr).unsqueeze(0) + + +def boxes_to_regions(boxes, width: int, height: int) -> list: + regions: list = [] + if not isinstance(boxes, list): + return regions + for box in boxes: + if not isinstance(box, dict): + continue + meta = box.get("metadata") + meta = meta if isinstance(meta, dict) else {} + regions.append({ + **pixels_to_fractions(box, width, height), + "type": meta.get("type", "obj"), + "text": meta.get("text", ""), + "desc": meta.get("desc", ""), + "palette": meta.get("palette", []), + }) + return regions + + +def _norm_bbox(region: dict) -> list[int]: + def grid(value: float) -> int: + return max(0, min(1000, round(value * 1000))) + + x, y = region.get("x", 0.0), region.get("y", 0.0) + w, h = region.get("w", 0.0), region.get("h", 0.0) + ymin, xmin, ymax, xmax = grid(y), grid(x), grid(y + h), grid(x + w) + if ymin > ymax: + ymin, ymax = ymax, ymin + if xmin > xmax: + xmin, xmax = xmax, xmin + return [ymin, xmin, ymax, xmax] + + +def build_elements(regions: list) -> list: + elements = [] + for region in regions: + if not isinstance(region, dict): + continue + etype = "text" if region.get("type") == "text" else "obj" + element = {"type": etype} + element["bbox"] = _norm_bbox(region) + if etype == "text": + element["text"] = region.get("text", "") + element["desc"] = region.get("desc", "") + palette = normalize_palette(region.get("palette", [])) + if palette: + element["color_palette"] = palette[:5] + elements.append(element) + return elements + + +class CreateBoundingBoxes(io.ComfyNode): + @classmethod + def define_schema(cls): + editor_state = io.BoundingBoxes.Input( + "editor_state", + socketless=False, + tooltip="Draw bounding boxes and set each box type, text, description, color palette. Start with background element first and foreground last.", + ) + return io.Schema( + node_id="CreateBoundingBoxes", + display_name="Create Bounding Boxes", + category="utilities", + description="Draw bounding boxes in a canvas. Outputs Ideogram prompt elements, pixel-space bounding boxes, and a preview image.", + inputs=[ + io.Image.Input( + "background", + optional=True, + tooltip="Optional image used as background in the canvas and preview.", + ), + io.Int.Input("width", default=1024, min=64, max=16384, step=16, + tooltip="Width of the canvas and the pixel grid for the bounding boxes."), + io.Int.Input("height", default=1024, min=64, max=16384, step=16, + tooltip="Height of the canvas and the pixel grid for the bounding boxes."), + editor_state, + ], + outputs=[ + io.Image.Output(display_name="preview"), + io.BoundingBox.Output(display_name="bboxes"), + io.Array.Output(display_name="elements"), + ], + is_experimental=True, + ) + + @classmethod + def execute(cls, width, height, editor_state=None, background=None) -> io.NodeOutput: + regions = boxes_to_regions(editor_state, width, height) + preview = render_preview(regions, width, height, _bg_from_image(background)) + return io.NodeOutput( + preview, + fractions_to_bbox_frame(regions, width, height), + build_elements(regions), + ui={"dims": [width, height]}, + ) + + +class BoundingBoxesExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [CreateBoundingBoxes] + + +async def comfy_entrypoint() -> BoundingBoxesExtension: + return BoundingBoxesExtension() diff --git a/comfy_extras/nodes_color.py b/comfy_extras/nodes_color.py index 688254e4eb22..1bf256bf6543 100644 --- a/comfy_extras/nodes_color.py +++ b/comfy_extras/nodes_color.py @@ -1,5 +1,6 @@ from typing_extensions import override from comfy_api.latest import ComfyExtension, io +from comfy_extras.color_util import hex_to_rgb class ColorToRGBInt(io.ComfyNode): @@ -24,9 +25,7 @@ def execute(cls, color: str) -> io.NodeOutput: # expect format #RRGGBB if len(color) != 7 or color[0] != "#": raise ValueError("Color must be in format #RRGGBB") - r = int(color[1:3], 16) - g = int(color[3:5], 16) - b = int(color[5:7], 16) + r, g, b = hex_to_rgb(color) rgb_int = r * 256 * 256 + g * 256 + b return io.NodeOutput(rgb_int, color) diff --git a/comfy_extras/nodes_json_prompt.py b/comfy_extras/nodes_json_prompt.py new file mode 100644 index 000000000000..206f5aa710e2 --- /dev/null +++ b/comfy_extras/nodes_json_prompt.py @@ -0,0 +1,77 @@ +from typing_extensions import override + +from comfy_api.latest import ComfyExtension, io +from comfy_extras.color_util import normalize_palette + + +class BuildJsonPromptIdeogram(io.ComfyNode): + @classmethod + def define_schema(cls): + color_palette = io.Colors.Input( + "color_palette", + socketless=False, + tooltip="Hex color codes that steer the image's dominant colors. Up to 16 entries.", + ) + return io.Schema( + node_id="BuildJsonPromptIdeogram", + display_name="Build JSON Prompt (Ideogram)", + category="text", + description="Build a JSON prompt for the Ideogram 4 model.", + inputs=[ + io.Array.Input("element", tooltip="Prompt elements from the node Create Bounding Boxes."), + io.String.Input("high_level_description", multiline=True, default="", + tooltip="Optional description of the image in one or two sentences. Strongly recommended."), + io.String.Input("background", multiline=True, default="", + tooltip="Mandatory description of the image background or environment."), + io.DynamicCombo.Input("style", options=[ + io.DynamicCombo.Option("none", []), + io.DynamicCombo.Option("photo", [io.String.Input("photo", default="", tooltip="Camera or lens details for photographic outputs (e.g. 35mm, f/1.4, bokeh).")]), + io.DynamicCombo.Option("art_style", [io.String.Input("art_style", default="", tooltip="Art style description (e.g. flat vector illustration, bold outlines).")]), + ]), + io.String.Input("aesthetics", default="", tooltip="Mandatory aesthetic keywords (e.g. moody, cinematic, desaturated)."), + io.String.Input("lighting", default="", tooltip="Mandatory lighting description (e.g. golden hour, rim light, dramatic shadows)."), + io.String.Input("medium", default="", tooltip="Mandatory medium type (e.g. photograph, illustration, 3d_render, painting, graphic_design). When style = photo, set to photograph."), + color_palette, + ], + outputs=[io.Dict.Output(display_name="prompt")], + is_experimental=True, + ) + + @classmethod + def execute(cls, element, style, high_level_description="", background="", + aesthetics="", lighting="", medium="", color_palette=None) -> io.NodeOutput: + elements = element if isinstance(element, list) else [] + kind = style.get("style", "none") if isinstance(style, dict) else "none" + photo = style.get("photo", "") if isinstance(style, dict) else "" + art_style = style.get("art_style", "") if isinstance(style, dict) else "" + palette = normalize_palette(color_palette or []) + + caption: dict = {} + if high_level_description.strip(): + caption["high_level_description"] = high_level_description + if kind != "none": + style_desc: dict = {"aesthetics": aesthetics, "lighting": lighting} + if kind == "photo": + style_desc["photo"] = photo + style_desc["medium"] = medium + else: + style_desc["medium"] = medium + style_desc["art_style"] = art_style + if palette: + style_desc["color_palette"] = palette + caption["style_description"] = style_desc + caption["compositional_deconstruction"] = { + "background": background, + "elements": elements, + } + return io.NodeOutput(caption) + + +class JsonPromptExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [BuildJsonPromptIdeogram] + + +async def comfy_entrypoint() -> JsonPromptExtension: + return JsonPromptExtension() diff --git a/comfy_extras/nodes_string.py b/comfy_extras/nodes_string.py index 97485c8c5edb..21929ae630e7 100644 --- a/comfy_extras/nodes_string.py +++ b/comfy_extras/nodes_string.py @@ -440,6 +440,57 @@ def execute(cls, json_string, key): except (json.JSONDecodeError, TypeError): return io.NodeOutput("") + +def _dump_json(value, indent): + return json.dumps(value, ensure_ascii=False, indent=indent or None) + + +class ConvertDictionaryToString(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="ConvertDictionaryToString", + display_name="Convert Dictionary to String", + category="text", + search_aliases=["json", "dict to json", "stringify", "serialize", "dict to string"], + inputs=[ + io.Dict.Input("dictionary"), + io.Int.Input("indent", default=2, min=0, max=8, + tooltip="Spaces per indent level. 0 produces compact single-line string."), + ], + outputs=[ + io.String.Output(), + ], + ) + + @classmethod + def execute(cls, dictionary, indent=2): + return io.NodeOutput(_dump_json(dictionary, indent)) + + +class ConvertArrayToString(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="ConvertArrayToString", + display_name="Convert Array to String", + category="text", + search_aliases=["json", "list to json", "stringify", "serialize", "list to string", "array to json"], + inputs=[ + io.Array.Input("array"), + io.Int.Input("indent", default=2, min=0, max=8, + tooltip="Spaces per indent level. 0 produces compact single-line string."), + ], + outputs=[ + io.String.Output(), + ], + ) + + @classmethod + def execute(cls, array, indent=2): + return io.NodeOutput(_dump_json(array, indent)) + + class StringExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[io.ComfyNode]]: @@ -457,6 +508,8 @@ async def get_node_list(self) -> list[type[io.ComfyNode]]: RegexExtract, RegexReplace, JsonExtractString, + ConvertDictionaryToString, + ConvertArrayToString, ] async def comfy_entrypoint() -> StringExtension: diff --git a/nodes.py b/nodes.py index b1a663f4cbc8..c7fbd3475a2c 100644 --- a/nodes.py +++ b/nodes.py @@ -2368,6 +2368,8 @@ async def init_builtin_extra_nodes(): "nodes_images.py", "nodes_video_model.py", "nodes_ideogram4.py", + "nodes_bounding_boxes.py", + "nodes_json_prompt.py", "nodes_train.py", "nodes_dataset.py", "nodes_sag.py",