From e84a3ef63c452800c2884fc195affedef07b9b31 Mon Sep 17 00:00:00 2001 From: DN6 Date: Mon, 1 Jun 2026 23:55:40 +0530 Subject: [PATCH 01/30] update --- src/diffusers/commands/diffusers_cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/diffusers/commands/diffusers_cli.py b/src/diffusers/commands/diffusers_cli.py index a27ac24f2a3e..95b8dd5f3938 100644 --- a/src/diffusers/commands/diffusers_cli.py +++ b/src/diffusers/commands/diffusers_cli.py @@ -15,6 +15,7 @@ from argparse import ArgumentParser +from .agentic import register_agentic_commands from .custom_blocks import CustomBlocksCommand from .env import EnvironmentCommand from .fp16_safetensors import FP16SafetensorsCommand @@ -28,6 +29,7 @@ def main(): EnvironmentCommand.register_subcommand(commands_parser) FP16SafetensorsCommand.register_subcommand(commands_parser) CustomBlocksCommand.register_subcommand(commands_parser) + register_agentic_commands(commands_parser) # Let's go args = parser.parse_args() From 59be75317e59bbb4b295a1c57fdf949843fa0866 Mon Sep 17 00:00:00 2001 From: DN6 Date: Tue, 2 Jun 2026 00:03:03 +0530 Subject: [PATCH 02/30] update --- src/diffusers/commands/agentic/README.md | 246 ++++++++++ src/diffusers/commands/agentic/__init__.py | 18 + src/diffusers/commands/agentic/_common.py | 533 +++++++++++++++++++++ src/diffusers/commands/agentic/app.py | 38 ++ src/diffusers/commands/agentic/audio.py | 133 +++++ src/diffusers/commands/agentic/image.py | 170 +++++++ src/diffusers/commands/agentic/modular.py | 249 ++++++++++ src/diffusers/commands/agentic/tasks.py | 91 ++++ src/diffusers/commands/agentic/video.py | 147 ++++++ 9 files changed, 1625 insertions(+) create mode 100644 src/diffusers/commands/agentic/README.md create mode 100644 src/diffusers/commands/agentic/__init__.py create mode 100644 src/diffusers/commands/agentic/_common.py create mode 100644 src/diffusers/commands/agentic/app.py create mode 100644 src/diffusers/commands/agentic/audio.py create mode 100644 src/diffusers/commands/agentic/image.py create mode 100644 src/diffusers/commands/agentic/modular.py create mode 100644 src/diffusers/commands/agentic/tasks.py create mode 100644 src/diffusers/commands/agentic/video.py diff --git a/src/diffusers/commands/agentic/README.md b/src/diffusers/commands/agentic/README.md new file mode 100644 index 000000000000..07f261692cd3 --- /dev/null +++ b/src/diffusers/commands/agentic/README.md @@ -0,0 +1,246 @@ +# Agentic CLI for Diffusers + +Single-command access to common Diffusers use-cases. Designed for AI agents +and humans who need to run image/video/audio generation **without writing +Python scripts**. + +Every command below is reachable as `diffusers-cli `. Run +`diffusers-cli --help` for full option documentation. + +## How it works + +The module integrates with the main CLI through a single function call in +`diffusers_cli.py` — removing it disables everything with no side effects. + +``` +src/diffusers/commands/agentic/ +├── app.py # register_agentic_commands(subparsers) — single integration point +├── _common.py # Shared helpers (arg groups, pipeline detection, loading, remote, I/O) +├── image.py # text-to-image, image-to-image, inpaint +├── video.py # text-to-video, image-to-video +├── audio.py # text-to-audio +├── modular.py # generic ModularPipeline runner with free-form inputs +└── tasks.py # `tasks` — list every registered agentic command +``` + +## Discovering tasks + +```bash +diffusers-cli tasks # human-readable +diffusers-cli tasks --json # for agents +``` + +## Pipeline detection (DiffusionPipeline vs ModularPipeline) + +Every inference command auto-detects whether the `--model` is a regular +`DiffusionPipeline` repo (`model_index.json`) or a custom +`ModularPipeline` repo (`modular_model_index.json`) via a single Hub +listing — no weights are downloaded. If you point a task-shaped command at +a modular repo, it exits with a hint to use `diffusers-cli modular` +instead. The reverse is also true: `modular` rejects a regular repo and +points back at the task-shaped command. + +## Pushing outputs to a bucket + +Every inference command (and `modular`) accepts `--push-to ` +to upload the generated files to a Hugging Face **bucket** after they're +saved locally. The bucket is created if it doesn't exist and files land +under a prefix named after the task (e.g. `text-to-image/`). + +```bash +diffusers-cli text-to-image \ + --model stabilityai/stable-diffusion-xl-base-1.0 \ + --prompt "a watercolor of a fox" \ + --num-images 4 \ + --push-to your-username/cli-generations +``` + +The upload is a single `batch_bucket_files` round-trip regardless of how +many files were generated. The JSON payload reports `hf://buckets/...` +URIs so an agent can pipe them into a follow-up tool. + +## Running remotely (HF Jobs) and fetching outputs back + +Every inference command supports `--remote`, which submits the same call +to Hugging Face Jobs via `huggingface_hub.run_uv_job`, then by default +**waits for the job to finish and downloads the outputs back to your +local machine**. + +The flow: + +1. If `--push-to` isn't set, default it to `/jobs-artifacts` + (the canonical jobs bucket — `https://huggingface.co/buckets//jobs-artifacts`). +2. Generate a random `run_id` and pass it via `DIFFUSERS_CLI_RUN_ID` env + so the container writes its files under `/` inside the bucket. +3. Submit the job (your `HF_TOKEN` is forwarded as a secret). +4. Poll `inspect_job` every `--poll-interval` seconds until the stage is + `COMPLETED` / `CANCELED` / `ERROR` / `DELETED`. +5. List `/` in the bucket and `download_bucket_files` everything + into the local `--output` directory (default `./outputs/`). + +Pass `--no-wait` to fire-and-forget — the command prints the job id and +returns immediately; you can fetch later via `huggingface-cli buckets`. + +| Option | Description | +|--------|-------------| +| `--remote` | Run on HF Jobs instead of locally | +| `--flavor` | Hardware flavor (default `a10g-small`) | +| `--timeout` | Job timeout (e.g. `30m`, `2h`) | +| `--dependencies` | Extra pip deps. Repeat for multiple | +| `--namespace` | HF namespace (defaults to the current user) | +| `--no-wait` | Skip polling/download — submit and exit | +| `--poll-interval` | Seconds between job-status polls (default 5) | + +```bash +# Submit text-to-image to HF Jobs on an A100, wait, download to ./outputs/ +diffusers-cli text-to-image \ + --model stabilityai/stable-diffusion-xl-base-1.0 \ + --prompt "a watercolor of a fox in autumn leaves" \ + --num-images 4 \ + --remote --flavor a100-large --timeout 30m +``` + +```bash +# Same call, fire-and-forget +diffusers-cli text-to-image ... --remote --no-wait +``` + +## Common options + +Every inference command supports: + +| Option | Description | +|--------|-------------| +| `--model` / `-m` | Model id on the Hub or local path | +| `--device` | `cpu`, `cuda`, `cuda:0`, `mps` (defaults to best available) | +| `--dtype` | `auto`, `float16`, `bfloat16`, `float32` | +| `--variant` | Optional weight variant (e.g. `fp16`) | +| `--revision` | Model revision (branch, tag, or SHA) | +| `--token` | Hugging Face token for gated/private models | +| `--trust-remote-code` | Allow custom code from the Hub | +| `--output` / `-o` | Output file or directory | +| `--json` | Machine-readable JSON summary on stdout | +| `--seed` | Random seed for reproducibility | +| `--pipeline-kwargs` | JSON object of extra kwargs forwarded to the pipeline call | + +## Commands + +### Image + +1. Generate an image from a text prompt + ```bash + diffusers-cli text-to-image \ + --model stabilityai/stable-diffusion-xl-base-1.0 \ + --prompt "a watercolor of a fox in autumn leaves" \ + --output fox.png + ``` + +2. Generate with explicit sampling controls + ```bash + diffusers-cli text-to-image \ + --model stabilityai/stable-diffusion-xl-base-1.0 \ + --prompt "studio portrait of a cyberpunk hacker" \ + --negative-prompt "blurry, low quality" \ + --num-inference-steps 30 \ + --guidance-scale 7.5 \ + --height 1024 --width 1024 \ + --seed 42 + ``` + +3. Generate multiple variants at once + ```bash + diffusers-cli text-to-image \ + --model black-forest-labs/FLUX.1-schnell \ + --prompt "a still life with citrus and ceramics" \ + --num-images 4 \ + --output ./outputs/still-life/ + ``` + +4. Transform an existing image with a prompt (image-to-image) + ```bash + diffusers-cli image-to-image \ + --model stabilityai/stable-diffusion-xl-refiner-1.0 \ + --image input.jpg \ + --prompt "make it look like an oil painting" \ + --strength 0.6 \ + --output painted.png + ``` + +5. Inpaint a masked region of an image + ```bash + diffusers-cli inpaint \ + --model stabilityai/stable-diffusion-2-inpainting \ + --image photo.png \ + --mask mask.png \ + --prompt "a golden retriever sitting on the bench" \ + --output filled.png + ``` + +6. Emit JSON for downstream tooling + ```bash + diffusers-cli text-to-image \ + --model stabilityai/sdxl-turbo \ + --prompt "neon city at night" \ + --json + ``` + +### Video + +7. Generate a short clip from a text prompt + ```bash + diffusers-cli text-to-video \ + --model THUDM/CogVideoX-2b \ + --prompt "a panda surfing on a wave at sunset" \ + --num-frames 49 \ + --fps 8 \ + --output panda.mp4 + ``` + +8. Animate a single still image + ```bash + diffusers-cli image-to-video \ + --model stabilityai/stable-video-diffusion-img2vid-xt \ + --image still.png \ + --prompt "subtle camera dolly forward" \ + --num-frames 25 \ + --output animated.mp4 + ``` + +### Audio + +9. Generate music or a sound effect from a text prompt + ```bash + diffusers-cli text-to-audio \ + --model cvssp/audioldm2 \ + --prompt "a calm piano melody in a quiet room" \ + --audio-length-in-s 10 \ + --output music.wav + ``` + +### Modular pipelines + +Modular pipelines have an open-ended input surface defined by the block +graph, so the CLI doesn't try to predict it — pass inputs verbatim. + +14. Run a modular pipeline with free-form inputs + ```bash + diffusers-cli modular \ + --model your-username/my-modular-pipeline \ + --inputs prompt="a calm landscape" \ + --inputs num_inference_steps=25 \ + --inputs-json '{"guidance_scale": 4.5}' \ + --output-key image \ + --output out.png + ``` + +The output type is auto-detected — a PIL image (or list of PIL images) +becomes PNG(s), a sequence of frames becomes an MP4, a numpy audio array +becomes a WAV, and anything else is JSON-serialized. + +### Roadmap + +Open an issue if you'd like to help land one: + +- **Video**: `video-to-video` +- **Conditioning**: ControlNet, T2I-Adapter, instruction editing (Flux-Kontext, InstructPix2Pix) +- **Quantization / export**: `convert` (fp16/safetensors/GGUF), `quantize` (bitsandbytes, torchao) diff --git a/src/diffusers/commands/agentic/__init__.py b/src/diffusers/commands/agentic/__init__.py new file mode 100644 index 000000000000..9076bc7ee4c9 --- /dev/null +++ b/src/diffusers/commands/agentic/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .app import register_agentic_commands + + +__all__ = ["register_agentic_commands"] diff --git a/src/diffusers/commands/agentic/_common.py b/src/diffusers/commands/agentic/_common.py new file mode 100644 index 000000000000..6910f9daa4db --- /dev/null +++ b/src/diffusers/commands/agentic/_common.py @@ -0,0 +1,533 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared helpers for the agentic CLI surface. + +These utilities are intentionally small and dependency-light. Each diffusers +agentic subcommand should be able to be read end-to-end by an agent without +needing to follow many layers of indirection. +""" + +from __future__ import annotations + +import json +import os +import sys +from argparse import ArgumentParser, Namespace +from pathlib import Path +from typing import Any, Optional + + +DEFAULT_OUTPUT_DIR = "outputs" + + +DTYPE_CHOICES = ("auto", "float16", "fp16", "bfloat16", "bf16", "float32", "fp32") +CPU_OFFLOAD_CHOICES = ("model", "group") +ATTENTION_BACKEND_CHOICES = ( + "default", + "flash_hub", + "flash_varlen_hub", + "flash_4_hub", + "sage_hub", +) + + +def add_loading_arguments(parser: ArgumentParser) -> None: + """Arguments shared by every inference subcommand.""" + parser.add_argument("--model", "-m", required=True, help="Model id on the Hugging Face Hub or local path.") + parser.add_argument("--device", default=None, help="Device to run on (e.g. cpu, cuda, cuda:0, mps).") + parser.add_argument( + "--dtype", + default="auto", + choices=DTYPE_CHOICES, + help="Torch dtype for pipeline weights.", + ) + parser.add_argument("--variant", default=None, help='Optional weight variant (e.g. "fp16").') + parser.add_argument("--revision", default=None, help="Model revision (branch, tag, or commit SHA).") + parser.add_argument("--token", default=None, help="Hugging Face token for gated/private models.") + parser.add_argument("--trust-remote-code", action="store_true", help="Allow custom code from the Hub.") + + +def add_optimization_arguments(parser: ArgumentParser) -> None: + """Optional pipeline-optimization flags. All default to off.""" + parser.add_argument( + "--cpu-offload", + choices=CPU_OFFLOAD_CHOICES, + default=None, + help=( + "Offload pipeline components to CPU during inference. " + "'model' uses enable_model_cpu_offload, " + "'group' uses pipeline.enable_group_offload(leaf_level, use_stream=True)." + ), + ) + parser.add_argument( + "--attention-backend", + choices=ATTENTION_BACKEND_CHOICES, + default="default", + help=( + "Override the attention backend on the transformer/UNet. " + "Only Hub-hosted kernels are exposed — they auto-download on first " + "use and avoid a local install. 'default' leaves the backend untouched." + ), + ) + parser.add_argument("--vae-tiling", action="store_true", help="Enable VAE tiling (lower peak VRAM).") + parser.add_argument("--vae-slicing", action="store_true", help="Enable VAE slicing (lower peak VRAM).") + parser.add_argument( + "--context-parallel", + action="store_true", + help=( + "Enable Ulysses-style context parallelism (ulysses_anything mode, supports arbitrary " + "sequence lengths). Requires launching the CLI under torchrun with ≥2 GPUs." + ), + ) + + +def add_generation_arguments(parser: ArgumentParser) -> None: + """Arguments shared by image/video generation subcommands.""" + parser.add_argument("--prompt", "-p", default=None, help="Text prompt.") + parser.add_argument("--negative-prompt", default=None, help="Negative text prompt.") + parser.add_argument("--num-inference-steps", type=int, default=None, help="Number of denoising steps.") + parser.add_argument("--guidance-scale", type=float, default=None, help="Classifier-free guidance scale.") + parser.add_argument("--height", type=int, default=None, help="Output height in pixels.") + parser.add_argument("--width", type=int, default=None, help="Output width in pixels.") + parser.add_argument("--num-images", type=int, default=1, help="Number of images to generate.") + parser.add_argument("--seed", type=int, default=None, help="Random seed for reproducibility.") + parser.add_argument( + "--pipeline-kwargs", + default=None, + help="JSON object of extra kwargs forwarded to the pipeline call.", + ) + + +def add_output_arguments(parser: ArgumentParser) -> None: + """Output formatting arguments.""" + parser.add_argument( + "--output", + "-o", + default=None, + help="Output file or directory. Defaults to ./outputs/-..", + ) + parser.add_argument( + "--push-to", + default=None, + help=( + "Upload the generated files to this HF bucket id after saving " + "(created if missing). When --remote is set, defaults to " + "/jobs-artifacts; remote runs always write to that bucket " + "and fetch the results back locally." + ), + ) + parser.add_argument("--json", action="store_true", help="Emit a machine-readable JSON summary on stdout.") + + +def add_remote_arguments(parser: ArgumentParser) -> None: + """Optional HF Jobs arguments — works on every inference subcommand.""" + parser.add_argument( + "--remote", + action="store_true", + help="Submit this command to Hugging Face Jobs instead of running locally.", + ) + parser.add_argument( + "--flavor", + default="a10g-small", + help="HF Jobs hardware flavor for --remote (e.g. a10g-small, a100-large, cpu-basic).", + ) + parser.add_argument( + "--timeout", + default=None, + help="HF Jobs timeout for --remote (e.g. 30m, 2h).", + ) + parser.add_argument( + "--dependencies", + action="append", + default=None, + help="Extra pip dependencies for the --remote job. Repeat to add multiple.", + ) + parser.add_argument( + "--namespace", + default=None, + help="HF namespace to run the --remote job under (defaults to the current user).", + ) + parser.add_argument( + "--no-wait", + action="store_true", + help=( + "Don't wait for the --remote job to finish — submit and print the job id. " + "Default behaviour is to poll until completion and download outputs locally." + ), + ) + parser.add_argument( + "--poll-interval", + type=float, + default=5.0, + help="Seconds between job-status polls when waiting for --remote completion.", + ) + + +def resolve_dtype(name: Optional[str]): + """Map a CLI dtype string to a torch dtype. + + Returns ``"auto"`` when the user wants diffusers to pick. + """ + if name in (None, "auto"): + return "auto" + + import torch + + mapping = { + "fp32": torch.float32, + "fp16": torch.float16, + "bf16": torch.bfloat16, + } + if name not in mapping: + raise ValueError(f"Unknown dtype: {name}") + return mapping[name] + + +def resolve_device(name: Optional[str]) -> str: + """Pick a device, defaulting to the best available one.""" + if name: + return name + import torch + + if torch.cuda.is_available(): + return "cuda" + + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + return "mps" + + return "cpu" + + +def load_pipeline(args: Namespace, pipeline_cls_name: str) -> Any: + """Load a diffusers pipeline class by name and move it to the chosen device. + + ``pipeline_cls_name`` can be any class exported from ``diffusers`` — + typically one of ``AutoPipelineForText2Image``, ``AutoPipelineForImage2Image``, + ``AutoPipelineForInpainting``, or ``DiffusionPipeline`` for video/audio. + """ + import diffusers + + pipeline_cls = getattr(diffusers, pipeline_cls_name) + from_pretrained_kwargs: dict[str, Any] = { + "torch_dtype": resolve_dtype(args.dtype), + "trust_remote_code": args.trust_remote_code, + } + if args.variant: + from_pretrained_kwargs["variant"] = args.variant + if args.revision: + from_pretrained_kwargs["revision"] = args.revision + if args.token: + from_pretrained_kwargs["token"] = args.token + + pipeline = pipeline_cls.from_pretrained(args.model, **from_pretrained_kwargs) + pipeline = map_to_device(pipeline, args, resolve_device(args.device)) + if args.vae_tiling and hasattr(pipeline, "enable_vae_tiling"): + pipeline.enable_vae_tiling() + if args.vae_slicing and hasattr(pipeline, "enable_vae_slicing"): + pipeline.enable_vae_slicing() + if args.attention_backend != "default": + _set_attention_backend(pipeline, args.attention_backend) + if args.context_parallel: + _enable_context_parallel(pipeline) + return pipeline + + +def map_to_device(pipeline: Any, args: Namespace, device: str) -> Any: + """Get the pipeline ready to run on ``device``. + + Calls ``.to(device)`` by default; when ``--cpu-offload`` is set the chosen + offload helper (``model``, ``sequential``, or ``group``) handles placement instead. + """ + if args.cpu_offload is None: + return pipeline.to(device) + if args.cpu_offload == "model": + pipeline.enable_model_cpu_offload(device=device) + elif args.cpu_offload == "group": + import torch + + pipeline.enable_group_offload( + onload_device=torch.device(device), + offload_type="leaf_level", + use_stream=device.startswith("cuda"), + ) + return pipeline + + +def _enable_context_parallel(pipeline: Any) -> None: + """Enable Ulysses-style context-parallel inference on the transformer/UNet.""" + import torch + + if not torch.distributed.is_available() or not torch.distributed.is_initialized(): + raise SystemExit( + "--context-parallel requires torch.distributed to be initialized. " + "Launch the CLI under torchrun, e.g.: " + "`torchrun --nproc-per-node=N -m diffusers.commands.diffusers_cli ...`." + ) + + from diffusers import ContextParallelConfig + + cfg = ContextParallelConfig( + ulysses_degree=torch.distributed.get_world_size(), + ring_degree=1, + ulysses_anything=True, + ) + for attr in ("transformer", "unet"): + module = getattr(pipeline, attr, None) + if module is not None and hasattr(module, "enable_parallelism"): + module.enable_parallelism(config=cfg) + return + + +def _set_attention_backend(pipeline: Any, backend: str) -> None: + for attr in ("transformer", "unet"): + module = getattr(pipeline, attr, None) + if module is not None and hasattr(module, "set_attention_backend"): + try: + module.set_attention_backend(backend) + except (ValueError, ImportError, RuntimeError): + pass + return + + +def get_generator(seed: Optional[int], device: str): + if seed is None: + return None + import torch + + generator_device = "cpu" if device == "mps" else device + return torch.Generator(device=generator_device).manual_seed(seed) + + +def parse_pipeline_kwargs(raw: Optional[str]) -> dict[str, Any]: + if not raw: + return {} + try: + parsed = json.loads(raw) + except json.JSONDecodeError as e: + raise SystemExit(f"--pipeline-kwargs must be valid JSON: {e}") from e + if not isinstance(parsed, dict): + raise SystemExit("--pipeline-kwargs must decode to a JSON object.") + return parsed + + +def default_output_paths(task: str, num: int, explicit: Optional[str], ext: str = "png") -> list[Path]: + """Resolve output file paths for ``num`` generated artifacts. + + - If ``explicit`` is a directory (or ends with /), write into it. + - If ``explicit`` is a file and ``num == 1``, write to that file. + - If ``explicit`` is a file template and ``num > 1``, append ``-`` before the suffix. + - Otherwise default to ``./outputs/-.``. + """ + if explicit is None: + base = Path(DEFAULT_OUTPUT_DIR) + base.mkdir(parents=True, exist_ok=True) + return [base / f"{task}-{i}.{ext}" for i in range(num)] + + p = Path(explicit) + if explicit.endswith(os.sep) or p.is_dir(): + p.mkdir(parents=True, exist_ok=True) + return [p / f"{task}-{i}.{ext}" for i in range(num)] + + p.parent.mkdir(parents=True, exist_ok=True) + if num == 1: + return [p] + stem, suffix = p.stem, p.suffix or f".{ext}" + return [p.with_name(f"{stem}-{i}{suffix}") for i in range(num)] + + +# Source for the diffusers install used by --remote jobs. While iterating on a +# feature branch, point at the branch URL; once merged, switch back to a release +# pin. ``--dependencies "diffusers @ git+..."`` on the local command appends +# additional dependencies but does not replace this default install. +DIFFUSERS_SOURCE = "diffusers @ git+https://github.com/huggingface/diffusers@diffuser-cli-for-agent" +_DEFAULT_REMOTE_DEPS = (DIFFUSERS_SOURCE, "accelerate", "transformers", "safetensors") + +# Entry point for ``uv run`` inside the container. ``uv run`` accepts a file path, +# URL, or *command*; passing the ``diffusers-cli`` console script name makes UV +# install the deps above (which register the entry point) and then exec the CLI. +_UV_RUNNER_SCRIPT = "diffusers-cli" + + +RUN_ID_ENV = "DIFFUSERS_CLI_RUN_ID" + +# Namespace keys that control *how* a remote job runs locally, not what runs +# inside the container. They are stripped when forwarding argv to the container. +HF_JOBS_KEYS = frozenset( + {"remote", "flavor", "timeout", "dependencies", "namespace", "no_wait", "poll_interval", "func"} +) + + +def _forward_args(args: Namespace, task: str) -> list[str]: + """Reconstruct argv for the remote container from a parsed Namespace. + + Skips the local-only job-control keys above. Boolean flags are emitted + only when True. List values become repeated ``--flag value`` pairs. + """ + out: list[str] = [task] + for key, value in vars(args).items(): + if key in HF_JOBS_KEYS: + continue + if value is None or value is False: + continue + flag = "--" + key.replace("_", "-") + if value is True: + out.append(flag) + elif isinstance(value, list): + for item in value: + out.extend([flag, str(item)]) + else: + out.extend([flag, str(value)]) + return out + + +def maybe_submit_remote(args: Namespace, task: str) -> bool: + """If ``--remote`` was set, submit this invocation to HF Jobs and return True. + + The local ``run()`` should bail immediately when this returns True. + + Auto-defaults ``--push-to`` to ``/jobs-artifacts`` so the remote + container has somewhere to write before tear-down. By default, polls + the job until completion and downloads the artifacts back to the local + output directory; pass ``--no-wait`` to fire-and-forget. + """ + if not args.remote: + return False + + import uuid + + from huggingface_hub import HfApi, get_token, run_uv_job + + hf_token = args.token or get_token() + api = HfApi(token=hf_token) + + if not args.push_to: + args.push_to = f"{api.whoami()['name']}/jobs-artifacts" + + run_id = uuid.uuid4().hex[:12] + + forwarded = _forward_args(args, task) + dependencies = list(_DEFAULT_REMOTE_DEPS) + if args.dependencies: + dependencies.extend(args.dependencies) + + secrets = {"HF_TOKEN": hf_token} if hf_token else None + env = {RUN_ID_ENV: run_id} + + job = run_uv_job( + script=_UV_RUNNER_SCRIPT, + script_args=forwarded, + dependencies=dependencies, + flavor=args.flavor, + timeout=args.timeout, + namespace=args.namespace, + secrets=secrets, + env=env, + token=hf_token, + ) + + payload: dict[str, Any] = { + "task": "remote-submit", + "job_id": getattr(job, "id", None), + "job_status": str(getattr(job, "status", "")), + "flavor": args.flavor, + "push_to": args.push_to, + "run_id": run_id, + } + + if args.no_wait: + format_result(args, payload) + return True + + final_status = _wait_for_job(api, job.id, args.namespace, args.poll_interval) + payload["job_status"] = final_status + payload["outputs"] = _download_job_artifacts(api, args.push_to, run_id, args.output) + format_result(args, payload) + return True + + +def _wait_for_job(api: Any, job_id: str, namespace: Optional[str], poll_interval: float) -> str: + """Poll ``inspect_job`` until the job reaches a terminal stage; return that stage as a string.""" + import time + + terminal = {"COMPLETED", "CANCELED", "ERROR", "DELETED"} + while True: + info = api.inspect_job(job_id=job_id, namespace=namespace) + stage = str(info.status.stage) if info.status else "UNKNOWN" + if stage in terminal: + return stage + time.sleep(poll_interval) + + +def _download_job_artifacts(api: Any, bucket_id: str, run_id: str, output: Optional[str]) -> list[str]: + """Download every file under ``/`` from ``bucket_id`` to a local directory. + + ``output`` is always treated as a directory (created if missing) — remote + runs produce many files, so a file-path target wouldn't make sense. + """ + from huggingface_hub import BucketFile + + local_dir = Path(output) if output else Path(DEFAULT_OUTPUT_DIR) + local_dir.mkdir(parents=True, exist_ok=True) + + pairs: list[tuple[Any, Path]] = [] + for entry in api.list_bucket_tree(bucket_id, prefix=f"{run_id}/", recursive=True): + if not isinstance(entry, BucketFile): + continue + pairs.append((entry, local_dir / Path(entry.path).name)) + + if not pairs: + return [] + api.download_bucket_files(bucket_id, files=pairs) + return [str(local) for _, local in pairs] + + +def push_outputs(args: Namespace, saved_paths: list[str], task: str) -> Optional[dict[str, Any]]: + """Upload ``saved_paths`` to the ``--push-to`` bucket, returning a summary. + + Returns None when ``--push-to`` is unset. Creates the bucket if needed. + When ``DIFFUSERS_CLI_RUN_ID`` is set (i.e. we're inside a remote job), + files land under ``/`` so the local side can isolate this run's + output; otherwise they land under ``/``. + """ + if not args.push_to: + return None + target = args.push_to + + from huggingface_hub import HfApi + + api = HfApi(token=args.token) + api.create_bucket(target, exist_ok=True) + + prefix = os.environ.get(RUN_ID_ENV) or task + add = [(local, f"{prefix}/{Path(local).name}") for local in saved_paths] + api.batch_bucket_files(target, add=add) + + uploaded = [f"hf://buckets/{target}/{dest}" for _, dest in add] + return {"bucket_id": target, "uploaded": uploaded} + + +def format_result(args: Namespace, payload: dict[str, Any]) -> None: + """Print either a human-friendly summary or JSON, depending on --json.""" + if args.json: + json.dump(payload, sys.stdout, default=str) + sys.stdout.write("\n") + return + + outputs = payload.get("outputs", []) + if outputs: + for path in outputs: + print(path) + else: + print(payload) diff --git a/src/diffusers/commands/agentic/app.py b/src/diffusers/commands/agentic/app.py new file mode 100644 index 000000000000..3ca7b50ae1ed --- /dev/null +++ b/src/diffusers/commands/agentic/app.py @@ -0,0 +1,38 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Single integration point for the agentic CLI. + +Removing the call to ``register_agentic_commands`` from +``diffusers_cli.py`` disables the entire surface with no side effects. +""" + +from __future__ import annotations + +from argparse import _SubParsersAction + +from . import audio as audio_commands +from . import image as image_commands +from . import modular as modular_commands +from . import tasks as tasks_commands +from . import video as video_commands + + +def register_agentic_commands(subparsers: _SubParsersAction) -> None: + """Register every agentic subcommand on the top-level ``diffusers-cli`` parser.""" + image_commands.register(subparsers) + video_commands.register(subparsers) + audio_commands.register(subparsers) + modular_commands.register(subparsers) + tasks_commands.register(subparsers) diff --git a/src/diffusers/commands/agentic/audio.py b/src/diffusers/commands/agentic/audio.py new file mode 100644 index 000000000000..42c2fd0da210 --- /dev/null +++ b/src/diffusers/commands/agentic/audio.py @@ -0,0 +1,133 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Audio-generation subcommands: text-to-audio.""" + +from __future__ import annotations + +from argparse import ArgumentParser, Namespace, _SubParsersAction + +from .. import BaseDiffusersCLICommand +from . import _common + + +def register(subparsers: _SubParsersAction) -> None: + Text2AudioCommand.register_subcommand(subparsers) + + +def _save_audio(audios, sampling_rate: int, args: Namespace, task: str) -> list[str]: + """Save one or more audio arrays as WAV files.""" + import numpy as np + from scipy.io.wavfile import write as wavfile_write + + paths = _common.default_output_paths(task, len(audios), args.output, ext="wav") + saved: list[str] = [] + for audio, path in zip(audios, paths): + data = np.asarray(audio) + if data.dtype.kind == "f": + data = np.clip(data, -1.0, 1.0) + data = (data * 32767).astype(np.int16) + if data.ndim > 1 and data.shape[0] < data.shape[-1]: + # ``(channels, samples)`` → ``(samples, channels)`` for scipy. + data = data.T + wavfile_write(str(path), sampling_rate, data) + saved.append(str(path)) + return saved + + +class Text2AudioCommand(BaseDiffusersCLICommand): + task = "text-to-audio" + + @staticmethod + def register_subcommand(subparsers: _SubParsersAction) -> None: + parser: ArgumentParser = subparsers.add_parser( + "text-to-audio", + help="Generate an audio clip (music or sound) from a text prompt.", + ) + _common.add_loading_arguments(parser) + _common.add_optimization_arguments(parser) + _common.add_generation_arguments(parser) + _common.add_remote_arguments(parser) + parser.add_argument( + "--audio-length-in-s", + type=float, + default=None, + help="Duration of the generated audio in seconds.", + ) + parser.add_argument( + "--sampling-rate", + type=int, + default=None, + help="Override the sampling rate written to the WAV file.", + ) + _common.add_output_arguments(parser) + parser.set_defaults(func=Text2AudioCommand) + + def __init__(self, args: Namespace): + self.args = args + + def run(self) -> None: + if _common.maybe_submit_remote(self.args, self.task): + return + pipeline = _common.load_pipeline(self.args, "DiffusionPipeline") + + call_kwargs: dict = {} + if self.args.prompt is not None: + call_kwargs["prompt"] = self.args.prompt + if self.args.negative_prompt is not None: + call_kwargs["negative_prompt"] = self.args.negative_prompt + if self.args.num_inference_steps is not None: + call_kwargs["num_inference_steps"] = self.args.num_inference_steps + if self.args.guidance_scale is not None: + call_kwargs["guidance_scale"] = self.args.guidance_scale + if self.args.audio_length_in_s is not None: + call_kwargs["audio_length_in_s"] = self.args.audio_length_in_s + if self.args.num_images != 1: + call_kwargs["num_waveforms_per_prompt"] = self.args.num_images + + generator = _common.get_generator(self.args.seed, pipeline.device.type) + if generator is not None: + call_kwargs["generator"] = generator + + call_kwargs.update(_common.parse_pipeline_kwargs(self.args.pipeline_kwargs)) + + result = pipeline(**call_kwargs) + audios = getattr(result, "audios", None) + if audios is None: + audios = result[0] + + sampling_rate = self.args.sampling_rate + if sampling_rate is None: + pipeline_sr = getattr(pipeline, "sampling_rate", None) + if isinstance(pipeline_sr, int): + sampling_rate = pipeline_sr + else: + vocoder_config = getattr(getattr(pipeline, "vocoder", None), "config", None) + sampling_rate = getattr(vocoder_config, "sampling_rate", 16000) if vocoder_config else 16000 + + saved = _save_audio(audios, sampling_rate, self.args, self.task) + pushed = _common.push_outputs(self.args, saved, self.task) + + _common.format_result( + self.args, + { + "task": self.task, + "model": self.args.model, + "device": pipeline.device.type, + "outputs": saved, + "pushed": pushed, + "sampling_rate": sampling_rate, + "seed": self.args.seed, + }, + ) diff --git a/src/diffusers/commands/agentic/image.py b/src/diffusers/commands/agentic/image.py new file mode 100644 index 000000000000..94fdd81d6953 --- /dev/null +++ b/src/diffusers/commands/agentic/image.py @@ -0,0 +1,170 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Image-generation subcommands: text-to-image, image-to-image, inpaint.""" + +from __future__ import annotations + +from argparse import ArgumentParser, Namespace, _SubParsersAction + +from diffusers.utils import load_image + +from .. import BaseDiffusersCLICommand +from . import _common + + +def register(subparsers: _SubParsersAction) -> None: + Text2ImageCommand.register_subcommand(subparsers) + Image2ImageCommand.register_subcommand(subparsers) + InpaintCommand.register_subcommand(subparsers) + + +def _build_call_kwargs(args: Namespace, pipeline) -> dict: + kwargs: dict = {} + if args.prompt is not None: + kwargs["prompt"] = args.prompt + if args.negative_prompt is not None: + kwargs["negative_prompt"] = args.negative_prompt + if args.num_inference_steps is not None: + kwargs["num_inference_steps"] = args.num_inference_steps + if args.guidance_scale is not None: + kwargs["guidance_scale"] = args.guidance_scale + if args.height is not None: + kwargs["height"] = args.height + if args.width is not None: + kwargs["width"] = args.width + if args.num_images != 1: + kwargs["num_images_per_prompt"] = args.num_images + + generator = _common.get_generator(args.seed, pipeline.device.type) + if generator is not None: + kwargs["generator"] = generator + + kwargs.update(_common.parse_pipeline_kwargs(args.pipeline_kwargs)) + return kwargs + + +def _save_images(images, task: str, args: Namespace) -> list[str]: + paths = _common.default_output_paths(task, len(images), args.output, ext="png") + saved: list[str] = [] + for image, path in zip(images, paths): + image.save(path) + saved.append(str(path)) + return saved + + +class _BaseImageCommand(BaseDiffusersCLICommand): + task: str = "" + auto_cls: str = "" + + def __init__(self, args: Namespace): + self.args = args + + def run(self) -> None: + if _common.maybe_submit_remote(self.args, self.task): + return + + pipeline = _common.load_pipeline(self.args, self.auto_cls) + call_kwargs = _build_call_kwargs(self.args, pipeline) + self._attach_inputs(call_kwargs) + + result = pipeline(**call_kwargs) + saved = _save_images(result.images, self.task, self.args) + pushed = _common.push_outputs(self.args, saved, self.task) + + _common.format_result( + self.args, + { + "task": self.task, + "model": self.args.model, + "device": pipeline.device.type, + "outputs": saved, + "pushed": pushed, + "seed": self.args.seed, + }, + ) + + def _attach_inputs(self, call_kwargs: dict) -> None: # noqa: B027 + """Hook for subclasses to attach image/mask conditioning.""" + + +class Text2ImageCommand(_BaseImageCommand): + task = "text-to-image" + auto_cls = "AutoPipelineForText2Image" + + @staticmethod + def register_subcommand(subparsers: _SubParsersAction) -> None: + parser: ArgumentParser = subparsers.add_parser( + "text-to-image", + help="Generate an image from a text prompt.", + ) + _common.add_loading_arguments(parser) + _common.add_optimization_arguments(parser) + _common.add_generation_arguments(parser) + _common.add_remote_arguments(parser) + _common.add_output_arguments(parser) + parser.set_defaults(func=Text2ImageCommand) + + +class Image2ImageCommand(_BaseImageCommand): + task = "image-to-image" + auto_cls = "AutoPipelineForImage2Image" + + @staticmethod + def register_subcommand(subparsers: _SubParsersAction) -> None: + parser: ArgumentParser = subparsers.add_parser( + "image-to-image", + help="Transform an input image conditioned on a text prompt.", + ) + _common.add_loading_arguments(parser) + _common.add_optimization_arguments(parser) + _common.add_generation_arguments(parser) + _common.add_remote_arguments(parser) + _common.add_output_arguments(parser) + + parser.add_argument("--image", required=True, help="Path or URL to the conditioning image.") + parser.add_argument("--strength", type=float, default=None, help="How much to transform the input (0-1).") + parser.set_defaults(func=Image2ImageCommand) + + def _attach_inputs(self, call_kwargs: dict) -> None: + call_kwargs["image"] = load_image(self.args.image) + if self.args.strength is not None: + call_kwargs["strength"] = self.args.strength + + +class InpaintCommand(_BaseImageCommand): + task = "inpaint" + auto_cls = "AutoPipelineForInpainting" + + @staticmethod + def register_subcommand(subparsers: _SubParsersAction) -> None: + parser: ArgumentParser = subparsers.add_parser( + "inpaint", + help="Inpaint a region of an image defined by a mask.", + ) + _common.add_loading_arguments(parser) + _common.add_optimization_arguments(parser) + _common.add_generation_arguments(parser) + _common.add_remote_arguments(parser) + _common.add_output_arguments(parser) + parser.add_argument("--image", required=True, help="Path or URL to the base image.") + parser.add_argument("--mask", required=True, help="Path or URL to the mask image (white=inpaint).") + parser.add_argument("--strength", type=float, default=None, help="Strength of the inpainting transform (0-1).") + parser.set_defaults(func=InpaintCommand) + + def _attach_inputs(self, call_kwargs: dict) -> None: + call_kwargs["image"] = load_image(self.args.image) + call_kwargs["mask_image"] = load_image(self.args.mask) + if self.args.strength is not None: + call_kwargs["strength"] = self.args.strength diff --git a/src/diffusers/commands/agentic/modular.py b/src/diffusers/commands/agentic/modular.py new file mode 100644 index 000000000000..304c8b17329f --- /dev/null +++ b/src/diffusers/commands/agentic/modular.py @@ -0,0 +1,249 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""``diffusers-cli modular`` — run a custom ModularPipeline. + +Modular pipelines don't fit the ``task -> AutoPipelineFor*`` taxonomy: the +pipeline blocks themselves define the surface. This command takes free-form +``--inputs key=value`` (or a JSON blob) and forwards them to the modular +pipeline call, then auto-detects the result type so the agent doesn't need +to know whether it asked for an image, video, or audio output. +""" + +from __future__ import annotations + +import json +from argparse import ArgumentParser, Namespace, _SubParsersAction +from pathlib import Path +from typing import Any + +from .. import BaseDiffusersCLICommand +from . import _common + + +def register(subparsers: _SubParsersAction) -> None: + ModularCommand.register_subcommand(subparsers) + + +def _parse_inputs(args: Namespace) -> dict[str, Any]: + """Combine ``--inputs-json`` and repeated ``--inputs key=value`` into one dict. + + Values from ``--inputs`` are JSON-decoded when possible (so booleans, + numbers, lists, and nested objects survive); plain strings fall back to + raw text. + """ + out: dict[str, Any] = {} + if args.inputs_json: + try: + decoded = json.loads(args.inputs_json) + except json.JSONDecodeError as e: + raise SystemExit(f"--inputs-json must be valid JSON: {e}") from e + if not isinstance(decoded, dict): + raise SystemExit("--inputs-json must decode to a JSON object.") + out.update(decoded) + + for pair in args.inputs or []: + if "=" not in pair: + raise SystemExit(f"--inputs entries must look like key=value, got {pair!r}.") + key, _, raw = pair.partition("=") + try: + out[key] = json.loads(raw) + except json.JSONDecodeError: + out[key] = raw + return out + + +def _save_auto(value: Any, args: Namespace, task: str) -> list[str]: + """Save ``value`` based on its runtime type and return the written paths.""" + pil_images = _as_pil_list(value) + if pil_images is not None: + paths = _common.default_output_paths(task, len(pil_images), args.output, ext="png") + for img, path in zip(pil_images, paths): + img.save(path) + return [str(p) for p in paths] + + frames = _as_frame_sequence(value) + if frames is not None: + from diffusers.utils import export_to_video + + path = _common.default_output_paths(task, 1, args.output, ext="mp4")[0] + export_to_video(frames, str(path), fps=args.fps) + return [str(path)] + + audios = _as_audio_arrays(value) + if audios is not None: + from .audio import _save_audio + + return _save_audio(audios, args.sampling_rate or 16000, args, task) + + # Fallback: dump as JSON. + path = _common.default_output_paths(task, 1, args.output, ext="json")[0] + Path(path).write_text(json.dumps(value, default=str, indent=2)) + return [str(path)] + + +def _as_pil_list(value: Any): + try: + from PIL.Image import Image as PILImage + except ImportError: + return None + if isinstance(value, PILImage): + return [value] + if isinstance(value, (list, tuple)) and value and all(isinstance(v, PILImage) for v in value): + return list(value) + return None + + +def _as_frame_sequence(value: Any): + """A frame sequence is a list of PIL images or numpy frames meant to be a single clip.""" + try: + from PIL.Image import Image as PILImage + except ImportError: + PILImage = None # type: ignore[assignment] + + if isinstance(value, (list, tuple)) and len(value) >= 2: + first = value[0] + if PILImage is not None and isinstance(first, PILImage): + # Heuristic: distinguish "list of images we want as PNGs" from "frame sequence". + # The modular pipeline call already returned a single value, so we treat a + # homogeneous list of >=2 images as a clip. + return list(value) + try: + import numpy as np + + if isinstance(first, np.ndarray): + return list(value) + except ImportError: + pass + return None + + +def _as_audio_arrays(value: Any): + try: + import numpy as np + except ImportError: + return None + if isinstance(value, np.ndarray) and value.ndim <= 2: + return [value] + if ( + isinstance(value, (list, tuple)) + and value + and all(isinstance(v, np.ndarray) for v in value) + ): + return list(value) + return None + + +class ModularCommand(BaseDiffusersCLICommand): + task = "modular" + + @staticmethod + def register_subcommand(subparsers: _SubParsersAction) -> None: + parser: ArgumentParser = subparsers.add_parser( + "modular", + help="Run a custom ModularPipeline with free-form inputs.", + ) + _common.add_loading_arguments(parser) + _common.add_optimization_arguments(parser) + parser.add_argument( + "--inputs", + action="append", + default=None, + help='Inputs as key=value (value JSON-decoded when possible). Repeat to add multiple.', + ) + parser.add_argument( + "--inputs-json", + default=None, + help="Inputs as a single JSON object (merged with any --inputs entries).", + ) + parser.add_argument( + "--output-key", + default=None, + help='Optional intermediate to extract (e.g. "image", "video"). ' + "Forwarded to ModularPipeline as the ``output`` argument.", + ) + parser.add_argument( + "--fps", + type=int, + default=8, + help="FPS used when the output happens to be a frame sequence.", + ) + parser.add_argument( + "--sampling-rate", + type=int, + default=None, + help="Sample rate used when the output happens to be an audio array.", + ) + _common.add_remote_arguments(parser) + _common.add_output_arguments(parser) + parser.set_defaults(func=ModularCommand) + + def __init__(self, args: Namespace): + self.args = args + + def run(self) -> None: + if _common.maybe_submit_remote(self.args, self.task): + return + + pipeline = self._load_modular() + call_kwargs = _parse_inputs(self.args) + if self.args.output_key is not None: + call_kwargs["output"] = self.args.output_key + + result = pipeline(**call_kwargs) + saved = _save_auto(result, self.args, self.task) + pushed = _common.push_outputs(self.args, saved, self.task) + + _common.format_result( + self.args, + { + "task": self.task, + "model": self.args.model, + "pipeline_class": type(pipeline).__name__, + "outputs": saved, + "pushed": pushed, + "output_key": self.args.output_key, + }, + ) + + def _load_modular(self): + from diffusers import ModularPipeline + + dtype = _common.resolve_dtype(self.args.dtype) + device = _common.resolve_device(self.args.device) + + from_pretrained_kwargs: dict[str, Any] = { + "trust_remote_code": self.args.trust_remote_code, + } + if dtype != "auto": + from_pretrained_kwargs["torch_dtype"] = dtype + if self.args.revision: + from_pretrained_kwargs["revision"] = self.args.revision + if self.args.token: + from_pretrained_kwargs["token"] = self.args.token + + pipeline = ModularPipeline.from_pretrained(self.args.model, **from_pretrained_kwargs) + if not hasattr(pipeline, "to"): + return pipeline + + pipeline = _common.map_to_device(pipeline, self.args, device) + if self.args.vae_tiling and hasattr(pipeline, "enable_vae_tiling"): + pipeline.enable_vae_tiling() + if self.args.vae_slicing and hasattr(pipeline, "enable_vae_slicing"): + pipeline.enable_vae_slicing() + if self.args.attention_backend != "default": + _common._set_attention_backend(pipeline, self.args.attention_backend) + if self.args.context_parallel: + _common._enable_context_parallel(pipeline) + return pipeline diff --git a/src/diffusers/commands/agentic/tasks.py b/src/diffusers/commands/agentic/tasks.py new file mode 100644 index 000000000000..be2999469783 --- /dev/null +++ b/src/diffusers/commands/agentic/tasks.py @@ -0,0 +1,91 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""``diffusers-cli tasks`` — list every registered agentic subcommand. + +Designed so an agent can discover the surface area without parsing +``--help`` output. +""" + +from __future__ import annotations + +import json +import sys +from argparse import ArgumentParser, Namespace, _SubParsersAction + +from .. import BaseDiffusersCLICommand + + +AGENTIC_TASK_NAMES: tuple[str, ...] = ( + "text-to-image", + "image-to-image", + "inpaint", + "text-to-video", + "image-to-video", + "text-to-audio", + "modular", +) + + +def register(subparsers: _SubParsersAction) -> None: + ListTasksCommand.register_subcommand(subparsers, subparsers) + + +def list_agentic_tasks(subparsers: _SubParsersAction) -> list[dict]: + """Return ``[{name, description}, ...]`` for every registered agentic task. + + Reads metadata directly from the live argparse subparsers so the list + can never drift from the actual commands. + """ + choices = getattr(subparsers, "choices", {}) or {} + actions = [a for a in getattr(subparsers, "_choices_actions", [])] + descriptions = {a.dest: a.help for a in actions} + + out: list[dict] = [] + for name in AGENTIC_TASK_NAMES: + if name not in choices: + continue + out.append({"name": name, "description": descriptions.get(name, "")}) + return out + + +class ListTasksCommand(BaseDiffusersCLICommand): + task = "tasks" + + # The live subparsers object is captured at registration time so ``run`` + # can introspect it without needing access to ``main``'s locals. + _root_subparsers: _SubParsersAction | None = None + + @staticmethod + def register_subcommand(subparsers: _SubParsersAction, root_subparsers: _SubParsersAction) -> None: + parser: ArgumentParser = subparsers.add_parser( + "tasks", + help="List every registered agentic task with a one-line description.", + ) + parser.add_argument("--json", action="store_true", help="Emit machine-readable JSON.") + parser.set_defaults(func=ListTasksCommand) + ListTasksCommand._root_subparsers = root_subparsers + + def __init__(self, args: Namespace): + self.args = args + + def run(self) -> None: + tasks = list_agentic_tasks(self._root_subparsers) if self._root_subparsers else [] + if self.args.json: + json.dump({"tasks": tasks}, sys.stdout) + sys.stdout.write("\n") + return + width = max((len(t["name"]) for t in tasks), default=0) + for entry in tasks: + print(f"{entry['name']:<{width}} {entry['description'] or ''}") diff --git a/src/diffusers/commands/agentic/video.py b/src/diffusers/commands/agentic/video.py new file mode 100644 index 000000000000..e4dcdc4bb8a2 --- /dev/null +++ b/src/diffusers/commands/agentic/video.py @@ -0,0 +1,147 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Video-generation subcommands: text-to-video, image-to-video. + +There is no AutoPipeline for video, so these commands load via +``DiffusionPipeline`` and rely on the repo's ``model_index.json`` to pick +the right pipeline class (CogVideoX, Hunyuan, LTX, Wan, etc.). +""" + +from __future__ import annotations + +from argparse import ArgumentParser, Namespace, _SubParsersAction + +from diffusers.utils import load_image + +from .. import BaseDiffusersCLICommand +from . import _common + + +def register(subparsers: _SubParsersAction) -> None: + Text2VideoCommand.register_subcommand(subparsers) + Image2VideoCommand.register_subcommand(subparsers) + + +def _add_video_arguments(parser: ArgumentParser) -> None: + parser.add_argument("--num-frames", type=int, default=None, help="Number of frames to generate.") + parser.add_argument("--fps", type=int, default=8, help="Frames per second for the output video.") + + +def _build_call_kwargs(args: Namespace, pipeline) -> dict: + kwargs: dict = {} + if args.prompt is not None: + kwargs["prompt"] = args.prompt + if args.negative_prompt is not None: + kwargs["negative_prompt"] = args.negative_prompt + if args.num_inference_steps is not None: + kwargs["num_inference_steps"] = args.num_inference_steps + if args.guidance_scale is not None: + kwargs["guidance_scale"] = args.guidance_scale + if args.height is not None: + kwargs["height"] = args.height + if args.width is not None: + kwargs["width"] = args.width + if args.num_frames is not None: + kwargs["num_frames"] = args.num_frames + + generator = _common.get_generator(args.seed, pipeline.device.type) + if generator is not None: + kwargs["generator"] = generator + + kwargs.update(_common.parse_pipeline_kwargs(args.pipeline_kwargs)) + return kwargs + + +def _save_video(frames, args: Namespace, task: str) -> str: + from diffusers.utils import export_to_video + + path = _common.default_output_paths(task, 1, args.output, ext="mp4")[0] + export_to_video(frames, str(path), fps=args.fps) + return str(path) + + +class _BaseVideoCommand(BaseDiffusersCLICommand): + task: str = "" + + def __init__(self, args: Namespace): + self.args = args + + def run(self) -> None: + if _common.maybe_submit_remote(self.args, self.task): + return + pipeline = _common.load_pipeline(self.args, "DiffusionPipeline") + call_kwargs = _build_call_kwargs(self.args, pipeline) + self._attach_inputs(call_kwargs) + + result = pipeline(**call_kwargs) + frames = result.frames[0] if hasattr(result, "frames") else result[0] + out_path = _save_video(frames, self.args, self.task) + pushed = _common.push_outputs(self.args, [out_path], self.task) + + _common.format_result( + self.args, + { + "task": self.task, + "model": self.args.model, + "device": pipeline.device.type, + "outputs": [out_path], + "pushed": pushed, + "fps": self.args.fps, + "seed": self.args.seed, + }, + ) + + def _attach_inputs(self, call_kwargs: dict) -> None: # noqa: B027 + """Hook for subclasses to attach conditioning inputs.""" + + +class Text2VideoCommand(_BaseVideoCommand): + task = "text-to-video" + + @staticmethod + def register_subcommand(subparsers: _SubParsersAction) -> None: + parser: ArgumentParser = subparsers.add_parser( + "text-to-video", + help="Generate a video clip from a text prompt.", + ) + _common.add_loading_arguments(parser) + _common.add_optimization_arguments(parser) + _common.add_generation_arguments(parser) + _add_video_arguments(parser) + _common.add_remote_arguments(parser) + _common.add_output_arguments(parser) + parser.set_defaults(func=Text2VideoCommand) + + +class Image2VideoCommand(_BaseVideoCommand): + task = "image-to-video" + + @staticmethod + def register_subcommand(subparsers: _SubParsersAction) -> None: + parser: ArgumentParser = subparsers.add_parser( + "image-to-video", + help="Generate a video clip conditioned on an input image.", + ) + _common.add_loading_arguments(parser) + _common.add_optimization_arguments(parser) + _common.add_generation_arguments(parser) + _add_video_arguments(parser) + _common.add_remote_arguments(parser) + _common.add_output_arguments(parser) + parser.add_argument("--image", required=True, help="Path or URL to the conditioning image.") + parser.set_defaults(func=Image2VideoCommand) + + def _attach_inputs(self, call_kwargs: dict) -> None: + call_kwargs["image"] = load_image(self.args.image) From 4194c3980256be65cbafc762705b27e095a94eb6 Mon Sep 17 00:00:00 2001 From: DN6 Date: Tue, 2 Jun 2026 00:22:19 +0530 Subject: [PATCH 03/30] update --- src/diffusers/commands/agentic/_common.py | 55 +++++++++++++++++++++-- 1 file changed, 52 insertions(+), 3 deletions(-) diff --git a/src/diffusers/commands/agentic/_common.py b/src/diffusers/commands/agentic/_common.py index 6910f9daa4db..d81937a25b76 100644 --- a/src/diffusers/commands/agentic/_common.py +++ b/src/diffusers/commands/agentic/_common.py @@ -369,6 +369,16 @@ def default_output_paths(task: str, num: int, explicit: Optional[str], ext: str ) +def _rewrite_model_arg(forwarded: list[str], new_path: str) -> list[str]: + """Return a copy of ``forwarded`` with the ``--model`` value replaced by ``new_path``.""" + out = list(forwarded) + for i, token in enumerate(out): + if token in ("--model", "-m") and i + 1 < len(out): + out[i + 1] = new_path + return out + return out + + def _forward_args(args: Namespace, task: str) -> list[str]: """Reconstruct argv for the remote container from a parsed Namespace. @@ -409,6 +419,11 @@ def maybe_submit_remote(args: Namespace, task: str) -> bool: from huggingface_hub import HfApi, get_token, run_uv_job + try: + from huggingface_hub import Volume + except ImportError: + Volume = None + hf_token = args.token or get_token() api = HfApi(token=hf_token) @@ -423,9 +438,15 @@ def maybe_submit_remote(args: Namespace, task: str) -> bool: dependencies.extend(args.dependencies) secrets = {"HF_TOKEN": hf_token} if hf_token else None - env = {RUN_ID_ENV: run_id} + env = { + RUN_ID_ENV: run_id, + "HF_ENABLE_PARALLEL_LOADING": "1", # thread-pool the safetensors load step + } - job = run_uv_job( + # Mount the model repo into the job's filesystem so the container reads it + # from local disk instead of downloading on every run. Requires + # huggingface_hub >= 1.16. Falls back to the download path otherwise. + run_uv_job_kwargs: dict[str, Any] = dict( script=_UV_RUNNER_SCRIPT, script_args=forwarded, dependencies=dependencies, @@ -436,6 +457,14 @@ def maybe_submit_remote(args: Namespace, task: str) -> bool: env=env, token=hf_token, ) + if Volume is not None and not Path(args.model).exists(): + mount_path = "/model" + run_uv_job_kwargs["volumes"] = [ + Volume(type="model", source=args.model, mount_path=mount_path) + ] + run_uv_job_kwargs["script_args"] = _rewrite_model_arg(forwarded, mount_path) + + job = run_uv_job(**run_uv_job_kwargs) payload: dict[str, Any] = { "task": "remote-submit", @@ -450,6 +479,12 @@ def maybe_submit_remote(args: Namespace, task: str) -> bool: format_result(args, payload) return True + print( + f"[diffusers-cli] submitted job {job.id} (run_id={run_id}); " + f"watch at {getattr(job, 'url', 'https://huggingface.co/jobs')}", + file=sys.stderr, + flush=True, + ) final_status = _wait_for_job(api, job.id, args.namespace, args.poll_interval) payload["job_status"] = final_status payload["outputs"] = _download_job_artifacts(api, args.push_to, run_id, args.output) @@ -458,14 +493,28 @@ def maybe_submit_remote(args: Namespace, task: str) -> bool: def _wait_for_job(api: Any, job_id: str, namespace: Optional[str], poll_interval: float) -> str: - """Poll ``inspect_job`` until the job reaches a terminal stage; return that stage as a string.""" + """Poll ``inspect_job`` until the job reaches a terminal stage; return that stage as a string. + + Prints a heartbeat each poll and a labelled line on every stage transition so + the local terminal isn't silent for the multi-minute install/download/run + window of a remote inference job. + """ import time terminal = {"COMPLETED", "CANCELED", "ERROR", "DELETED"} + last_stage: Optional[str] = None while True: info = api.inspect_job(job_id=job_id, namespace=namespace) stage = str(info.status.stage) if info.status else "UNKNOWN" + if stage != last_stage: + if last_stage is not None: + print("", file=sys.stderr, flush=True) + print(f"[diffusers-cli] job {job_id}: {stage}", file=sys.stderr, flush=True) + last_stage = stage + else: + print(".", end="", file=sys.stderr, flush=True) if stage in terminal: + print("", file=sys.stderr, flush=True) return stage time.sleep(poll_interval) From d8eb952a1546cb881e8824250dd8f78669ed1dc5 Mon Sep 17 00:00:00 2001 From: DN6 Date: Tue, 2 Jun 2026 00:26:01 +0530 Subject: [PATCH 04/30] update --- src/diffusers/commands/agentic/_common.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/diffusers/commands/agentic/_common.py b/src/diffusers/commands/agentic/_common.py index d81937a25b76..bd0cb1fd0de4 100644 --- a/src/diffusers/commands/agentic/_common.py +++ b/src/diffusers/commands/agentic/_common.py @@ -352,7 +352,14 @@ def default_output_paths(task: str, num: int, explicit: Optional[str], ext: str # pin. ``--dependencies "diffusers @ git+..."`` on the local command appends # additional dependencies but does not replace this default install. DIFFUSERS_SOURCE = "diffusers @ git+https://github.com/huggingface/diffusers@diffuser-cli-for-agent" -_DEFAULT_REMOTE_DEPS = (DIFFUSERS_SOURCE, "accelerate", "transformers", "safetensors") +_DEFAULT_REMOTE_DEPS = ( + DIFFUSERS_SOURCE, + "accelerate", + "transformers", + "safetensors", + "torch==2.10.*", + "torchvision", +) # Entry point for ``uv run`` inside the container. ``uv run`` accepts a file path, # URL, or *command*; passing the ``diffusers-cli`` console script name makes UV @@ -459,9 +466,7 @@ def maybe_submit_remote(args: Namespace, task: str) -> bool: ) if Volume is not None and not Path(args.model).exists(): mount_path = "/model" - run_uv_job_kwargs["volumes"] = [ - Volume(type="model", source=args.model, mount_path=mount_path) - ] + run_uv_job_kwargs["volumes"] = [Volume(type="model", source=args.model, mount_path=mount_path)] run_uv_job_kwargs["script_args"] = _rewrite_model_arg(forwarded, mount_path) job = run_uv_job(**run_uv_job_kwargs) From 95f33c7ddcde604ed576751cacdd3f67510e0560 Mon Sep 17 00:00:00 2001 From: DN6 Date: Wed, 3 Jun 2026 22:37:37 +0530 Subject: [PATCH 05/30] update --- src/diffusers/commands/agentic/README.md | 246 ------ src/diffusers/commands/agentic/__init__.py | 18 - src/diffusers/commands/agentic/_common.py | 587 -------------- src/diffusers/commands/agentic/app.py | 38 - src/diffusers/commands/agentic/audio.py | 133 ---- src/diffusers/commands/agentic/image.py | 170 ----- src/diffusers/commands/agentic/modular.py | 249 ------ src/diffusers/commands/agentic/tasks.py | 91 --- src/diffusers/commands/agentic/video.py | 147 ---- src/diffusers/commands/custom_blocks.py | 114 ++- src/diffusers/commands/diffusers_cli.py | 4 +- src/diffusers/commands/inference.py | 846 +++++++++++++++++++++ 12 files changed, 898 insertions(+), 1745 deletions(-) delete mode 100644 src/diffusers/commands/agentic/README.md delete mode 100644 src/diffusers/commands/agentic/__init__.py delete mode 100644 src/diffusers/commands/agentic/_common.py delete mode 100644 src/diffusers/commands/agentic/app.py delete mode 100644 src/diffusers/commands/agentic/audio.py delete mode 100644 src/diffusers/commands/agentic/image.py delete mode 100644 src/diffusers/commands/agentic/modular.py delete mode 100644 src/diffusers/commands/agentic/tasks.py delete mode 100644 src/diffusers/commands/agentic/video.py create mode 100644 src/diffusers/commands/inference.py diff --git a/src/diffusers/commands/agentic/README.md b/src/diffusers/commands/agentic/README.md deleted file mode 100644 index 07f261692cd3..000000000000 --- a/src/diffusers/commands/agentic/README.md +++ /dev/null @@ -1,246 +0,0 @@ -# Agentic CLI for Diffusers - -Single-command access to common Diffusers use-cases. Designed for AI agents -and humans who need to run image/video/audio generation **without writing -Python scripts**. - -Every command below is reachable as `diffusers-cli `. Run -`diffusers-cli --help` for full option documentation. - -## How it works - -The module integrates with the main CLI through a single function call in -`diffusers_cli.py` — removing it disables everything with no side effects. - -``` -src/diffusers/commands/agentic/ -├── app.py # register_agentic_commands(subparsers) — single integration point -├── _common.py # Shared helpers (arg groups, pipeline detection, loading, remote, I/O) -├── image.py # text-to-image, image-to-image, inpaint -├── video.py # text-to-video, image-to-video -├── audio.py # text-to-audio -├── modular.py # generic ModularPipeline runner with free-form inputs -└── tasks.py # `tasks` — list every registered agentic command -``` - -## Discovering tasks - -```bash -diffusers-cli tasks # human-readable -diffusers-cli tasks --json # for agents -``` - -## Pipeline detection (DiffusionPipeline vs ModularPipeline) - -Every inference command auto-detects whether the `--model` is a regular -`DiffusionPipeline` repo (`model_index.json`) or a custom -`ModularPipeline` repo (`modular_model_index.json`) via a single Hub -listing — no weights are downloaded. If you point a task-shaped command at -a modular repo, it exits with a hint to use `diffusers-cli modular` -instead. The reverse is also true: `modular` rejects a regular repo and -points back at the task-shaped command. - -## Pushing outputs to a bucket - -Every inference command (and `modular`) accepts `--push-to ` -to upload the generated files to a Hugging Face **bucket** after they're -saved locally. The bucket is created if it doesn't exist and files land -under a prefix named after the task (e.g. `text-to-image/`). - -```bash -diffusers-cli text-to-image \ - --model stabilityai/stable-diffusion-xl-base-1.0 \ - --prompt "a watercolor of a fox" \ - --num-images 4 \ - --push-to your-username/cli-generations -``` - -The upload is a single `batch_bucket_files` round-trip regardless of how -many files were generated. The JSON payload reports `hf://buckets/...` -URIs so an agent can pipe them into a follow-up tool. - -## Running remotely (HF Jobs) and fetching outputs back - -Every inference command supports `--remote`, which submits the same call -to Hugging Face Jobs via `huggingface_hub.run_uv_job`, then by default -**waits for the job to finish and downloads the outputs back to your -local machine**. - -The flow: - -1. If `--push-to` isn't set, default it to `/jobs-artifacts` - (the canonical jobs bucket — `https://huggingface.co/buckets//jobs-artifacts`). -2. Generate a random `run_id` and pass it via `DIFFUSERS_CLI_RUN_ID` env - so the container writes its files under `/` inside the bucket. -3. Submit the job (your `HF_TOKEN` is forwarded as a secret). -4. Poll `inspect_job` every `--poll-interval` seconds until the stage is - `COMPLETED` / `CANCELED` / `ERROR` / `DELETED`. -5. List `/` in the bucket and `download_bucket_files` everything - into the local `--output` directory (default `./outputs/`). - -Pass `--no-wait` to fire-and-forget — the command prints the job id and -returns immediately; you can fetch later via `huggingface-cli buckets`. - -| Option | Description | -|--------|-------------| -| `--remote` | Run on HF Jobs instead of locally | -| `--flavor` | Hardware flavor (default `a10g-small`) | -| `--timeout` | Job timeout (e.g. `30m`, `2h`) | -| `--dependencies` | Extra pip deps. Repeat for multiple | -| `--namespace` | HF namespace (defaults to the current user) | -| `--no-wait` | Skip polling/download — submit and exit | -| `--poll-interval` | Seconds between job-status polls (default 5) | - -```bash -# Submit text-to-image to HF Jobs on an A100, wait, download to ./outputs/ -diffusers-cli text-to-image \ - --model stabilityai/stable-diffusion-xl-base-1.0 \ - --prompt "a watercolor of a fox in autumn leaves" \ - --num-images 4 \ - --remote --flavor a100-large --timeout 30m -``` - -```bash -# Same call, fire-and-forget -diffusers-cli text-to-image ... --remote --no-wait -``` - -## Common options - -Every inference command supports: - -| Option | Description | -|--------|-------------| -| `--model` / `-m` | Model id on the Hub or local path | -| `--device` | `cpu`, `cuda`, `cuda:0`, `mps` (defaults to best available) | -| `--dtype` | `auto`, `float16`, `bfloat16`, `float32` | -| `--variant` | Optional weight variant (e.g. `fp16`) | -| `--revision` | Model revision (branch, tag, or SHA) | -| `--token` | Hugging Face token for gated/private models | -| `--trust-remote-code` | Allow custom code from the Hub | -| `--output` / `-o` | Output file or directory | -| `--json` | Machine-readable JSON summary on stdout | -| `--seed` | Random seed for reproducibility | -| `--pipeline-kwargs` | JSON object of extra kwargs forwarded to the pipeline call | - -## Commands - -### Image - -1. Generate an image from a text prompt - ```bash - diffusers-cli text-to-image \ - --model stabilityai/stable-diffusion-xl-base-1.0 \ - --prompt "a watercolor of a fox in autumn leaves" \ - --output fox.png - ``` - -2. Generate with explicit sampling controls - ```bash - diffusers-cli text-to-image \ - --model stabilityai/stable-diffusion-xl-base-1.0 \ - --prompt "studio portrait of a cyberpunk hacker" \ - --negative-prompt "blurry, low quality" \ - --num-inference-steps 30 \ - --guidance-scale 7.5 \ - --height 1024 --width 1024 \ - --seed 42 - ``` - -3. Generate multiple variants at once - ```bash - diffusers-cli text-to-image \ - --model black-forest-labs/FLUX.1-schnell \ - --prompt "a still life with citrus and ceramics" \ - --num-images 4 \ - --output ./outputs/still-life/ - ``` - -4. Transform an existing image with a prompt (image-to-image) - ```bash - diffusers-cli image-to-image \ - --model stabilityai/stable-diffusion-xl-refiner-1.0 \ - --image input.jpg \ - --prompt "make it look like an oil painting" \ - --strength 0.6 \ - --output painted.png - ``` - -5. Inpaint a masked region of an image - ```bash - diffusers-cli inpaint \ - --model stabilityai/stable-diffusion-2-inpainting \ - --image photo.png \ - --mask mask.png \ - --prompt "a golden retriever sitting on the bench" \ - --output filled.png - ``` - -6. Emit JSON for downstream tooling - ```bash - diffusers-cli text-to-image \ - --model stabilityai/sdxl-turbo \ - --prompt "neon city at night" \ - --json - ``` - -### Video - -7. Generate a short clip from a text prompt - ```bash - diffusers-cli text-to-video \ - --model THUDM/CogVideoX-2b \ - --prompt "a panda surfing on a wave at sunset" \ - --num-frames 49 \ - --fps 8 \ - --output panda.mp4 - ``` - -8. Animate a single still image - ```bash - diffusers-cli image-to-video \ - --model stabilityai/stable-video-diffusion-img2vid-xt \ - --image still.png \ - --prompt "subtle camera dolly forward" \ - --num-frames 25 \ - --output animated.mp4 - ``` - -### Audio - -9. Generate music or a sound effect from a text prompt - ```bash - diffusers-cli text-to-audio \ - --model cvssp/audioldm2 \ - --prompt "a calm piano melody in a quiet room" \ - --audio-length-in-s 10 \ - --output music.wav - ``` - -### Modular pipelines - -Modular pipelines have an open-ended input surface defined by the block -graph, so the CLI doesn't try to predict it — pass inputs verbatim. - -14. Run a modular pipeline with free-form inputs - ```bash - diffusers-cli modular \ - --model your-username/my-modular-pipeline \ - --inputs prompt="a calm landscape" \ - --inputs num_inference_steps=25 \ - --inputs-json '{"guidance_scale": 4.5}' \ - --output-key image \ - --output out.png - ``` - -The output type is auto-detected — a PIL image (or list of PIL images) -becomes PNG(s), a sequence of frames becomes an MP4, a numpy audio array -becomes a WAV, and anything else is JSON-serialized. - -### Roadmap - -Open an issue if you'd like to help land one: - -- **Video**: `video-to-video` -- **Conditioning**: ControlNet, T2I-Adapter, instruction editing (Flux-Kontext, InstructPix2Pix) -- **Quantization / export**: `convert` (fp16/safetensors/GGUF), `quantize` (bitsandbytes, torchao) diff --git a/src/diffusers/commands/agentic/__init__.py b/src/diffusers/commands/agentic/__init__.py deleted file mode 100644 index 9076bc7ee4c9..000000000000 --- a/src/diffusers/commands/agentic/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright 2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .app import register_agentic_commands - - -__all__ = ["register_agentic_commands"] diff --git a/src/diffusers/commands/agentic/_common.py b/src/diffusers/commands/agentic/_common.py deleted file mode 100644 index bd0cb1fd0de4..000000000000 --- a/src/diffusers/commands/agentic/_common.py +++ /dev/null @@ -1,587 +0,0 @@ -# Copyright 2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Shared helpers for the agentic CLI surface. - -These utilities are intentionally small and dependency-light. Each diffusers -agentic subcommand should be able to be read end-to-end by an agent without -needing to follow many layers of indirection. -""" - -from __future__ import annotations - -import json -import os -import sys -from argparse import ArgumentParser, Namespace -from pathlib import Path -from typing import Any, Optional - - -DEFAULT_OUTPUT_DIR = "outputs" - - -DTYPE_CHOICES = ("auto", "float16", "fp16", "bfloat16", "bf16", "float32", "fp32") -CPU_OFFLOAD_CHOICES = ("model", "group") -ATTENTION_BACKEND_CHOICES = ( - "default", - "flash_hub", - "flash_varlen_hub", - "flash_4_hub", - "sage_hub", -) - - -def add_loading_arguments(parser: ArgumentParser) -> None: - """Arguments shared by every inference subcommand.""" - parser.add_argument("--model", "-m", required=True, help="Model id on the Hugging Face Hub or local path.") - parser.add_argument("--device", default=None, help="Device to run on (e.g. cpu, cuda, cuda:0, mps).") - parser.add_argument( - "--dtype", - default="auto", - choices=DTYPE_CHOICES, - help="Torch dtype for pipeline weights.", - ) - parser.add_argument("--variant", default=None, help='Optional weight variant (e.g. "fp16").') - parser.add_argument("--revision", default=None, help="Model revision (branch, tag, or commit SHA).") - parser.add_argument("--token", default=None, help="Hugging Face token for gated/private models.") - parser.add_argument("--trust-remote-code", action="store_true", help="Allow custom code from the Hub.") - - -def add_optimization_arguments(parser: ArgumentParser) -> None: - """Optional pipeline-optimization flags. All default to off.""" - parser.add_argument( - "--cpu-offload", - choices=CPU_OFFLOAD_CHOICES, - default=None, - help=( - "Offload pipeline components to CPU during inference. " - "'model' uses enable_model_cpu_offload, " - "'group' uses pipeline.enable_group_offload(leaf_level, use_stream=True)." - ), - ) - parser.add_argument( - "--attention-backend", - choices=ATTENTION_BACKEND_CHOICES, - default="default", - help=( - "Override the attention backend on the transformer/UNet. " - "Only Hub-hosted kernels are exposed — they auto-download on first " - "use and avoid a local install. 'default' leaves the backend untouched." - ), - ) - parser.add_argument("--vae-tiling", action="store_true", help="Enable VAE tiling (lower peak VRAM).") - parser.add_argument("--vae-slicing", action="store_true", help="Enable VAE slicing (lower peak VRAM).") - parser.add_argument( - "--context-parallel", - action="store_true", - help=( - "Enable Ulysses-style context parallelism (ulysses_anything mode, supports arbitrary " - "sequence lengths). Requires launching the CLI under torchrun with ≥2 GPUs." - ), - ) - - -def add_generation_arguments(parser: ArgumentParser) -> None: - """Arguments shared by image/video generation subcommands.""" - parser.add_argument("--prompt", "-p", default=None, help="Text prompt.") - parser.add_argument("--negative-prompt", default=None, help="Negative text prompt.") - parser.add_argument("--num-inference-steps", type=int, default=None, help="Number of denoising steps.") - parser.add_argument("--guidance-scale", type=float, default=None, help="Classifier-free guidance scale.") - parser.add_argument("--height", type=int, default=None, help="Output height in pixels.") - parser.add_argument("--width", type=int, default=None, help="Output width in pixels.") - parser.add_argument("--num-images", type=int, default=1, help="Number of images to generate.") - parser.add_argument("--seed", type=int, default=None, help="Random seed for reproducibility.") - parser.add_argument( - "--pipeline-kwargs", - default=None, - help="JSON object of extra kwargs forwarded to the pipeline call.", - ) - - -def add_output_arguments(parser: ArgumentParser) -> None: - """Output formatting arguments.""" - parser.add_argument( - "--output", - "-o", - default=None, - help="Output file or directory. Defaults to ./outputs/-..", - ) - parser.add_argument( - "--push-to", - default=None, - help=( - "Upload the generated files to this HF bucket id after saving " - "(created if missing). When --remote is set, defaults to " - "/jobs-artifacts; remote runs always write to that bucket " - "and fetch the results back locally." - ), - ) - parser.add_argument("--json", action="store_true", help="Emit a machine-readable JSON summary on stdout.") - - -def add_remote_arguments(parser: ArgumentParser) -> None: - """Optional HF Jobs arguments — works on every inference subcommand.""" - parser.add_argument( - "--remote", - action="store_true", - help="Submit this command to Hugging Face Jobs instead of running locally.", - ) - parser.add_argument( - "--flavor", - default="a10g-small", - help="HF Jobs hardware flavor for --remote (e.g. a10g-small, a100-large, cpu-basic).", - ) - parser.add_argument( - "--timeout", - default=None, - help="HF Jobs timeout for --remote (e.g. 30m, 2h).", - ) - parser.add_argument( - "--dependencies", - action="append", - default=None, - help="Extra pip dependencies for the --remote job. Repeat to add multiple.", - ) - parser.add_argument( - "--namespace", - default=None, - help="HF namespace to run the --remote job under (defaults to the current user).", - ) - parser.add_argument( - "--no-wait", - action="store_true", - help=( - "Don't wait for the --remote job to finish — submit and print the job id. " - "Default behaviour is to poll until completion and download outputs locally." - ), - ) - parser.add_argument( - "--poll-interval", - type=float, - default=5.0, - help="Seconds between job-status polls when waiting for --remote completion.", - ) - - -def resolve_dtype(name: Optional[str]): - """Map a CLI dtype string to a torch dtype. - - Returns ``"auto"`` when the user wants diffusers to pick. - """ - if name in (None, "auto"): - return "auto" - - import torch - - mapping = { - "fp32": torch.float32, - "fp16": torch.float16, - "bf16": torch.bfloat16, - } - if name not in mapping: - raise ValueError(f"Unknown dtype: {name}") - return mapping[name] - - -def resolve_device(name: Optional[str]) -> str: - """Pick a device, defaulting to the best available one.""" - if name: - return name - import torch - - if torch.cuda.is_available(): - return "cuda" - - if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): - return "mps" - - return "cpu" - - -def load_pipeline(args: Namespace, pipeline_cls_name: str) -> Any: - """Load a diffusers pipeline class by name and move it to the chosen device. - - ``pipeline_cls_name`` can be any class exported from ``diffusers`` — - typically one of ``AutoPipelineForText2Image``, ``AutoPipelineForImage2Image``, - ``AutoPipelineForInpainting``, or ``DiffusionPipeline`` for video/audio. - """ - import diffusers - - pipeline_cls = getattr(diffusers, pipeline_cls_name) - from_pretrained_kwargs: dict[str, Any] = { - "torch_dtype": resolve_dtype(args.dtype), - "trust_remote_code": args.trust_remote_code, - } - if args.variant: - from_pretrained_kwargs["variant"] = args.variant - if args.revision: - from_pretrained_kwargs["revision"] = args.revision - if args.token: - from_pretrained_kwargs["token"] = args.token - - pipeline = pipeline_cls.from_pretrained(args.model, **from_pretrained_kwargs) - pipeline = map_to_device(pipeline, args, resolve_device(args.device)) - if args.vae_tiling and hasattr(pipeline, "enable_vae_tiling"): - pipeline.enable_vae_tiling() - if args.vae_slicing and hasattr(pipeline, "enable_vae_slicing"): - pipeline.enable_vae_slicing() - if args.attention_backend != "default": - _set_attention_backend(pipeline, args.attention_backend) - if args.context_parallel: - _enable_context_parallel(pipeline) - return pipeline - - -def map_to_device(pipeline: Any, args: Namespace, device: str) -> Any: - """Get the pipeline ready to run on ``device``. - - Calls ``.to(device)`` by default; when ``--cpu-offload`` is set the chosen - offload helper (``model``, ``sequential``, or ``group``) handles placement instead. - """ - if args.cpu_offload is None: - return pipeline.to(device) - if args.cpu_offload == "model": - pipeline.enable_model_cpu_offload(device=device) - elif args.cpu_offload == "group": - import torch - - pipeline.enable_group_offload( - onload_device=torch.device(device), - offload_type="leaf_level", - use_stream=device.startswith("cuda"), - ) - return pipeline - - -def _enable_context_parallel(pipeline: Any) -> None: - """Enable Ulysses-style context-parallel inference on the transformer/UNet.""" - import torch - - if not torch.distributed.is_available() or not torch.distributed.is_initialized(): - raise SystemExit( - "--context-parallel requires torch.distributed to be initialized. " - "Launch the CLI under torchrun, e.g.: " - "`torchrun --nproc-per-node=N -m diffusers.commands.diffusers_cli ...`." - ) - - from diffusers import ContextParallelConfig - - cfg = ContextParallelConfig( - ulysses_degree=torch.distributed.get_world_size(), - ring_degree=1, - ulysses_anything=True, - ) - for attr in ("transformer", "unet"): - module = getattr(pipeline, attr, None) - if module is not None and hasattr(module, "enable_parallelism"): - module.enable_parallelism(config=cfg) - return - - -def _set_attention_backend(pipeline: Any, backend: str) -> None: - for attr in ("transformer", "unet"): - module = getattr(pipeline, attr, None) - if module is not None and hasattr(module, "set_attention_backend"): - try: - module.set_attention_backend(backend) - except (ValueError, ImportError, RuntimeError): - pass - return - - -def get_generator(seed: Optional[int], device: str): - if seed is None: - return None - import torch - - generator_device = "cpu" if device == "mps" else device - return torch.Generator(device=generator_device).manual_seed(seed) - - -def parse_pipeline_kwargs(raw: Optional[str]) -> dict[str, Any]: - if not raw: - return {} - try: - parsed = json.loads(raw) - except json.JSONDecodeError as e: - raise SystemExit(f"--pipeline-kwargs must be valid JSON: {e}") from e - if not isinstance(parsed, dict): - raise SystemExit("--pipeline-kwargs must decode to a JSON object.") - return parsed - - -def default_output_paths(task: str, num: int, explicit: Optional[str], ext: str = "png") -> list[Path]: - """Resolve output file paths for ``num`` generated artifacts. - - - If ``explicit`` is a directory (or ends with /), write into it. - - If ``explicit`` is a file and ``num == 1``, write to that file. - - If ``explicit`` is a file template and ``num > 1``, append ``-`` before the suffix. - - Otherwise default to ``./outputs/-.``. - """ - if explicit is None: - base = Path(DEFAULT_OUTPUT_DIR) - base.mkdir(parents=True, exist_ok=True) - return [base / f"{task}-{i}.{ext}" for i in range(num)] - - p = Path(explicit) - if explicit.endswith(os.sep) or p.is_dir(): - p.mkdir(parents=True, exist_ok=True) - return [p / f"{task}-{i}.{ext}" for i in range(num)] - - p.parent.mkdir(parents=True, exist_ok=True) - if num == 1: - return [p] - stem, suffix = p.stem, p.suffix or f".{ext}" - return [p.with_name(f"{stem}-{i}{suffix}") for i in range(num)] - - -# Source for the diffusers install used by --remote jobs. While iterating on a -# feature branch, point at the branch URL; once merged, switch back to a release -# pin. ``--dependencies "diffusers @ git+..."`` on the local command appends -# additional dependencies but does not replace this default install. -DIFFUSERS_SOURCE = "diffusers @ git+https://github.com/huggingface/diffusers@diffuser-cli-for-agent" -_DEFAULT_REMOTE_DEPS = ( - DIFFUSERS_SOURCE, - "accelerate", - "transformers", - "safetensors", - "torch==2.10.*", - "torchvision", -) - -# Entry point for ``uv run`` inside the container. ``uv run`` accepts a file path, -# URL, or *command*; passing the ``diffusers-cli`` console script name makes UV -# install the deps above (which register the entry point) and then exec the CLI. -_UV_RUNNER_SCRIPT = "diffusers-cli" - - -RUN_ID_ENV = "DIFFUSERS_CLI_RUN_ID" - -# Namespace keys that control *how* a remote job runs locally, not what runs -# inside the container. They are stripped when forwarding argv to the container. -HF_JOBS_KEYS = frozenset( - {"remote", "flavor", "timeout", "dependencies", "namespace", "no_wait", "poll_interval", "func"} -) - - -def _rewrite_model_arg(forwarded: list[str], new_path: str) -> list[str]: - """Return a copy of ``forwarded`` with the ``--model`` value replaced by ``new_path``.""" - out = list(forwarded) - for i, token in enumerate(out): - if token in ("--model", "-m") and i + 1 < len(out): - out[i + 1] = new_path - return out - return out - - -def _forward_args(args: Namespace, task: str) -> list[str]: - """Reconstruct argv for the remote container from a parsed Namespace. - - Skips the local-only job-control keys above. Boolean flags are emitted - only when True. List values become repeated ``--flag value`` pairs. - """ - out: list[str] = [task] - for key, value in vars(args).items(): - if key in HF_JOBS_KEYS: - continue - if value is None or value is False: - continue - flag = "--" + key.replace("_", "-") - if value is True: - out.append(flag) - elif isinstance(value, list): - for item in value: - out.extend([flag, str(item)]) - else: - out.extend([flag, str(value)]) - return out - - -def maybe_submit_remote(args: Namespace, task: str) -> bool: - """If ``--remote`` was set, submit this invocation to HF Jobs and return True. - - The local ``run()`` should bail immediately when this returns True. - - Auto-defaults ``--push-to`` to ``/jobs-artifacts`` so the remote - container has somewhere to write before tear-down. By default, polls - the job until completion and downloads the artifacts back to the local - output directory; pass ``--no-wait`` to fire-and-forget. - """ - if not args.remote: - return False - - import uuid - - from huggingface_hub import HfApi, get_token, run_uv_job - - try: - from huggingface_hub import Volume - except ImportError: - Volume = None - - hf_token = args.token or get_token() - api = HfApi(token=hf_token) - - if not args.push_to: - args.push_to = f"{api.whoami()['name']}/jobs-artifacts" - - run_id = uuid.uuid4().hex[:12] - - forwarded = _forward_args(args, task) - dependencies = list(_DEFAULT_REMOTE_DEPS) - if args.dependencies: - dependencies.extend(args.dependencies) - - secrets = {"HF_TOKEN": hf_token} if hf_token else None - env = { - RUN_ID_ENV: run_id, - "HF_ENABLE_PARALLEL_LOADING": "1", # thread-pool the safetensors load step - } - - # Mount the model repo into the job's filesystem so the container reads it - # from local disk instead of downloading on every run. Requires - # huggingface_hub >= 1.16. Falls back to the download path otherwise. - run_uv_job_kwargs: dict[str, Any] = dict( - script=_UV_RUNNER_SCRIPT, - script_args=forwarded, - dependencies=dependencies, - flavor=args.flavor, - timeout=args.timeout, - namespace=args.namespace, - secrets=secrets, - env=env, - token=hf_token, - ) - if Volume is not None and not Path(args.model).exists(): - mount_path = "/model" - run_uv_job_kwargs["volumes"] = [Volume(type="model", source=args.model, mount_path=mount_path)] - run_uv_job_kwargs["script_args"] = _rewrite_model_arg(forwarded, mount_path) - - job = run_uv_job(**run_uv_job_kwargs) - - payload: dict[str, Any] = { - "task": "remote-submit", - "job_id": getattr(job, "id", None), - "job_status": str(getattr(job, "status", "")), - "flavor": args.flavor, - "push_to": args.push_to, - "run_id": run_id, - } - - if args.no_wait: - format_result(args, payload) - return True - - print( - f"[diffusers-cli] submitted job {job.id} (run_id={run_id}); " - f"watch at {getattr(job, 'url', 'https://huggingface.co/jobs')}", - file=sys.stderr, - flush=True, - ) - final_status = _wait_for_job(api, job.id, args.namespace, args.poll_interval) - payload["job_status"] = final_status - payload["outputs"] = _download_job_artifacts(api, args.push_to, run_id, args.output) - format_result(args, payload) - return True - - -def _wait_for_job(api: Any, job_id: str, namespace: Optional[str], poll_interval: float) -> str: - """Poll ``inspect_job`` until the job reaches a terminal stage; return that stage as a string. - - Prints a heartbeat each poll and a labelled line on every stage transition so - the local terminal isn't silent for the multi-minute install/download/run - window of a remote inference job. - """ - import time - - terminal = {"COMPLETED", "CANCELED", "ERROR", "DELETED"} - last_stage: Optional[str] = None - while True: - info = api.inspect_job(job_id=job_id, namespace=namespace) - stage = str(info.status.stage) if info.status else "UNKNOWN" - if stage != last_stage: - if last_stage is not None: - print("", file=sys.stderr, flush=True) - print(f"[diffusers-cli] job {job_id}: {stage}", file=sys.stderr, flush=True) - last_stage = stage - else: - print(".", end="", file=sys.stderr, flush=True) - if stage in terminal: - print("", file=sys.stderr, flush=True) - return stage - time.sleep(poll_interval) - - -def _download_job_artifacts(api: Any, bucket_id: str, run_id: str, output: Optional[str]) -> list[str]: - """Download every file under ``/`` from ``bucket_id`` to a local directory. - - ``output`` is always treated as a directory (created if missing) — remote - runs produce many files, so a file-path target wouldn't make sense. - """ - from huggingface_hub import BucketFile - - local_dir = Path(output) if output else Path(DEFAULT_OUTPUT_DIR) - local_dir.mkdir(parents=True, exist_ok=True) - - pairs: list[tuple[Any, Path]] = [] - for entry in api.list_bucket_tree(bucket_id, prefix=f"{run_id}/", recursive=True): - if not isinstance(entry, BucketFile): - continue - pairs.append((entry, local_dir / Path(entry.path).name)) - - if not pairs: - return [] - api.download_bucket_files(bucket_id, files=pairs) - return [str(local) for _, local in pairs] - - -def push_outputs(args: Namespace, saved_paths: list[str], task: str) -> Optional[dict[str, Any]]: - """Upload ``saved_paths`` to the ``--push-to`` bucket, returning a summary. - - Returns None when ``--push-to`` is unset. Creates the bucket if needed. - When ``DIFFUSERS_CLI_RUN_ID`` is set (i.e. we're inside a remote job), - files land under ``/`` so the local side can isolate this run's - output; otherwise they land under ``/``. - """ - if not args.push_to: - return None - target = args.push_to - - from huggingface_hub import HfApi - - api = HfApi(token=args.token) - api.create_bucket(target, exist_ok=True) - - prefix = os.environ.get(RUN_ID_ENV) or task - add = [(local, f"{prefix}/{Path(local).name}") for local in saved_paths] - api.batch_bucket_files(target, add=add) - - uploaded = [f"hf://buckets/{target}/{dest}" for _, dest in add] - return {"bucket_id": target, "uploaded": uploaded} - - -def format_result(args: Namespace, payload: dict[str, Any]) -> None: - """Print either a human-friendly summary or JSON, depending on --json.""" - if args.json: - json.dump(payload, sys.stdout, default=str) - sys.stdout.write("\n") - return - - outputs = payload.get("outputs", []) - if outputs: - for path in outputs: - print(path) - else: - print(payload) diff --git a/src/diffusers/commands/agentic/app.py b/src/diffusers/commands/agentic/app.py deleted file mode 100644 index 3ca7b50ae1ed..000000000000 --- a/src/diffusers/commands/agentic/app.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright 2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Single integration point for the agentic CLI. - -Removing the call to ``register_agentic_commands`` from -``diffusers_cli.py`` disables the entire surface with no side effects. -""" - -from __future__ import annotations - -from argparse import _SubParsersAction - -from . import audio as audio_commands -from . import image as image_commands -from . import modular as modular_commands -from . import tasks as tasks_commands -from . import video as video_commands - - -def register_agentic_commands(subparsers: _SubParsersAction) -> None: - """Register every agentic subcommand on the top-level ``diffusers-cli`` parser.""" - image_commands.register(subparsers) - video_commands.register(subparsers) - audio_commands.register(subparsers) - modular_commands.register(subparsers) - tasks_commands.register(subparsers) diff --git a/src/diffusers/commands/agentic/audio.py b/src/diffusers/commands/agentic/audio.py deleted file mode 100644 index 42c2fd0da210..000000000000 --- a/src/diffusers/commands/agentic/audio.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright 2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Audio-generation subcommands: text-to-audio.""" - -from __future__ import annotations - -from argparse import ArgumentParser, Namespace, _SubParsersAction - -from .. import BaseDiffusersCLICommand -from . import _common - - -def register(subparsers: _SubParsersAction) -> None: - Text2AudioCommand.register_subcommand(subparsers) - - -def _save_audio(audios, sampling_rate: int, args: Namespace, task: str) -> list[str]: - """Save one or more audio arrays as WAV files.""" - import numpy as np - from scipy.io.wavfile import write as wavfile_write - - paths = _common.default_output_paths(task, len(audios), args.output, ext="wav") - saved: list[str] = [] - for audio, path in zip(audios, paths): - data = np.asarray(audio) - if data.dtype.kind == "f": - data = np.clip(data, -1.0, 1.0) - data = (data * 32767).astype(np.int16) - if data.ndim > 1 and data.shape[0] < data.shape[-1]: - # ``(channels, samples)`` → ``(samples, channels)`` for scipy. - data = data.T - wavfile_write(str(path), sampling_rate, data) - saved.append(str(path)) - return saved - - -class Text2AudioCommand(BaseDiffusersCLICommand): - task = "text-to-audio" - - @staticmethod - def register_subcommand(subparsers: _SubParsersAction) -> None: - parser: ArgumentParser = subparsers.add_parser( - "text-to-audio", - help="Generate an audio clip (music or sound) from a text prompt.", - ) - _common.add_loading_arguments(parser) - _common.add_optimization_arguments(parser) - _common.add_generation_arguments(parser) - _common.add_remote_arguments(parser) - parser.add_argument( - "--audio-length-in-s", - type=float, - default=None, - help="Duration of the generated audio in seconds.", - ) - parser.add_argument( - "--sampling-rate", - type=int, - default=None, - help="Override the sampling rate written to the WAV file.", - ) - _common.add_output_arguments(parser) - parser.set_defaults(func=Text2AudioCommand) - - def __init__(self, args: Namespace): - self.args = args - - def run(self) -> None: - if _common.maybe_submit_remote(self.args, self.task): - return - pipeline = _common.load_pipeline(self.args, "DiffusionPipeline") - - call_kwargs: dict = {} - if self.args.prompt is not None: - call_kwargs["prompt"] = self.args.prompt - if self.args.negative_prompt is not None: - call_kwargs["negative_prompt"] = self.args.negative_prompt - if self.args.num_inference_steps is not None: - call_kwargs["num_inference_steps"] = self.args.num_inference_steps - if self.args.guidance_scale is not None: - call_kwargs["guidance_scale"] = self.args.guidance_scale - if self.args.audio_length_in_s is not None: - call_kwargs["audio_length_in_s"] = self.args.audio_length_in_s - if self.args.num_images != 1: - call_kwargs["num_waveforms_per_prompt"] = self.args.num_images - - generator = _common.get_generator(self.args.seed, pipeline.device.type) - if generator is not None: - call_kwargs["generator"] = generator - - call_kwargs.update(_common.parse_pipeline_kwargs(self.args.pipeline_kwargs)) - - result = pipeline(**call_kwargs) - audios = getattr(result, "audios", None) - if audios is None: - audios = result[0] - - sampling_rate = self.args.sampling_rate - if sampling_rate is None: - pipeline_sr = getattr(pipeline, "sampling_rate", None) - if isinstance(pipeline_sr, int): - sampling_rate = pipeline_sr - else: - vocoder_config = getattr(getattr(pipeline, "vocoder", None), "config", None) - sampling_rate = getattr(vocoder_config, "sampling_rate", 16000) if vocoder_config else 16000 - - saved = _save_audio(audios, sampling_rate, self.args, self.task) - pushed = _common.push_outputs(self.args, saved, self.task) - - _common.format_result( - self.args, - { - "task": self.task, - "model": self.args.model, - "device": pipeline.device.type, - "outputs": saved, - "pushed": pushed, - "sampling_rate": sampling_rate, - "seed": self.args.seed, - }, - ) diff --git a/src/diffusers/commands/agentic/image.py b/src/diffusers/commands/agentic/image.py deleted file mode 100644 index 94fdd81d6953..000000000000 --- a/src/diffusers/commands/agentic/image.py +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright 2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Image-generation subcommands: text-to-image, image-to-image, inpaint.""" - -from __future__ import annotations - -from argparse import ArgumentParser, Namespace, _SubParsersAction - -from diffusers.utils import load_image - -from .. import BaseDiffusersCLICommand -from . import _common - - -def register(subparsers: _SubParsersAction) -> None: - Text2ImageCommand.register_subcommand(subparsers) - Image2ImageCommand.register_subcommand(subparsers) - InpaintCommand.register_subcommand(subparsers) - - -def _build_call_kwargs(args: Namespace, pipeline) -> dict: - kwargs: dict = {} - if args.prompt is not None: - kwargs["prompt"] = args.prompt - if args.negative_prompt is not None: - kwargs["negative_prompt"] = args.negative_prompt - if args.num_inference_steps is not None: - kwargs["num_inference_steps"] = args.num_inference_steps - if args.guidance_scale is not None: - kwargs["guidance_scale"] = args.guidance_scale - if args.height is not None: - kwargs["height"] = args.height - if args.width is not None: - kwargs["width"] = args.width - if args.num_images != 1: - kwargs["num_images_per_prompt"] = args.num_images - - generator = _common.get_generator(args.seed, pipeline.device.type) - if generator is not None: - kwargs["generator"] = generator - - kwargs.update(_common.parse_pipeline_kwargs(args.pipeline_kwargs)) - return kwargs - - -def _save_images(images, task: str, args: Namespace) -> list[str]: - paths = _common.default_output_paths(task, len(images), args.output, ext="png") - saved: list[str] = [] - for image, path in zip(images, paths): - image.save(path) - saved.append(str(path)) - return saved - - -class _BaseImageCommand(BaseDiffusersCLICommand): - task: str = "" - auto_cls: str = "" - - def __init__(self, args: Namespace): - self.args = args - - def run(self) -> None: - if _common.maybe_submit_remote(self.args, self.task): - return - - pipeline = _common.load_pipeline(self.args, self.auto_cls) - call_kwargs = _build_call_kwargs(self.args, pipeline) - self._attach_inputs(call_kwargs) - - result = pipeline(**call_kwargs) - saved = _save_images(result.images, self.task, self.args) - pushed = _common.push_outputs(self.args, saved, self.task) - - _common.format_result( - self.args, - { - "task": self.task, - "model": self.args.model, - "device": pipeline.device.type, - "outputs": saved, - "pushed": pushed, - "seed": self.args.seed, - }, - ) - - def _attach_inputs(self, call_kwargs: dict) -> None: # noqa: B027 - """Hook for subclasses to attach image/mask conditioning.""" - - -class Text2ImageCommand(_BaseImageCommand): - task = "text-to-image" - auto_cls = "AutoPipelineForText2Image" - - @staticmethod - def register_subcommand(subparsers: _SubParsersAction) -> None: - parser: ArgumentParser = subparsers.add_parser( - "text-to-image", - help="Generate an image from a text prompt.", - ) - _common.add_loading_arguments(parser) - _common.add_optimization_arguments(parser) - _common.add_generation_arguments(parser) - _common.add_remote_arguments(parser) - _common.add_output_arguments(parser) - parser.set_defaults(func=Text2ImageCommand) - - -class Image2ImageCommand(_BaseImageCommand): - task = "image-to-image" - auto_cls = "AutoPipelineForImage2Image" - - @staticmethod - def register_subcommand(subparsers: _SubParsersAction) -> None: - parser: ArgumentParser = subparsers.add_parser( - "image-to-image", - help="Transform an input image conditioned on a text prompt.", - ) - _common.add_loading_arguments(parser) - _common.add_optimization_arguments(parser) - _common.add_generation_arguments(parser) - _common.add_remote_arguments(parser) - _common.add_output_arguments(parser) - - parser.add_argument("--image", required=True, help="Path or URL to the conditioning image.") - parser.add_argument("--strength", type=float, default=None, help="How much to transform the input (0-1).") - parser.set_defaults(func=Image2ImageCommand) - - def _attach_inputs(self, call_kwargs: dict) -> None: - call_kwargs["image"] = load_image(self.args.image) - if self.args.strength is not None: - call_kwargs["strength"] = self.args.strength - - -class InpaintCommand(_BaseImageCommand): - task = "inpaint" - auto_cls = "AutoPipelineForInpainting" - - @staticmethod - def register_subcommand(subparsers: _SubParsersAction) -> None: - parser: ArgumentParser = subparsers.add_parser( - "inpaint", - help="Inpaint a region of an image defined by a mask.", - ) - _common.add_loading_arguments(parser) - _common.add_optimization_arguments(parser) - _common.add_generation_arguments(parser) - _common.add_remote_arguments(parser) - _common.add_output_arguments(parser) - parser.add_argument("--image", required=True, help="Path or URL to the base image.") - parser.add_argument("--mask", required=True, help="Path or URL to the mask image (white=inpaint).") - parser.add_argument("--strength", type=float, default=None, help="Strength of the inpainting transform (0-1).") - parser.set_defaults(func=InpaintCommand) - - def _attach_inputs(self, call_kwargs: dict) -> None: - call_kwargs["image"] = load_image(self.args.image) - call_kwargs["mask_image"] = load_image(self.args.mask) - if self.args.strength is not None: - call_kwargs["strength"] = self.args.strength diff --git a/src/diffusers/commands/agentic/modular.py b/src/diffusers/commands/agentic/modular.py deleted file mode 100644 index 304c8b17329f..000000000000 --- a/src/diffusers/commands/agentic/modular.py +++ /dev/null @@ -1,249 +0,0 @@ -# Copyright 2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""``diffusers-cli modular`` — run a custom ModularPipeline. - -Modular pipelines don't fit the ``task -> AutoPipelineFor*`` taxonomy: the -pipeline blocks themselves define the surface. This command takes free-form -``--inputs key=value`` (or a JSON blob) and forwards them to the modular -pipeline call, then auto-detects the result type so the agent doesn't need -to know whether it asked for an image, video, or audio output. -""" - -from __future__ import annotations - -import json -from argparse import ArgumentParser, Namespace, _SubParsersAction -from pathlib import Path -from typing import Any - -from .. import BaseDiffusersCLICommand -from . import _common - - -def register(subparsers: _SubParsersAction) -> None: - ModularCommand.register_subcommand(subparsers) - - -def _parse_inputs(args: Namespace) -> dict[str, Any]: - """Combine ``--inputs-json`` and repeated ``--inputs key=value`` into one dict. - - Values from ``--inputs`` are JSON-decoded when possible (so booleans, - numbers, lists, and nested objects survive); plain strings fall back to - raw text. - """ - out: dict[str, Any] = {} - if args.inputs_json: - try: - decoded = json.loads(args.inputs_json) - except json.JSONDecodeError as e: - raise SystemExit(f"--inputs-json must be valid JSON: {e}") from e - if not isinstance(decoded, dict): - raise SystemExit("--inputs-json must decode to a JSON object.") - out.update(decoded) - - for pair in args.inputs or []: - if "=" not in pair: - raise SystemExit(f"--inputs entries must look like key=value, got {pair!r}.") - key, _, raw = pair.partition("=") - try: - out[key] = json.loads(raw) - except json.JSONDecodeError: - out[key] = raw - return out - - -def _save_auto(value: Any, args: Namespace, task: str) -> list[str]: - """Save ``value`` based on its runtime type and return the written paths.""" - pil_images = _as_pil_list(value) - if pil_images is not None: - paths = _common.default_output_paths(task, len(pil_images), args.output, ext="png") - for img, path in zip(pil_images, paths): - img.save(path) - return [str(p) for p in paths] - - frames = _as_frame_sequence(value) - if frames is not None: - from diffusers.utils import export_to_video - - path = _common.default_output_paths(task, 1, args.output, ext="mp4")[0] - export_to_video(frames, str(path), fps=args.fps) - return [str(path)] - - audios = _as_audio_arrays(value) - if audios is not None: - from .audio import _save_audio - - return _save_audio(audios, args.sampling_rate or 16000, args, task) - - # Fallback: dump as JSON. - path = _common.default_output_paths(task, 1, args.output, ext="json")[0] - Path(path).write_text(json.dumps(value, default=str, indent=2)) - return [str(path)] - - -def _as_pil_list(value: Any): - try: - from PIL.Image import Image as PILImage - except ImportError: - return None - if isinstance(value, PILImage): - return [value] - if isinstance(value, (list, tuple)) and value and all(isinstance(v, PILImage) for v in value): - return list(value) - return None - - -def _as_frame_sequence(value: Any): - """A frame sequence is a list of PIL images or numpy frames meant to be a single clip.""" - try: - from PIL.Image import Image as PILImage - except ImportError: - PILImage = None # type: ignore[assignment] - - if isinstance(value, (list, tuple)) and len(value) >= 2: - first = value[0] - if PILImage is not None and isinstance(first, PILImage): - # Heuristic: distinguish "list of images we want as PNGs" from "frame sequence". - # The modular pipeline call already returned a single value, so we treat a - # homogeneous list of >=2 images as a clip. - return list(value) - try: - import numpy as np - - if isinstance(first, np.ndarray): - return list(value) - except ImportError: - pass - return None - - -def _as_audio_arrays(value: Any): - try: - import numpy as np - except ImportError: - return None - if isinstance(value, np.ndarray) and value.ndim <= 2: - return [value] - if ( - isinstance(value, (list, tuple)) - and value - and all(isinstance(v, np.ndarray) for v in value) - ): - return list(value) - return None - - -class ModularCommand(BaseDiffusersCLICommand): - task = "modular" - - @staticmethod - def register_subcommand(subparsers: _SubParsersAction) -> None: - parser: ArgumentParser = subparsers.add_parser( - "modular", - help="Run a custom ModularPipeline with free-form inputs.", - ) - _common.add_loading_arguments(parser) - _common.add_optimization_arguments(parser) - parser.add_argument( - "--inputs", - action="append", - default=None, - help='Inputs as key=value (value JSON-decoded when possible). Repeat to add multiple.', - ) - parser.add_argument( - "--inputs-json", - default=None, - help="Inputs as a single JSON object (merged with any --inputs entries).", - ) - parser.add_argument( - "--output-key", - default=None, - help='Optional intermediate to extract (e.g. "image", "video"). ' - "Forwarded to ModularPipeline as the ``output`` argument.", - ) - parser.add_argument( - "--fps", - type=int, - default=8, - help="FPS used when the output happens to be a frame sequence.", - ) - parser.add_argument( - "--sampling-rate", - type=int, - default=None, - help="Sample rate used when the output happens to be an audio array.", - ) - _common.add_remote_arguments(parser) - _common.add_output_arguments(parser) - parser.set_defaults(func=ModularCommand) - - def __init__(self, args: Namespace): - self.args = args - - def run(self) -> None: - if _common.maybe_submit_remote(self.args, self.task): - return - - pipeline = self._load_modular() - call_kwargs = _parse_inputs(self.args) - if self.args.output_key is not None: - call_kwargs["output"] = self.args.output_key - - result = pipeline(**call_kwargs) - saved = _save_auto(result, self.args, self.task) - pushed = _common.push_outputs(self.args, saved, self.task) - - _common.format_result( - self.args, - { - "task": self.task, - "model": self.args.model, - "pipeline_class": type(pipeline).__name__, - "outputs": saved, - "pushed": pushed, - "output_key": self.args.output_key, - }, - ) - - def _load_modular(self): - from diffusers import ModularPipeline - - dtype = _common.resolve_dtype(self.args.dtype) - device = _common.resolve_device(self.args.device) - - from_pretrained_kwargs: dict[str, Any] = { - "trust_remote_code": self.args.trust_remote_code, - } - if dtype != "auto": - from_pretrained_kwargs["torch_dtype"] = dtype - if self.args.revision: - from_pretrained_kwargs["revision"] = self.args.revision - if self.args.token: - from_pretrained_kwargs["token"] = self.args.token - - pipeline = ModularPipeline.from_pretrained(self.args.model, **from_pretrained_kwargs) - if not hasattr(pipeline, "to"): - return pipeline - - pipeline = _common.map_to_device(pipeline, self.args, device) - if self.args.vae_tiling and hasattr(pipeline, "enable_vae_tiling"): - pipeline.enable_vae_tiling() - if self.args.vae_slicing and hasattr(pipeline, "enable_vae_slicing"): - pipeline.enable_vae_slicing() - if self.args.attention_backend != "default": - _common._set_attention_backend(pipeline, self.args.attention_backend) - if self.args.context_parallel: - _common._enable_context_parallel(pipeline) - return pipeline diff --git a/src/diffusers/commands/agentic/tasks.py b/src/diffusers/commands/agentic/tasks.py deleted file mode 100644 index be2999469783..000000000000 --- a/src/diffusers/commands/agentic/tasks.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright 2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""``diffusers-cli tasks`` — list every registered agentic subcommand. - -Designed so an agent can discover the surface area without parsing -``--help`` output. -""" - -from __future__ import annotations - -import json -import sys -from argparse import ArgumentParser, Namespace, _SubParsersAction - -from .. import BaseDiffusersCLICommand - - -AGENTIC_TASK_NAMES: tuple[str, ...] = ( - "text-to-image", - "image-to-image", - "inpaint", - "text-to-video", - "image-to-video", - "text-to-audio", - "modular", -) - - -def register(subparsers: _SubParsersAction) -> None: - ListTasksCommand.register_subcommand(subparsers, subparsers) - - -def list_agentic_tasks(subparsers: _SubParsersAction) -> list[dict]: - """Return ``[{name, description}, ...]`` for every registered agentic task. - - Reads metadata directly from the live argparse subparsers so the list - can never drift from the actual commands. - """ - choices = getattr(subparsers, "choices", {}) or {} - actions = [a for a in getattr(subparsers, "_choices_actions", [])] - descriptions = {a.dest: a.help for a in actions} - - out: list[dict] = [] - for name in AGENTIC_TASK_NAMES: - if name not in choices: - continue - out.append({"name": name, "description": descriptions.get(name, "")}) - return out - - -class ListTasksCommand(BaseDiffusersCLICommand): - task = "tasks" - - # The live subparsers object is captured at registration time so ``run`` - # can introspect it without needing access to ``main``'s locals. - _root_subparsers: _SubParsersAction | None = None - - @staticmethod - def register_subcommand(subparsers: _SubParsersAction, root_subparsers: _SubParsersAction) -> None: - parser: ArgumentParser = subparsers.add_parser( - "tasks", - help="List every registered agentic task with a one-line description.", - ) - parser.add_argument("--json", action="store_true", help="Emit machine-readable JSON.") - parser.set_defaults(func=ListTasksCommand) - ListTasksCommand._root_subparsers = root_subparsers - - def __init__(self, args: Namespace): - self.args = args - - def run(self) -> None: - tasks = list_agentic_tasks(self._root_subparsers) if self._root_subparsers else [] - if self.args.json: - json.dump({"tasks": tasks}, sys.stdout) - sys.stdout.write("\n") - return - width = max((len(t["name"]) for t in tasks), default=0) - for entry in tasks: - print(f"{entry['name']:<{width}} {entry['description'] or ''}") diff --git a/src/diffusers/commands/agentic/video.py b/src/diffusers/commands/agentic/video.py deleted file mode 100644 index e4dcdc4bb8a2..000000000000 --- a/src/diffusers/commands/agentic/video.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright 2025 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Video-generation subcommands: text-to-video, image-to-video. - -There is no AutoPipeline for video, so these commands load via -``DiffusionPipeline`` and rely on the repo's ``model_index.json`` to pick -the right pipeline class (CogVideoX, Hunyuan, LTX, Wan, etc.). -""" - -from __future__ import annotations - -from argparse import ArgumentParser, Namespace, _SubParsersAction - -from diffusers.utils import load_image - -from .. import BaseDiffusersCLICommand -from . import _common - - -def register(subparsers: _SubParsersAction) -> None: - Text2VideoCommand.register_subcommand(subparsers) - Image2VideoCommand.register_subcommand(subparsers) - - -def _add_video_arguments(parser: ArgumentParser) -> None: - parser.add_argument("--num-frames", type=int, default=None, help="Number of frames to generate.") - parser.add_argument("--fps", type=int, default=8, help="Frames per second for the output video.") - - -def _build_call_kwargs(args: Namespace, pipeline) -> dict: - kwargs: dict = {} - if args.prompt is not None: - kwargs["prompt"] = args.prompt - if args.negative_prompt is not None: - kwargs["negative_prompt"] = args.negative_prompt - if args.num_inference_steps is not None: - kwargs["num_inference_steps"] = args.num_inference_steps - if args.guidance_scale is not None: - kwargs["guidance_scale"] = args.guidance_scale - if args.height is not None: - kwargs["height"] = args.height - if args.width is not None: - kwargs["width"] = args.width - if args.num_frames is not None: - kwargs["num_frames"] = args.num_frames - - generator = _common.get_generator(args.seed, pipeline.device.type) - if generator is not None: - kwargs["generator"] = generator - - kwargs.update(_common.parse_pipeline_kwargs(args.pipeline_kwargs)) - return kwargs - - -def _save_video(frames, args: Namespace, task: str) -> str: - from diffusers.utils import export_to_video - - path = _common.default_output_paths(task, 1, args.output, ext="mp4")[0] - export_to_video(frames, str(path), fps=args.fps) - return str(path) - - -class _BaseVideoCommand(BaseDiffusersCLICommand): - task: str = "" - - def __init__(self, args: Namespace): - self.args = args - - def run(self) -> None: - if _common.maybe_submit_remote(self.args, self.task): - return - pipeline = _common.load_pipeline(self.args, "DiffusionPipeline") - call_kwargs = _build_call_kwargs(self.args, pipeline) - self._attach_inputs(call_kwargs) - - result = pipeline(**call_kwargs) - frames = result.frames[0] if hasattr(result, "frames") else result[0] - out_path = _save_video(frames, self.args, self.task) - pushed = _common.push_outputs(self.args, [out_path], self.task) - - _common.format_result( - self.args, - { - "task": self.task, - "model": self.args.model, - "device": pipeline.device.type, - "outputs": [out_path], - "pushed": pushed, - "fps": self.args.fps, - "seed": self.args.seed, - }, - ) - - def _attach_inputs(self, call_kwargs: dict) -> None: # noqa: B027 - """Hook for subclasses to attach conditioning inputs.""" - - -class Text2VideoCommand(_BaseVideoCommand): - task = "text-to-video" - - @staticmethod - def register_subcommand(subparsers: _SubParsersAction) -> None: - parser: ArgumentParser = subparsers.add_parser( - "text-to-video", - help="Generate a video clip from a text prompt.", - ) - _common.add_loading_arguments(parser) - _common.add_optimization_arguments(parser) - _common.add_generation_arguments(parser) - _add_video_arguments(parser) - _common.add_remote_arguments(parser) - _common.add_output_arguments(parser) - parser.set_defaults(func=Text2VideoCommand) - - -class Image2VideoCommand(_BaseVideoCommand): - task = "image-to-video" - - @staticmethod - def register_subcommand(subparsers: _SubParsersAction) -> None: - parser: ArgumentParser = subparsers.add_parser( - "image-to-video", - help="Generate a video clip conditioned on an input image.", - ) - _common.add_loading_arguments(parser) - _common.add_optimization_arguments(parser) - _common.add_generation_arguments(parser) - _add_video_arguments(parser) - _common.add_remote_arguments(parser) - _common.add_output_arguments(parser) - parser.add_argument("--image", required=True, help="Path or URL to the conditioning image.") - parser.set_defaults(func=Image2VideoCommand) - - def _attach_inputs(self, call_kwargs: dict) -> None: - call_kwargs["image"] = load_image(self.args.image) diff --git a/src/diffusers/commands/custom_blocks.py b/src/diffusers/commands/custom_blocks.py index 953240c5a2c3..22c38e6256b3 100644 --- a/src/diffusers/commands/custom_blocks.py +++ b/src/diffusers/commands/custom_blocks.py @@ -12,94 +12,89 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -Usage example: - TODO +"""``diffusers-cli custom_blocks`` — save a custom ``ModularPipelineBlocks`` subclass. + +Parses a local ``block.py``, finds a ``ModularPipelineBlocks`` subclass, +dynamically imports it, and calls ``save_pretrained`` in the current +working directory so the result can be pushed to the Hub and consumed by +``diffusers-cli inference``. """ +from __future__ import annotations + import ast import importlib.util import os -from argparse import ArgumentParser, Namespace +from argparse import ArgumentParser, Namespace, _SubParsersAction from pathlib import Path from ..utils import logging from . import BaseDiffusersCLICommand -EXPECTED_PARENT_CLASSES = ["ModularPipelineBlocks"] -CONFIG = "config.json" - - -def conversion_command_factory(args: Namespace): - return CustomBlocksCommand(args.block_module_name, args.block_class_name) +_EXPECTED_BASE_CLASSES = ("ModularPipelineBlocks",) class CustomBlocksCommand(BaseDiffusersCLICommand): + task = "custom_blocks" + @staticmethod - def register_subcommand(parser: ArgumentParser): - conversion_parser = parser.add_parser("custom_blocks") - conversion_parser.add_argument( - "--block_module_name", - type=str, + def register_subcommand(subparsers: _SubParsersAction) -> None: + parser: ArgumentParser = subparsers.add_parser( + "custom_blocks", + help="Save a custom ModularPipelineBlocks subclass via save_pretrained.", + ) + parser.add_argument( + "--block-module-name", default="block.py", - help="Module filename in which the custom block will be implemented.", + help="Module filename in which the custom block is implemented (default: block.py).", ) - conversion_parser.add_argument( - "--block_class_name", - type=str, + parser.add_argument( + "--block-class-name", default=None, - help="Name of the custom block. If provided None, we will try to infer it.", + help="Name of the custom block class. If None, the first ModularPipelineBlocks subclass found is used.", ) - conversion_parser.set_defaults(func=conversion_command_factory) + parser.set_defaults(func=CustomBlocksCommand) - def __init__(self, block_module_name: str = "block.py", block_class_name: str = None): + def __init__(self, args: Namespace): self.logger = logging.get_logger("diffusers-cli/custom_blocks") - self.block_module_name = Path(block_module_name) - self.block_class_name = block_class_name + self.block_module_name = Path(args.block_module_name) + self.block_class_name = args.block_class_name + + def run(self) -> None: + candidates = self._get_class_names(self.block_module_name) + classes_found = list({cls for cls, _ in candidates}) - def run(self): - # determine the block to be saved. - out = self._get_class_names(self.block_module_name) - classes_found = list({cls for cls, _ in out}) + if not candidates: + raise ValueError( + f"No ModularPipelineBlocks subclass found in {self.block_module_name}. " + "Ensure your block class inherits from `ModularPipelineBlocks` directly." + ) if self.block_class_name is not None: - child_class, parent_class = self._choose_block(out, self.block_class_name) - if child_class is None and parent_class is None: + child_class = next((cls for cls, _ in candidates if cls == self.block_class_name), None) + if child_class is None: raise ValueError( - "`block_class_name` could not be retrieved. Available classes from " - f"{self.block_module_name}:\n{classes_found}" + f"--block-class-name {self.block_class_name!r} not found in " + f"{self.block_module_name}. Available: {classes_found}" ) else: self.logger.info( - f"Found classes: {classes_found} will be using {classes_found[0]}. " - "If this needs to be changed, re-run the command specifying `block_class_name`." + f"Found classes: {classes_found} — using {classes_found[0]}. " + "Re-run with --block-class-name to override." ) - child_class, parent_class = out[0][0], out[0][1] + child_class, _ = candidates[0] - # dynamically get the custom block and initialize it to call `save_pretrained` in the current directory. - # the user is responsible for running it, so I guess that is safe? module_name = f"__dynamic__{self.block_module_name.stem}" spec = importlib.util.spec_from_file_location(module_name, str(self.block_module_name)) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) getattr(module, child_class)().save_pretrained(os.getcwd()) - # or, we could create it manually. - # automap = self._create_automap(parent_class=parent_class, child_class=child_class) - # with open(CONFIG, "w") as f: - # json.dump(automap, f) - - def _choose_block(self, candidates, chosen=None): - for cls, base in candidates: - if cls == chosen: - return cls, base - return None, None - - def _get_class_names(self, file_path): + def _get_class_names(self, file_path: Path) -> list[tuple[str, str]]: source = file_path.read_text(encoding="utf-8") try: - tree = ast.parse(source, filename=file_path) + tree = ast.parse(source, filename=str(file_path)) except SyntaxError as e: raise ValueError(f"Could not parse {file_path!r}: {e}") from e @@ -107,26 +102,17 @@ def _get_class_names(self, file_path): for node in tree.body: if not isinstance(node, ast.ClassDef): continue - - # extract all base names for this class base_names = [bname for b in node.bases if (bname := self._get_base_name(b)) is not None] - - # for each allowed base that appears in the class's bases, emit a tuple - for allowed in EXPECTED_PARENT_CLASSES: + for allowed in _EXPECTED_BASE_CLASSES: if allowed in base_names: results.append((node.name, allowed)) - return results - def _get_base_name(self, node: ast.expr): + @staticmethod + def _get_base_name(node: ast.expr) -> str | None: if isinstance(node, ast.Name): return node.id - elif isinstance(node, ast.Attribute): - val = self._get_base_name(node.value) + if isinstance(node, ast.Attribute): + val = CustomBlocksCommand._get_base_name(node.value) return f"{val}.{node.attr}" if val else node.attr return None - - def _create_automap(self, parent_class, child_class): - module = str(self.block_module_name).replace(".py", "").rsplit(".", 1)[-1] - auto_map = {f"{parent_class}": f"{module}.{child_class}"} - return {"auto_map": auto_map} diff --git a/src/diffusers/commands/diffusers_cli.py b/src/diffusers/commands/diffusers_cli.py index 95b8dd5f3938..ceb806af1a3e 100644 --- a/src/diffusers/commands/diffusers_cli.py +++ b/src/diffusers/commands/diffusers_cli.py @@ -15,10 +15,10 @@ from argparse import ArgumentParser -from .agentic import register_agentic_commands from .custom_blocks import CustomBlocksCommand from .env import EnvironmentCommand from .fp16_safetensors import FP16SafetensorsCommand +from .inference import InferenceCommand def main(): @@ -29,7 +29,7 @@ def main(): EnvironmentCommand.register_subcommand(commands_parser) FP16SafetensorsCommand.register_subcommand(commands_parser) CustomBlocksCommand.register_subcommand(commands_parser) - register_agentic_commands(commands_parser) + InferenceCommand.register_subcommand(commands_parser) # Let's go args = parser.parse_args() diff --git a/src/diffusers/commands/inference.py b/src/diffusers/commands/inference.py new file mode 100644 index 000000000000..614994c07016 --- /dev/null +++ b/src/diffusers/commands/inference.py @@ -0,0 +1,846 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""``diffusers-cli inference`` — single agentic entry point. + +Runs any diffusers pipeline (standard or modular) by forwarding +``--pipeline-kwargs`` verbatim, saves the output by sniffing its runtime +type, and can submit the same call to HF Jobs via ``--remote`` (with the +model repo volume-mounted and the results downloaded back). +""" + +from __future__ import annotations + +import json +import os +import sys +from argparse import ArgumentParser, Namespace, _SubParsersAction +from pathlib import Path +from typing import Any, Optional + +from diffusers.utils import load_image + +from . import BaseDiffusersCLICommand + + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +DEFAULT_OUTPUT_DIR = "outputs" +DTYPE_CHOICES = ("auto", "float16", "fp16", "bfloat16", "bf16", "float32", "fp32") +CPU_OFFLOAD_CHOICES = ("model", "group") +ATTENTION_BACKEND_CHOICES = ( + "default", + "flash_hub", + "flash_varlen_hub", + "flash_4_hub", + "sage_hub", +) + +_MODULAR_INDEX = "modular_model_index.json" + +# Keys whose string value should be resolved via ``diffusers.utils.load_image`` +# before being passed to the pipeline call. +_IMAGE_INPUT_KEYS = ( + "image", + "mask_image", + "control_image", + "ip_adapter_image", + "image_2", +) + +# Source for the diffusers install used by --remote jobs. While iterating on a +# feature branch, point at the branch URL; once merged, switch back to a release +# pin. ``--dependencies "diffusers @ git+..."`` on the local command appends +# additional dependencies but does not replace this default install. +DIFFUSERS_SOURCE = "diffusers @ git+https://github.com/huggingface/diffusers@diffuser-cli-for-agent" +_DEFAULT_REMOTE_DEPS = ( + DIFFUSERS_SOURCE, + "accelerate", + "transformers", + "safetensors", + "torch==2.10.*", + "torchvision", +) + +# Entry point for ``uv run`` inside the container. ``uv run`` accepts a file +# path, URL, or command; passing the installed console script name makes UV +# install the deps above (which register the entry point) and exec the CLI. +_UV_RUNNER_SCRIPT = "diffusers-cli" + +RUN_ID_ENV = "DIFFUSERS_CLI_RUN_ID" + +# Namespace keys that control *how* a remote job runs locally, not what runs +# inside the container. They are stripped when forwarding argv to the container. +HF_JOBS_KEYS = frozenset( + {"remote", "flavor", "timeout", "dependencies", "namespace", "no_wait", "poll_interval", "func"} +) + + +# --------------------------------------------------------------------------- +# Argparse helpers +# --------------------------------------------------------------------------- + + +def _add_loading_arguments(parser: ArgumentParser) -> None: + parser.add_argument("--model", "-m", required=True, help="Model id on the Hugging Face Hub or local path.") + parser.add_argument("--device", default=None, help="Device to run on (e.g. cpu, cuda, cuda:0, mps).") + parser.add_argument("--dtype", default="auto", choices=DTYPE_CHOICES, help="Torch dtype for pipeline weights.") + parser.add_argument("--variant", default=None, help='Optional weight variant (e.g. "fp16").') + parser.add_argument("--revision", default=None, help="Model revision (branch, tag, or commit SHA).") + parser.add_argument("--token", default=None, help="Hugging Face token for gated/private models.") + parser.add_argument("--trust-remote-code", action="store_true", help="Allow custom code from the Hub.") + + +def _add_optimization_arguments(parser: ArgumentParser) -> None: + parser.add_argument( + "--cpu-offload", + choices=CPU_OFFLOAD_CHOICES, + default=None, + help=( + "Offload pipeline components to CPU during inference. " + "'model' uses enable_model_cpu_offload, " + "'group' uses pipeline.enable_group_offload(leaf_level, use_stream=True)." + ), + ) + parser.add_argument( + "--attention-backend", + choices=ATTENTION_BACKEND_CHOICES, + default="default", + help=( + "Override the attention backend on the transformer/UNet. " + "Only Hub-hosted kernels are exposed — they auto-download on first use." + ), + ) + parser.add_argument("--vae-tiling", action="store_true", help="Enable VAE tiling (lower peak VRAM).") + parser.add_argument("--vae-slicing", action="store_true", help="Enable VAE slicing (lower peak VRAM).") + parser.add_argument( + "--context-parallel", + action="store_true", + help=( + "Enable Ulysses-style context parallelism (ulysses_anything mode). " + "Requires launching the CLI under torchrun with ≥2 GPUs." + ), + ) + + +def _add_output_arguments(parser: ArgumentParser) -> None: + parser.add_argument( + "--output", + "-o", + default=None, + help="Output file or directory. Defaults to ./outputs/-..", + ) + parser.add_argument( + "--push-to", + default=None, + help=( + "Upload the generated files to this HF bucket id after saving (created if missing). " + "When --remote is set, defaults to /jobs-artifacts." + ), + ) + parser.add_argument("--json", action="store_true", help="Emit a machine-readable JSON summary on stdout.") + + +def _add_remote_arguments(parser: ArgumentParser) -> None: + parser.add_argument( + "--remote", + action="store_true", + help="Submit this command to Hugging Face Jobs instead of running locally.", + ) + parser.add_argument( + "--flavor", + default="a10g-small", + help="HF Jobs hardware flavor for --remote (e.g. a10g-small, a100-large, cpu-basic).", + ) + parser.add_argument("--timeout", default=None, help="HF Jobs timeout for --remote (e.g. 30m, 2h).") + parser.add_argument( + "--dependencies", + action="append", + default=None, + help="Extra pip dependencies for the --remote job. Repeat to add multiple.", + ) + parser.add_argument( + "--namespace", + default=None, + help="HF namespace to run the --remote job under (defaults to the current user).", + ) + parser.add_argument( + "--no-wait", + action="store_true", + help="Don't wait for the --remote job to finish — submit and print the job id.", + ) + parser.add_argument( + "--poll-interval", + type=float, + default=5.0, + help="Seconds between job-status polls when waiting for --remote completion.", + ) + + +# --------------------------------------------------------------------------- +# Pipeline loading + optimization +# --------------------------------------------------------------------------- + + +def _resolve_dtype(name: Optional[str]): + if name in (None, "auto"): + return "auto" + import torch + + mapping = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16} + if name not in mapping: + raise ValueError(f"Unknown dtype: {name}") + return mapping[name] + + +def _resolve_device(name: Optional[str]) -> str: + if name: + return name + import torch + + if torch.cuda.is_available(): + return "cuda" + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + return "mps" + return "cpu" + + +def _map_to_device(pipeline: Any, args: Namespace, device: str) -> Any: + """Move the pipeline to ``device``, or hand off to the chosen CPU-offload helper.""" + if args.cpu_offload is None: + return pipeline.to(device) + if args.cpu_offload == "model": + pipeline.enable_model_cpu_offload(device=device) + elif args.cpu_offload == "group": + import torch + + pipeline.enable_group_offload( + onload_device=torch.device(device), + offload_type="leaf_level", + use_stream=device.startswith("cuda"), + ) + return pipeline + + +def _set_attention_backend(pipeline: Any, backend: str) -> None: + for attr in ("transformer", "unet"): + module = getattr(pipeline, attr, None) + if module is not None and hasattr(module, "set_attention_backend"): + try: + module.set_attention_backend(backend) + except (ValueError, ImportError, RuntimeError): + pass + return + + +def _enable_context_parallel(pipeline: Any) -> None: + import torch + + if not torch.distributed.is_available() or not torch.distributed.is_initialized(): + raise SystemExit( + "--context-parallel requires torch.distributed to be initialized. " + "Launch the CLI under torchrun, e.g.: " + "`torchrun --nproc-per-node=N -m diffusers.commands.diffusers_cli inference ...`." + ) + + from diffusers import ContextParallelConfig + + cfg = ContextParallelConfig( + ulysses_degree=torch.distributed.get_world_size(), + ring_degree=1, + ulysses_anything=True, + ) + for attr in ("transformer", "unet"): + module = getattr(pipeline, attr, None) + if module is not None and hasattr(module, "enable_parallelism"): + module.enable_parallelism(config=cfg) + return + + +def _apply_optimizations(pipeline: Any, args: Namespace) -> None: + """Apply VAE tiling/slicing, attention backend, and context-parallel toggles.""" + if args.vae_tiling and hasattr(pipeline, "enable_vae_tiling"): + pipeline.enable_vae_tiling() + if args.vae_slicing and hasattr(pipeline, "enable_vae_slicing"): + pipeline.enable_vae_slicing() + if args.attention_backend != "default": + _set_attention_backend(pipeline, args.attention_backend) + if args.context_parallel: + _enable_context_parallel(pipeline) + + +def _from_pretrained_kwargs(args: Namespace) -> dict[str, Any]: + dtype = _resolve_dtype(args.dtype) + kwargs: dict[str, Any] = {"trust_remote_code": args.trust_remote_code} + if dtype != "auto": + kwargs["torch_dtype"] = dtype + if args.variant: + kwargs["variant"] = args.variant + if args.revision: + kwargs["revision"] = args.revision + if args.token: + kwargs["token"] = args.token + return kwargs + + +def _load_pipeline(args: Namespace, modular: bool) -> Any: + import diffusers + + pipeline_cls = diffusers.ModularPipeline if modular else diffusers.DiffusionPipeline + pipeline = pipeline_cls.from_pretrained(args.model, **_from_pretrained_kwargs(args)) + if not hasattr(pipeline, "to"): + return pipeline + pipeline = _map_to_device(pipeline, args, _resolve_device(args.device)) + _apply_optimizations(pipeline, args) + return pipeline + + +# --------------------------------------------------------------------------- +# Modular pipeline detection + introspection +# --------------------------------------------------------------------------- + + +def _is_modular_repo(args: Namespace) -> bool: + local = Path(args.model) + if local.exists(): + return (local / _MODULAR_INDEX).exists() + + from huggingface_hub import HfApi + from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError + + try: + files = set(HfApi(token=args.token).list_repo_files(args.model, revision=args.revision)) + except (RepositoryNotFoundError, HfHubHTTPError): + return False + return _MODULAR_INDEX in files + + +def _describe_modular(args: Namespace) -> None: + """Load just the block definitions and print the input schema.""" + from diffusers import ModularPipelineBlocks + + kwargs: dict[str, Any] = {"trust_remote_code": args.trust_remote_code} + if args.revision: + kwargs["revision"] = args.revision + if args.token: + kwargs["token"] = args.token + + blocks = ModularPipelineBlocks.from_pretrained(args.model, **kwargs) + schema = [ + { + "name": p.name, + "type_hint": str(p.type_hint) if p.type_hint is not None else None, + "default": p.default, + "required": p.required, + "description": p.description, + } + for p in blocks.inputs + ] + payload = { + "task": "inference-describe", + "model": args.model, + "blocks_class": type(blocks).__name__, + "inputs": schema, + } + + if args.json: + json.dump(payload, sys.stdout, default=str) + sys.stdout.write("\n") + return + + print(f"{type(blocks).__name__} ({args.model}) inputs:") + for entry in schema: + tag = "required" if entry["required"] else f"optional, default={entry['default']!r}" + print(f" {entry['name']} ({tag})") + if entry["type_hint"]: + print(f" type: {entry['type_hint']}") + if entry["description"]: + print(f" desc: {entry['description']}") + + +# --------------------------------------------------------------------------- +# Pipeline call helpers +# --------------------------------------------------------------------------- + + +def _parse_pipeline_kwargs(raw: Optional[str]) -> dict[str, Any]: + if not raw: + return {} + try: + parsed = json.loads(raw) + except json.JSONDecodeError as e: + raise SystemExit(f"--pipeline-kwargs must be valid JSON: {e}") from e + if not isinstance(parsed, dict): + raise SystemExit("--pipeline-kwargs must decode to a JSON object.") + return parsed + + +def _resolve_image_inputs(call_kwargs: dict[str, Any]) -> None: + """Replace string paths/URLs at known image-input keys with PIL images.""" + for key in _IMAGE_INPUT_KEYS: + value = call_kwargs.get(key) + if isinstance(value, str): + call_kwargs[key] = load_image(value) + + +def _get_generator(seed: Optional[int], device: str): + if seed is None: + return None + import torch + + generator_device = "cpu" if device == "mps" else device + return torch.Generator(device=generator_device).manual_seed(seed) + + +def _result_to_savable(result: Any) -> Any: + """Unwrap a pipeline-output object into the raw payload the saver can sniff.""" + if hasattr(result, "images"): + return result.images + if hasattr(result, "frames"): + frames = result.frames + return frames[0] if isinstance(frames, (list, tuple)) and frames else frames + if hasattr(result, "audios"): + return result.audios + return result + + +# --------------------------------------------------------------------------- +# Output saving (auto-sniff by type) +# --------------------------------------------------------------------------- + + +def _default_output_paths(task: str, num: int, explicit: Optional[str], ext: str) -> list[Path]: + if explicit is None: + base = Path(DEFAULT_OUTPUT_DIR) + base.mkdir(parents=True, exist_ok=True) + return [base / f"{task}-{i}.{ext}" for i in range(num)] + + p = Path(explicit) + if explicit.endswith(os.sep) or p.is_dir(): + p.mkdir(parents=True, exist_ok=True) + return [p / f"{task}-{i}.{ext}" for i in range(num)] + + p.parent.mkdir(parents=True, exist_ok=True) + if num == 1: + return [p] + stem, suffix = p.stem, p.suffix or f".{ext}" + return [p.with_name(f"{stem}-{i}{suffix}") for i in range(num)] + + +def _as_pil_list(value: Any): + try: + from PIL.Image import Image as PILImage + except ImportError: + return None + if isinstance(value, PILImage): + return [value] + if isinstance(value, (list, tuple)) and value and all(isinstance(v, PILImage) for v in value): + return list(value) + return None + + +def _as_frame_sequence(value: Any): + try: + from PIL.Image import Image as PILImage + except ImportError: + PILImage = None # type: ignore[assignment] + + if isinstance(value, (list, tuple)) and len(value) >= 2: + first = value[0] + if PILImage is not None and isinstance(first, PILImage): + return list(value) + try: + import numpy as np + + if isinstance(first, np.ndarray): + return list(value) + except ImportError: + pass + return None + + +def _as_audio_arrays(value: Any): + try: + import numpy as np + except ImportError: + return None + if isinstance(value, np.ndarray) and value.ndim <= 2: + return [value] + if isinstance(value, (list, tuple)) and value and all(isinstance(v, np.ndarray) for v in value): + return list(value) + return None + + +def _save_audio_arrays(audios, sampling_rate: int, args: Namespace, task: str) -> list[str]: + """Write each numpy audio array to a 16-bit PCM WAV at ``sampling_rate`` Hz.""" + import numpy as np + from scipy.io.wavfile import write as wavfile_write + + paths = _default_output_paths(task, len(audios), args.output, ext="wav") + saved: list[str] = [] + for audio, path in zip(audios, paths): + data = np.asarray(audio) + if data.dtype.kind == "f": + data = (np.clip(data, -1.0, 1.0) * 32767).astype(np.int16) + if data.ndim > 1 and data.shape[0] < data.shape[-1]: + data = data.T # (channels, samples) → (samples, channels) for scipy. + wavfile_write(str(path), sampling_rate, data) + saved.append(str(path)) + return saved + + +def _save_auto(value: Any, args: Namespace, task: str) -> list[str]: + """Save ``value`` by sniffing its runtime type.""" + pil_images = _as_pil_list(value) + if pil_images is not None: + paths = _default_output_paths(task, len(pil_images), args.output, ext="png") + for img, path in zip(pil_images, paths): + img.save(path) + return [str(p) for p in paths] + + frames = _as_frame_sequence(value) + if frames is not None: + from diffusers.utils import export_to_video + + path = _default_output_paths(task, 1, args.output, ext="mp4")[0] + export_to_video(frames, str(path), fps=getattr(args, "fps", 8)) + return [str(path)] + + audios = _as_audio_arrays(value) + if audios is not None: + return _save_audio_arrays(audios, getattr(args, "sampling_rate", None) or 16000, args, task) + + path = _default_output_paths(task, 1, args.output, ext="json")[0] + Path(path).write_text(json.dumps(value, default=str, indent=2)) + return [str(path)] + + +# --------------------------------------------------------------------------- +# Hub bucket upload (--push-to) +# --------------------------------------------------------------------------- + + +def _push_outputs(args: Namespace, saved_paths: list[str], task: str) -> Optional[dict[str, Any]]: + """Upload ``saved_paths`` to the ``--push-to`` bucket. Returns a summary or None.""" + if not args.push_to: + return None + + from huggingface_hub import HfApi + + api = HfApi(token=args.token) + api.create_bucket(args.push_to, exist_ok=True) + + prefix = os.environ.get(RUN_ID_ENV) or task + add = [(local, f"{prefix}/{Path(local).name}") for local in saved_paths] + api.batch_bucket_files(args.push_to, add=add) + + uploaded = [f"hf://buckets/{args.push_to}/{dest}" for _, dest in add] + return {"bucket_id": args.push_to, "uploaded": uploaded} + + +# --------------------------------------------------------------------------- +# Remote submission (HF Jobs) +# --------------------------------------------------------------------------- + + +def _build_task_kwargs(args: Namespace) -> dict[str, Any]: + """Pick out the kwargs the container should invoke the task with.""" + out: dict[str, Any] = {} + for key, value in vars(args).items(): + if key in HF_JOBS_KEYS or value is None or value is False: + continue + out[key] = value + return out + + +def _kwargs_to_argv(task: str, task_kwargs: dict[str, Any]) -> list[str]: + """Render ``task_kwargs`` as the argv list the container's argparse will see.""" + argv: list[str] = [task] + for key, value in task_kwargs.items(): + flag = "--" + key.replace("_", "-") + if value is True: + argv.append(flag) + elif isinstance(value, list): + for item in value: + argv.extend([flag, str(item)]) + else: + argv.extend([flag, str(value)]) + return argv + + +def _maybe_submit_remote(args: Namespace, task: str) -> bool: + """If ``--remote`` was set, submit this invocation to HF Jobs and return True.""" + if not args.remote: + return False + + import uuid + + from huggingface_hub import HfApi, get_token, run_uv_job + + try: + from huggingface_hub import Volume + except ImportError: + Volume = None + + hf_token = args.token or get_token() + api = HfApi(token=hf_token) + + if not args.push_to: + args.push_to = f"{api.whoami()['name']}/jobs-artifacts" + + run_id = uuid.uuid4().hex[:12] + + task_kwargs = _build_task_kwargs(args) + dependencies = list(_DEFAULT_REMOTE_DEPS) + if args.dependencies: + dependencies.extend(args.dependencies) + + secrets = {"HF_TOKEN": hf_token} if hf_token else None + env = { + RUN_ID_ENV: run_id, + "HF_ENABLE_PARALLEL_LOADING": "1", # thread-pool the safetensors load step + } + + # Mount the model repo into the job's filesystem so the container reads it + # from local disk instead of downloading. Requires huggingface_hub >= 1.16. + volumes = None + if Volume is not None and not Path(args.model).exists(): + mount_path = "/model" + volumes = [Volume(type="model", source=args.model, mount_path=mount_path)] + task_kwargs["model"] = mount_path + + run_uv_job_kwargs: dict[str, Any] = { + "script": _UV_RUNNER_SCRIPT, + "script_args": _kwargs_to_argv(task, task_kwargs), + "dependencies": dependencies, + "flavor": args.flavor, + "timeout": args.timeout, + "namespace": args.namespace, + "secrets": secrets, + "env": env, + "token": hf_token, + } + if volumes is not None: + run_uv_job_kwargs["volumes"] = volumes + + job = run_uv_job(**run_uv_job_kwargs) + + payload: dict[str, Any] = { + "task": "remote-submit", + "job_id": getattr(job, "id", None), + "job_status": str(getattr(job, "status", "")), + "flavor": args.flavor, + "push_to": args.push_to, + "run_id": run_id, + } + + if args.no_wait: + _format_result(args, payload) + return True + + print( + f"[diffusers-cli] submitted job {job.id} (run_id={run_id}); " + f"watch at {getattr(job, 'url', 'https://huggingface.co/jobs')}", + file=sys.stderr, + flush=True, + ) + final_status = _wait_for_job(api, job.id, args.namespace, args.poll_interval) + payload["job_status"] = final_status + payload["outputs"] = _download_job_artifacts(api, args.push_to, run_id, args.output) + _format_result(args, payload) + return True + + +def _wait_for_job(api: Any, job_id: str, namespace: Optional[str], poll_interval: float) -> str: + """Stream container logs to stderr until the job terminates; return the final stage.""" + fetch = getattr(api, "fetch_job_logs", None) + if fetch is not None: + try: + for line in fetch(job_id=job_id, namespace=namespace, follow=True): + print(line, file=sys.stderr, flush=True) + except TypeError: + return _poll_for_job(api, job_id, namespace, poll_interval) + info = api.inspect_job(job_id=job_id, namespace=namespace) + return str(info.status.stage) if info.status else "UNKNOWN" + return _poll_for_job(api, job_id, namespace, poll_interval) + + +def _poll_for_job(api: Any, job_id: str, namespace: Optional[str], poll_interval: float) -> str: + """Heartbeat-style fallback when ``fetch_job_logs`` isn't available.""" + import time + + terminal = {"COMPLETED", "CANCELED", "ERROR", "DELETED"} + last_stage: Optional[str] = None + while True: + info = api.inspect_job(job_id=job_id, namespace=namespace) + stage = str(info.status.stage) if info.status else "UNKNOWN" + if stage != last_stage: + if last_stage is not None: + print("", file=sys.stderr, flush=True) + print(f"[diffusers-cli] job {job_id}: {stage}", file=sys.stderr, flush=True) + last_stage = stage + else: + print(".", end="", file=sys.stderr, flush=True) + if stage in terminal: + print("", file=sys.stderr, flush=True) + return stage + time.sleep(poll_interval) + + +def _download_job_artifacts(api: Any, bucket_id: str, run_id: str, output: Optional[str]) -> list[str]: + """Download every file under ``/`` from ``bucket_id`` into a local directory.""" + from huggingface_hub import BucketFile + + local_dir = Path(output) if output else Path(DEFAULT_OUTPUT_DIR) + local_dir.mkdir(parents=True, exist_ok=True) + + pairs: list[tuple[Any, Path]] = [] + for entry in api.list_bucket_tree(bucket_id, prefix=f"{run_id}/", recursive=True): + if not isinstance(entry, BucketFile): + continue + pairs.append((entry, local_dir / Path(entry.path).name)) + + if not pairs: + return [] + api.download_bucket_files(bucket_id, files=pairs) + return [str(local) for _, local in pairs] + + +# --------------------------------------------------------------------------- +# Result formatting +# --------------------------------------------------------------------------- + + +def _format_result(args: Namespace, payload: dict[str, Any]) -> None: + """Print either a human-friendly summary or JSON, depending on --json.""" + if args.json: + json.dump(payload, sys.stdout, default=str) + sys.stdout.write("\n") + return + + outputs = payload.get("outputs", []) + if outputs: + for path in outputs: + print(path) + else: + print(payload) + + +# --------------------------------------------------------------------------- +# The one and only agentic subcommand +# --------------------------------------------------------------------------- + + +class InferenceCommand(BaseDiffusersCLICommand): + task = "inference" + + @staticmethod + def register_subcommand(subparsers: _SubParsersAction) -> None: + parser: ArgumentParser = subparsers.add_parser( + "inference", + help="Run any diffusers pipeline (standard or modular) by forwarding --pipeline-kwargs verbatim.", + ) + _add_loading_arguments(parser) + _add_optimization_arguments(parser) + parser.add_argument( + "--pipeline-kwargs", + default=None, + help=( + "JSON object of kwargs passed to the pipeline call. String values at known " + f"image-input keys ({', '.join(_IMAGE_INPUT_KEYS)}) are auto-loaded as PIL images." + ), + ) + parser.add_argument( + "--output-key", + default=None, + help="For modular pipelines: name of the intermediate to extract (passed as `output=` to the call).", + ) + parser.add_argument( + "--describe", + action="store_true", + help=( + "For modular pipelines: print the input schema from block definitions and exit. " + "Weights are NOT downloaded. Errors on standard (non-modular) pipelines." + ), + ) + parser.add_argument("--seed", type=int, default=None, help="Random seed for reproducibility.") + parser.add_argument( + "--fps", + type=int, + default=8, + help="FPS used when the output happens to be a frame sequence.", + ) + parser.add_argument( + "--sampling-rate", + type=int, + default=None, + help="Sample rate used when the output happens to be an audio array.", + ) + _add_remote_arguments(parser) + _add_output_arguments(parser) + parser.set_defaults(func=InferenceCommand) + + def __init__(self, args: Namespace): + self.args = args + + def run(self) -> None: + is_modular = _is_modular_repo(self.args) + + if self.args.describe: + if not is_modular: + raise SystemExit( + "--describe only works for modular pipeline repos " + "(those that ship modular_model_index.json)." + ) + _describe_modular(self.args) + return + + if _maybe_submit_remote(self.args, self.task): + return + + pipeline = _load_pipeline(self.args, modular=is_modular) + + call_kwargs = _parse_pipeline_kwargs(self.args.pipeline_kwargs) + _resolve_image_inputs(call_kwargs) + + if self.args.output_key is not None: + call_kwargs["output"] = self.args.output_key + + generator = _get_generator(self.args.seed, getattr(pipeline, "device", None) and pipeline.device.type or "cpu") + if generator is not None: + call_kwargs["generator"] = generator + + result = pipeline(**call_kwargs) + savable = result if is_modular else _result_to_savable(result) + saved = _save_auto(savable, self.args, self.task) + pushed = _push_outputs(self.args, saved, self.task) + + _format_result( + self.args, + { + "task": self.task, + "model": self.args.model, + "device": pipeline.device.type if hasattr(pipeline, "device") else None, + "pipeline_class": type(pipeline).__name__, + "modular": is_modular, + "outputs": saved, + "pushed": pushed, + "seed": self.args.seed, + "output_key": self.args.output_key, + }, + ) + + From accfa06e69ed05e229c198b402281a83769ee7dd Mon Sep 17 00:00:00 2001 From: DN6 Date: Wed, 3 Jun 2026 22:39:23 +0530 Subject: [PATCH 06/30] update --- src/diffusers/commands/inference.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/diffusers/commands/inference.py b/src/diffusers/commands/inference.py index 614994c07016..178c6e52115e 100644 --- a/src/diffusers/commands/inference.py +++ b/src/diffusers/commands/inference.py @@ -1,4 +1,4 @@ -# Copyright 2025 The HuggingFace Team. All rights reserved. +# Copyright 2026 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -802,8 +802,7 @@ def run(self) -> None: if self.args.describe: if not is_modular: raise SystemExit( - "--describe only works for modular pipeline repos " - "(those that ship modular_model_index.json)." + "--describe only works for modular pipeline repos " "(those that ship modular_model_index.json)." ) _describe_modular(self.args) return @@ -842,5 +841,3 @@ def run(self) -> None: "output_key": self.args.output_key, }, ) - - From 4d4d9e8ecda362ff4ebe1a3fcb4388714783cfe9 Mon Sep 17 00:00:00 2001 From: DN6 Date: Wed, 3 Jun 2026 22:44:20 +0530 Subject: [PATCH 07/30] update --- src/diffusers/commands/custom_blocks.py | 114 +++++++++++++----------- 1 file changed, 64 insertions(+), 50 deletions(-) diff --git a/src/diffusers/commands/custom_blocks.py b/src/diffusers/commands/custom_blocks.py index 22c38e6256b3..953240c5a2c3 100644 --- a/src/diffusers/commands/custom_blocks.py +++ b/src/diffusers/commands/custom_blocks.py @@ -12,89 +12,94 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""``diffusers-cli custom_blocks`` — save a custom ``ModularPipelineBlocks`` subclass. - -Parses a local ``block.py``, finds a ``ModularPipelineBlocks`` subclass, -dynamically imports it, and calls ``save_pretrained`` in the current -working directory so the result can be pushed to the Hub and consumed by -``diffusers-cli inference``. """ - -from __future__ import annotations +Usage example: + TODO +""" import ast import importlib.util import os -from argparse import ArgumentParser, Namespace, _SubParsersAction +from argparse import ArgumentParser, Namespace from pathlib import Path from ..utils import logging from . import BaseDiffusersCLICommand -_EXPECTED_BASE_CLASSES = ("ModularPipelineBlocks",) +EXPECTED_PARENT_CLASSES = ["ModularPipelineBlocks"] +CONFIG = "config.json" -class CustomBlocksCommand(BaseDiffusersCLICommand): - task = "custom_blocks" +def conversion_command_factory(args: Namespace): + return CustomBlocksCommand(args.block_module_name, args.block_class_name) + +class CustomBlocksCommand(BaseDiffusersCLICommand): @staticmethod - def register_subcommand(subparsers: _SubParsersAction) -> None: - parser: ArgumentParser = subparsers.add_parser( - "custom_blocks", - help="Save a custom ModularPipelineBlocks subclass via save_pretrained.", - ) - parser.add_argument( - "--block-module-name", + def register_subcommand(parser: ArgumentParser): + conversion_parser = parser.add_parser("custom_blocks") + conversion_parser.add_argument( + "--block_module_name", + type=str, default="block.py", - help="Module filename in which the custom block is implemented (default: block.py).", + help="Module filename in which the custom block will be implemented.", ) - parser.add_argument( - "--block-class-name", + conversion_parser.add_argument( + "--block_class_name", + type=str, default=None, - help="Name of the custom block class. If None, the first ModularPipelineBlocks subclass found is used.", + help="Name of the custom block. If provided None, we will try to infer it.", ) - parser.set_defaults(func=CustomBlocksCommand) + conversion_parser.set_defaults(func=conversion_command_factory) - def __init__(self, args: Namespace): + def __init__(self, block_module_name: str = "block.py", block_class_name: str = None): self.logger = logging.get_logger("diffusers-cli/custom_blocks") - self.block_module_name = Path(args.block_module_name) - self.block_class_name = args.block_class_name - - def run(self) -> None: - candidates = self._get_class_names(self.block_module_name) - classes_found = list({cls for cls, _ in candidates}) + self.block_module_name = Path(block_module_name) + self.block_class_name = block_class_name - if not candidates: - raise ValueError( - f"No ModularPipelineBlocks subclass found in {self.block_module_name}. " - "Ensure your block class inherits from `ModularPipelineBlocks` directly." - ) + def run(self): + # determine the block to be saved. + out = self._get_class_names(self.block_module_name) + classes_found = list({cls for cls, _ in out}) if self.block_class_name is not None: - child_class = next((cls for cls, _ in candidates if cls == self.block_class_name), None) - if child_class is None: + child_class, parent_class = self._choose_block(out, self.block_class_name) + if child_class is None and parent_class is None: raise ValueError( - f"--block-class-name {self.block_class_name!r} not found in " - f"{self.block_module_name}. Available: {classes_found}" + "`block_class_name` could not be retrieved. Available classes from " + f"{self.block_module_name}:\n{classes_found}" ) else: self.logger.info( - f"Found classes: {classes_found} — using {classes_found[0]}. " - "Re-run with --block-class-name to override." + f"Found classes: {classes_found} will be using {classes_found[0]}. " + "If this needs to be changed, re-run the command specifying `block_class_name`." ) - child_class, _ = candidates[0] + child_class, parent_class = out[0][0], out[0][1] + # dynamically get the custom block and initialize it to call `save_pretrained` in the current directory. + # the user is responsible for running it, so I guess that is safe? module_name = f"__dynamic__{self.block_module_name.stem}" spec = importlib.util.spec_from_file_location(module_name, str(self.block_module_name)) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) getattr(module, child_class)().save_pretrained(os.getcwd()) - def _get_class_names(self, file_path: Path) -> list[tuple[str, str]]: + # or, we could create it manually. + # automap = self._create_automap(parent_class=parent_class, child_class=child_class) + # with open(CONFIG, "w") as f: + # json.dump(automap, f) + + def _choose_block(self, candidates, chosen=None): + for cls, base in candidates: + if cls == chosen: + return cls, base + return None, None + + def _get_class_names(self, file_path): source = file_path.read_text(encoding="utf-8") try: - tree = ast.parse(source, filename=str(file_path)) + tree = ast.parse(source, filename=file_path) except SyntaxError as e: raise ValueError(f"Could not parse {file_path!r}: {e}") from e @@ -102,17 +107,26 @@ def _get_class_names(self, file_path: Path) -> list[tuple[str, str]]: for node in tree.body: if not isinstance(node, ast.ClassDef): continue + + # extract all base names for this class base_names = [bname for b in node.bases if (bname := self._get_base_name(b)) is not None] - for allowed in _EXPECTED_BASE_CLASSES: + + # for each allowed base that appears in the class's bases, emit a tuple + for allowed in EXPECTED_PARENT_CLASSES: if allowed in base_names: results.append((node.name, allowed)) + return results - @staticmethod - def _get_base_name(node: ast.expr) -> str | None: + def _get_base_name(self, node: ast.expr): if isinstance(node, ast.Name): return node.id - if isinstance(node, ast.Attribute): - val = CustomBlocksCommand._get_base_name(node.value) + elif isinstance(node, ast.Attribute): + val = self._get_base_name(node.value) return f"{val}.{node.attr}" if val else node.attr return None + + def _create_automap(self, parent_class, child_class): + module = str(self.block_module_name).replace(".py", "").rsplit(".", 1)[-1] + auto_map = {f"{parent_class}": f"{module}.{child_class}"} + return {"auto_map": auto_map} From f97aef8f517a0c417e5ed72abc7929b2d39665f0 Mon Sep 17 00:00:00 2001 From: DN6 Date: Wed, 3 Jun 2026 23:29:13 +0530 Subject: [PATCH 08/30] update --- src/diffusers/commands/diffusers_cli.py | 2 + src/diffusers/commands/inference.py | 212 +++++++++++++++++------- 2 files changed, 156 insertions(+), 58 deletions(-) diff --git a/src/diffusers/commands/diffusers_cli.py b/src/diffusers/commands/diffusers_cli.py index ceb806af1a3e..80f449426c54 100644 --- a/src/diffusers/commands/diffusers_cli.py +++ b/src/diffusers/commands/diffusers_cli.py @@ -16,6 +16,7 @@ from argparse import ArgumentParser from .custom_blocks import CustomBlocksCommand +from .describe import DescribeCommand from .env import EnvironmentCommand from .fp16_safetensors import FP16SafetensorsCommand from .inference import InferenceCommand @@ -30,6 +31,7 @@ def main(): FP16SafetensorsCommand.register_subcommand(commands_parser) CustomBlocksCommand.register_subcommand(commands_parser) InferenceCommand.register_subcommand(commands_parser) + DescribeCommand.register_subcommand(commands_parser) # Let's go args = parser.parse_args() diff --git a/src/diffusers/commands/inference.py b/src/diffusers/commands/inference.py index 178c6e52115e..b92f3fcbedd0 100644 --- a/src/diffusers/commands/inference.py +++ b/src/diffusers/commands/inference.py @@ -14,10 +14,9 @@ """``diffusers-cli inference`` — single agentic entry point. -Runs any diffusers pipeline (standard or modular) by forwarding -``--pipeline-kwargs`` verbatim, saves the output by sniffing its runtime -type, and can submit the same call to HF Jobs via ``--remote`` (with the -model repo volume-mounted and the results downloaded back). +Runs any diffusers pipeline (standard or modular) by forwarding ``--pipeline-kwargs`` verbatim, saves the output by +sniffing its runtime type, and can submit the same call to HF Jobs via ``--remote`` (with the model repo volume-mounted +and the results downloaded back). """ from __future__ import annotations @@ -49,8 +48,6 @@ "sage_hub", ) -_MODULAR_INDEX = "modular_model_index.json" - # Keys whose string value should be resolved via ``diffusers.utils.load_image`` # before being passed to the pipeline call. _IMAGE_INPUT_KEYS = ( @@ -313,55 +310,170 @@ def _load_pipeline(args: Namespace, modular: bool) -> Any: # --------------------------------------------------------------------------- -def _is_modular_repo(args: Namespace) -> bool: +def _try_fetch_config(args: Namespace, filename: str) -> Optional[str]: + """Try to resolve ``filename`` for ``args.model`` (local path or Hub repo). None if absent.""" local = Path(args.model) if local.exists(): - return (local / _MODULAR_INDEX).exists() + candidate = local / filename + return str(candidate) if candidate.exists() else None - from huggingface_hub import HfApi - from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError + from huggingface_hub import hf_hub_download + from huggingface_hub.utils import EntryNotFoundError, HfHubHTTPError, RepositoryNotFoundError try: - files = set(HfApi(token=args.token).list_repo_files(args.model, revision=args.revision)) - except (RepositoryNotFoundError, HfHubHTTPError): - return False - return _MODULAR_INDEX in files + return hf_hub_download(args.model, filename, revision=args.revision, token=args.token) + except (EntryNotFoundError, HfHubHTTPError, RepositoryNotFoundError): + return None -def _describe_modular(args: Namespace) -> None: - """Load just the block definitions and print the input schema.""" - from diffusers import ModularPipelineBlocks +def _is_modular_repo(args: Namespace) -> bool: + """Detect by trying ``DiffusionPipeline.config_name`` first; modular iff that's absent.""" + from diffusers import DiffusionPipeline - kwargs: dict[str, Any] = {"trust_remote_code": args.trust_remote_code} - if args.revision: - kwargs["revision"] = args.revision - if args.token: - kwargs["token"] = args.token + return _try_fetch_config(args, DiffusionPipeline.config_name) is None - blocks = ModularPipelineBlocks.from_pretrained(args.model, **kwargs) - schema = [ - { - "name": p.name, - "type_hint": str(p.type_hint) if p.type_hint is not None else None, - "default": p.default, - "required": p.required, - "description": p.description, - } - for p in blocks.inputs - ] - payload = { - "task": "inference-describe", - "model": args.model, - "blocks_class": type(blocks).__name__, - "inputs": schema, - } + +def _describe(args: Namespace) -> None: + """Print the pipeline's input schema. + + Tries ``DiffusionPipeline.config_name`` (= ``model_index.json``) first; if present, introspects the declared + pipeline class's ``__call__`` signature. Otherwise falls back to ``ModularPipelineBlocks.from_pretrained`` and + reads the block-declared ``inputs``. No weights downloaded either way. + """ + import inspect + + import diffusers + + standard_index = _try_fetch_config(args, diffusers.DiffusionPipeline.config_name) + + if standard_index is not None: + with open(standard_index) as f: + index = json.load(f) + class_name = index.get("_class_name") + pipeline_cls = getattr(diffusers, class_name, None) + if pipeline_cls is None: + raise SystemExit( + f"Pipeline class {class_name!r} declared in {diffusers.DiffusionPipeline.config_name} " + "is not exported by the installed diffusers." + ) + sig = inspect.signature(pipeline_cls.__call__) + descriptions = _parse_docstring_args(pipeline_cls.__call__.__doc__) if getattr(args, "verbose", False) else {} + schema: list[dict[str, Any]] = [] + for name, param in sig.parameters.items(): + if name == "self": + continue + if param.kind in (inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD): + continue + has_default = param.default is not inspect.Parameter.empty + schema.append( + { + "name": name, + "type_hint": str(param.annotation) if param.annotation is not inspect.Parameter.empty else None, + "default": param.default if has_default else None, + "required": not has_default, + "description": descriptions.get(name, ""), + } + ) + else: + kwargs: dict[str, Any] = {"trust_remote_code": args.trust_remote_code} + if args.revision: + kwargs["revision"] = args.revision + if args.token: + kwargs["token"] = args.token + try: + blocks = diffusers.ModularPipelineBlocks.from_pretrained(args.model, **kwargs) + except Exception as e: + raise SystemExit( + f"Could not describe {args.model!r}: no {diffusers.DiffusionPipeline.config_name} and " + f"loading as a modular pipeline failed ({type(e).__name__}: {e}). " + "Is this a diffusers pipeline repo? Pass --trust-remote-code if it ships custom block code." + ) from e + class_name = type(blocks).__name__ + schema = [ + { + "name": p.name, + "type_hint": str(p.type_hint) if p.type_hint is not None else None, + "default": p.default, + "required": p.required, + "description": p.description, + } + for p in blocks.inputs + ] if args.json: + payload = { + "task": "inference-describe", + "model": args.model, + "pipeline_class": class_name, + "inputs": schema, + } json.dump(payload, sys.stdout, default=str) sys.stdout.write("\n") return - print(f"{type(blocks).__name__} ({args.model}) inputs:") + _print_schema(class_name, args.model, schema) + + +def _parse_docstring_args(docstring: Optional[str]) -> dict[str, str]: + """Extract per-argument descriptions from a Google-style ``Args:`` block. + + Returns a ``{name: description}`` mapping. Best-effort — unrecognised formats just yield an empty dict rather than + raising. + """ + if not docstring: + return {} + + import re + + lines = docstring.expandtabs().splitlines() + start = None + section_indent = 0 + for i, line in enumerate(lines): + if line.strip() in ("Args:", "Arguments:", "Parameters:"): + start = i + 1 + section_indent = len(line) - len(line.lstrip()) + break + if start is None: + return {} + + descriptions: dict[str, str] = {} + current_name: Optional[str] = None + current_lines: list[str] = [] + arg_indent: Optional[int] = None + name_pattern = re.compile(r"^(\w+)\s*(?:\([^)]*\))?\s*:?\s*(.*)$") + + def _flush() -> None: + if current_name and current_lines: + descriptions[current_name] = " ".join(s.strip() for s in current_lines).strip() + + for line in lines[start:]: + if not line.strip(): + continue + indent = len(line) - len(line.lstrip()) + # A new top-level section ends the Args block. + if indent <= section_indent and line.strip().endswith(":"): + break + if arg_indent is None: + arg_indent = indent + if indent == arg_indent: + _flush() + current_lines = [] + match = name_pattern.match(line.strip()) + if match: + current_name = match.group(1) + tail = match.group(2).strip() + if tail: + current_lines.append(tail) + else: + current_name = None + elif current_name is not None and indent > arg_indent: + current_lines.append(line.strip()) + _flush() + return descriptions + + +def _print_schema(class_name: str, model: str, schema: list[dict[str, Any]]) -> None: + print(f"{class_name} ({model}) inputs:") for entry in schema: tag = "required" if entry["required"] else f"optional, default={entry['default']!r}" print(f" {entry['name']} ({tag})") @@ -502,7 +614,7 @@ def _save_audio_arrays(audios, sampling_rate: int, args: Namespace, task: str) - return saved -def _save_auto(value: Any, args: Namespace, task: str) -> list[str]: +def _save_output(value: Any, args: Namespace, task: str) -> list[str]: """Save ``value`` by sniffing its runtime type.""" pil_images = _as_pil_list(value) if pil_images is not None: @@ -768,14 +880,6 @@ def register_subcommand(subparsers: _SubParsersAction) -> None: default=None, help="For modular pipelines: name of the intermediate to extract (passed as `output=` to the call).", ) - parser.add_argument( - "--describe", - action="store_true", - help=( - "For modular pipelines: print the input schema from block definitions and exit. " - "Weights are NOT downloaded. Errors on standard (non-modular) pipelines." - ), - ) parser.add_argument("--seed", type=int, default=None, help="Random seed for reproducibility.") parser.add_argument( "--fps", @@ -799,14 +903,6 @@ def __init__(self, args: Namespace): def run(self) -> None: is_modular = _is_modular_repo(self.args) - if self.args.describe: - if not is_modular: - raise SystemExit( - "--describe only works for modular pipeline repos " "(those that ship modular_model_index.json)." - ) - _describe_modular(self.args) - return - if _maybe_submit_remote(self.args, self.task): return @@ -824,7 +920,7 @@ def run(self) -> None: result = pipeline(**call_kwargs) savable = result if is_modular else _result_to_savable(result) - saved = _save_auto(savable, self.args, self.task) + saved = _save_output(savable, self.args, self.task) pushed = _push_outputs(self.args, saved, self.task) _format_result( From 3774951818d3c2cc72cc3fd8ed6838da0ffe6cd0 Mon Sep 17 00:00:00 2001 From: DN6 Date: Wed, 3 Jun 2026 23:40:20 +0530 Subject: [PATCH 09/30] update --- src/diffusers/commands/describe.py | 81 +++++++++++++++++++++++++++++ src/diffusers/commands/inference.py | 6 +++ 2 files changed, 87 insertions(+) create mode 100644 src/diffusers/commands/describe.py diff --git a/src/diffusers/commands/describe.py b/src/diffusers/commands/describe.py new file mode 100644 index 000000000000..7eae367a8df7 --- /dev/null +++ b/src/diffusers/commands/describe.py @@ -0,0 +1,81 @@ +# Copyright 2026 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""``diffusers-cli describe`` — print the input schema for any pipeline repo. + +Tries ``DiffusionPipeline.config_name`` first (so standard repos get their ``__call__`` signature introspected); falls +back to ``ModularPipelineBlocks.from_pretrained`` for modular repos. No weights are downloaded — only the small index +file (and any custom block code if ``--trust-remote-code`` is set). +""" + +from __future__ import annotations + +from argparse import ArgumentParser, Namespace, _SubParsersAction + +from . import BaseDiffusersCLICommand +from .inference import _describe + + +class DescribeCommand(BaseDiffusersCLICommand): + task = "describe" + + @staticmethod + def register_subcommand(subparsers: _SubParsersAction) -> None: + parser: ArgumentParser = subparsers.add_parser( + "describe", + help="Print the input schema for a diffusers pipeline repo. No weights downloaded.", + ) + parser.add_argument( + "--model", + "-m", + required=True, + help="Model id on the Hugging Face Hub or local path.", + ) + parser.add_argument( + "--revision", + default=None, + help="Model revision (branch, tag, or commit SHA).", + ) + parser.add_argument( + "--token", + default=None, + help="Hugging Face token for gated/private models.", + ) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Allow custom code from the Hub (required for modular pipelines that ship block code).", + ) + parser.add_argument( + "--verbose", + "-v", + action="store_true", + help=( + "Also include per-argument descriptions from the pipeline's __call__ docstring. " + "Modular pipelines always include block-declared descriptions; --verbose populates " + "the equivalent field for standard pipelines by parsing the Google-style Args: block." + ), + ) + parser.add_argument( + "--json", + action="store_true", + help="Emit a machine-readable JSON summary on stdout.", + ) + parser.set_defaults(func=DescribeCommand) + + def __init__(self, args: Namespace): + self.args = args + + def run(self) -> None: + _describe(self.args) diff --git a/src/diffusers/commands/inference.py b/src/diffusers/commands/inference.py index b92f3fcbedd0..539b47177dff 100644 --- a/src/diffusers/commands/inference.py +++ b/src/diffusers/commands/inference.py @@ -698,6 +698,12 @@ def _maybe_submit_remote(args: Namespace, task: str) -> bool: if not args.remote: return False + print( + f"[diffusers-cli] preparing remote {task!r} job on flavor={args.flavor!r}...", + file=sys.stderr, + flush=True, + ) + import uuid from huggingface_hub import HfApi, get_token, run_uv_job From add747b6d82cffcba1b9d68db4b360a234c87662 Mon Sep 17 00:00:00 2001 From: DN6 Date: Fri, 12 Jun 2026 14:54:19 +0530 Subject: [PATCH 10/30] update --- src/diffusers/commands/custom_blocks.py | 27 +++--- src/diffusers/commands/describe.py | 4 +- src/diffusers/commands/diffusers_cli.py | 12 ++- src/diffusers/commands/env.py | 7 +- src/diffusers/commands/fp16_safetensors.py | 14 ++- .../commands/{inference.py => generate.py} | 90 +++++++++---------- 6 files changed, 84 insertions(+), 70 deletions(-) rename src/diffusers/commands/{inference.py => generate.py} (93%) diff --git a/src/diffusers/commands/custom_blocks.py b/src/diffusers/commands/custom_blocks.py index 953240c5a2c3..bc0889376a95 100644 --- a/src/diffusers/commands/custom_blocks.py +++ b/src/diffusers/commands/custom_blocks.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -Usage example: - TODO +"""``diffusers-cli custom_blocks`` — package a local ``ModularPipelineBlocks`` subclass for the Hub. + +Parses ``block.py`` (or ``--block_module_name``), instantiates the chosen block, and calls ``save_pretrained`` in the +current working directory. """ import ast @@ -28,7 +29,6 @@ EXPECTED_PARENT_CLASSES = ["ModularPipelineBlocks"] -CONFIG = "config.json" def conversion_command_factory(args: Namespace): @@ -38,7 +38,12 @@ def conversion_command_factory(args: Namespace): class CustomBlocksCommand(BaseDiffusersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): - conversion_parser = parser.add_parser("custom_blocks") + conversion_parser = parser.add_parser( + "custom_blocks", + help="Package a local ModularPipelineBlocks subclass for the Hub.", + usage="\n diffusers-cli custom_blocks [options]", + ) + conversion_parser._optionals.title = "Options" conversion_parser.add_argument( "--block_module_name", type=str, @@ -77,19 +82,12 @@ def run(self): ) child_class, parent_class = out[0][0], out[0][1] - # dynamically get the custom block and initialize it to call `save_pretrained` in the current directory. - # the user is responsible for running it, so I guess that is safe? module_name = f"__dynamic__{self.block_module_name.stem}" spec = importlib.util.spec_from_file_location(module_name, str(self.block_module_name)) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) getattr(module, child_class)().save_pretrained(os.getcwd()) - # or, we could create it manually. - # automap = self._create_automap(parent_class=parent_class, child_class=child_class) - # with open(CONFIG, "w") as f: - # json.dump(automap, f) - def _choose_block(self, candidates, chosen=None): for cls, base in candidates: if cls == chosen: @@ -125,8 +123,3 @@ def _get_base_name(self, node: ast.expr): val = self._get_base_name(node.value) return f"{val}.{node.attr}" if val else node.attr return None - - def _create_automap(self, parent_class, child_class): - module = str(self.block_module_name).replace(".py", "").rsplit(".", 1)[-1] - auto_map = {f"{parent_class}": f"{module}.{child_class}"} - return {"auto_map": auto_map} diff --git a/src/diffusers/commands/describe.py b/src/diffusers/commands/describe.py index 7eae367a8df7..b5c617c10319 100644 --- a/src/diffusers/commands/describe.py +++ b/src/diffusers/commands/describe.py @@ -24,7 +24,7 @@ from argparse import ArgumentParser, Namespace, _SubParsersAction from . import BaseDiffusersCLICommand -from .inference import _describe +from .generate import _describe class DescribeCommand(BaseDiffusersCLICommand): @@ -35,7 +35,9 @@ def register_subcommand(subparsers: _SubParsersAction) -> None: parser: ArgumentParser = subparsers.add_parser( "describe", help="Print the input schema for a diffusers pipeline repo. No weights downloaded.", + usage="\n diffusers-cli describe [options]", ) + parser._optionals.title = "Options" parser.add_argument( "--model", "-m", diff --git a/src/diffusers/commands/diffusers_cli.py b/src/diffusers/commands/diffusers_cli.py index 80f449426c54..09a3a8ab03b7 100644 --- a/src/diffusers/commands/diffusers_cli.py +++ b/src/diffusers/commands/diffusers_cli.py @@ -19,18 +19,22 @@ from .describe import DescribeCommand from .env import EnvironmentCommand from .fp16_safetensors import FP16SafetensorsCommand -from .inference import InferenceCommand +from .generate import GenerateCommand def main(): - parser = ArgumentParser("Diffusers CLI tool", usage="diffusers-cli []") - commands_parser = parser.add_subparsers(help="diffusers-cli command helpers") + parser = ArgumentParser( + prog="diffusers-cli", + usage="\n diffusers-cli [options]", + ) + parser._optionals.title = "General Options" + commands_parser = parser.add_subparsers(title="Commands", metavar="") # Register commands EnvironmentCommand.register_subcommand(commands_parser) FP16SafetensorsCommand.register_subcommand(commands_parser) CustomBlocksCommand.register_subcommand(commands_parser) - InferenceCommand.register_subcommand(commands_parser) + GenerateCommand.register_subcommand(commands_parser) DescribeCommand.register_subcommand(commands_parser) # Let's go diff --git a/src/diffusers/commands/env.py b/src/diffusers/commands/env.py index 58f31d478bf3..cab163fcdd63 100644 --- a/src/diffusers/commands/env.py +++ b/src/diffusers/commands/env.py @@ -40,7 +40,12 @@ def info_command_factory(_): class EnvironmentCommand(BaseDiffusersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser) -> None: - download_parser = parser.add_parser("env") + download_parser = parser.add_parser( + "env", + help="Print versions of diffusers and its dependencies (for bug reports).", + usage="\n diffusers-cli env", + ) + download_parser._optionals.title = "Options" download_parser.set_defaults(func=info_command_factory) def run(self) -> dict: diff --git a/src/diffusers/commands/fp16_safetensors.py b/src/diffusers/commands/fp16_safetensors.py index 382d6c39bd19..44e374b5707d 100644 --- a/src/diffusers/commands/fp16_safetensors.py +++ b/src/diffusers/commands/fp16_safetensors.py @@ -33,6 +33,13 @@ def conversion_command_factory(args: Namespace): + warnings.warn( + "`diffusers-cli fp16_safetensors` is deprecated and will be removed in a future version. " + "Convert weights to fp16 safetensors directly with `safetensors.torch.save_file` or via " + "`pipeline.save_pretrained(..., safe_serialization=True, variant='fp16')`.", + FutureWarning, + stacklevel=2, + ) if args.use_auth_token: warnings.warn( "The `--use_auth_token` flag is deprecated and will be removed in a future version." @@ -44,7 +51,12 @@ def conversion_command_factory(args: Namespace): class FP16SafetensorsCommand(BaseDiffusersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): - conversion_parser = parser.add_parser("fp16_safetensors") + conversion_parser = parser.add_parser( + "fp16_safetensors", + help="[DEPRECATED] Convert a Hub checkpoint's weights to fp16 safetensors and push back as a PR.", + usage="\n diffusers-cli fp16_safetensors [options]", + ) + conversion_parser._optionals.title = "Options" conversion_parser.add_argument( "--ckpt_id", type=str, diff --git a/src/diffusers/commands/inference.py b/src/diffusers/commands/generate.py similarity index 93% rename from src/diffusers/commands/inference.py rename to src/diffusers/commands/generate.py index 539b47177dff..e8f92083e1c8 100644 --- a/src/diffusers/commands/inference.py +++ b/src/diffusers/commands/generate.py @@ -12,11 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""``diffusers-cli inference`` — single agentic entry point. +"""``diffusers-cli generate`` — single agentic entry point. Runs any diffusers pipeline (standard or modular) by forwarding ``--pipeline-kwargs`` verbatim, saves the output by -sniffing its runtime type, and can submit the same call to HF Jobs via ``--remote`` (with the model repo volume-mounted -and the results downloaded back). +sniffing its runtime type, and can submit the same call to HF Jobs via ``--remote``. """ from __future__ import annotations @@ -59,10 +58,13 @@ ) # Source for the diffusers install used by --remote jobs. While iterating on a -# feature branch, point at the branch URL; once merged, switch back to a release -# pin. ``--dependencies "diffusers @ git+..."`` on the local command appends -# additional dependencies but does not replace this default install. -DIFFUSERS_SOURCE = "diffusers @ git+https://github.com/huggingface/diffusers@diffuser-cli-for-agent" +# feature branch, point at the GitHub tarball URL — uv installs it over plain +# HTTP and the container doesn't need ``git``. Once merged, switch back to a +# PyPI release pin. ``--dependencies "diffusers @ ..."`` on the local command +# appends additional dependencies but does not replace this default install. +DIFFUSERS_SOURCE = ( + "diffusers @ https://github.com/huggingface/diffusers/archive/refs/heads/diffuser-cli-for-agent.tar.gz" +) _DEFAULT_REMOTE_DEPS = ( DIFFUSERS_SOURCE, "accelerate", @@ -250,7 +252,7 @@ def _enable_context_parallel(pipeline: Any) -> None: raise SystemExit( "--context-parallel requires torch.distributed to be initialized. " "Launch the CLI under torchrun, e.g.: " - "`torchrun --nproc-per-node=N -m diffusers.commands.diffusers_cli inference ...`." + "`torchrun --nproc-per-node=N -m diffusers.commands.diffusers_cli generate ...`." ) from diffusers import ContextParallelConfig @@ -281,7 +283,9 @@ def _apply_optimizations(pipeline: Any, args: Namespace) -> None: def _from_pretrained_kwargs(args: Namespace) -> dict[str, Any]: dtype = _resolve_dtype(args.dtype) - kwargs: dict[str, Any] = {"trust_remote_code": args.trust_remote_code} + # disable_mmap: mmap-faults over a network-mounted volume trigger one round-trip per page; + # a sequential read is dramatically faster than the random-access mmap pattern. + kwargs: dict[str, Any] = {"trust_remote_code": args.trust_remote_code, "disable_mmap": True} if dtype != "auto": kwargs["torch_dtype"] = dtype if args.variant: @@ -357,7 +361,7 @@ def _describe(args: Namespace) -> None: "is not exported by the installed diffusers." ) sig = inspect.signature(pipeline_cls.__call__) - descriptions = _parse_docstring_args(pipeline_cls.__call__.__doc__) if getattr(args, "verbose", False) else {} + descriptions = _parse_docstring_args(pipeline_cls.__call__.__doc__) if args.verbose else {} schema: list[dict[str, Any]] = [] for name, param in sig.parameters.items(): if name == "self": @@ -402,7 +406,7 @@ def _describe(args: Namespace) -> None: if args.json: payload = { - "task": "inference-describe", + "task": "describe", "model": args.model, "pipeline_class": class_name, "inputs": schema, @@ -628,12 +632,12 @@ def _save_output(value: Any, args: Namespace, task: str) -> list[str]: from diffusers.utils import export_to_video path = _default_output_paths(task, 1, args.output, ext="mp4")[0] - export_to_video(frames, str(path), fps=getattr(args, "fps", 8)) + export_to_video(frames, str(path), fps=args.fps) return [str(path)] audios = _as_audio_arrays(value) if audios is not None: - return _save_audio_arrays(audios, getattr(args, "sampling_rate", None) or 16000, args, task) + return _save_audio_arrays(audios, args.sampling_rate or 16000, args, task) path = _default_output_paths(task, 1, args.output, ext="json")[0] Path(path).write_text(json.dumps(value, default=str, indent=2)) @@ -708,11 +712,6 @@ def _maybe_submit_remote(args: Namespace, task: str) -> bool: from huggingface_hub import HfApi, get_token, run_uv_job - try: - from huggingface_hub import Volume - except ImportError: - Volume = None - hf_token = args.token or get_token() api = HfApi(token=hf_token) @@ -732,29 +731,25 @@ def _maybe_submit_remote(args: Namespace, task: str) -> bool: "HF_ENABLE_PARALLEL_LOADING": "1", # thread-pool the safetensors load step } - # Mount the model repo into the job's filesystem so the container reads it - # from local disk instead of downloading. Requires huggingface_hub >= 1.16. - volumes = None - if Volume is not None and not Path(args.model).exists(): - mount_path = "/model" - volumes = [Volume(type="model", source=args.model, mount_path=mount_path)] - task_kwargs["model"] = mount_path - - run_uv_job_kwargs: dict[str, Any] = { - "script": _UV_RUNNER_SCRIPT, - "script_args": _kwargs_to_argv(task, task_kwargs), - "dependencies": dependencies, - "flavor": args.flavor, - "timeout": args.timeout, - "namespace": args.namespace, - "secrets": secrets, - "env": env, - "token": hf_token, - } - if volumes is not None: - run_uv_job_kwargs["volumes"] = volumes + if Path(args.model).exists(): + print( + f"[diffusers-cli] WARNING: --model {args.model!r} is a local path; the container can't see it. " + "Pass a Hub repo id so the job can download it.", + file=sys.stderr, + flush=True, + ) - job = run_uv_job(**run_uv_job_kwargs) + job = run_uv_job( + script=_UV_RUNNER_SCRIPT, + script_args=_kwargs_to_argv(task, task_kwargs), + dependencies=dependencies, + flavor=args.flavor, + timeout=args.timeout, + namespace=args.namespace, + secrets=secrets, + env=env, + token=hf_token, + ) payload: dict[str, Any] = { "task": "remote-submit", @@ -858,19 +853,21 @@ def _format_result(args: Namespace, payload: dict[str, Any]) -> None: # --------------------------------------------------------------------------- -# The one and only agentic subcommand +# Subcommand # --------------------------------------------------------------------------- -class InferenceCommand(BaseDiffusersCLICommand): - task = "inference" +class GenerateCommand(BaseDiffusersCLICommand): + task = "generate" @staticmethod def register_subcommand(subparsers: _SubParsersAction) -> None: parser: ArgumentParser = subparsers.add_parser( - "inference", + "generate", help="Run any diffusers pipeline (standard or modular) by forwarding --pipeline-kwargs verbatim.", + usage="\n diffusers-cli generate [options]", ) + parser._optionals.title = "Options" _add_loading_arguments(parser) _add_optimization_arguments(parser) parser.add_argument( @@ -901,7 +898,7 @@ def register_subcommand(subparsers: _SubParsersAction) -> None: ) _add_remote_arguments(parser) _add_output_arguments(parser) - parser.set_defaults(func=InferenceCommand) + parser.set_defaults(func=GenerateCommand) def __init__(self, args: Namespace): self.args = args @@ -920,7 +917,8 @@ def run(self) -> None: if self.args.output_key is not None: call_kwargs["output"] = self.args.output_key - generator = _get_generator(self.args.seed, getattr(pipeline, "device", None) and pipeline.device.type or "cpu") + device = pipeline.device.type if hasattr(pipeline, "device") else "cpu" + generator = _get_generator(self.args.seed, device) if generator is not None: call_kwargs["generator"] = generator From 934b5575de12074816e5fb5e1f06fcf1ad52a95f Mon Sep 17 00:00:00 2001 From: DN6 Date: Fri, 12 Jun 2026 15:15:15 +0530 Subject: [PATCH 11/30] update --- src/diffusers/commands/generate.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/diffusers/commands/generate.py b/src/diffusers/commands/generate.py index e8f92083e1c8..fae4bf78a565 100644 --- a/src/diffusers/commands/generate.py +++ b/src/diffusers/commands/generate.py @@ -70,10 +70,13 @@ "accelerate", "transformers", "safetensors", - "torch==2.10.*", - "torchvision", ) +# Base container image — provides torch + CUDA so uv doesn't reinstall the ~3GB nvidia-* +# wheels per cold start. cuda12.8 is the highest cuda12.x tag below the HF Jobs host +# driver's CUDA 12.9 max; cuda13.x tags fail with "driver too old". +_DEFAULT_REMOTE_IMAGE = "pytorch/pytorch:2.10.0-cuda12.8-cudnn9-runtime" + # Entry point for ``uv run`` inside the container. ``uv run`` accepts a file # path, URL, or command; passing the installed console script name makes UV # install the deps above (which register the entry point) and exec the CLI. @@ -743,6 +746,7 @@ def _maybe_submit_remote(args: Namespace, task: str) -> bool: script=_UV_RUNNER_SCRIPT, script_args=_kwargs_to_argv(task, task_kwargs), dependencies=dependencies, + image=_DEFAULT_REMOTE_IMAGE, flavor=args.flavor, timeout=args.timeout, namespace=args.namespace, From 0ae1eb087bdd34d32b55e0233db3595b77705fc5 Mon Sep 17 00:00:00 2001 From: DN6 Date: Fri, 12 Jun 2026 15:37:40 +0530 Subject: [PATCH 12/30] update --- src/diffusers/commands/generate.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/diffusers/commands/generate.py b/src/diffusers/commands/generate.py index fae4bf78a565..d152582d4cec 100644 --- a/src/diffusers/commands/generate.py +++ b/src/diffusers/commands/generate.py @@ -72,15 +72,13 @@ "safetensors", ) -# Base container image — provides torch + CUDA so uv doesn't reinstall the ~3GB nvidia-* -# wheels per cold start. cuda12.8 is the highest cuda12.x tag below the HF Jobs host -# driver's CUDA 12.9 max; cuda13.x tags fail with "driver too old". +# Base container image — provides torch + CUDA so ``uv pip install --system`` +# only has to add the small Python deps. cuda12.8 is the highest cuda12.x tag +# below the HF Jobs host driver's CUDA 12.9 max. _DEFAULT_REMOTE_IMAGE = "pytorch/pytorch:2.10.0-cuda12.8-cudnn9-runtime" -# Entry point for ``uv run`` inside the container. ``uv run`` accepts a file -# path, URL, or command; passing the installed console script name makes UV -# install the deps above (which register the entry point) and exec the CLI. -_UV_RUNNER_SCRIPT = "diffusers-cli" +# Installed console-script name invoked inside the container after the deps land. +_CONTAINER_CLI_BINARY = "diffusers-cli" RUN_ID_ENV = "DIFFUSERS_CLI_RUN_ID" @@ -711,9 +709,10 @@ def _maybe_submit_remote(args: Namespace, task: str) -> bool: flush=True, ) + import shlex import uuid - from huggingface_hub import HfApi, get_token, run_uv_job + from huggingface_hub import HfApi, get_token, run_job hf_token = args.token or get_token() api = HfApi(token=hf_token) @@ -742,11 +741,17 @@ def _maybe_submit_remote(args: Namespace, task: str) -> bool: flush=True, ) - job = run_uv_job( - script=_UV_RUNNER_SCRIPT, - script_args=_kwargs_to_argv(task, task_kwargs), - dependencies=dependencies, + # Build the in-container shell command: install the small Python deps into the + # image's system Python (where torch + CUDA already live) via ``uv pip install + # --system``, then exec the CLI with the same argv. --break-system-packages + # bypasses PEP 668; safe here because the container is ephemeral. + install_cmd = shlex.join(["uv", "pip", "install", "--system", "--break-system-packages", *dependencies]) + cli_cmd = shlex.join([_CONTAINER_CLI_BINARY, *_kwargs_to_argv(task, task_kwargs)]) + container_cmd = ["sh", "-c", f"{install_cmd} && {cli_cmd}"] + + job = run_job( image=_DEFAULT_REMOTE_IMAGE, + command=container_cmd, flavor=args.flavor, timeout=args.timeout, namespace=args.namespace, From dcfd09c111141cea26a14eef0626c5ea9e8bb6c4 Mon Sep 17 00:00:00 2001 From: DN6 Date: Fri, 12 Jun 2026 19:19:15 +0530 Subject: [PATCH 13/30] update --- src/diffusers/commands/generate.py | 38 ++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/src/diffusers/commands/generate.py b/src/diffusers/commands/generate.py index d152582d4cec..5398d3fa114a 100644 --- a/src/diffusers/commands/generate.py +++ b/src/diffusers/commands/generate.py @@ -200,7 +200,14 @@ def _resolve_dtype(name: Optional[str]): return "auto" import torch - mapping = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16} + mapping = { + "fp32": torch.float32, + "float32": torch.float32, + "fp16": torch.float16, + "float16": torch.float16, + "bf16": torch.bfloat16, + "bfloat16": torch.bfloat16, + } if name not in mapping: raise ValueError(f"Unknown dtype: {name}") return mapping[name] @@ -284,8 +291,6 @@ def _apply_optimizations(pipeline: Any, args: Namespace) -> None: def _from_pretrained_kwargs(args: Namespace) -> dict[str, Any]: dtype = _resolve_dtype(args.dtype) - # disable_mmap: mmap-faults over a network-mounted volume trigger one round-trip per page; - # a sequential read is dramatically faster than the random-access mmap pattern. kwargs: dict[str, Any] = {"trust_remote_code": args.trust_remote_code, "disable_mmap": True} if dtype != "auto": kwargs["torch_dtype"] = dtype @@ -355,6 +360,10 @@ def _describe(args: Namespace) -> None: with open(standard_index) as f: index = json.load(f) class_name = index.get("_class_name") + if class_name is None: + raise SystemExit( + f"{diffusers.DiffusionPipeline.config_name} for {args.model!r} has no `_class_name` field." + ) pipeline_cls = getattr(diffusers, class_name, None) if pipeline_cls is None: raise SystemExit( @@ -781,11 +790,30 @@ def _maybe_submit_remote(args: Namespace, task: str) -> bool: ) final_status = _wait_for_job(api, job.id, args.namespace, args.poll_interval) payload["job_status"] = final_status + payload["timing"] = _job_timing(api, job.id, args.namespace) payload["outputs"] = _download_job_artifacts(api, args.push_to, run_id, args.output) _format_result(args, payload) return True +def _job_timing(api: Any, job_id: str, namespace: Optional[str]) -> dict[str, Optional[float]]: + """Return queue/run/total wallclock seconds for ``job_id`` from inspect_job timestamps.""" + info = api.inspect_job(job_id=job_id, namespace=namespace) + + def _delta(start, end) -> Optional[float]: + return (end - start).total_seconds() if (start is not None and end is not None) else None + + timing = { + "queued_seconds": _delta(info.created_at, info.started_at), + "run_seconds": _delta(info.started_at, info.finished_at), + "total_seconds": _delta(info.created_at, info.finished_at), + } + parts = [f"{k.replace('_seconds', '')}={v:.1f}s" for k, v in timing.items() if v is not None] + if parts: + print(f"[diffusers-cli] timing: {' '.join(parts)}", file=sys.stderr, flush=True) + return timing + + def _wait_for_job(api: Any, job_id: str, namespace: Optional[str], poll_interval: float) -> str: """Stream container logs to stderr until the job terminates; return the final stage.""" fetch = getattr(api, "fetch_job_logs", None) @@ -873,7 +901,7 @@ class GenerateCommand(BaseDiffusersCLICommand): def register_subcommand(subparsers: _SubParsersAction) -> None: parser: ArgumentParser = subparsers.add_parser( "generate", - help="Run any diffusers pipeline (standard or modular) by forwarding --pipeline-kwargs verbatim.", + help="Run any diffusers pipeline locally or remotely on HF Jobs.", usage="\n diffusers-cli generate [options]", ) parser._optionals.title = "Options" @@ -941,7 +969,7 @@ def run(self) -> None: { "task": self.task, "model": self.args.model, - "device": pipeline.device.type if hasattr(pipeline, "device") else None, + "device": device, "pipeline_class": type(pipeline).__name__, "modular": is_modular, "outputs": saved, From 9515c551da50be398335a5383dec9c7d83afb7d1 Mon Sep 17 00:00:00 2001 From: DN6 Date: Fri, 12 Jun 2026 19:31:45 +0530 Subject: [PATCH 14/30] update --- .ai/skills/diffusers-cli/SKILL.md | 140 ++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 .ai/skills/diffusers-cli/SKILL.md diff --git a/.ai/skills/diffusers-cli/SKILL.md b/.ai/skills/diffusers-cli/SKILL.md new file mode 100644 index 000000000000..ae9c34cdc712 --- /dev/null +++ b/.ai/skills/diffusers-cli/SKILL.md @@ -0,0 +1,140 @@ +--- +name: diffusers-cli +description: > + Use when the user wants to run a diffusers pipeline from a terminal (one-off + generation, batch jobs, smoke-testing a new model), submit jobs to HF Jobs + hardware via `--remote`, or introspect an unknown pipeline's input schema + before calling it. Prefer this over writing ad-hoc Python scripts for + generation tasks. +--- + +## Overview + +`diffusers-cli` is the shipped CLI in `src/diffusers/commands/`. Three subcommands +matter for agentic use: + +| Command | Purpose | +| --- | --- | +| `generate` | Run any `DiffusionPipeline` or `ModularPipeline` by forwarding `--pipeline-kwargs` verbatim. Saves output by sniffing its runtime type. | +| `describe` | Print the input schema (kwarg names + types + defaults + docstring) for a pipeline repo. **No weights downloaded** — only `model_index.json` (or `modular_model_index.json`) is fetched. | +| `custom_blocks` | Package a local `ModularPipelineBlocks` subclass for the Hub. | + +`env` (system info) and `fp16_safetensors` (deprecated) also exist but aren't +relevant to inference. + +## The describe → generate flow + +For any model you haven't called before, run `describe` first to learn its +input contract, then `generate` with the right `--pipeline-kwargs`: + +```bash +# 1. Discover what kwargs the pipeline takes (no weight download) +diffusers-cli describe --model black-forest-labs/FLUX.1-dev --json + +# 2. Run it +diffusers-cli generate \ + --model black-forest-labs/FLUX.1-dev \ + --pipeline-kwargs '{"prompt": "a cat", "num_inference_steps": 30}' \ + --dtype bf16 +``` + +`describe`'s `--json` output is machine-readable: a list of `{name, type_hint, +default, required, description}` entries. Use `--verbose` to additionally parse +the `__call__` docstring's `Args:` block for descriptions on standard pipelines. + +## Standard vs modular detection + +`generate` auto-detects which kind of pipeline it's calling: + +1. If `model_index.json` exists on the repo → `DiffusionPipeline.from_pretrained` path +2. Otherwise → `ModularPipeline.from_pretrained` path + +You don't need to tell it which. Modular repos must pass `--trust-remote-code` +if they ship custom block code. + +## `--pipeline-kwargs` semantics + +A JSON object passed straight through to `pipeline(**kwargs)`. String values at +known image-input keys (`image`, `mask_image`, `control_image`, +`ip_adapter_image`, `image_2`) are auto-loaded as PIL images, so you can pass +URLs or local paths directly: + +```bash +diffusers-cli generate \ + --model stabilityai/stable-diffusion-xl-refiner-1.0 \ + --pipeline-kwargs '{ + "image": "https://example.com/cat.png", + "prompt": "a photorealistic cat", + "strength": 0.6 + }' +``` + +**Shell-quoting gotcha**: the JSON must be on one line (or use `\` to +line-continue). A literal newline inside the single-quoted argument lands as a +raw control char inside the string and breaks `json.loads`. + +## Output handling + +`generate` sniffs the pipeline return type and saves accordingly: + +- `PIL.Image` / list of them → `outputs/generate-.png` +- Frame sequence (≥2 PILs or ndarrays) → `outputs/generate-0.mp4` (uses `--fps`, default 8) +- Numpy audio array → `outputs/generate-0.wav` (uses `--sampling-rate`) +- Anything else → JSON dump + +Override the destination with `--output ` (file or directory). + +Use `--push-to /` to upload outputs to an HF bucket after saving. +The bucket is created if it doesn't exist; objects land under +`/`. + +## Remote execution (`--remote`) + +Adds `--remote` to submit the same call as a Hugging Face Job: + +```bash +diffusers-cli generate \ + --model black-forest-labs/FLUX.1-dev \ + --pipeline-kwargs '{"prompt": "a cat"}' \ + --remote --flavor a100-large +``` + +What happens: + +1. Token is read from `args.token` or `huggingface_hub.get_token()`. +2. A bucket (`/jobs-artifacts` by default) is auto-created. +3. Job is submitted to HF Jobs via `run_job` with the pytorch image + (`pytorch/pytorch:2.10.0-cuda12.8-cudnn9-runtime`) so torch + CUDA are + preinstalled. +4. Container runs `uv pip install --system --break-system-packages + && diffusers-cli generate ...` — only ~50 MB of deps install + because torch already lives in the image's site-packages. +5. The CLI streams the container's logs to stderr until the job terminates, + then downloads any files the job uploaded to the bucket under its `run_id` + prefix. +6. A timing breakdown (`queued_seconds`, `run_seconds`, `total_seconds`) is + printed and added to the JSON payload. + +Use `--no-wait` to submit and immediately return the job id without streaming +logs. Use `--namespace` to run under a different account. + +## `--json` machine-readable mode + +All subcommands accept `--json` to emit a single JSON object on stdout instead +of human-readable text. Use this when an agent needs to parse the result — +output paths, timing, pushed-bucket URIs, etc. + +## When NOT to use this skill + +- Multi-stage workflows where you need intermediate tensor manipulation between + pipelines → write Python. +- Training or fine-tuning → CLI only covers inference. +- Anything requiring custom `device_map`, `quantization_config`, or other + low-level loader knobs not exposed by the CLI flags → write Python. + +## Verifying the CLI is installed + +The console entry point lives in `pyproject.toml` (`diffusers-cli = +"diffusers.commands.diffusers_cli:main"`). If `diffusers-cli` is not on PATH +after `pip install -e .`, reinstall with `pip install -e . --force-reinstall +--no-deps` and check `which diffusers-cli`. From 404be8a381f50ac1ff2fc5ec99bbf9bb5478d7e5 Mon Sep 17 00:00:00 2001 From: DN6 Date: Mon, 15 Jun 2026 14:30:16 +0530 Subject: [PATCH 15/30] update --- src/diffusers/commands/generate.py | 57 ++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/src/diffusers/commands/generate.py b/src/diffusers/commands/generate.py index 5398d3fa114a..1b9c98ebe552 100644 --- a/src/diffusers/commands/generate.py +++ b/src/diffusers/commands/generate.py @@ -131,7 +131,7 @@ def _add_optimization_arguments(parser: ArgumentParser) -> None: action="store_true", help=( "Enable Ulysses-style context parallelism (ulysses_anything mode). " - "Requires launching the CLI under torchrun with ≥2 GPUs." + "Requires a DiT-based pipeline and launching the CLI under torchrun with ≥2 GPUs." ), ) @@ -242,15 +242,23 @@ def _map_to_device(pipeline: Any, args: Namespace, device: str) -> Any: return pipeline -def _set_attention_backend(pipeline: Any, backend: str) -> None: +def _denoiser(pipeline: Any) -> Optional[Any]: + """Return the pipeline's denoiser submodule (transformer or unet) or None.""" for attr in ("transformer", "unet"): module = getattr(pipeline, attr, None) - if module is not None and hasattr(module, "set_attention_backend"): - try: - module.set_attention_backend(backend) - except (ValueError, ImportError, RuntimeError): - pass - return + if module is not None: + return module + return None + + +def _set_attention_backend(pipeline: Any, backend: str) -> None: + module = _denoiser(pipeline) + if module is None or not hasattr(module, "set_attention_backend"): + return + try: + module.set_attention_backend(backend) + except (ValueError, ImportError, RuntimeError): + pass def _enable_context_parallel(pipeline: Any) -> None: @@ -263,18 +271,22 @@ def _enable_context_parallel(pipeline: Any) -> None: "`torchrun --nproc-per-node=N -m diffusers.commands.diffusers_cli generate ...`." ) + transformer = getattr(pipeline, "transformer", None) + if transformer is None or not hasattr(transformer, "enable_parallelism"): + raise SystemExit( + "--context-parallel requires a DiT-based pipeline. " + f"{type(pipeline).__name__} does not expose a `transformer` with `enable_parallelism`." + ) + from diffusers import ContextParallelConfig - cfg = ContextParallelConfig( - ulysses_degree=torch.distributed.get_world_size(), - ring_degree=1, - ulysses_anything=True, + transformer.enable_parallelism( + config=ContextParallelConfig( + ulysses_degree=torch.distributed.get_world_size(), + ring_degree=1, + ulysses_anything=True, + ) ) - for attr in ("transformer", "unet"): - module = getattr(pipeline, attr, None) - if module is not None and hasattr(module, "enable_parallelism"): - module.enable_parallelism(config=cfg) - return def _apply_optimizations(pipeline: Any, args: Namespace) -> None: @@ -754,8 +766,15 @@ def _maybe_submit_remote(args: Namespace, task: str) -> bool: # image's system Python (where torch + CUDA already live) via ``uv pip install # --system``, then exec the CLI with the same argv. --break-system-packages # bypasses PEP 668; safe here because the container is ephemeral. + # For --context-parallel, wrap with torchrun so torch.distributed initializes + # across every visible GPU before our generate command runs. install_cmd = shlex.join(["uv", "pip", "install", "--system", "--break-system-packages", *dependencies]) - cli_cmd = shlex.join([_CONTAINER_CLI_BINARY, *_kwargs_to_argv(task, task_kwargs)]) + cli_argv = _kwargs_to_argv(task, task_kwargs) + if args.context_parallel: + cli_argv = ["torchrun", "--nproc-per-node=gpu", "-m", "diffusers.commands.diffusers_cli", *cli_argv] + else: + cli_argv = [_CONTAINER_CLI_BINARY, *cli_argv] + cli_cmd = shlex.join(cli_argv) container_cmd = ["sh", "-c", f"{install_cmd} && {cli_cmd}"] job = run_job( @@ -901,7 +920,7 @@ class GenerateCommand(BaseDiffusersCLICommand): def register_subcommand(subparsers: _SubParsersAction) -> None: parser: ArgumentParser = subparsers.add_parser( "generate", - help="Run any diffusers pipeline locally or remotely on HF Jobs.", + help="Run any diffusers pipeline locally or remotely with HF Jobs.", usage="\n diffusers-cli generate [options]", ) parser._optionals.title = "Options" From f3fa589f53f2ca8e5f22195561ede8fd970c90b2 Mon Sep 17 00:00:00 2001 From: DN6 Date: Mon, 15 Jun 2026 14:49:08 +0530 Subject: [PATCH 16/30] update --- src/diffusers/commands/generate.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/diffusers/commands/generate.py b/src/diffusers/commands/generate.py index 1b9c98ebe552..20f081a008c3 100644 --- a/src/diffusers/commands/generate.py +++ b/src/diffusers/commands/generate.py @@ -219,6 +219,13 @@ def _resolve_device(name: Optional[str]) -> str: import torch if torch.cuda.is_available(): + # Under torchrun, LOCAL_RANK identifies this process's assigned GPU. + # Without this pin every rank falls back to cuda:0 and OOMs because the + # whole pipeline gets replicated onto a single device. + local_rank = os.environ.get("LOCAL_RANK") + if local_rank is not None: + torch.cuda.set_device(int(local_rank)) + return f"cuda:{local_rank}" return "cuda" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): return "mps" From 633461dc76af2c6ad1924b854f96a58d44ec6c60 Mon Sep 17 00:00:00 2001 From: DN6 Date: Mon, 15 Jun 2026 14:57:37 +0530 Subject: [PATCH 17/30] update --- src/diffusers/commands/generate.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/diffusers/commands/generate.py b/src/diffusers/commands/generate.py index 20f081a008c3..b8ee139fa17b 100644 --- a/src/diffusers/commands/generate.py +++ b/src/diffusers/commands/generate.py @@ -271,12 +271,20 @@ def _set_attention_backend(pipeline: Any, backend: str) -> None: def _enable_context_parallel(pipeline: Any) -> None: import torch - if not torch.distributed.is_available() or not torch.distributed.is_initialized(): - raise SystemExit( - "--context-parallel requires torch.distributed to be initialized. " - "Launch the CLI under torchrun, e.g.: " - "`torchrun --nproc-per-node=N -m diffusers.commands.diffusers_cli generate ...`." - ) + if not torch.distributed.is_available(): + raise SystemExit("--context-parallel requires a torch build with distributed support.") + + if not torch.distributed.is_initialized(): + # torchrun sets RANK/WORLD_SIZE/LOCAL_RANK/MASTER_* env vars but does not call + # init_process_group on our behalf — do it here. If those env vars are absent the + # process wasn't launched under torchrun, so point the user at the right command. + if "LOCAL_RANK" not in os.environ: + raise SystemExit( + "--context-parallel requires torch.distributed to be initialized. " + "Launch the CLI under torchrun, e.g.: " + "`torchrun --nproc-per-node=N -m diffusers.commands.diffusers_cli generate ...`." + ) + torch.distributed.init_process_group(backend="nccl") transformer = getattr(pipeline, "transformer", None) if transformer is None or not hasattr(transformer, "enable_parallelism"): From 268bae965aa7f878cc776976e773f00da9accecd Mon Sep 17 00:00:00 2001 From: DN6 Date: Mon, 15 Jun 2026 16:20:53 +0530 Subject: [PATCH 18/30] update --- src/diffusers/commands/generate.py | 66 ++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/src/diffusers/commands/generate.py b/src/diffusers/commands/generate.py index b8ee139fa17b..6501984fe2ff 100644 --- a/src/diffusers/commands/generate.py +++ b/src/diffusers/commands/generate.py @@ -284,7 +284,9 @@ def _enable_context_parallel(pipeline: Any) -> None: "Launch the CLI under torchrun, e.g.: " "`torchrun --nproc-per-node=N -m diffusers.commands.diffusers_cli generate ...`." ) - torch.distributed.init_process_group(backend="nccl") + # Hybrid backend: ulysses_anything's per-rank size coordination wants Gloo on CPU + # (avoids H2D/D2H for a tiny int tensor); the main attention all-to-all stays on NCCL. + torch.distributed.init_process_group(backend="cpu:gloo,cuda:nccl") transformer = getattr(pipeline, "transformer", None) if transformer is None or not hasattr(transformer, "enable_parallelism"): @@ -831,8 +833,19 @@ def _maybe_submit_remote(args: Namespace, task: str) -> bool: def _job_timing(api: Any, job_id: str, namespace: Optional[str]) -> dict[str, Optional[float]]: - """Return queue/run/total wallclock seconds for ``job_id`` from inspect_job timestamps.""" + """Return queue/run/total wallclock seconds for ``job_id`` from inspect_job timestamps. + + inspect_job sometimes returns finished_at=None for a few seconds after the container exits + while HF Jobs propagates the terminal state; retry briefly so we don't miss run/total. + """ + import time + info = api.inspect_job(job_id=job_id, namespace=namespace) + for _ in range(5): + if info.finished_at is not None: + break + time.sleep(1.0) + info = api.inspect_job(job_id=job_id, namespace=namespace) def _delta(start, end) -> Optional[float]: return (end - start).total_seconds() if (start is not None and end is not None) else None @@ -993,22 +1006,33 @@ def run(self) -> None: if generator is not None: call_kwargs["generator"] = generator - result = pipeline(**call_kwargs) - savable = result if is_modular else _result_to_savable(result) - saved = _save_output(savable, self.args, self.task) - pushed = _push_outputs(self.args, saved, self.task) - - _format_result( - self.args, - { - "task": self.task, - "model": self.args.model, - "device": device, - "pipeline_class": type(pipeline).__name__, - "modular": is_modular, - "outputs": saved, - "pushed": pushed, - "seed": self.args.seed, - "output_key": self.args.output_key, - }, - ) + try: + result = pipeline(**call_kwargs) + + # Under torchrun, ranks > 0 produce identical output to rank 0 (CP shards the + # transformer compute but ranks reduce to the same final tensors). Save/push/print + # from rank 0 only to avoid clobbering bucket files 4x and printing 4x. + if os.environ.get("RANK", "0") == "0": + savable = result if is_modular else _result_to_savable(result) + saved = _save_output(savable, self.args, self.task) + pushed = _push_outputs(self.args, saved, self.task) + + _format_result( + self.args, + { + "task": self.task, + "model": self.args.model, + "device": device, + "pipeline_class": type(pipeline).__name__, + "modular": is_modular, + "outputs": saved, + "pushed": pushed, + "seed": self.args.seed, + "output_key": self.args.output_key, + }, + ) + finally: + import torch + + if torch.distributed.is_available() and torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() From fa7a0a23eabc052f9984898f62919d2c143499d8 Mon Sep 17 00:00:00 2001 From: DN6 Date: Mon, 15 Jun 2026 17:11:41 +0530 Subject: [PATCH 19/30] update --- src/diffusers/commands/describe.py | 160 +++++++++++++++- src/diffusers/commands/diffusers_cli.py | 15 +- src/diffusers/commands/generate.py | 242 +++++------------------- 3 files changed, 219 insertions(+), 198 deletions(-) diff --git a/src/diffusers/commands/describe.py b/src/diffusers/commands/describe.py index b5c617c10319..8dd240014540 100644 --- a/src/diffusers/commands/describe.py +++ b/src/diffusers/commands/describe.py @@ -21,10 +21,168 @@ from __future__ import annotations +import json from argparse import ArgumentParser, Namespace, _SubParsersAction +from typing import Any, Optional from . import BaseDiffusersCLICommand -from .generate import _describe +from ._common import try_fetch_config +from ._output import OutputFormat, out + + +def _describe(args: Namespace) -> None: + """Print the pipeline's input schema. + + Tries ``DiffusionPipeline.config_name`` (= ``model_index.json``) first; if present, introspects the declared + pipeline class's ``__call__`` signature. Otherwise falls back to ``ModularPipelineBlocks.from_pretrained`` and + reads the block-declared ``inputs``. No weights downloaded either way. + """ + import inspect + + import diffusers + + model_index = try_fetch_config(args, diffusers.DiffusionPipeline.config_name) + if model_index is not None: + with open(model_index) as f: + index = json.load(f) + class_name = index.get("_class_name") + if class_name is None: + raise SystemExit( + f"{diffusers.DiffusionPipeline.config_name} for {args.model!r} has no `_class_name` field." + ) + pipeline_cls = getattr(diffusers, class_name, None) + if pipeline_cls is None: + raise SystemExit( + f"Pipeline class {class_name!r} declared in {diffusers.DiffusionPipeline.config_name} " + "is not exported by the installed diffusers." + ) + + sig = inspect.signature(pipeline_cls.__call__) + descriptions = _parse_docstring_args(pipeline_cls.__call__.__doc__) if args.verbose else {} + schema: list[dict[str, Any]] = [] + for name, param in sig.parameters.items(): + if name == "self": + continue + if param.kind in (inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD): + continue + has_default = param.default is not inspect.Parameter.empty + schema.append( + { + "name": name, + "type_hint": str(param.annotation) if param.annotation is not inspect.Parameter.empty else None, + "default": param.default if has_default else None, + "required": not has_default, + "description": descriptions.get(name, ""), + } + ) + else: + kwargs: dict[str, Any] = {"trust_remote_code": args.trust_remote_code} + if args.revision: + kwargs["revision"] = args.revision + if args.token: + kwargs["token"] = args.token + try: + blocks = diffusers.ModularPipelineBlocks.from_pretrained(args.model, **kwargs) + except Exception as e: + raise SystemExit( + f"Could not describe {args.model!r}: no {diffusers.DiffusionPipeline.config_name} and " + f"loading as a modular pipeline failed ({type(e).__name__}: {e}). " + "Is this a diffusers pipeline repo? Pass --trust-remote-code if it ships custom block code." + ) from e + + class_name = type(blocks).__name__ + schema = [ + { + "name": p.name, + "type_hint": str(p.type_hint) if p.type_hint is not None else None, + "default": p.default, + "required": p.required, + "description": p.description, + } + for p in blocks.inputs + ] + + if args.json: + out.set_mode(OutputFormat.JSON) + + if out.mode in (OutputFormat.JSON, OutputFormat.AGENT): + # Agents get the structured schema (full payload for JSON, the inputs table for AGENT). + if out.mode == OutputFormat.JSON: + out.dict({"task": "describe", "model": args.model, "pipeline_class": class_name, "inputs": schema}) + else: + out.table(schema, headers=["name", "required", "type_hint", "default", "description"]) + return + + _print_schema(class_name, args.model, schema) + + +def _parse_docstring_args(docstring: Optional[str]) -> dict[str, str]: + """Extract per-argument descriptions from a Google-style ``Args:`` block. + + Returns a ``{name: description}`` mapping. Best-effort — unrecognised formats just yield an empty dict rather than + raising. + """ + if not docstring: + return {} + + import re + + lines = docstring.expandtabs().splitlines() + start = None + section_indent = 0 + for i, line in enumerate(lines): + if line.strip() in ("Args:", "Arguments:", "Parameters:"): + start = i + 1 + section_indent = len(line) - len(line.lstrip()) + break + if start is None: + return {} + + descriptions: dict[str, str] = {} + current_name: Optional[str] = None + current_lines: list[str] = [] + arg_indent: Optional[int] = None + name_pattern = re.compile(r"^(\w+)\s*(?:\([^)]*\))?\s*:?\s*(.*)$") + + def _flush() -> None: + if current_name and current_lines: + descriptions[current_name] = " ".join(s.strip() for s in current_lines).strip() + + for line in lines[start:]: + if not line.strip(): + continue + indent = len(line) - len(line.lstrip()) + # A new top-level section ends the Args block. + if indent <= section_indent and line.strip().endswith(":"): + break + if arg_indent is None: + arg_indent = indent + if indent == arg_indent: + _flush() + current_lines = [] + match = name_pattern.match(line.strip()) + if match: + current_name = match.group(1) + tail = match.group(2).strip() + if tail: + current_lines.append(tail) + else: + current_name = None + elif current_name is not None and indent > arg_indent: + current_lines.append(line.strip()) + _flush() + return descriptions + + +def _print_schema(class_name: str, model: str, schema: list[dict[str, Any]]) -> None: + print(f"{class_name} ({model}) inputs:") + for entry in schema: + tag = "required" if entry["required"] else f"optional, default={entry['default']!r}" + print(f" {entry['name']} ({tag})") + if entry["type_hint"]: + print(f" type: {entry['type_hint']}") + if entry["description"]: + print(f" desc: {entry['description']}") class DescribeCommand(BaseDiffusersCLICommand): diff --git a/src/diffusers/commands/diffusers_cli.py b/src/diffusers/commands/diffusers_cli.py index 09a3a8ab03b7..deca219d90f4 100644 --- a/src/diffusers/commands/diffusers_cli.py +++ b/src/diffusers/commands/diffusers_cli.py @@ -15,6 +15,7 @@ from argparse import ArgumentParser +from ._output import OutputFormat, out from .custom_blocks import CustomBlocksCommand from .describe import DescribeCommand from .env import EnvironmentCommand @@ -25,9 +26,19 @@ def main(): parser = ArgumentParser( prog="diffusers-cli", - usage="\n diffusers-cli [options]", + usage="\n diffusers-cli [--format ] [options]", ) parser._optionals.title = "General Options" + parser.add_argument( + "--format", + choices=[m.value for m in OutputFormat], + default=OutputFormat.AUTO.value, + help=( + "Output format. 'auto' (default) picks 'agent' when an AI coding agent is detected " + "(via CLAUDECODE/CURSOR_AI/AIDER_AI_CONTEXT/... env vars) and 'human' otherwise. " + "Must appear before the subcommand." + ), + ) commands_parser = parser.add_subparsers(title="Commands", metavar="") # Register commands @@ -40,6 +51,8 @@ def main(): # Let's go args = parser.parse_args() + out.set_mode(OutputFormat(args.format)) + if not hasattr(args, "func"): parser.print_help() exit(1) diff --git a/src/diffusers/commands/generate.py b/src/diffusers/commands/generate.py index 6501984fe2ff..3421f72d3920 100644 --- a/src/diffusers/commands/generate.py +++ b/src/diffusers/commands/generate.py @@ -30,6 +30,8 @@ from diffusers.utils import load_image from . import BaseDiffusersCLICommand +from ._common import try_fetch_config +from ._output import OutputFormat, out # --------------------------------------------------------------------------- @@ -102,6 +104,15 @@ def _add_loading_arguments(parser: ArgumentParser) -> None: parser.add_argument("--revision", default=None, help="Model revision (branch, tag, or commit SHA).") parser.add_argument("--token", default=None, help="Hugging Face token for gated/private models.") parser.add_argument("--trust-remote-code", action="store_true", help="Allow custom code from the Hub.") + parser.add_argument( + "--lora", + default=None, + help=( + "JSON object describing a LoRA adapter to attach after the pipeline loads. " + 'Shape: {"lora_id": "", "lora_scale": }. ' + 'Example: \'{"lora_id": "alvdansen/littletinies", "lora_scale": 0.8}\'.' + ), + ) def _add_optimization_arguments(parser: ArgumentParser) -> None: @@ -244,7 +255,7 @@ def _map_to_device(pipeline: Any, args: Namespace, device: str) -> Any: pipeline.enable_group_offload( onload_device=torch.device(device), offload_type="leaf_level", - use_stream=device.startswith("cuda"), + use_stream=True, ) return pipeline @@ -264,8 +275,11 @@ def _set_attention_backend(pipeline: Any, backend: str) -> None: return try: module.set_attention_backend(backend) - except (ValueError, ImportError, RuntimeError): - pass + except (ValueError, ImportError, RuntimeError) as e: + raise SystemExit( + f"Failed to set attention backend {backend!r}: {type(e).__name__}: {e}. " + f"Allowed backends: {', '.join(ATTENTION_BACKEND_CHOICES)}." + ) from e def _enable_context_parallel(pipeline: Any) -> None: @@ -275,15 +289,6 @@ def _enable_context_parallel(pipeline: Any) -> None: raise SystemExit("--context-parallel requires a torch build with distributed support.") if not torch.distributed.is_initialized(): - # torchrun sets RANK/WORLD_SIZE/LOCAL_RANK/MASTER_* env vars but does not call - # init_process_group on our behalf — do it here. If those env vars are absent the - # process wasn't launched under torchrun, so point the user at the right command. - if "LOCAL_RANK" not in os.environ: - raise SystemExit( - "--context-parallel requires torch.distributed to be initialized. " - "Launch the CLI under torchrun, e.g.: " - "`torchrun --nproc-per-node=N -m diffusers.commands.diffusers_cli generate ...`." - ) # Hybrid backend: ulysses_anything's per-rank size coordination wants Gloo on CPU # (avoids H2D/D2H for a tiny int tensor); the main attention all-to-all stays on NCCL. torch.distributed.init_process_group(backend="cpu:gloo,cuda:nccl") @@ -341,189 +346,42 @@ def _load_pipeline(args: Namespace, modular: bool) -> Any: return pipeline pipeline = _map_to_device(pipeline, args, _resolve_device(args.device)) _apply_optimizations(pipeline, args) + _load_lora(pipeline, args) return pipeline -# --------------------------------------------------------------------------- -# Modular pipeline detection + introspection -# --------------------------------------------------------------------------- - +def _load_lora(pipeline: Any, args: Namespace) -> None: + """Attach a LoRA adapter from a JSON spec like ``{"lora_id": "...", "lora_scale": 0.8}``.""" + if not args.lora: + return + try: + spec = json.loads(args.lora) + except json.JSONDecodeError as e: + raise SystemExit(f"--lora must be valid JSON: {e}") from e + if not isinstance(spec, dict): + raise SystemExit("--lora must decode to a JSON object.") + lora_id = spec.get("lora_id") + if not lora_id: + raise SystemExit("--lora must include a 'lora_id' field.") + if not hasattr(pipeline, "load_lora_weights"): + raise SystemExit(f"{type(pipeline).__name__} does not support LoRA loading.") -def _try_fetch_config(args: Namespace, filename: str) -> Optional[str]: - """Try to resolve ``filename`` for ``args.model`` (local path or Hub repo). None if absent.""" - local = Path(args.model) - if local.exists(): - candidate = local / filename - return str(candidate) if candidate.exists() else None + pipeline.load_lora_weights(lora_id, adapter_name="default") + scale = spec.get("lora_scale") + if scale is not None and hasattr(pipeline, "set_adapters"): + pipeline.set_adapters(["default"], adapter_weights=[float(scale)]) - from huggingface_hub import hf_hub_download - from huggingface_hub.utils import EntryNotFoundError, HfHubHTTPError, RepositoryNotFoundError - try: - return hf_hub_download(args.model, filename, revision=args.revision, token=args.token) - except (EntryNotFoundError, HfHubHTTPError, RepositoryNotFoundError): - return None +# --------------------------------------------------------------------------- +# Modular pipeline detection + introspection +# --------------------------------------------------------------------------- def _is_modular_repo(args: Namespace) -> bool: """Detect by trying ``DiffusionPipeline.config_name`` first; modular iff that's absent.""" from diffusers import DiffusionPipeline - return _try_fetch_config(args, DiffusionPipeline.config_name) is None - - -def _describe(args: Namespace) -> None: - """Print the pipeline's input schema. - - Tries ``DiffusionPipeline.config_name`` (= ``model_index.json``) first; if present, introspects the declared - pipeline class's ``__call__`` signature. Otherwise falls back to ``ModularPipelineBlocks.from_pretrained`` and - reads the block-declared ``inputs``. No weights downloaded either way. - """ - import inspect - - import diffusers - - standard_index = _try_fetch_config(args, diffusers.DiffusionPipeline.config_name) - - if standard_index is not None: - with open(standard_index) as f: - index = json.load(f) - class_name = index.get("_class_name") - if class_name is None: - raise SystemExit( - f"{diffusers.DiffusionPipeline.config_name} for {args.model!r} has no `_class_name` field." - ) - pipeline_cls = getattr(diffusers, class_name, None) - if pipeline_cls is None: - raise SystemExit( - f"Pipeline class {class_name!r} declared in {diffusers.DiffusionPipeline.config_name} " - "is not exported by the installed diffusers." - ) - sig = inspect.signature(pipeline_cls.__call__) - descriptions = _parse_docstring_args(pipeline_cls.__call__.__doc__) if args.verbose else {} - schema: list[dict[str, Any]] = [] - for name, param in sig.parameters.items(): - if name == "self": - continue - if param.kind in (inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD): - continue - has_default = param.default is not inspect.Parameter.empty - schema.append( - { - "name": name, - "type_hint": str(param.annotation) if param.annotation is not inspect.Parameter.empty else None, - "default": param.default if has_default else None, - "required": not has_default, - "description": descriptions.get(name, ""), - } - ) - else: - kwargs: dict[str, Any] = {"trust_remote_code": args.trust_remote_code} - if args.revision: - kwargs["revision"] = args.revision - if args.token: - kwargs["token"] = args.token - try: - blocks = diffusers.ModularPipelineBlocks.from_pretrained(args.model, **kwargs) - except Exception as e: - raise SystemExit( - f"Could not describe {args.model!r}: no {diffusers.DiffusionPipeline.config_name} and " - f"loading as a modular pipeline failed ({type(e).__name__}: {e}). " - "Is this a diffusers pipeline repo? Pass --trust-remote-code if it ships custom block code." - ) from e - class_name = type(blocks).__name__ - schema = [ - { - "name": p.name, - "type_hint": str(p.type_hint) if p.type_hint is not None else None, - "default": p.default, - "required": p.required, - "description": p.description, - } - for p in blocks.inputs - ] - - if args.json: - payload = { - "task": "describe", - "model": args.model, - "pipeline_class": class_name, - "inputs": schema, - } - json.dump(payload, sys.stdout, default=str) - sys.stdout.write("\n") - return - - _print_schema(class_name, args.model, schema) - - -def _parse_docstring_args(docstring: Optional[str]) -> dict[str, str]: - """Extract per-argument descriptions from a Google-style ``Args:`` block. - - Returns a ``{name: description}`` mapping. Best-effort — unrecognised formats just yield an empty dict rather than - raising. - """ - if not docstring: - return {} - - import re - - lines = docstring.expandtabs().splitlines() - start = None - section_indent = 0 - for i, line in enumerate(lines): - if line.strip() in ("Args:", "Arguments:", "Parameters:"): - start = i + 1 - section_indent = len(line) - len(line.lstrip()) - break - if start is None: - return {} - - descriptions: dict[str, str] = {} - current_name: Optional[str] = None - current_lines: list[str] = [] - arg_indent: Optional[int] = None - name_pattern = re.compile(r"^(\w+)\s*(?:\([^)]*\))?\s*:?\s*(.*)$") - - def _flush() -> None: - if current_name and current_lines: - descriptions[current_name] = " ".join(s.strip() for s in current_lines).strip() - - for line in lines[start:]: - if not line.strip(): - continue - indent = len(line) - len(line.lstrip()) - # A new top-level section ends the Args block. - if indent <= section_indent and line.strip().endswith(":"): - break - if arg_indent is None: - arg_indent = indent - if indent == arg_indent: - _flush() - current_lines = [] - match = name_pattern.match(line.strip()) - if match: - current_name = match.group(1) - tail = match.group(2).strip() - if tail: - current_lines.append(tail) - else: - current_name = None - elif current_name is not None and indent > arg_indent: - current_lines.append(line.strip()) - _flush() - return descriptions - - -def _print_schema(class_name: str, model: str, schema: list[dict[str, Any]]) -> None: - print(f"{class_name} ({model}) inputs:") - for entry in schema: - tag = "required" if entry["required"] else f"optional, default={entry['default']!r}" - print(f" {entry['name']} ({tag})") - if entry["type_hint"]: - print(f" type: {entry['type_hint']}") - if entry["description"]: - print(f" desc: {entry['description']}") + return try_fetch_config(args, DiffusionPipeline.config_name) is None # --------------------------------------------------------------------------- @@ -835,8 +693,8 @@ def _maybe_submit_remote(args: Namespace, task: str) -> bool: def _job_timing(api: Any, job_id: str, namespace: Optional[str]) -> dict[str, Optional[float]]: """Return queue/run/total wallclock seconds for ``job_id`` from inspect_job timestamps. - inspect_job sometimes returns finished_at=None for a few seconds after the container exits - while HF Jobs propagates the terminal state; retry briefly so we don't miss run/total. + inspect_job sometimes returns finished_at=None for a few seconds after the container exits while HF Jobs propagates + the terminal state; retry briefly so we don't miss run/total. """ import time @@ -922,18 +780,10 @@ def _download_job_artifacts(api: Any, bucket_id: str, run_id: str, output: Optio def _format_result(args: Namespace, payload: dict[str, Any]) -> None: - """Print either a human-friendly summary or JSON, depending on --json.""" + """Route the result payload through ``out``. ``--json`` escalates the mode regardless of --format.""" if args.json: - json.dump(payload, sys.stdout, default=str) - sys.stdout.write("\n") - return - - outputs = payload.get("outputs", []) - if outputs: - for path in outputs: - print(path) - else: - print(payload) + out.set_mode(OutputFormat.JSON) + out.result(payload.get("task", "done"), **payload) # --------------------------------------------------------------------------- From 55e1c1433cd781814b40a894ba930ac0289499a6 Mon Sep 17 00:00:00 2001 From: DN6 Date: Mon, 15 Jun 2026 17:13:48 +0530 Subject: [PATCH 20/30] update --- src/diffusers/commands/_common.py | 44 ++++++++ src/diffusers/commands/_output.py | 182 ++++++++++++++++++++++++++++++ 2 files changed, 226 insertions(+) create mode 100644 src/diffusers/commands/_common.py create mode 100644 src/diffusers/commands/_output.py diff --git a/src/diffusers/commands/_common.py b/src/diffusers/commands/_common.py new file mode 100644 index 000000000000..df242628841d --- /dev/null +++ b/src/diffusers/commands/_common.py @@ -0,0 +1,44 @@ +# Copyright 2026 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Shared helpers used by multiple ``diffusers-cli`` subcommands. + +Anything imported by more than one command file lives here so command modules stay standalone — no cross-command +imports between e.g. ``describe`` and ``generate``. +""" + +from __future__ import annotations + +from argparse import Namespace +from pathlib import Path +from typing import Optional + + +def try_fetch_config(args: Namespace, filename: str) -> Optional[str]: + """Resolve ``filename`` for ``args.model`` (local path or Hub repo). Return None if absent. + + Used by ``generate`` (to detect modular vs standard pipelines) and ``describe`` (to read the pipeline class for + schema introspection) — no weights are downloaded, only the small index file. + """ + local = Path(args.model) + if local.exists(): + candidate = local / filename + return str(candidate) if candidate.exists() else None + + from huggingface_hub import hf_hub_download + from huggingface_hub.utils import EntryNotFoundError, HfHubHTTPError, RepositoryNotFoundError + + try: + return hf_hub_download(args.model, filename, revision=args.revision, token=args.token) + except (EntryNotFoundError, HfHubHTTPError, RepositoryNotFoundError): + return None diff --git a/src/diffusers/commands/_output.py b/src/diffusers/commands/_output.py new file mode 100644 index 000000000000..1200737ec696 --- /dev/null +++ b/src/diffusers/commands/_output.py @@ -0,0 +1,182 @@ +# Copyright 2026 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Dual-audience output sink for ``diffusers-cli``. + +Every subcommand routes user-visible output through the singleton ``out``. The mode is one of ``human`` (default for +terminals), ``agent`` (auto-selected when an AI coding agent is detected), ``json`` (machine-parseable), or ``quiet`` +(first value per record). The set of methods on ``out`` covers the shapes our commands actually produce — free-form +text, key/value results, structured dicts, and tabular schemas — so leaf commands never branch on ``args.json`` +themselves. +""" + +from __future__ import annotations + +import json +import os +import sys +from enum import Enum +from typing import Any, Optional, Sequence + + +# Environment variables set by known AI coding agents. Presence of any one triggers AGENT mode +# under `--format auto`. Subset of the huggingface_hub harness registry — extend as needed. +_AGENT_ENV_VARS = ( + "CLAUDECODE", # Claude Code + "CLAUDE_CODE", # alt spelling + "CURSOR_AI", # Cursor + "AIDER_AI_CONTEXT", # Aider + "GH_COPILOT_AGENT", # GitHub Copilot Agent +) + + +def is_agent() -> bool: + """Return True if the process appears to be invoked by an AI coding agent.""" + return any(os.environ.get(v) for v in _AGENT_ENV_VARS) + + +class OutputFormat(str, Enum): + AUTO = "auto" + HUMAN = "human" + AGENT = "agent" + JSON = "json" + QUIET = "quiet" + + +class Output: + """Singleton output sink. Resolve mode once at startup, then call ``out.``.""" + + mode: OutputFormat + + def __init__(self) -> None: + self.set_mode(OutputFormat.AUTO) + + def set_mode(self, mode: OutputFormat) -> None: + """Set the active output mode. AUTO resolves to AGENT or HUMAN via ``is_agent()``.""" + if mode == OutputFormat.AUTO: + mode = OutputFormat.AGENT if is_agent() else OutputFormat.HUMAN + self.mode = mode + + # ------------------------------------------------------------------ stdout + + def text(self, msg: str) -> None: + """Free-form line. Suppressed in QUIET; printed plain in every other mode.""" + if self.mode == OutputFormat.QUIET: + return + print(msg) + + def dict(self, data: dict[str, Any]) -> None: + """Structured object — JSON in every mode (indented for HUMAN, compact otherwise). + + Use for payloads that don't decompose cleanly into key/value pairs (e.g. describe schemas). + """ + if self.mode == OutputFormat.QUIET: + return + indent = 2 if self.mode == OutputFormat.HUMAN else None + print(json.dumps(data, indent=indent, default=str)) + + def result(self, message: str, **data: Any) -> None: + """Success summary. + + - HUMAN: ``message`` followed by `` key: value`` lines. + - AGENT: ``key=value`` pairs space-separated on one line (TSV-ish, parser-friendly). + - JSON: compact JSON of ``data``. + - QUIET: first non-None value. + """ + if self.mode == OutputFormat.HUMAN: + print(message) + for k, v in data.items(): + if v is not None: + print(f" {k}: {v}") + elif self.mode == OutputFormat.AGENT: + parts = [f"{k}={v}" for k, v in data.items() if v is not None] + print(" ".join(parts) if parts else message) + elif self.mode == OutputFormat.JSON: + print(json.dumps(data, default=str)) + elif self.mode == OutputFormat.QUIET: + for v in data.values(): + if v is not None: + print(v) + return + + def table( + self, + items: Sequence[dict[str, Any]], + *, + headers: Optional[list[str]] = None, + id_key: Optional[str] = None, + ) -> None: + """Tabular data — HUMAN gets padded columns, AGENT gets TSV, JSON gets the list, QUIET gets id_key. + + Headers default to the keys of the first item. + """ + if not items: + if self.mode in (OutputFormat.HUMAN, OutputFormat.AGENT): + print("No results.") + elif self.mode == OutputFormat.JSON: + print("[]") + return + + if headers is None: + headers = list(items[0].keys()) + + if self.mode == OutputFormat.JSON: + print(json.dumps(list(items), default=str)) + return + + if self.mode == OutputFormat.QUIET: + key = id_key or headers[0] + for item in items: + value = item.get(key) + if value is not None: + print(value) + return + + rows = [[_cell(item.get(h)) for h in headers] for item in items] + if self.mode == OutputFormat.AGENT: + print("\t".join(headers)) + for row in rows: + print("\t".join(row)) + return + + # HUMAN: pad each column to its widest cell for readable alignment. + widths = [max(len(h), *(len(r[i]) for r in rows)) for i, h in enumerate(headers)] + print(" ".join(h.ljust(widths[i]) for i, h in enumerate(headers))) + for row in rows: + print(" ".join(c.ljust(widths[i]) for i, c in enumerate(row))) + + # ------------------------------------------------------------------ stderr + + def hint(self, message: str) -> None: + """Next-step suggestion. Always goes to stderr so it never pollutes parseable stdout.""" + if self.mode == OutputFormat.QUIET: + return + print(f"Hint: {message}", file=sys.stderr) + + def warning(self, message: str) -> None: + """Non-fatal warning — stderr, every mode.""" + print(f"Warning: {message}", file=sys.stderr) + + def error(self, message: str) -> None: + """Error — stderr, every mode.""" + print(f"Error: {message}", file=sys.stderr) + + +def _cell(value: Any) -> str: + if value is None: + return "" + return str(value) + + +# Module-level singleton imported by every subcommand. +out = Output() From 6ba7a3fd50223385fb32ce6e58e39651f4395cd5 Mon Sep 17 00:00:00 2001 From: DN6 Date: Mon, 15 Jun 2026 17:17:19 +0530 Subject: [PATCH 21/30] update --- src/diffusers/commands/generate.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/diffusers/commands/generate.py b/src/diffusers/commands/generate.py index 3421f72d3920..523e3a55e6c8 100644 --- a/src/diffusers/commands/generate.py +++ b/src/diffusers/commands/generate.py @@ -87,7 +87,17 @@ # Namespace keys that control *how* a remote job runs locally, not what runs # inside the container. They are stripped when forwarding argv to the container. HF_JOBS_KEYS = frozenset( - {"remote", "flavor", "timeout", "dependencies", "namespace", "no_wait", "poll_interval", "func"} + { + "remote", + "flavor", + "timeout", + "dependencies", + "namespace", + "no_wait", + "poll_interval", + "func", + "format", # top-level --format is a local rendering flag; never forward to the container + } ) From 6f02aedc823af55980b6df0ca979924e68f69c11 Mon Sep 17 00:00:00 2001 From: DN6 Date: Mon, 15 Jun 2026 17:23:44 +0530 Subject: [PATCH 22/30] update --- src/diffusers/commands/generate.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/diffusers/commands/generate.py b/src/diffusers/commands/generate.py index 523e3a55e6c8..e60e088da79f 100644 --- a/src/diffusers/commands/generate.py +++ b/src/diffusers/commands/generate.py @@ -186,7 +186,11 @@ def _add_remote_arguments(parser: ArgumentParser) -> None: default="a10g-small", help="HF Jobs hardware flavor for --remote (e.g. a10g-small, a100-large, cpu-basic).", ) - parser.add_argument("--timeout", default=None, help="HF Jobs timeout for --remote (e.g. 30m, 2h).") + parser.add_argument( + "--timeout", + default="10m", + help="HF Jobs timeout for --remote (e.g. 30m, 2h). Defaults to 10m.", + ) parser.add_argument( "--dependencies", action="append", @@ -247,9 +251,12 @@ def _resolve_device(name: Optional[str]) -> str: if local_rank is not None: torch.cuda.set_device(int(local_rank)) return f"cuda:{local_rank}" + return "cuda" + if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): return "mps" + return "cpu" From 889f6460314d80eb9e060302b49c8d8d054cf4ff Mon Sep 17 00:00:00 2001 From: DN6 Date: Mon, 15 Jun 2026 17:55:55 +0530 Subject: [PATCH 23/30] update --- .ai/skills/diffusers-cli/SKILL.md | 149 +++++++----------------------- 1 file changed, 34 insertions(+), 115 deletions(-) diff --git a/.ai/skills/diffusers-cli/SKILL.md b/.ai/skills/diffusers-cli/SKILL.md index ae9c34cdc712..b2aedd8fca16 100644 --- a/.ai/skills/diffusers-cli/SKILL.md +++ b/.ai/skills/diffusers-cli/SKILL.md @@ -3,138 +3,57 @@ name: diffusers-cli description: > Use when the user wants to run a diffusers pipeline from a terminal (one-off generation, batch jobs, smoke-testing a new model), submit jobs to HF Jobs - hardware via `--remote`, or introspect an unknown pipeline's input schema - before calling it. Prefer this over writing ad-hoc Python scripts for - generation tasks. + hardware via `--remote`, introspect a pipeline's input schema before + calling it, or attach a LoRA at inference time. Prefer this over writing + ad-hoc Python scripts for generation tasks. --- ## Overview -`diffusers-cli` is the shipped CLI in `src/diffusers/commands/`. Three subcommands -matter for agentic use: +`diffusers-cli` is the shipped CLI in `src/diffusers/commands/`. Subcommands relevant to agentic use: -| Command | Purpose | -| --- | --- | -| `generate` | Run any `DiffusionPipeline` or `ModularPipeline` by forwarding `--pipeline-kwargs` verbatim. Saves output by sniffing its runtime type. | -| `describe` | Print the input schema (kwarg names + types + defaults + docstring) for a pipeline repo. **No weights downloaded** — only `model_index.json` (or `modular_model_index.json`) is fetched. | -| `custom_blocks` | Package a local `ModularPipelineBlocks` subclass for the Hub. | +| Command | Purpose | +| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `generate` | Run any `DiffusionPipeline` or `ModularPipeline`. Forwards `--pipeline-kwargs` verbatim, saves output by sniffing its runtime type, optionally runs on HF Jobs via `--remote`. | +| `describe` | Print the input schema for a pipeline repo (kwarg names, types, defaults, descriptions). **No weights downloaded** — only the small index file. | +| `custom_blocks` | Package a local `ModularPipelineBlocks` subclass for the Hub. | +| `env` | Print versions of diffusers + torch + transformers + accelerate + safetensors + CUDA + GPU info. Use when investigating environment issues, dtype/precision support, or building bug reports. | -`env` (system info) and `fp16_safetensors` (deprecated) also exist but aren't -relevant to inference. +`fp16_safetensors` is also shipped but deprecated and not relevant to inference. -## The describe → generate flow +## When to read which file -For any model you haven't called before, run `describe` first to learn its -input contract, then `generate` with the right `--pipeline-kwargs`: +Most agentic work goes through `generate`. Read the matching reference file before constructing a command: -```bash -# 1. Discover what kwargs the pipeline takes (no weight download) -diffusers-cli describe --model black-forest-labs/FLUX.1-dev --json - -# 2. Run it -diffusers-cli generate \ - --model black-forest-labs/FLUX.1-dev \ - --pipeline-kwargs '{"prompt": "a cat", "num_inference_steps": 30}' \ - --dtype bf16 -``` - -`describe`'s `--json` output is machine-readable: a list of `{name, type_hint, -default, required, description}` entries. Use `--verbose` to additionally parse -the `__call__` docstring's `Args:` block for descriptions on standard pipelines. - -## Standard vs modular detection - -`generate` auto-detects which kind of pipeline it's calling: - -1. If `model_index.json` exists on the repo → `DiffusionPipeline.from_pretrained` path -2. Otherwise → `ModularPipeline.from_pretrained` path +- **[`generate.md`](generate.md)** — full reference for `diffusers-cli generate`. Covers `--pipeline-kwargs` + semantics and the shell-quoting gotcha, LoRA via `--lora`, optimization flags (`--dtype`, `--cpu-offload`, + `--attention-backend`, `--vae-tiling/slicing`), output handling and `--push-to` bucket uploads, the full + `--remote` HF Jobs flow (image, container command, log streaming, timing payload, artifact download), and + context parallel (`--context-parallel`) for both local-torchrun and `--remote` paths. -You don't need to tell it which. Modular repos must pass `--trust-remote-code` -if they ship custom block code. - -## `--pipeline-kwargs` semantics - -A JSON object passed straight through to `pipeline(**kwargs)`. String values at -known image-input keys (`image`, `mask_image`, `control_image`, -`ip_adapter_image`, `image_2`) are auto-loaded as PIL images, so you can pass -URLs or local paths directly: +The other commands are small enough that `diffusers-cli --help` is the canonical reference: ```bash -diffusers-cli generate \ - --model stabilityai/stable-diffusion-xl-refiner-1.0 \ - --pipeline-kwargs '{ - "image": "https://example.com/cat.png", - "prompt": "a photorealistic cat", - "strength": 0.6 - }' +diffusers-cli describe --help +diffusers-cli custom_blocks --help +diffusers-cli env --help ``` -**Shell-quoting gotcha**: the JSON must be on one line (or use `\` to -line-continue). A literal newline inside the single-quoted argument lands as a -raw control char inside the string and breaks `json.loads`. - -## Output handling - -`generate` sniffs the pipeline return type and saves accordingly: - -- `PIL.Image` / list of them → `outputs/generate-.png` -- Frame sequence (≥2 PILs or ndarrays) → `outputs/generate-0.mp4` (uses `--fps`, default 8) -- Numpy audio array → `outputs/generate-0.wav` (uses `--sampling-rate`) -- Anything else → JSON dump - -Override the destination with `--output ` (file or directory). - -Use `--push-to /` to upload outputs to an HF bucket after saving. -The bucket is created if it doesn't exist; objects land under -`/`. - -## Remote execution (`--remote`) - -Adds `--remote` to submit the same call as a Hugging Face Job: - -```bash -diffusers-cli generate \ - --model black-forest-labs/FLUX.1-dev \ - --pipeline-kwargs '{"prompt": "a cat"}' \ - --remote --flavor a100-large -``` - -What happens: - -1. Token is read from `args.token` or `huggingface_hub.get_token()`. -2. A bucket (`/jobs-artifacts` by default) is auto-created. -3. Job is submitted to HF Jobs via `run_job` with the pytorch image - (`pytorch/pytorch:2.10.0-cuda12.8-cudnn9-runtime`) so torch + CUDA are - preinstalled. -4. Container runs `uv pip install --system --break-system-packages - && diffusers-cli generate ...` — only ~50 MB of deps install - because torch already lives in the image's site-packages. -5. The CLI streams the container's logs to stderr until the job terminates, - then downloads any files the job uploaded to the bucket under its `run_id` - prefix. -6. A timing breakdown (`queued_seconds`, `run_seconds`, `total_seconds`) is - printed and added to the JSON payload. - -Use `--no-wait` to submit and immediately return the job id without streaming -logs. Use `--namespace` to run under a different account. - -## `--json` machine-readable mode - -All subcommands accept `--json` to emit a single JSON object on stdout instead -of human-readable text. Use this when an agent needs to parse the result — -output paths, timing, pushed-bucket URIs, etc. - ## When NOT to use this skill -- Multi-stage workflows where you need intermediate tensor manipulation between - pipelines → write Python. +- Multi-stage workflows where you need intermediate tensor manipulation between pipelines → write Python. - Training or fine-tuning → CLI only covers inference. -- Anything requiring custom `device_map`, `quantization_config`, or other - low-level loader knobs not exposed by the CLI flags → write Python. +- Anything requiring custom `device_map`, `quantization_config`, or other low-level loader knobs not exposed by + the CLI flags → write Python. ## Verifying the CLI is installed -The console entry point lives in `pyproject.toml` (`diffusers-cli = -"diffusers.commands.diffusers_cli:main"`). If `diffusers-cli` is not on PATH -after `pip install -e .`, reinstall with `pip install -e . --force-reinstall ---no-deps` and check `which diffusers-cli`. +The console entry point is registered in `pyproject.toml` (`diffusers-cli = +"diffusers.commands.diffusers_cli:main"`). If `diffusers-cli` is not on PATH after `pip install -e .`, reinstall +with `pip install -e . --force-reinstall --no-deps` and check `which diffusers-cli`. If the installed binary is +missing recent features (e.g. you see `unrecognized arguments: --lora`), reinstall. + +## Full user guide + +For a non-agent overview with troubleshooting and tips, see [`DIFFUSERS_CLI.md`](../../../DIFFUSERS_CLI.md) at +the repo root. From ab70d692713eb728898d91117dafbd4e20fbee23 Mon Sep 17 00:00:00 2001 From: DN6 Date: Tue, 16 Jun 2026 12:46:10 +0530 Subject: [PATCH 24/30] pdate --- .ai/skills/diffusers-cli/generate.md | 182 +++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) create mode 100644 .ai/skills/diffusers-cli/generate.md diff --git a/.ai/skills/diffusers-cli/generate.md b/.ai/skills/diffusers-cli/generate.md new file mode 100644 index 000000000000..e71ba56f5590 --- /dev/null +++ b/.ai/skills/diffusers-cli/generate.md @@ -0,0 +1,182 @@ +# `diffusers-cli generate` — reference + +Full surface for `diffusers-cli generate`. Use this file as the source of truth when constructing a `generate` +invocation. The top-level [`SKILL.md`](SKILL.md) covers when to use the CLI; this file covers how. + +## The describe → generate flow + +For any model you haven't called before, run `describe` first to learn its input contract, then `generate` with +the right `--pipeline-kwargs`: + +```bash +# 1. Discover what kwargs the pipeline takes (no weight download) +diffusers-cli --format json describe --model black-forest-labs/FLUX.2-klein-9B + +# 2. Run it +diffusers-cli generate \ + --model black-forest-labs/FLUX.2-klein-9B \ + --pipeline-kwargs '{"prompt": "Make the cats fur grey", "image": "https://blobcdn.same.energy/a/d0/58/d058b51c2329b0ea4057e9f12cd9a1da36347e34"}' \ + --dtype bf16 +``` + +`describe --format json` emits a `{task, model, pipeline_class, inputs[]}` payload where each input is +`{name, type_hint, default, required, description}`. + +## Standard vs modular detection + +`generate` auto-detects which kind of pipeline it's calling: + +1. If `model_index.json` exists on the repo → `DiffusionPipeline.from_pretrained` path. +2. Otherwise → `ModularPipeline.from_pretrained` path. + +You don't need to tell it which. Modular repos must pass `--trust-remote-code` if they ship custom block code. + +## `--pipeline-kwargs` semantics + +A JSON object passed straight through to `pipeline(**kwargs)`. String values at known image-input keys (`image`, +`mask_image`, `control_image`, `ip_adapter_image`, `image_2`) are auto-loaded as PIL images, so you can pass URLs +or local paths directly: + +```bash +diffusers-cli generate \ + --model black-forest-labs/FLUX.2-klein-9B \ + --pipeline-kwargs '{"image": "https://example.com/cat.png", "prompt": "make the fur grey", "strength": 0.6}' +``` + +**Shell-quoting gotcha**: the JSON must be on one line (or use `\` to line-continue). A literal newline inside the +single-quoted argument lands as a raw control char inside the string and breaks `json.loads`. + +## LoRA adapters (`--lora`) + +Attach a LoRA after the pipeline loads via a JSON spec: + +```bash +diffusers-cli generate \ + --model black-forest-labs/FLUX.2-klein-9B \ + --pipeline-kwargs '{"prompt": "a tiny grey cat"}' \ + --lora '{"lora_id": "alvdansen/littletinies", "lora_scale": 0.8}' +``` + +Calls `pipeline.load_lora_weights(, adapter_name="default")` and, if `lora_scale` is present, +`pipeline.set_adapters(["default"], adapter_weights=[])`. Errors clearly if the pipeline doesn't support +LoRA or `lora_id` is missing. + +## Optimization flags + +- `--dtype {auto, bf16, fp16, fp32, …}` — pipeline weight dtype. `bf16` is the right default for modern DiTs on + A100/H100. +- `--cpu-offload {model, group}` — `model` uses `enable_model_cpu_offload`, `group` uses + `enable_group_offload(offload_type="leaf_level", use_stream=True)`. Use `group` to fit a 9B+ model on a single A100. +- `--attention-backend {default, flash_hub, flash_varlen_hub, flash_4_hub, sage_hub}` — hub-hosted kernels, + auto-downloaded on first use. Failures (kernel not available, CUDA arch mismatch, network) raise a clear + `SystemExit` listing the alternatives instead of silently reverting to the default. +- `--vae-tiling` / `--vae-slicing` — lower peak VAE decode VRAM. +- `--context-parallel` — Ulysses-style context parallelism on a DiT. See [Context parallel](#context-parallel) below. + +`disable_mmap=True` is always passed to `from_pretrained` — sequential reads are faster than mmap page-faults on +most filesystems. + +## Output handling + +`generate` sniffs the pipeline return type and saves accordingly: + +- `PIL.Image` / list of them → `outputs/generate-.png` +- Frame sequence (≥2 PILs or ndarrays) → `outputs/generate-0.mp4` (uses `--fps`, default 8) +- Numpy audio array → `outputs/generate-0.wav` (uses `--sampling-rate`) +- Anything else → JSON dump + +Override the destination with `--output ` (file or directory). + +Use `--push-to /` to upload outputs to an HF bucket after saving. The bucket is created if it +doesn't exist; objects land under `/`. + +## Remote execution (`--remote`) + +Adds `--remote` to submit the same call as a Hugging Face Job: + +```bash +diffusers-cli generate \ + --model black-forest-labs/FLUX.2-klein-9B \ + --pipeline-kwargs '{"prompt": "Make the cats fur grey", "image": "https://blobcdn.same.energy/a/d0/58/d058b51c2329b0ea4057e9f12cd9a1da36347e34"}' \ + --remote --flavor a100-large \ + --dtype bf16 \ + --cpu-offload group +``` + +What happens: + +1. Token is read from `args.token` or `huggingface_hub.get_token()`. +2. A bucket (`/jobs-artifacts` by default) is auto-created. +3. Job is submitted via `run_job` (not `run_uv_job` — needed to honor the image) with image + `pytorch/pytorch:2.10.0-cuda12.8-cudnn9-runtime` (torch 2.10 + CUDA 12.8, matches HF Jobs host driver max of + CUDA 12.9). +4. Container runs: + ``` + sh -c "uv pip install --system --break-system-packages && diffusers-cli generate ..." + ``` + Only `diffusers`-tarball + `accelerate` + `transformers` + `safetensors` are installed inline (~50 MB instead + of ~3 GB) because torch+CUDA come from the image. `--break-system-packages` bypasses PEP 668 in the image's + system Python. +5. Container logs stream to stderr; on completion the CLI downloads any files the job uploaded to the bucket + under its `run_id` prefix into `./outputs/`. +6. A timing breakdown (`queued_seconds`, `run_seconds`, `total_seconds`) is printed and included in the JSON + payload. + +Flags: + +- `--flavor ` — HF Jobs hardware (e.g. `a10g-small`, `a100-large`, `4xa100-large`). +- `--timeout ` — max wallclock (e.g. `30m`, `2h`). Defaults to `10m`. +- `--dependencies ` — extra pip deps (repeatable). +- `--namespace ` — run under a different account. +- `--no-wait` — submit, return job id, don't stream logs. +- `--push-to ` — override the artifact bucket id. + +## Context parallel + +`--context-parallel` enables Ulysses CP on a DiT-based pipeline. **Locally** the user must launch via torchrun: + +```bash +torchrun --nproc-per-node=2 -m diffusers.commands.diffusers_cli generate \ + --model black-forest-labs/FLUX.2-klein-9B \ + --pipeline-kwargs '{"prompt": "Make the cats fur grey"}' \ + --dtype bf16 \ + --context-parallel +``` + +**Remotely** the CLI handles the torchrun wrapping — just pass `--context-parallel` to a `--remote` invocation on +a multi-GPU flavor: + +```bash +diffusers-cli generate \ + --model black-forest-labs/FLUX.2-klein-9B \ + --pipeline-kwargs '{"prompt": "Make the cats fur grey", "image": "https://blobcdn.same.energy/a/d0/58/d058b51c2329b0ea4057e9f12cd9a1da36347e34"}' \ + --remote --flavor 4xa100-large \ + --dtype bf16 \ + --context-parallel +``` + +Inside the container, CP swaps the entrypoint to `torchrun --nproc-per-node=gpu -m +diffusers.commands.diffusers_cli`, initializes a hybrid process group (`cpu:gloo,cuda:nccl` — NCCL for the +attention all-to-all, Gloo for `ulysses_anything`'s per-rank size coordination), pins each rank to +`cuda:{LOCAL_RANK}`, and gates output saving/printing to rank 0 only. + +**Memory note**: CP shards the sequence, **not the weights**. Every rank still holds the full transformer. Wins +are wall-clock attention speedup and headroom for very long sequences, not "fit a model that doesn't fit." For +weight sharding you'd want TP or FSDP — not exposed in the CLI yet. + +CP is DiT-only. UNet pipelines raise a clear error directing you to a DiT pipeline (FLUX, SD3, HunyuanDiT, +AuraFlow, …). + +## Output mode (`--format`) + +The CLI auto-detects when running under an AI coding agent (Claude Code, Cursor, Aider, GH Copilot Agent — via +`CLAUDECODE`, `CLAUDE_CODE`, `CURSOR_AI`, `AIDER_AI_CONTEXT`, `GH_COPILOT_AGENT`) and switches output to **agent +mode** automatically — TSV tables, `key=value` results, compact JSON dicts, no progress bars. + +Override explicitly with `--format {auto, human, agent, json, quiet}` placed **before** the subcommand: + +```bash +diffusers-cli --format json generate --model --pipeline-kwargs '...' +``` + +The legacy `--json` flag on `generate` still works as a shortcut for `--format json`. From af8cbf40c1fca385cea1a66b9473f440cc67b547 Mon Sep 17 00:00:00 2001 From: DN6 Date: Tue, 16 Jun 2026 16:33:40 +0530 Subject: [PATCH 25/30] update --- .ai/skills/diffusers-cli/SKILL.md | 19 ++++-- .ai/skills/diffusers-cli/generate.md | 2 +- src/diffusers/commands/_output.py | 36 +++-------- src/diffusers/commands/custom_blocks.py | 2 + src/diffusers/commands/describe.py | 41 ++++-------- src/diffusers/commands/diffusers_cli.py | 2 +- src/diffusers/commands/generate.py | 83 +++++++++++++++---------- 7 files changed, 88 insertions(+), 97 deletions(-) diff --git a/.ai/skills/diffusers-cli/SKILL.md b/.ai/skills/diffusers-cli/SKILL.md index b2aedd8fca16..cf70620044d7 100644 --- a/.ai/skills/diffusers-cli/SKILL.md +++ b/.ai/skills/diffusers-cli/SKILL.md @@ -19,8 +19,6 @@ description: > | `custom_blocks` | Package a local `ModularPipelineBlocks` subclass for the Hub. | | `env` | Print versions of diffusers + torch + transformers + accelerate + safetensors + CUDA + GPU info. Use when investigating environment issues, dtype/precision support, or building bug reports. | -`fp16_safetensors` is also shipped but deprecated and not relevant to inference. - ## When to read which file Most agentic work goes through `generate`. Read the matching reference file before constructing a command: @@ -53,7 +51,18 @@ The console entry point is registered in `pyproject.toml` (`diffusers-cli = with `pip install -e . --force-reinstall --no-deps` and check `which diffusers-cli`. If the installed binary is missing recent features (e.g. you see `unrecognized arguments: --lora`), reinstall. -## Full user guide +## Output formats + +`--format {auto, human, agent, json}` (top-level flag, must appear before the subcommand): + +- **`human`** — plain-text indented output for terminals (default when not running under an agent harness). No ANSI color. +- **`agent`** — TSV tables and `key=value` lines. Auto-selected when an agent env var is present + (`CLAUDECODE`, `CLAUDE_CODE`, `CODEX_SANDBOX`, `CURSOR_AI`, `AIDER_AI_CONTEXT`, `GH_COPILOT_AGENT`, + `AI_AGENT`). Token-cheap for LLM agents to read. +- **`json`** — compact JSON. Use for programmatic parsing (scripts, services) where type fidelity and nested + structures matter. + +`stdout` carries data; `stderr` carries hints/warnings/progress — parseable output is never polluted. -For a non-agent overview with troubleshooting and tips, see [`DIFFUSERS_CLI.md`](../../../DIFFUSERS_CLI.md) at -the repo root. +Rule of thumb: `--format json` for scripts that will `json.loads()` the output, otherwise leave it on +auto-detect (`agent` for LLMs, `human` for terminals). diff --git a/.ai/skills/diffusers-cli/generate.md b/.ai/skills/diffusers-cli/generate.md index e71ba56f5590..ba64cae017c9 100644 --- a/.ai/skills/diffusers-cli/generate.md +++ b/.ai/skills/diffusers-cli/generate.md @@ -173,7 +173,7 @@ The CLI auto-detects when running under an AI coding agent (Claude Code, Cursor, `CLAUDECODE`, `CLAUDE_CODE`, `CURSOR_AI`, `AIDER_AI_CONTEXT`, `GH_COPILOT_AGENT`) and switches output to **agent mode** automatically — TSV tables, `key=value` results, compact JSON dicts, no progress bars. -Override explicitly with `--format {auto, human, agent, json, quiet}` placed **before** the subcommand: +Override explicitly with `--format {auto, human, agent, json}` placed **before** the subcommand: ```bash diffusers-cli --format json generate --model --pipeline-kwargs '...' diff --git a/src/diffusers/commands/_output.py b/src/diffusers/commands/_output.py index 1200737ec696..5c4d91e1d86c 100644 --- a/src/diffusers/commands/_output.py +++ b/src/diffusers/commands/_output.py @@ -14,10 +14,9 @@ """Dual-audience output sink for ``diffusers-cli``. Every subcommand routes user-visible output through the singleton ``out``. The mode is one of ``human`` (default for -terminals), ``agent`` (auto-selected when an AI coding agent is detected), ``json`` (machine-parseable), or ``quiet`` -(first value per record). The set of methods on ``out`` covers the shapes our commands actually produce — free-form -text, key/value results, structured dicts, and tabular schemas — so leaf commands never branch on ``args.json`` -themselves. +terminals), ``agent`` (auto-selected when an AI coding agent is detected), or ``json`` (machine-parseable). The set of +methods on ``out`` covers the shapes our commands actually produce — free-form text, key/value results, structured +dicts, and tabular schemas — so leaf commands never branch on ``args.json`` themselves. """ from __future__ import annotations @@ -30,10 +29,11 @@ # Environment variables set by known AI coding agents. Presence of any one triggers AGENT mode -# under `--format auto`. Subset of the huggingface_hub harness registry — extend as needed. +# under `--format auto`. _AGENT_ENV_VARS = ( "CLAUDECODE", # Claude Code "CLAUDE_CODE", # alt spelling + "CODEX_SANDBOX", # Codex "CURSOR_AI", # Cursor "AIDER_AI_CONTEXT", # Aider "GH_COPILOT_AGENT", # GitHub Copilot Agent @@ -50,7 +50,6 @@ class OutputFormat(str, Enum): HUMAN = "human" AGENT = "agent" JSON = "json" - QUIET = "quiet" class Output: @@ -70,9 +69,7 @@ def set_mode(self, mode: OutputFormat) -> None: # ------------------------------------------------------------------ stdout def text(self, msg: str) -> None: - """Free-form line. Suppressed in QUIET; printed plain in every other mode.""" - if self.mode == OutputFormat.QUIET: - return + """Free-form line. Printed plain in every mode.""" print(msg) def dict(self, data: dict[str, Any]) -> None: @@ -80,8 +77,6 @@ def dict(self, data: dict[str, Any]) -> None: Use for payloads that don't decompose cleanly into key/value pairs (e.g. describe schemas). """ - if self.mode == OutputFormat.QUIET: - return indent = 2 if self.mode == OutputFormat.HUMAN else None print(json.dumps(data, indent=indent, default=str)) @@ -91,7 +86,6 @@ def result(self, message: str, **data: Any) -> None: - HUMAN: ``message`` followed by `` key: value`` lines. - AGENT: ``key=value`` pairs space-separated on one line (TSV-ish, parser-friendly). - JSON: compact JSON of ``data``. - - QUIET: first non-None value. """ if self.mode == OutputFormat.HUMAN: print(message) @@ -103,20 +97,14 @@ def result(self, message: str, **data: Any) -> None: print(" ".join(parts) if parts else message) elif self.mode == OutputFormat.JSON: print(json.dumps(data, default=str)) - elif self.mode == OutputFormat.QUIET: - for v in data.values(): - if v is not None: - print(v) - return def table( self, items: Sequence[dict[str, Any]], *, headers: Optional[list[str]] = None, - id_key: Optional[str] = None, ) -> None: - """Tabular data — HUMAN gets padded columns, AGENT gets TSV, JSON gets the list, QUIET gets id_key. + """Tabular data — HUMAN gets padded columns, AGENT gets TSV, JSON gets the list. Headers default to the keys of the first item. """ @@ -134,14 +122,6 @@ def table( print(json.dumps(list(items), default=str)) return - if self.mode == OutputFormat.QUIET: - key = id_key or headers[0] - for item in items: - value = item.get(key) - if value is not None: - print(value) - return - rows = [[_cell(item.get(h)) for h in headers] for item in items] if self.mode == OutputFormat.AGENT: print("\t".join(headers)) @@ -159,8 +139,6 @@ def table( def hint(self, message: str) -> None: """Next-step suggestion. Always goes to stderr so it never pollutes parseable stdout.""" - if self.mode == OutputFormat.QUIET: - return print(f"Hint: {message}", file=sys.stderr) def warning(self, message: str) -> None: diff --git a/src/diffusers/commands/custom_blocks.py b/src/diffusers/commands/custom_blocks.py index bc0889376a95..10a17ee60b8c 100644 --- a/src/diffusers/commands/custom_blocks.py +++ b/src/diffusers/commands/custom_blocks.py @@ -82,6 +82,8 @@ def run(self): ) child_class, parent_class = out[0][0], out[0][1] + # dynamically get the custom block and initialize it to call `save_pretrained` in the current directory. + # the user is responsible for running it, so I guess that is safe? module_name = f"__dynamic__{self.block_module_name.stem}" spec = importlib.util.spec_from_file_location(module_name, str(self.block_module_name)) module = importlib.util.module_from_spec(spec) diff --git a/src/diffusers/commands/describe.py b/src/diffusers/commands/describe.py index 8dd240014540..95dbf13459af 100644 --- a/src/diffusers/commands/describe.py +++ b/src/diffusers/commands/describe.py @@ -102,18 +102,19 @@ def _describe(args: Namespace) -> None: for p in blocks.inputs ] - if args.json: - out.set_mode(OutputFormat.JSON) - - if out.mode in (OutputFormat.JSON, OutputFormat.AGENT): - # Agents get the structured schema (full payload for JSON, the inputs table for AGENT). - if out.mode == OutputFormat.JSON: - out.dict({"task": "describe", "model": args.model, "pipeline_class": class_name, "inputs": schema}) - else: - out.table(schema, headers=["name", "required", "type_hint", "default", "description"]) - return - - _print_schema(class_name, args.model, schema) + if out.mode == OutputFormat.JSON: + out.dict({"task": "describe", "model": args.model, "pipeline_class": class_name, "inputs": schema}) + elif out.mode == OutputFormat.AGENT: + out.table(schema, headers=["name", "required", "type_hint", "default", "description"]) + else: + out.text(f"{class_name} ({args.model}) inputs:") + for entry in schema: + tag = "required" if entry["required"] else f"optional, default={entry['default']!r}" + out.text(f" {entry['name']} ({tag})") + if entry["type_hint"]: + out.text(f" type: {entry['type_hint']}") + if entry["description"]: + out.text(f" desc: {entry['description']}") def _parse_docstring_args(docstring: Optional[str]) -> dict[str, str]: @@ -174,17 +175,6 @@ def _flush() -> None: return descriptions -def _print_schema(class_name: str, model: str, schema: list[dict[str, Any]]) -> None: - print(f"{class_name} ({model}) inputs:") - for entry in schema: - tag = "required" if entry["required"] else f"optional, default={entry['default']!r}" - print(f" {entry['name']} ({tag})") - if entry["type_hint"]: - print(f" type: {entry['type_hint']}") - if entry["description"]: - print(f" desc: {entry['description']}") - - class DescribeCommand(BaseDiffusersCLICommand): task = "describe" @@ -227,11 +217,6 @@ def register_subcommand(subparsers: _SubParsersAction) -> None: "the equivalent field for standard pipelines by parsing the Google-style Args: block." ), ) - parser.add_argument( - "--json", - action="store_true", - help="Emit a machine-readable JSON summary on stdout.", - ) parser.set_defaults(func=DescribeCommand) def __init__(self, args: Namespace): diff --git a/src/diffusers/commands/diffusers_cli.py b/src/diffusers/commands/diffusers_cli.py index deca219d90f4..8deb98c9916b 100644 --- a/src/diffusers/commands/diffusers_cli.py +++ b/src/diffusers/commands/diffusers_cli.py @@ -28,7 +28,7 @@ def main(): prog="diffusers-cli", usage="\n diffusers-cli [--format ] [options]", ) - parser._optionals.title = "General Options" + parser._optionals.title = "Options" parser.add_argument( "--format", choices=[m.value for m in OutputFormat], diff --git a/src/diffusers/commands/generate.py b/src/diffusers/commands/generate.py index e60e088da79f..2591dae200a1 100644 --- a/src/diffusers/commands/generate.py +++ b/src/diffusers/commands/generate.py @@ -31,7 +31,7 @@ from . import BaseDiffusersCLICommand from ._common import try_fetch_config -from ._output import OutputFormat, out +from ._output import out # --------------------------------------------------------------------------- @@ -41,13 +41,19 @@ DEFAULT_OUTPUT_DIR = "outputs" DTYPE_CHOICES = ("auto", "float16", "fp16", "bfloat16", "bf16", "float32", "fp32") CPU_OFFLOAD_CHOICES = ("model", "group") -ATTENTION_BACKEND_CHOICES = ( - "default", - "flash_hub", - "flash_varlen_hub", - "flash_4_hub", - "sage_hub", -) + + +def _hub_attention_backends() -> tuple[str, ...]: + """Hub-hosted attention backends sourced from ``_HUB_KERNELS_REGISTRY``. + + Single source of truth: if the registry grows or shrinks, the CLI choices follow. + """ + from diffusers.models.attention_dispatch import _HUB_KERNELS_REGISTRY + + return tuple(sorted(backend.value for backend in _HUB_KERNELS_REGISTRY)) + + +ATTENTION_BACKEND_CHOICES = ("default", *_hub_attention_backends()) # Keys whose string value should be resolved via ``diffusers.utils.load_image`` # before being passed to the pipeline call. @@ -72,6 +78,8 @@ "accelerate", "transformers", "safetensors", + "sentencepiece", # required by several text-encoder tokenizers (T5, LLaMA, …) + "ftfy", # required by older CLIP text-encoder paths ) # Base container image — provides torch + CUDA so ``uv pip install --system`` @@ -172,7 +180,6 @@ def _add_output_arguments(parser: ArgumentParser) -> None: "When --remote is set, defaults to /jobs-artifacts." ), ) - parser.add_argument("--json", action="store_true", help="Emit a machine-readable JSON summary on stdout.") def _add_remote_arguments(parser: ArgumentParser) -> None: @@ -241,23 +248,20 @@ def _resolve_dtype(name: Optional[str]): def _resolve_device(name: Optional[str]) -> str: if name: return name - import torch - if torch.cuda.is_available(): - # Under torchrun, LOCAL_RANK identifies this process's assigned GPU. - # Without this pin every rank falls back to cuda:0 and OOMs because the - # whole pipeline gets replicated onto a single device. + from diffusers.utils.torch_utils import torch_device + + # Under torchrun, LOCAL_RANK identifies this process's assigned GPU. Without this + # pin every rank falls back to cuda:0 and OOMs as the pipeline replicates onto a + # single device. Only applies to cuda — torch_device already handles npu/xpu/mps/etc. + if torch_device == "cuda": local_rank = os.environ.get("LOCAL_RANK") if local_rank is not None: + import torch + torch.cuda.set_device(int(local_rank)) return f"cuda:{local_rank}" - - return "cuda" - - if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): - return "mps" - - return "cpu" + return torch_device def _map_to_device(pipeline: Any, args: Namespace, device: str) -> Any: @@ -515,9 +519,13 @@ def _as_audio_arrays(value: Any): def _save_audio_arrays(audios, sampling_rate: int, args: Namespace, task: str) -> list[str]: - """Write each numpy audio array to a 16-bit PCM WAV at ``sampling_rate`` Hz.""" + """Write each numpy audio array to a 16-bit PCM WAV at ``sampling_rate`` Hz. + + Uses the stdlib ``wave`` module so no scipy dependency is required. + """ + import wave + import numpy as np - from scipy.io.wavfile import write as wavfile_write paths = _default_output_paths(task, len(audios), args.output, ext="wav") saved: list[str] = [] @@ -525,9 +533,21 @@ def _save_audio_arrays(audios, sampling_rate: int, args: Namespace, task: str) - data = np.asarray(audio) if data.dtype.kind == "f": data = (np.clip(data, -1.0, 1.0) * 32767).astype(np.int16) - if data.ndim > 1 and data.shape[0] < data.shape[-1]: - data = data.T # (channels, samples) → (samples, channels) for scipy. - wavfile_write(str(path), sampling_rate, data) + else: + data = data.astype(np.int16) + if data.ndim == 1: + n_channels = 1 + else: + # Heuristic: shorter axis is channels (interleaved layout for `wave` is + # samples × channels, so transpose if needed). + if data.shape[0] < data.shape[-1]: + data = data.T + n_channels = data.shape[1] + with wave.open(str(path), "wb") as w: + w.setnchannels(n_channels) + w.setsampwidth(2) # 16-bit PCM + w.setframerate(sampling_rate) + w.writeframes(data.tobytes()) saved.append(str(path)) return saved @@ -690,7 +710,7 @@ def _maybe_submit_remote(args: Namespace, task: str) -> bool: } if args.no_wait: - _format_result(args, payload) + _format_result(payload) return True print( @@ -703,7 +723,7 @@ def _maybe_submit_remote(args: Namespace, task: str) -> bool: payload["job_status"] = final_status payload["timing"] = _job_timing(api, job.id, args.namespace) payload["outputs"] = _download_job_artifacts(api, args.push_to, run_id, args.output) - _format_result(args, payload) + _format_result(payload) return True @@ -796,10 +816,8 @@ def _download_job_artifacts(api: Any, bucket_id: str, run_id: str, output: Optio # --------------------------------------------------------------------------- -def _format_result(args: Namespace, payload: dict[str, Any]) -> None: - """Route the result payload through ``out``. ``--json`` escalates the mode regardless of --format.""" - if args.json: - out.set_mode(OutputFormat.JSON) +def _format_result(payload: dict[str, Any]) -> None: + """Route the result payload through the output sink.""" out.result(payload.get("task", "done"), **payload) @@ -885,7 +903,6 @@ def run(self) -> None: pushed = _push_outputs(self.args, saved, self.task) _format_result( - self.args, { "task": self.task, "model": self.args.model, From b50dae1ac2b32bc958d96a7ab2c9dd6c918ad501 Mon Sep 17 00:00:00 2001 From: DN6 Date: Tue, 16 Jun 2026 17:07:38 +0530 Subject: [PATCH 26/30] update --- src/diffusers/commands/_common.py | 3 +-- src/diffusers/commands/_output.py | 4 ++-- src/diffusers/commands/describe.py | 14 ++++++------- src/diffusers/commands/generate.py | 33 +++++++++++++++++------------- 4 files changed, 28 insertions(+), 26 deletions(-) diff --git a/src/diffusers/commands/_common.py b/src/diffusers/commands/_common.py index df242628841d..bd95b3f88969 100644 --- a/src/diffusers/commands/_common.py +++ b/src/diffusers/commands/_common.py @@ -21,10 +21,9 @@ from argparse import Namespace from pathlib import Path -from typing import Optional -def try_fetch_config(args: Namespace, filename: str) -> Optional[str]: +def try_fetch_config(args: Namespace, filename: str) -> str | None: """Resolve ``filename`` for ``args.model`` (local path or Hub repo). Return None if absent. Used by ``generate`` (to detect modular vs standard pipelines) and ``describe`` (to read the pipeline class for diff --git a/src/diffusers/commands/_output.py b/src/diffusers/commands/_output.py index 5c4d91e1d86c..c155d789990f 100644 --- a/src/diffusers/commands/_output.py +++ b/src/diffusers/commands/_output.py @@ -25,7 +25,7 @@ import os import sys from enum import Enum -from typing import Any, Optional, Sequence +from typing import Any, Sequence # Environment variables set by known AI coding agents. Presence of any one triggers AGENT mode @@ -102,7 +102,7 @@ def table( self, items: Sequence[dict[str, Any]], *, - headers: Optional[list[str]] = None, + headers: list[str] | None = None, ) -> None: """Tabular data — HUMAN gets padded columns, AGENT gets TSV, JSON gets the list. diff --git a/src/diffusers/commands/describe.py b/src/diffusers/commands/describe.py index 95dbf13459af..5a9514961211 100644 --- a/src/diffusers/commands/describe.py +++ b/src/diffusers/commands/describe.py @@ -21,9 +21,11 @@ from __future__ import annotations +import inspect import json +import re from argparse import ArgumentParser, Namespace, _SubParsersAction -from typing import Any, Optional +from typing import Any from . import BaseDiffusersCLICommand from ._common import try_fetch_config @@ -37,8 +39,6 @@ def _describe(args: Namespace) -> None: pipeline class's ``__call__`` signature. Otherwise falls back to ``ModularPipelineBlocks.from_pretrained`` and reads the block-declared ``inputs``. No weights downloaded either way. """ - import inspect - import diffusers model_index = try_fetch_config(args, diffusers.DiffusionPipeline.config_name) @@ -117,7 +117,7 @@ def _describe(args: Namespace) -> None: out.text(f" desc: {entry['description']}") -def _parse_docstring_args(docstring: Optional[str]) -> dict[str, str]: +def _parse_docstring_args(docstring: str | None) -> dict[str, str]: """Extract per-argument descriptions from a Google-style ``Args:`` block. Returns a ``{name: description}`` mapping. Best-effort — unrecognised formats just yield an empty dict rather than @@ -126,8 +126,6 @@ def _parse_docstring_args(docstring: Optional[str]) -> dict[str, str]: if not docstring: return {} - import re - lines = docstring.expandtabs().splitlines() start = None section_indent = 0 @@ -140,9 +138,9 @@ def _parse_docstring_args(docstring: Optional[str]) -> dict[str, str]: return {} descriptions: dict[str, str] = {} - current_name: Optional[str] = None + current_name: str | None = None current_lines: list[str] = [] - arg_indent: Optional[int] = None + arg_indent: int | None = None name_pattern = re.compile(r"^(\w+)\s*(?:\([^)]*\))?\s*:?\s*(.*)$") def _flush() -> None: diff --git a/src/diffusers/commands/generate.py b/src/diffusers/commands/generate.py index 2591dae200a1..c86cd28df730 100644 --- a/src/diffusers/commands/generate.py +++ b/src/diffusers/commands/generate.py @@ -25,7 +25,7 @@ import sys from argparse import ArgumentParser, Namespace, _SubParsersAction from pathlib import Path -from typing import Any, Optional +from typing import Any from diffusers.utils import load_image @@ -80,6 +80,7 @@ def _hub_attention_backends() -> tuple[str, ...]: "safetensors", "sentencepiece", # required by several text-encoder tokenizers (T5, LLaMA, …) "ftfy", # required by older CLIP text-encoder paths + "kernels", # required by hub-hosted attention backends (flash_hub, sage_hub, …) ) # Base container image — provides torch + CUDA so ``uv pip install --system`` @@ -105,6 +106,7 @@ def _hub_attention_backends() -> tuple[str, ...]: "poll_interval", "func", "format", # top-level --format is a local rendering flag; never forward to the container + "device", # local device pin; container auto-detects its own (cuda:0 or LOCAL_RANK) } ) @@ -227,7 +229,7 @@ def _add_remote_arguments(parser: ArgumentParser) -> None: # --------------------------------------------------------------------------- -def _resolve_dtype(name: Optional[str]): +def _resolve_dtype(name: str | None): if name in (None, "auto"): return "auto" import torch @@ -245,7 +247,7 @@ def _resolve_dtype(name: Optional[str]): return mapping[name] -def _resolve_device(name: Optional[str]) -> str: +def _resolve_device(name: str | None) -> str: if name: return name @@ -281,7 +283,7 @@ def _map_to_device(pipeline: Any, args: Namespace, device: str) -> Any: return pipeline -def _denoiser(pipeline: Any) -> Optional[Any]: +def _denoiser(pipeline: Any) -> Any | None: """Return the pipeline's denoiser submodule (transformer or unet) or None.""" for attr in ("transformer", "unet"): module = getattr(pipeline, attr, None) @@ -410,7 +412,7 @@ def _is_modular_repo(args: Namespace) -> bool: # --------------------------------------------------------------------------- -def _parse_pipeline_kwargs(raw: Optional[str]) -> dict[str, Any]: +def _parse_pipeline_kwargs(raw: str | None) -> dict[str, Any]: if not raw: return {} try: @@ -430,7 +432,7 @@ def _resolve_image_inputs(call_kwargs: dict[str, Any]) -> None: call_kwargs[key] = load_image(value) -def _get_generator(seed: Optional[int], device: str): +def _get_generator(seed: int | None, device: str): if seed is None: return None import torch @@ -456,7 +458,7 @@ def _result_to_savable(result: Any) -> Any: # --------------------------------------------------------------------------- -def _default_output_paths(task: str, num: int, explicit: Optional[str], ext: str) -> list[Path]: +def _default_output_paths(task: str, num: int, explicit: str | None, ext: str) -> list[Path]: if explicit is None: base = Path(DEFAULT_OUTPUT_DIR) base.mkdir(parents=True, exist_ok=True) @@ -583,7 +585,7 @@ def _save_output(value: Any, args: Namespace, task: str) -> list[str]: # --------------------------------------------------------------------------- -def _push_outputs(args: Namespace, saved_paths: list[str], task: str) -> Optional[dict[str, Any]]: +def _push_outputs(args: Namespace, saved_paths: list[str], task: str) -> dict[str, Any] | None: """Upload ``saved_paths`` to the ``--push-to`` bucket. Returns a summary or None.""" if not args.push_to: return None @@ -636,6 +638,9 @@ def _maybe_submit_remote(args: Namespace, task: str) -> bool: if not args.remote: return False + if args.device is not None: + out.warning(f"--device {args.device!r} is ignored with --remote; the container auto-detects its GPU.") + print( f"[diffusers-cli] preparing remote {task!r} job on flavor={args.flavor!r}...", file=sys.stderr, @@ -727,7 +732,7 @@ def _maybe_submit_remote(args: Namespace, task: str) -> bool: return True -def _job_timing(api: Any, job_id: str, namespace: Optional[str]) -> dict[str, Optional[float]]: +def _job_timing(api: Any, job_id: str, namespace: str | None) -> dict[str, float | None]: """Return queue/run/total wallclock seconds for ``job_id`` from inspect_job timestamps. inspect_job sometimes returns finished_at=None for a few seconds after the container exits while HF Jobs propagates @@ -742,7 +747,7 @@ def _job_timing(api: Any, job_id: str, namespace: Optional[str]) -> dict[str, Op time.sleep(1.0) info = api.inspect_job(job_id=job_id, namespace=namespace) - def _delta(start, end) -> Optional[float]: + def _delta(start, end) -> float | None: return (end - start).total_seconds() if (start is not None and end is not None) else None timing = { @@ -756,7 +761,7 @@ def _delta(start, end) -> Optional[float]: return timing -def _wait_for_job(api: Any, job_id: str, namespace: Optional[str], poll_interval: float) -> str: +def _wait_for_job(api: Any, job_id: str, namespace: str | None, poll_interval: float) -> str: """Stream container logs to stderr until the job terminates; return the final stage.""" fetch = getattr(api, "fetch_job_logs", None) if fetch is not None: @@ -770,12 +775,12 @@ def _wait_for_job(api: Any, job_id: str, namespace: Optional[str], poll_interval return _poll_for_job(api, job_id, namespace, poll_interval) -def _poll_for_job(api: Any, job_id: str, namespace: Optional[str], poll_interval: float) -> str: +def _poll_for_job(api: Any, job_id: str, namespace: str | None, poll_interval: float) -> str: """Heartbeat-style fallback when ``fetch_job_logs`` isn't available.""" import time terminal = {"COMPLETED", "CANCELED", "ERROR", "DELETED"} - last_stage: Optional[str] = None + last_stage: str | None = None while True: info = api.inspect_job(job_id=job_id, namespace=namespace) stage = str(info.status.stage) if info.status else "UNKNOWN" @@ -792,7 +797,7 @@ def _poll_for_job(api: Any, job_id: str, namespace: Optional[str], poll_interval time.sleep(poll_interval) -def _download_job_artifacts(api: Any, bucket_id: str, run_id: str, output: Optional[str]) -> list[str]: +def _download_job_artifacts(api: Any, bucket_id: str, run_id: str, output: str | None) -> list[str]: """Download every file under ``/`` from ``bucket_id`` into a local directory.""" from huggingface_hub import BucketFile From 1d6f5b315e58ce6ddb821934aec8db2a17ffeb65 Mon Sep 17 00:00:00 2001 From: DN6 Date: Tue, 16 Jun 2026 17:45:35 +0530 Subject: [PATCH 27/30] update --- .ai/skills/diffusers-cli/generate.md | 23 ++++------ src/diffusers/commands/_output.py | 60 +++++++++---------------- src/diffusers/commands/custom_blocks.py | 15 +++++++ src/diffusers/commands/describe.py | 15 +++++++ src/diffusers/commands/generate.py | 27 +++++++++-- 5 files changed, 83 insertions(+), 57 deletions(-) diff --git a/.ai/skills/diffusers-cli/generate.md b/.ai/skills/diffusers-cli/generate.md index ba64cae017c9..4ba9738ba94e 100644 --- a/.ai/skills/diffusers-cli/generate.md +++ b/.ai/skills/diffusers-cli/generate.md @@ -105,21 +105,14 @@ diffusers-cli generate \ What happens: -1. Token is read from `args.token` or `huggingface_hub.get_token()`. -2. A bucket (`/jobs-artifacts` by default) is auto-created. -3. Job is submitted via `run_job` (not `run_uv_job` — needed to honor the image) with image - `pytorch/pytorch:2.10.0-cuda12.8-cudnn9-runtime` (torch 2.10 + CUDA 12.8, matches HF Jobs host driver max of - CUDA 12.9). -4. Container runs: - ``` - sh -c "uv pip install --system --break-system-packages && diffusers-cli generate ..." - ``` - Only `diffusers`-tarball + `accelerate` + `transformers` + `safetensors` are installed inline (~50 MB instead - of ~3 GB) because torch+CUDA come from the image. `--break-system-packages` bypasses PEP 668 in the image's - system Python. -5. Container logs stream to stderr; on completion the CLI downloads any files the job uploaded to the bucket - under its `run_id` prefix into `./outputs/`. -6. A timing breakdown (`queued_seconds`, `run_seconds`, `total_seconds`) is printed and included in the JSON +1. Your HF token is picked up (from `--token` or your login). +2. A bucket (`/jobs-artifacts` by default) is created if it doesn't exist. +3. The job runs in a pytorch container that already has torch + CUDA preinstalled. Only the small Python + deps (`diffusers`, `accelerate`, `transformers`, `safetensors`) are installed at container start — about + 50 MB instead of 3 GB. +4. Container logs stream to your terminal. When the job finishes, the CLI downloads every file the job + uploaded to the bucket under its `run_id` prefix into `./outputs/`. +5. A timing breakdown (`queued_seconds`, `run_seconds`, `total_seconds`) is printed and included in the JSON payload. Flags: diff --git a/src/diffusers/commands/_output.py b/src/diffusers/commands/_output.py index c155d789990f..5b08e2c909e4 100644 --- a/src/diffusers/commands/_output.py +++ b/src/diffusers/commands/_output.py @@ -11,25 +11,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Dual-audience output sink for ``diffusers-cli``. +"""Output formatting for ``diffusers-cli``. -Every subcommand routes user-visible output through the singleton ``out``. The mode is one of ``human`` (default for -terminals), ``agent`` (auto-selected when an AI coding agent is detected), or ``json`` (machine-parseable). The set of -methods on ``out`` covers the shapes our commands actually produce — free-form text, key/value results, structured -dicts, and tabular schemas — so leaf commands never branch on ``args.json`` themselves. +Commands print through the singleton ``out`` instead of calling ``print`` directly. ``out`` picks the right format +(human, agent, or json) based on the top-level ``--format`` flag, so commands don't have to check the mode themselves. """ from __future__ import annotations import json import os -import sys from enum import Enum from typing import Any, Sequence -# Environment variables set by known AI coding agents. Presence of any one triggers AGENT mode -# under `--format auto`. +# Environment variables set by known AI coding agents. If any of these is set, `--format auto` +# picks AGENT mode instead of HUMAN. _AGENT_ENV_VARS = ( "CLAUDECODE", # Claude Code "CLAUDE_CODE", # alt spelling @@ -41,7 +38,7 @@ def is_agent() -> bool: - """Return True if the process appears to be invoked by an AI coding agent.""" + """Return True if the CLI is being run by an AI coding agent.""" return any(os.environ.get(v) for v in _AGENT_ENV_VARS) @@ -53,7 +50,7 @@ class OutputFormat(str, Enum): class Output: - """Singleton output sink. Resolve mode once at startup, then call ``out.``.""" + """Picks the print format for each method based on the active mode (human / agent / json).""" mode: OutputFormat @@ -61,7 +58,7 @@ def __init__(self) -> None: self.set_mode(OutputFormat.AUTO) def set_mode(self, mode: OutputFormat) -> None: - """Set the active output mode. AUTO resolves to AGENT or HUMAN via ``is_agent()``.""" + """Set the active output mode. AUTO becomes AGENT or HUMAN based on is_agent().""" if mode == OutputFormat.AUTO: mode = OutputFormat.AGENT if is_agent() else OutputFormat.HUMAN self.mode = mode @@ -69,23 +66,20 @@ def set_mode(self, mode: OutputFormat) -> None: # ------------------------------------------------------------------ stdout def text(self, msg: str) -> None: - """Free-form line. Printed plain in every mode.""" + """Print a line of text. Same in every mode.""" print(msg) def dict(self, data: dict[str, Any]) -> None: - """Structured object — JSON in every mode (indented for HUMAN, compact otherwise). - - Use for payloads that don't decompose cleanly into key/value pairs (e.g. describe schemas). - """ + """Print a dict as JSON. Indented for HUMAN, compact for AGENT and JSON.""" indent = 2 if self.mode == OutputFormat.HUMAN else None print(json.dumps(data, indent=indent, default=str)) def result(self, message: str, **data: Any) -> None: - """Success summary. + """Print a result summary. - - HUMAN: ``message`` followed by `` key: value`` lines. - - AGENT: ``key=value`` pairs space-separated on one line (TSV-ish, parser-friendly). - - JSON: compact JSON of ``data``. + - HUMAN: the message line followed by `` key: value`` lines. + - AGENT: ``key=value`` pairs separated by spaces on one line. + - JSON: compact JSON of the data dict. """ if self.mode == OutputFormat.HUMAN: print(message) @@ -104,9 +98,13 @@ def table( *, headers: list[str] | None = None, ) -> None: - """Tabular data — HUMAN gets padded columns, AGENT gets TSV, JSON gets the list. + """Print a list of dicts as a table. - Headers default to the keys of the first item. + - HUMAN: columns padded so each column lines up. + - AGENT: tab-separated values, one row per line. + - JSON: the list itself as a JSON array. + + ``headers`` defaults to the keys of the first item. """ if not items: if self.mode in (OutputFormat.HUMAN, OutputFormat.AGENT): @@ -129,26 +127,12 @@ def table( print("\t".join(row)) return - # HUMAN: pad each column to its widest cell for readable alignment. + # HUMAN: pad each column to its widest cell so they line up. widths = [max(len(h), *(len(r[i]) for r in rows)) for i, h in enumerate(headers)] print(" ".join(h.ljust(widths[i]) for i, h in enumerate(headers))) for row in rows: print(" ".join(c.ljust(widths[i]) for i, c in enumerate(row))) - # ------------------------------------------------------------------ stderr - - def hint(self, message: str) -> None: - """Next-step suggestion. Always goes to stderr so it never pollutes parseable stdout.""" - print(f"Hint: {message}", file=sys.stderr) - - def warning(self, message: str) -> None: - """Non-fatal warning — stderr, every mode.""" - print(f"Warning: {message}", file=sys.stderr) - - def error(self, message: str) -> None: - """Error — stderr, every mode.""" - print(f"Error: {message}", file=sys.stderr) - def _cell(value: Any) -> str: if value is None: @@ -156,5 +140,5 @@ def _cell(value: Any) -> str: return str(value) -# Module-level singleton imported by every subcommand. +# Shared instance imported by every subcommand. out = Output() diff --git a/src/diffusers/commands/custom_blocks.py b/src/diffusers/commands/custom_blocks.py index 10a17ee60b8c..324978c83d3a 100644 --- a/src/diffusers/commands/custom_blocks.py +++ b/src/diffusers/commands/custom_blocks.py @@ -38,10 +38,25 @@ def conversion_command_factory(args: Namespace): class CustomBlocksCommand(BaseDiffusersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): + from argparse import RawDescriptionHelpFormatter + + epilog = ( + "Examples\n" + " $ diffusers-cli custom_blocks\n" + " $ diffusers-cli custom_blocks --block_module_name my_block.py\n" + " $ diffusers-cli custom_blocks --block_module_name my_block.py --block_class_name MyDenoiseBlock\n" + "\n" + "Learn more\n" + " Use `diffusers-cli --help` for more information about a command.\n" + " Read the documentation at https://huggingface.co/docs/diffusers\n" + ) + conversion_parser = parser.add_parser( "custom_blocks", help="Package a local ModularPipelineBlocks subclass for the Hub.", usage="\n diffusers-cli custom_blocks [options]", + epilog=epilog, + formatter_class=RawDescriptionHelpFormatter, ) conversion_parser._optionals.title = "Options" conversion_parser.add_argument( diff --git a/src/diffusers/commands/describe.py b/src/diffusers/commands/describe.py index 5a9514961211..12b894dfe9ce 100644 --- a/src/diffusers/commands/describe.py +++ b/src/diffusers/commands/describe.py @@ -178,10 +178,25 @@ class DescribeCommand(BaseDiffusersCLICommand): @staticmethod def register_subcommand(subparsers: _SubParsersAction) -> None: + from argparse import RawDescriptionHelpFormatter + + epilog = ( + "Examples\n" + " $ diffusers-cli describe -m stabilityai/stable-diffusion-xl-base-1.0\n" + " $ diffusers-cli describe -m black-forest-labs/FLUX.1-dev --verbose\n" + " $ diffusers-cli --format json describe -m stabilityai/stable-diffusion-xl-base-1.0\n" + "\n" + "Learn more\n" + " Use `diffusers-cli --help` for more information about a command.\n" + " Read the documentation at https://huggingface.co/docs/diffusers\n" + ) + parser: ArgumentParser = subparsers.add_parser( "describe", help="Print the input schema for a diffusers pipeline repo. No weights downloaded.", usage="\n diffusers-cli describe [options]", + epilog=epilog, + formatter_class=RawDescriptionHelpFormatter, ) parser._optionals.title = "Options" parser.add_argument( diff --git a/src/diffusers/commands/generate.py b/src/diffusers/commands/generate.py index c86cd28df730..8f5d0d5c824b 100644 --- a/src/diffusers/commands/generate.py +++ b/src/diffusers/commands/generate.py @@ -106,7 +106,6 @@ def _hub_attention_backends() -> tuple[str, ...]: "poll_interval", "func", "format", # top-level --format is a local rendering flag; never forward to the container - "device", # local device pin; container auto-detects its own (cuda:0 or LOCAL_RANK) } ) @@ -638,9 +637,6 @@ def _maybe_submit_remote(args: Namespace, task: str) -> bool: if not args.remote: return False - if args.device is not None: - out.warning(f"--device {args.device!r} is ignored with --remote; the container auto-detects its GPU.") - print( f"[diffusers-cli] preparing remote {task!r} job on flavor={args.flavor!r}...", file=sys.stderr, @@ -836,10 +832,33 @@ class GenerateCommand(BaseDiffusersCLICommand): @staticmethod def register_subcommand(subparsers: _SubParsersAction) -> None: + from argparse import RawDescriptionHelpFormatter + + epilog = ( + "Examples\n" + " $ diffusers-cli generate -m black-forest-labs/FLUX.1-dev --dtype bf16 \\\n" + ' --pipeline-kwargs \'{"prompt": "a cat on the moon"}\'\n' + " $ diffusers-cli generate -m black-forest-labs/FLUX.1-dev --dtype bf16 \\\n" + ' --pipeline-kwargs \'{"prompt": "make the fur grey", "image": "https://example.com/cat.png"}\'\n' + " $ diffusers-cli generate -m black-forest-labs/FLUX.1-dev --dtype bf16 \\\n" + ' --pipeline-kwargs \'{"prompt": "a tiny cat"}\' \\\n' + ' --lora \'{"lora_id": "alvdansen/littletinies", "lora_scale": 0.8}\'\n' + " $ diffusers-cli generate -m black-forest-labs/FLUX.1-dev --dtype bf16 \\\n" + ' --pipeline-kwargs \'{"prompt": "a cat"}\' --remote --flavor a100-large\n' + " $ diffusers-cli generate -m black-forest-labs/FLUX.1-dev --dtype bf16 --context-parallel \\\n" + ' --pipeline-kwargs \'{"prompt": "a cat"}\' --remote --flavor 4xa100-large\n' + "\n" + "Learn more\n" + " Use `diffusers-cli --help` for more information about a command.\n" + " Read the documentation at https://huggingface.co/docs/diffusers\n" + ) + parser: ArgumentParser = subparsers.add_parser( "generate", help="Run any diffusers pipeline locally or remotely with HF Jobs.", usage="\n diffusers-cli generate [options]", + epilog=epilog, + formatter_class=RawDescriptionHelpFormatter, ) parser._optionals.title = "Options" _add_loading_arguments(parser) From 043919219956a60cf7f749bcddcf5f65435a0440 Mon Sep 17 00:00:00 2001 From: DN6 Date: Thu, 18 Jun 2026 15:02:53 +0530 Subject: [PATCH 28/30] update --- src/diffusers/commands/generate.py | 61 +++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/src/diffusers/commands/generate.py b/src/diffusers/commands/generate.py index 8f5d0d5c824b..e20a9c21d9d8 100644 --- a/src/diffusers/commands/generate.py +++ b/src/diffusers/commands/generate.py @@ -164,6 +164,20 @@ def _add_optimization_arguments(parser: ArgumentParser) -> None: "Requires a DiT-based pipeline and launching the CLI under torchrun with ≥2 GPUs." ), ) + parser.add_argument( + "--compile", + nargs="?", + const="{}", + default=None, + metavar="JSON", + help=( + "torch.compile every denoiser submodule on the pipeline. Accepts an optional JSON " + 'object of kwargs forwarded to ``torch.compile``, e.g. \'{"mode": "max-autotune", ' + '"fullgraph": true}\'. Bare ``--compile`` uses torch defaults. Adds a one-time compilation ' + "cost on the first step but speeds up every subsequent step — worth it for multi-step " + "generation (50+ steps)." + ), + ) def _add_output_arguments(parser: ArgumentParser) -> None: @@ -334,15 +348,52 @@ def _enable_context_parallel(pipeline: Any) -> None: def _apply_optimizations(pipeline: Any, args: Namespace) -> None: - """Apply VAE tiling/slicing, attention backend, and context-parallel toggles.""" - if args.vae_tiling and hasattr(pipeline, "enable_vae_tiling"): - pipeline.enable_vae_tiling() - if args.vae_slicing and hasattr(pipeline, "enable_vae_slicing"): - pipeline.enable_vae_slicing() + """Apply VAE tiling/slicing, attention backend, context-parallel, and torch.compile toggles.""" + vae = getattr(pipeline, "vae", None) + if args.vae_tiling and vae is not None and hasattr(vae, "enable_tiling"): + vae.enable_tiling() + if args.vae_slicing and vae is not None and hasattr(vae, "enable_slicing"): + vae.enable_slicing() if args.attention_backend != "default": _set_attention_backend(pipeline, args.attention_backend) if args.context_parallel: _enable_context_parallel(pipeline) + if args.compile is not None: + _compile_denoiser(pipeline, args.compile) + + +def _compile_denoiser(pipeline: Any, compile_spec: str) -> None: + """Compile every ``transformer*`` and ``unet*`` submodule on the pipeline. + + ``compile_spec`` is the raw JSON string from ``--compile`` (``"{}"`` for bare flag). Decoded into kwargs and + forwarded verbatim to the compile call. + + Prefers regional compilation via ``module.compile_repeated_blocks(**kwargs)`` — only compiles the repeated inner + blocks (the bulk of the compute), much faster first-step latency than compiling the whole module. Falls back to + full ``torch.compile`` if the model doesn't expose ``_repeated_blocks``. + """ + import torch + + try: + compile_kwargs = json.loads(compile_spec) + except json.JSONDecodeError as e: + raise SystemExit(f"--compile must be valid JSON: {e}") from e + if not isinstance(compile_kwargs, dict): + raise SystemExit("--compile must decode to a JSON object.") + + for attr in dir(pipeline): + if not (attr.startswith("transformer") or attr.startswith("unet")): + continue + module = getattr(pipeline, attr, None) + if not isinstance(module, torch.nn.Module): + continue + + if getattr(module, "_repeated_blocks", None): + # Regional compile — only the repeated blocks. Mutates `module` in place. + module.compile_repeated_blocks(**compile_kwargs) + else: + # No regional metadata declared; fall back to compiling the whole module. + setattr(pipeline, attr, torch.compile(module, **compile_kwargs)) def _from_pretrained_kwargs(args: Namespace) -> dict[str, Any]: From bf8fe64b16e789bb69a2c36c807844cd003c34fd Mon Sep 17 00:00:00 2001 From: DN6 Date: Thu, 18 Jun 2026 15:06:16 +0530 Subject: [PATCH 29/30] update --- src/diffusers/commands/generate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/diffusers/commands/generate.py b/src/diffusers/commands/generate.py index e20a9c21d9d8..da9eadbafe84 100644 --- a/src/diffusers/commands/generate.py +++ b/src/diffusers/commands/generate.py @@ -80,7 +80,6 @@ def _hub_attention_backends() -> tuple[str, ...]: "safetensors", "sentencepiece", # required by several text-encoder tokenizers (T5, LLaMA, …) "ftfy", # required by older CLIP text-encoder paths - "kernels", # required by hub-hosted attention backends (flash_hub, sage_hub, …) ) # Base container image — provides torch + CUDA so ``uv pip install --system`` From 8354f6e09c5e7e1ebc10770a60fa76c8f36fde0a Mon Sep 17 00:00:00 2001 From: DN6 Date: Thu, 18 Jun 2026 17:46:45 +0530 Subject: [PATCH 30/30] update --- src/diffusers/commands/generate.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/diffusers/commands/generate.py b/src/diffusers/commands/generate.py index da9eadbafe84..432cf1576679 100644 --- a/src/diffusers/commands/generate.py +++ b/src/diffusers/commands/generate.py @@ -166,15 +166,16 @@ def _add_optimization_arguments(parser: ArgumentParser) -> None: parser.add_argument( "--compile", nargs="?", - const="{}", + const='{"mode": "max-autotune-no-cudagraphs"}', default=None, metavar="JSON", help=( "torch.compile every denoiser submodule on the pipeline. Accepts an optional JSON " 'object of kwargs forwarded to ``torch.compile``, e.g. \'{"mode": "max-autotune", ' - '"fullgraph": true}\'. Bare ``--compile`` uses torch defaults. Adds a one-time compilation ' - "cost on the first step but speeds up every subsequent step — worth it for multi-step " - "generation (50+ steps)." + '"fullgraph": true}\'. Bare ``--compile`` uses ``mode=max-autotune-no-cudagraphs`` — ' + "CUDA Graphs break with regional/repeated-block compile because sequential blocks " + "overwrite each other's output buffers. Adds a one-time compilation cost on the first " + "step but speeds up every subsequent step — worth it for multi-step generation (50+ steps)." ), )