Spaces:

amro-alasri
/

anycoder-1520cb3b

Runtime error

App Files Files Community

amro-alasri commited on 23 days ago

Commit

73fe9ee

verified ·

1 Parent(s): 6087d3e

Upload folder using huggingface_hub

Browse files

Files changed (43) hide show

app.py +122 -0
depth_anything_3/api.py +414 -0
depth_anything_3/app/css_and_html.py +594 -0
depth_anything_3/app/gradio_app.py +777 -0
depth_anything_3/app/modules/__init__.py +45 -0
depth_anything_3/app/modules/event_handlers.py +629 -0
depth_anything_3/app/modules/file_handlers.py +304 -0
depth_anything_3/app/modules/model_inference.py +365 -0
depth_anything_3/app/modules/ui_components.py +474 -0
depth_anything_3/app/modules/utils.py +211 -0
depth_anything_3/app/modules/visualization.py +434 -0
depth_anything_3/cfg.py +144 -0
depth_anything_3/cli.py +43 -0
depth_anything_3/model/__init__.py +20 -0
depth_anything_3/model/cam_dec.py +45 -0
depth_anything_3/model/cam_enc.py +80 -0
depth_anything_3/model/da3.py +378 -0
depth_anything_3/model/dinov2/dinov2.py +64 -0
depth_anything_3/model/dinov2/layers/__init__.py +25 -0
depth_anything_3/model/dinov2/layers/attention.py +100 -0
depth_anything_3/model/dinov2/layers/block.py +143 -0
depth_anything_3/model/dinov2/layers/drop_path.py +35 -0
depth_anything_3/model/dinov2/layers/layer_scale.py +31 -0
depth_anything_3/model/dinov2/layers/mlp.py +40 -0
depth_anything_3/model/dinov2/layers/patch_embed.py +94 -0
depth_anything_3/model/dinov2/layers/rope.py +200 -0
depth_anything_3/model/dinov2/layers/swiglu_ffn.py +62 -0
depth_anything_3/model/dinov2/vision_transformer.py +437 -0
depth_anything_3/model/dpt.py +457 -0
depth_anything_3/model/dualdpt.py +488 -0
depth_anything_3/model/gs_adapter.py +200 -0
depth_anything_3/model/gsdpt.py +133 -0
depth_anything_3/model/utils/attention.py +109 -0
depth_anything_3/model/utils/block.py +81 -0
depth_anything_3/model/utils/gs_renderer.py +340 -0
depth_anything_3/model/utils/head_utils.py +230 -0
depth_anything_3/model/utils/transform.py +208 -0
depth_anything_3/registry.py +50 -0
depth_anything_3/services/__init__.py +24 -0
depth_anything_3/services/backend.py +538 -0
depth_anything_3/services/gallery.py +562 -0
depth_anything_3/services/inference_service.py +225 -0
requirements.txt +1 -0

app.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Hugging Face Spaces App for Depth Anything 3.
+This app uses the @spaces.GPU decorator to dynamically allocate GPU resources
+for model inference on Hugging Face Spaces.
+"""
+import os
+import spaces
+from depth_anything_3.app.gradio_app import DepthAnything3App
+from depth_anything_3.app.modules.model_inference import ModelInference
+# Apply @spaces.GPU decorator to run_inference method
+# This ensures GPU operations happen in isolated subprocess
+# Model loading and inference will occur in GPU subprocess, not main process
+original_run_inference = ModelInference.run_inference
+@spaces.GPU(duration=120)  # Request GPU for up to 120 seconds per inference
+def gpu_run_inference(self, *args, **kwargs):
+    """
+    GPU-accelerated inference with Spaces decorator.
+    This function runs in a GPU subprocess where:
+    - Model is loaded and moved to GPU (safe)
+    - CUDA operations are allowed
+    - All CUDA tensors are moved to CPU before return (for pickle safety)
+    """
+    return original_run_inference(self, *args, **kwargs)
+# Replace the original method with the GPU-decorated version
+ModelInference.run_inference = gpu_run_inference
+# Initialize and launch the app
+if __name__ == "__main__":
+    # Configure directories for Hugging Face Spaces
+    model_dir = os.environ.get("DA3_MODEL_DIR", "depth-anything/DA3NESTED-GIANT-LARGE")
+    workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "workspace/gradio")
+    gallery_dir = os.environ.get("DA3_GALLERY_DIR", "workspace/gallery")
+    # Create directories if they don't exist
+    os.makedirs(workspace_dir, exist_ok=True)
+    os.makedirs(gallery_dir, exist_ok=True)
+    # Initialize the app
+    app = DepthAnything3App(
+        model_dir=model_dir,
+        workspace_dir=workspace_dir,
+        gallery_dir=gallery_dir
+    )
+    # Check if examples directory exists
+    examples_dir = os.path.join(workspace_dir, "examples")
+    examples_exist = os.path.exists(examples_dir)
+    # Check if caching is enabled via environment variable (default: True if examples exist)
+    # Allow disabling via environment variable: DA3_CACHE_EXAMPLES=false
+    cache_examples_env = os.environ.get("DA3_CACHE_EXAMPLES", "").lower()
+    if cache_examples_env in ("false", "0", "no"):
+        cache_examples = False
+    elif cache_examples_env in ("true", "1", "yes"):
+        cache_examples = True
+    else:
+        # Default: enable caching if examples directory exists
+        cache_examples = examples_exist
+    # Get cache_gs_tag from environment variable (default: "dl3dv")
+    cache_gs_tag = os.environ.get("DA3_CACHE_GS_TAG", "dl3dv")
+    # Launch with Spaces-friendly settings
+    print("🚀 Launching Depth Anything 3 on Hugging Face Spaces...")
+    print(f"📦 Model Directory: {model_dir}")
+    print(f"📁 Workspace Directory: {workspace_dir}")
+    print(f"🖼️  Gallery Directory: {gallery_dir}")
+    print(f"💾 Cache Examples: {cache_examples}")
+    if cache_examples:
+        if cache_gs_tag:
+            print(f"🏷️  Cache GS Tag: '{cache_gs_tag}' (scenes matching this tag will use high-res + 3DGS)")
+        else:
+            print("🏷️  Cache GS Tag: None (all scenes will use low-res only)")
+    # Pre-cache examples if requested
+    if cache_examples:
+        print("\n" + "=" * 60)
+        print("Pre-caching mode enabled")
+        if cache_gs_tag:
+            print(f"Scenes containing '{cache_gs_tag}' will use HIGH-RES + 3DGS")
+            print("Other scenes will use LOW-RES only")
+        else:
+            print("All scenes will use LOW-RES only")
+        print("=" * 60)
+        app.cache_examples(
+            show_cam=True,
+            filter_black_bg=False,
+            filter_white_bg=False,
+            save_percentage=5.0,
+            num_max_points=1000,
+            cache_gs_tag=cache_gs_tag,
+            gs_trj_mode="smooth",
+            gs_video_quality="low",
+        )
+    # Launch with minimal, Spaces-compatible configuration
+    # Some parameters may cause routing issues, so we use minimal config
+    app.launch(
+        host="0.0.0.0",  # Required for Spaces
+        port=7860,       # Standard Gradio port
+        share=False      # Not needed on Spaces
+    )

depth_anything_3/api.py ADDED Viewed

	@@ -0,0 +1,414 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Depth Anything 3 API module.
+This module provides the main API for Depth Anything 3, including model loading,
+inference, and export capabilities. It supports both single and nested model architectures.
+"""
+from __future__ import annotations
+import time
+from typing import Optional, Sequence
+import numpy as np
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+from PIL import Image
+from depth_anything_3.cfg import create_object, load_config
+from depth_anything_3.registry import MODEL_REGISTRY
+from depth_anything_3.specs import Prediction
+from depth_anything_3.utils.export import export
+from depth_anything_3.utils.geometry import affine_inverse
+from depth_anything_3.utils.io.input_processor import InputProcessor
+from depth_anything_3.utils.io.output_processor import OutputProcessor
+from depth_anything_3.utils.logger import logger
+from depth_anything_3.utils.pose_align import align_poses_umeyama
+torch.backends.cudnn.benchmark = False
+# logger.info("CUDNN Benchmark Disabled")
+SAFETENSORS_NAME = "model.safetensors"
+CONFIG_NAME = "config.json"
+class DepthAnything3(nn.Module, PyTorchModelHubMixin):
+    """
+    Depth Anything 3 main API class.
+    This class provides a high-level interface for depth estimation using Depth Anything 3.
+    It supports both single and nested model architectures with metric scaling capabilities.
+    Features:
+    - Hugging Face Hub integration via PyTorchModelHubMixin
+    - Support for multiple model presets (vitb, vitg, nested variants)
+    - Automatic mixed precision inference
+    - Export capabilities for various formats (GLB, PLY, NPZ, etc.)
+    - Camera pose estimation and metric depth scaling
+    Usage:
+        # Load from Hugging Face Hub
+        model = DepthAnything3.from_pretrained("huggingface/model-name")
+        # Or create with specific preset
+        model = DepthAnything3(preset="vitg")
+        # Run inference
+        prediction = model.inference(images, export_dir="output", export_format="glb")
+    """
+    _commit_hash: str | None = None  # Set by mixin when loading from Hub
+    def __init__(self, model_name: str = "da3-large", **kwargs):
+        """
+        Initialize DepthAnything3 with specified preset.
+        Args:
+        model_name: The name of the model preset to use.
+                    Examples: 'da3-giant', 'da3-large', 'da3metric-large', 'da3nested-giant-large'.
+        **kwargs: Additional keyword arguments (currently unused).
+        """
+        super().__init__()
+        self.model_name = model_name
+        # Build the underlying network
+        self.config = load_config(MODEL_REGISTRY[self.model_name])
+        self.model = create_object(self.config)
+        self.model.eval()
+        # Initialize processors
+        self.input_processor = InputProcessor()
+        self.output_processor = OutputProcessor()
+        # Device management (set by user)
+        self.device = None
+    @torch.inference_mode()
+    def forward(
+        self,
+        image: torch.Tensor,
+        extrinsics: torch.Tensor | None = None,
+        intrinsics: torch.Tensor | None = None,
+        export_feat_layers: list[int] | None = None,
+        infer_gs: bool = False,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Forward pass through the model.
+        Args:
+            image: Input batch with shape ``(B, N, 3, H, W)`` on the model device.
+            extrinsics: Optional camera extrinsics with shape ``(B, N, 4, 4)``.
+            intrinsics: Optional camera intrinsics with shape ``(B, N, 3, 3)``.
+            export_feat_layers: Layer indices to return intermediate features for.
+        Returns:
+            Dictionary containing model predictions
+        """
+        # Determine optimal autocast dtype
+        autocast_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+        with torch.no_grad():
+            with torch.autocast(device_type=image.device.type, dtype=autocast_dtype):
+                return self.model(image, extrinsics, intrinsics, export_feat_layers, infer_gs)
+    def inference(
+        self,
+        image: list[np.ndarray | Image.Image | str],
+        extrinsics: np.ndarray | None = None,
+        intrinsics: np.ndarray | None = None,
+        align_to_input_ext_scale: bool = True,
+        infer_gs: bool = False,
+        render_exts: np.ndarray | None = None,
+        render_ixts: np.ndarray | None = None,
+        render_hw: tuple[int, int] | None = None,
+        process_res: int = 504,
+        process_res_method: str = "upper_bound_resize",
+        export_dir: str | None = None,
+        export_format: str = "mini_npz",
+        export_feat_layers: Sequence[int] | None = None,
+        # GLB export parameters
+        conf_thresh_percentile: float = 40.0,
+        num_max_points: int = 1_000_000,
+        show_cameras: bool = True,
+        # Feat_vis export parameters
+        feat_vis_fps: int = 15,
+        export_kwargs: Optional[dict] = {},
+    ) -> Prediction:
+        """
+        Run inference on input images.
+        Args:
+            image: List of input images (numpy arrays, PIL Images, or file paths)
+            extrinsics: Camera extrinsics (N, 4, 4)
+            intrinsics: Camera intrinsics (N, 3, 3)
+            align_to_input_ext_scale: whether to align the input pose scale to the prediction
+            infer_gs: Enable the 3D Gaussian branch (needed for `gs_ply`/`gs_video` exports)
+            render_exts: Optional render extrinsics for Gaussian video export
+            render_ixts: Optional render intrinsics for Gaussian video export
+            render_hw: Optional render resolution for Gaussian video export
+            process_res: Processing resolution
+            process_res_method: Resize method for processing
+            export_dir: Directory to export results
+            export_format: Export format (mini_npz, npz, glb, ply, gs, gs_video)
+            export_feat_layers: Layer indices to export intermediate features from
+            conf_thresh_percentile: [GLB] Lower percentile for adaptive confidence threshold (default: 40.0) # noqa: E501
+            num_max_points: [GLB] Maximum number of points in the point cloud (default: 1,000,000)
+            show_cameras: [GLB] Show camera wireframes in the exported scene (default: True)
+            feat_vis_fps: [FEAT_VIS] Frame rate for output video (default: 15)
+            export_kwargs: additional arguments to export functions.
+        Returns:
+            Prediction object containing depth maps and camera parameters
+        """
+        if "gs" in export_format:
+            assert infer_gs, "must set `infer_gs=True` to perform gs-related export."
+        # Preprocess images
+        imgs_cpu, extrinsics, intrinsics = self._preprocess_inputs(
+            image, extrinsics, intrinsics, process_res, process_res_method
+        )
+        # Prepare tensors for model
+        imgs, ex_t, in_t = self._prepare_model_inputs(imgs_cpu, extrinsics, intrinsics)
+        # Normalize extrinsics
+        ex_t_norm = self._normalize_extrinsics(ex_t.clone() if ex_t is not None else None)
+        # Run model forward pass
+        export_feat_layers = list(export_feat_layers) if export_feat_layers is not None else []
+        raw_output = self._run_model_forward(imgs, ex_t_norm, in_t, export_feat_layers, infer_gs)
+        # Convert raw output to prediction
+        prediction = self._convert_to_prediction(raw_output)
+        # Align prediction to extrinsincs
+        prediction = self._align_to_input_extrinsics_intrinsics(
+            extrinsics, intrinsics, prediction, align_to_input_ext_scale
+        )
+        # Add processed images for visualization
+        prediction = self._add_processed_images(prediction, imgs_cpu)
+        # Export if requested
+        if export_dir is not None:
+            if "gs" in export_format:
+                if infer_gs and "gs_video" not in export_format:
+                    export_format = f"{export_format}-gs_video"
+                if "gs_video" in export_format:
+                    if "gs_video" not in export_kwargs:
+                        export_kwargs["gs_video"] = {}
+                    export_kwargs["gs_video"].update(
+                        {
+                            "extrinsics": render_exts,
+                            "intrinsics": render_ixts,
+                            "out_image_hw": render_hw,
+                        }
+                    )
+            # Add GLB export parameters
+            if "glb" in export_format:
+                if "glb" not in export_kwargs:
+                    export_kwargs["glb"] = {}
+                export_kwargs["glb"].update(
+                    {
+                        "conf_thresh_percentile": conf_thresh_percentile,
+                        "num_max_points": num_max_points,
+                        "show_cameras": show_cameras,
+                    }
+                )
+            # Add Feat_vis export parameters
+            if "feat_vis" in export_format:
+                if "feat_vis" not in export_kwargs:
+                    export_kwargs["feat_vis"] = {}
+                export_kwargs["feat_vis"].update(
+                    {
+                        "fps": feat_vis_fps,
+                    }
+                )
+            self._export_results(prediction, export_format, export_dir, **export_kwargs)
+        return prediction
+    def _preprocess_inputs(
+        self,
+        image: list[np.ndarray | Image.Image | str],
+        extrinsics: np.ndarray | None = None,
+        intrinsics: np.ndarray | None = None,
+        process_res: int = 504,
+        process_res_method: str = "upper_bound_resize",
+    ) -> torch.Tensor:
+        """Preprocess input images using input processor."""
+        start_time = time.time()
+        imgs_cpu, extrinsics, intrinsics = self.input_processor(
+            image,
+            extrinsics.copy() if extrinsics is not None else None,
+            intrinsics.copy() if intrinsics is not None else None,
+            process_res,
+            process_res_method,
+        )
+        end_time = time.time()
+        logger.info(
+            "Processed Images Done taking",
+            end_time - start_time,
+            "seconds. Shape: ",
+            imgs_cpu.shape,
+        )
+        return imgs_cpu, extrinsics, intrinsics
+    def _prepare_model_inputs(
+        self,
+        imgs_cpu: torch.Tensor,
+        extrinsics: torch.tensor | None,
+        intrinsics: torch.tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+        """Prepare tensors for model input."""
+        device = self._get_model_device()
+        # Move images to model device
+        imgs = imgs_cpu.to(device, non_blocking=True)[None].float()
+        # Convert camera parameters to tensors
+        ex_t = (
+            extrinsics.to(device, non_blocking=True)[None].float()
+            if extrinsics is not None
+            else None
+        )
+        in_t = (
+            intrinsics.to(device, non_blocking=True)[None].float()
+            if intrinsics is not None
+            else None
+        )
+        return imgs, ex_t, in_t
+    def _normalize_extrinsics(self, ex_t: torch.Tensor) -> torch.Tensor:
+        """Normalize extrinsics"""
+        if ex_t is None:
+            return None
+        transform = affine_inverse(ex_t[:, :1])
+        ex_t_norm = ex_t @ transform
+        c2ws = affine_inverse(ex_t_norm)
+        translations = c2ws[..., :3, 3]
+        dists = translations.norm(dim=-1)
+        median_dist = torch.median(dists)
+        median_dist = torch.clamp(median_dist, min=1e-1)
+        ex_t_norm[..., :3, 3] = ex_t_norm[..., :3, 3] / median_dist
+        return ex_t_norm
+    def _align_to_input_extrinsics_intrinsics(
+        self,
+        extrinsics: torch.Tensor,
+        intrinsics: torch.Tensor,
+        prediction: Prediction,
+        align_to_input_ext_scale: bool = True,
+        ransac_view_thresh: int = 10,
+    ) -> Prediction:
+        """Align depth map to input extrinsics"""
+        if extrinsics is None:
+            return prediction
+        prediction.intrinsics = intrinsics.numpy()
+        _, _, scale, aligned_extrinsics = align_poses_umeyama(
+            prediction.extrinsics,
+            extrinsics.numpy(),
+            ransac=len(extrinsics) >= ransac_view_thresh,
+            return_aligned=True,
+            random_state=42,
+        )
+        if align_to_input_ext_scale:
+            prediction.extrinsics = extrinsics[..., :3, :].numpy()
+            prediction.depth /= scale
+        else:
+            prediction.extrinsics = aligned_extrinsics
+        return prediction
+    def _run_model_forward(
+        self,
+        imgs: torch.Tensor,
+        ex_t: torch.Tensor | None,
+        in_t: torch.Tensor | None,
+        export_feat_layers: Sequence[int] | None = None,
+        infer_gs: bool = False,
+    ) -> dict[str, torch.Tensor]:
+        """Run model forward pass."""
+        device = imgs.device
+        need_sync = device.type == "cuda"
+        if need_sync:
+            torch.cuda.synchronize(device)
+        start_time = time.time()
+        feat_layers = list(export_feat_layers) if export_feat_layers is not None else None
+        output = self.forward(imgs, ex_t, in_t, feat_layers, infer_gs)
+        if need_sync:
+            torch.cuda.synchronize(device)
+        end_time = time.time()
+        logger.info(f"Model Forward Pass Done. Time: {end_time - start_time} seconds")
+        return output
+    def _convert_to_prediction(self, raw_output: dict[str, torch.Tensor]) -> Prediction:
+        """Convert raw model output to Prediction object."""
+        start_time = time.time()
+        output = self.output_processor(raw_output)
+        end_time = time.time()
+        logger.info(f"Conversion to Prediction Done. Time: {end_time - start_time} seconds")
+        return output
+    def _add_processed_images(self, prediction: Prediction, imgs_cpu: torch.Tensor) -> Prediction:
+        """Add processed images to prediction for visualization."""
+        # Convert from (N, 3, H, W) to (N, H, W, 3) and denormalize
+        processed_imgs = imgs_cpu.permute(0, 2, 3, 1).cpu().numpy()  # (N, H, W, 3)
+        # Denormalize from ImageNet normalization
+        mean = np.array([0.485, 0.456, 0.406])
+        std = np.array([0.229, 0.224, 0.225])
+        processed_imgs = processed_imgs * std + mean
+        processed_imgs = np.clip(processed_imgs, 0, 1)
+        processed_imgs = (processed_imgs * 255).astype(np.uint8)
+        prediction.processed_images = processed_imgs
+        return prediction
+    def _export_results(
+        self, prediction: Prediction, export_format: str, export_dir: str, **kwargs
+    ) -> None:
+        """Export results to specified format and directory."""
+        start_time = time.time()
+        export(prediction, export_format, export_dir, **kwargs)
+        end_time = time.time()
+        logger.info(f"Export Results Done. Time: {end_time - start_time} seconds")
+    def _get_model_device(self) -> torch.device:
+        """
+        Get the device where the model is located.
+        Returns:
+            Device where the model parameters are located
+        Raises:
+            ValueError: If no tensors are found in the model
+        """
+        if self.device is not None:
+            return self.device
+        # Find device from parameters
+        for param in self.parameters():
+            self.device = param.device
+            return param.device
+        # Find device from buffers
+        for buffer in self.buffers():
+            self.device = buffer.device
+            return buffer.device
+        raise ValueError("No tensor found in model")

depth_anything_3/app/css_and_html.py ADDED Viewed

	@@ -0,0 +1,594 @@

+# flake8: noqa: E501
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CSS and HTML content for the Depth Anything 3 Gradio application.
+This module contains all the CSS styles and HTML content blocks
+used in the Gradio interface.
+"""
+# CSS Styles for the Gradio interface
+GRADIO_CSS = """
+/* Add Font Awesome CDN with all styles including brands and colors */
+@import url('https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css');
+/* Add custom styles for colored icons */
+.fa-color-blue {
+    color: #3b82f6;
+}
+.fa-color-purple {
+    color: #8b5cf6;
+}
+.fa-color-cyan {
+    color: #06b6d4;
+}
+.fa-color-green {
+    color: #10b981;
+}
+.fa-color-yellow {
+    color: #f59e0b;
+}
+.fa-color-red {
+    color: #ef4444;
+}
+.link-btn {
+    display: inline-flex;
+    align-items: center;
+    gap: 8px;
+    text-decoration: none;
+    padding: 12px 24px;
+    border-radius: 50px;
+    font-weight: 500;
+    transition: all 0.3s ease;
+}
+/* Dark mode tech theme */
+@media (prefers-color-scheme: dark) {
+    html, body {
+        background: #1e293b;
+        color: #ffffff;
+    }
+    .gradio-container {
+        background: #1e293b;
+        color: #ffffff;
+    }
+    .link-btn {
+        background: rgba(255, 255, 255, 0.2);
+        color: white;
+        backdrop-filter: blur(10px);
+        border: 1px solid rgba(255, 255, 255, 0.3);
+    }
+    .link-btn:hover {
+        background: rgba(255, 255, 255, 0.3);
+        transform: translateY(-2px);
+        box-shadow: 0 8px 25px rgba(0, 0, 0, 0.2);
+    }
+    .tech-bg {
+        background: linear-gradient(135deg, #0f172a, #1e293b); /* Darker colors */
+        position: relative;
+        overflow: hidden;
+    }
+    .tech-bg::before {
+        content: '';
+        position: absolute;
+        top: 0;
+        left: 0;
+        right: 0;
+        bottom: 0;
+        background:
+            radial-gradient(circle at 20% 80%, rgba(59, 130, 246, 0.15) 0%, transparent 50%), /* Reduced opacity */
+            radial-gradient(circle at 80% 20%, rgba(139, 92, 246, 0.15) 0%, transparent 50%), /* Reduced opacity */
+            radial-gradient(circle at 40% 40%, rgba(18, 194, 233, 0.1) 0%, transparent 50%); /* Reduced opacity */
+        animation: techPulse 8s ease-in-out infinite;
+    }
+    .gradio-container .panel,
+    .gradio-container .block,
+    .gradio-container .form {
+        background: rgba(0, 0, 0, 0.3);
+        border: 1px solid rgba(59, 130, 246, 0.2);
+        border-radius: 10px;
+    }
+    .gradio-container * {
+        color: #ffffff;
+    }
+    .gradio-container label {
+        color: #e0e0e0;
+    }
+    .gradio-container .markdown {
+        color: #e0e0e0;
+    }
+}
+/* Light mode tech theme */
+@media (prefers-color-scheme: light) {
+    html, body {
+        background: #ffffff;
+        color: #1e293b;
+    }
+    .gradio-container {
+        background: #ffffff;
+        color: #1e293b;
+    }
+    .tech-bg {
+        background: linear-gradient(135deg, #ffffff, #f1f5f9);
+        position: relative;
+        overflow: hidden;
+    }
+    .link-btn {
+        background: rgba(59, 130, 246, 0.15);
+        color: var(--body-text-color);
+        border: 1px solid rgba(59, 130, 246, 0.3);
+    }
+    .link-btn:hover {
+        background: rgba(59, 130, 246, 0.25);
+        transform: translateY(-2px);
+        box-shadow: 0 8px 25px rgba(59, 130, 246, 0.2);
+    }
+    .tech-bg::before {
+        content: '';
+        position: absolute;
+        top: 0;
+        left: 0;
+        right: 0;
+        bottom: 0;
+        background:
+            radial-gradient(circle at 20% 80%, rgba(59, 130, 246, 0.1) 0%, transparent 50%),
+            radial-gradient(circle at 80% 20%, rgba(139, 92, 246, 0.1) 0%, transparent 50%),
+            radial-gradient(circle at 40% 40%, rgba(18, 194, 233, 0.08) 0%, transparent 50%);
+        animation: techPulse 8s ease-in-out infinite;
+    }
+    .gradio-container .panel,
+    .gradio-container .block,
+    .gradio-container .form {
+        background: rgba(255, 255, 255, 0.8);
+        border: 1px solid rgba(59, 130, 246, 0.3);
+        border-radius: 10px;
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    }
+    .gradio-container * {
+        color: #1e293b;
+    }
+    .gradio-container label {
+        color: #334155;
+    }
+    .gradio-container .markdown {
+        color: #334155;
+    }
+}
+@keyframes techPulse {
+    0%, 100% { opacity: 0.5; }
+    50% { opacity: 0.8; }
+}
+/* Custom log with tech gradient */
+.custom-log * {
+    font-style: italic;
+    font-size: 22px !important;
+    background: linear-gradient(135deg, #3b82f6, #8b5cf6);
+    background-size: 400% 400%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    font-weight: bold !important;
+    color: transparent !important;
+    text-align: center !important;
+    animation: techGradient 3s ease infinite;
+}
+@keyframes techGradient {
+    0% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+    100% { background-position: 0% 50%; }
+}
+@keyframes metricPulse {
+    0%, 100% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+}
+@keyframes pointcloudPulse {
+    0%, 100% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+}
+@keyframes camerasPulse {
+    0%, 100% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+}
+@keyframes gaussiansPulse {
+    0%, 100% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+}
+/* Special colors for key terms - Global styles */
+.metric-text {
+    background: linear-gradient(45deg, #ff6b6b, #ff8e53, #ff6b6b);
+    background-size: 200% 200%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent !important;
+    animation: metricPulse 2s ease-in-out infinite;
+    font-weight: 700;
+    text-shadow: 0 0 10px rgba(255, 107, 107, 0.5);
+}
+.pointcloud-text {
+    background: linear-gradient(45deg, #4ecdc4, #44a08d, #4ecdc4);
+    background-size: 200% 200%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent !important;
+    animation: pointcloudPulse 2.5s ease-in-out infinite;
+    font-weight: 700;
+    text-shadow: 0 0 10px rgba(78, 205, 196, 0.5);
+}
+.cameras-text {
+    background: linear-gradient(45deg, #667eea, #764ba2, #667eea);
+    background-size: 200% 200%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent !important;
+    animation: camerasPulse 3s ease-in-out infinite;
+    font-weight: 700;
+    text-shadow: 0 0 10px rgba(102, 126, 234, 0.5);
+}
+.gaussians-text {
+    background: linear-gradient(45deg, #f093fb, #f5576c, #f093fb);
+    background-size: 200% 200%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent !important;
+    animation: gaussiansPulse 2.2s ease-in-out infinite;
+    font-weight: 700;
+    text-shadow: 0 0 10px rgba(240, 147, 251, 0.5);
+}
+.example-log * {
+    font-style: italic;
+    font-size: 16px !important;
+    background: linear-gradient(135deg, #3b82f6, #8b5cf6);
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent !important;
+}
+#my_radio .wrap {
+    display: flex;
+    flex-wrap: nowrap;
+    justify-content: center;
+    align-items: center;
+}
+#my_radio .wrap label {
+    display: flex;
+    width: 50%;
+    justify-content: center;
+    align-items: center;
+    margin: 0;
+    padding: 10px 0;
+    box-sizing: border-box;
+}
+/* Align navigation buttons with dropdown bottom */
+.navigation-row {
+    display: flex !important;
+    align-items: flex-end !important;
+    gap: 8px !important;
+}
+.navigation-row > div:nth-child(1),
+.navigation-row > div:nth-child(3) {
+    align-self: flex-end !important;
+}
+.navigation-row > div:nth-child(2) {
+    flex: 1 !important;
+}
+/* Make thumbnails clickable with pointer cursor */
+.clickable-thumbnail img {
+    cursor: pointer !important;
+}
+.clickable-thumbnail:hover img {
+    cursor: pointer !important;
+    opacity: 0.8;
+    transition: opacity 0.3s ease;
+}
+/* Make thumbnail containers narrower horizontally */
+.clickable-thumbnail {
+    padding: 5px 2px !important;
+    margin: 0 2px !important;
+}
+.clickable-thumbnail .image-container {
+    margin: 0 !important;
+    padding: 0 !important;
+}
+.scene-info {
+    text-align: center !important;
+    padding: 5px 2px !important;
+    margin: 0 !important;
+}
+"""
+def get_header_html(logo_base64=None):
+    """
+    Generate the main header HTML with logo and title.
+    Args:
+        logo_base64 (str, optional): Base64 encoded logo image
+    Returns:
+        str: HTML string for the header
+    """
+    return """
+    <div class="tech-bg" style="text-align: center; margin-bottom: 5px; padding: 40px 20px; border-radius: 15px; position: relative; overflow: hidden;">
+        <div style="position: relative; z-index: 2;">
+            <h1 style="margin: 0; font-size: 3.5em; font-weight: 700;
+                background: linear-gradient(135deg, #3b82f6, #8b5cf6);
+                background-size: 400% 400%;
+                -webkit-background-clip: text;
+                background-clip: text;
+                color: transparent;
+                animation: techGradient 3s ease infinite;
+                text-shadow: 0 0 30px rgba(59, 130, 246, 0.5);
+                letter-spacing: 2px;">
+                Depth Anything 3
+            </h1>
+            <p style="margin: 15px 0 0 0; font-size: 2.16em; font-weight: 300;" class="header-subtitle">
+                Recovering the Visual Space from Any Views
+            </p>
+            <div style="margin-top: 20px;">
+                <!-- Revert buttons to original inline styles -->
+                <a href="https://depth-anything-3.github.io" target="_blank" class="link-btn">
+                    <i class="fas fa-globe" style="margin-right: 8px;"></i> Project Page
+                </a>
+                <a href="https://arxiv.org/abs/2511.10647" target="_blank" class="link-btn">
+                    <i class="fas fa-file-pdf" style="margin-right: 8px;"></i> Paper
+                </a>
+                <a href="https://github.com/ByteDance-Seed/Depth-Anything-3" target="_blank" class="link-btn">
+                    <i class="fab fa-github" style="margin-right: 8px;"></i> Code
+                </a>
+            </div>
+        </div>
+    </div>
+    <style>
+        /* Ensure tech-bg class is properly applied in dark mode */
+        @media (prefers-color-scheme: dark) {
+            .header-subtitle {
+                color: #cbd5e1;
+            }
+            /* Increase priority to ensure background color is properly applied */
+            .tech-bg {
+                background: linear-gradient(135deg, #0f172a, #1e293b) !important;
+            }
+        }
+        @media (prefers-color-scheme: light) {
+            .header-subtitle {
+                color: #475569;
+            }
+            /* Also add explicit background color for light mode */
+            .tech-bg {
+                background: linear-gradient(135deg, rgba(59, 130, 246, 0.1) 0%, rgba(139, 92, 246, 0.1) 100%) !important;
+            }
+        }
+    </style>
+    """
+def get_description_html():
+    """
+    Generate the main description and getting started HTML.
+    Returns:
+        str: HTML string for the description
+    """
+    return """
+    <div class="description-container" style="padding: 25px; border-radius: 15px; margin: 0 0 20px 0;">
+        <h2 class="description-title" style="margin-top: 0; font-size: 1.6em; text-align: center;">
+            <i class="fas fa-bullseye fa-color-red" style="margin-right: 8px;"></i> What This Demo Does
+        </h2>
+        <div class="description-content" style="padding: 20px; border-radius: 10px; margin: 15px 0; text-align: center;">
+            <p class="description-main" style="line-height: 1.6; margin: 0; font-size: 1.45em;">
+                <strong>Upload images or videos</strong> → <strong>Get <span class="metric-text">Metric</span> <span class="pointcloud-text">Point Clouds</span>, <span class="cameras-text">Cameras</span> and <span class="gaussians-text">Novel Views</span></strong> → <strong>Explore in 3D</strong>
+            </p>
+        </div>
+        <div style="text-align: center; margin-top: 15px;">
+            <p class="description-tip" style="font-style: italic; margin: 0;">
+                <i class="fas fa-lightbulb fa-color-yellow" style="margin-right: 8px;"></i> <strong>Tip:</strong> Landscape-oriented images or videos are preferred for best 3D recovering.
+            </p>
+        </div>
+    </div>
+    <style>
+        @media (prefers-color-scheme: dark) {
+            .description-container {
+                background: linear-gradient(135deg, rgba(59, 130, 246, 0.1) 0%, rgba(139, 92, 246, 0.1) 100%);
+                border: 1px solid rgba(59, 130, 246, 0.2);
+            }
+            .description-title { color: #3b82f6; }
+            .description-content { background: rgba(0, 0, 0, 0.3); }
+            .description-main { color: #e0e0e0; }
+            .description-text { color: #cbd5e1; }
+            .description-tip { color: #cbd5e1; }
+        }
+        @media (prefers-color-scheme: light) {
+            .description-container {
+                background: linear-gradient(135deg, rgba(59, 130, 246, 0.05) 0%, rgba(139, 92, 246, 0.05) 100%);
+                border: 1px solid rgba(59, 130, 246, 0.3);
+            }
+            .description-title { color: #3b82f6; }
+            .description-content { background: transparent; }
+            .description-main { color: #1e293b; }
+            .description-text { color: #475569; }
+            .description-tip { color: #475569; }
+        }
+    </style>
+    """
+def get_acknowledgements_html():
+    """
+    Generate the acknowledgements section HTML.
+    Returns:
+        str: HTML string for the acknowledgements
+    """
+    return """
+    <div style="background: linear-gradient(135deg, rgba(59, 130, 246, 0.1) 0%, rgba(139, 92, 246, 0.1) 100%);
+                padding: 25px; border-radius: 15px; margin: 20px 0; border: 1px solid rgba(59, 130, 246, 0.2);">
+        <h3 style="color: #3b82f6; margin-top: 0; text-align: center; font-size: 1.4em;">
+            <i class="fas fa-trophy fa-color-yellow" style="margin-right: 8px;"></i> Research Credits & Acknowledgments
+        </h3>
+        <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin: 15px 0;">
+            <!-- Original Research Section (Left) -->
+            <div style="text-align: center;">
+                <h4 style="color: #8b5cf6; margin: 10px 0;"><i class="fas fa-flask fa-color-green" style="margin-right: 8px;"></i> Original Research</h4>
+                <p style="color: #e0e0e0; margin: 5px 0;">
+                    <a href="https://depth-anything-3.github.io" target="_blank"
+                       style="color: #3b82f6; text-decoration: none; font-weight: 600;">
+                        Depth Anything 3
+                    </a>
+                </p>
+            </div>
+            <!-- Previous Versions Section (Right) -->
+            <div style="text-align: center;">
+                <h4 style="color: #8b5cf6; margin: 10px 0;"><i class="fas fa-history fa-color-blue" style="margin-right: 8px;"></i> Previous Versions</h4>
+                <div style="display: flex; flex-direction: row; gap: 15px; justify-content: center; align-items: center;">
+                    <p style="color: #e0e0e0; margin: 0;">
+                        <a href="https://huggingface.co/spaces/LiheYoung/Depth-Anything" target="_blank"
+                           style="color: #3b82f6; text-decoration: none; font-weight: 600;">
+                            Depth-Anything
+                        </a>
+                    </p>
+                    <span style="color: #e0e0e0;">•</span>
+                    <p style="color: #e0e0e0; margin: 0;">
+                        <a href="https://huggingface.co/spaces/depth-anything/Depth-Anything-V2" target="_blank"
+                           style="color: #3b82f6; text-decoration: none; font-weight: 600;">
+                            Depth-Anything-V2
+                        </a>
+                    </p>
+                </div>
+            </div>
+        </div>
+        <!-- HF Demo Adapted from - Centered at the bottom of the whole block -->
+        <div style="margin-top: 20px; padding-top: 15px; border-top: 1px solid rgba(59, 130, 246, 0.3); text-align: center;">
+            <p style="color: #a0a0a0; font-size: 0.9em; margin: 0;">
+                <i class="fas fa-code-branch fa-color-gray" style="margin-right: 5px;"></i> HF demo adapted from <a href="https://huggingface.co/spaces/facebook/map-anything" target="_blank" style="color: inherit; text-decoration: none;">Map Anything</a>
+            </p>
+        </div>
+    </div>
+    """
+def get_gradio_theme():
+    """
+    Get the configured Gradio theme with adaptive tech colors.
+    Returns:
+        gr.themes.Base: Configured Gradio theme
+    """
+    import gradio as gr
+    return gr.themes.Base(
+        primary_hue=gr.themes.Color(
+            c50="#eff6ff",
+            c100="#dbeafe",
+            c200="#bfdbfe",
+            c300="#93c5fd",
+            c400="#60a5fa",
+            c500="#3b82f6",
+            c600="#2563eb",
+            c700="#1d4ed8",
+            c800="#1e40af",
+            c900="#1e3a8a",
+            c950="#172554",
+        ),
+        secondary_hue=gr.themes.Color(
+            c50="#f5f3ff",
+            c100="#ede9fe",
+            c200="#ddd6fe",
+            c300="#c4b5fd",
+            c400="#a78bfa",
+            c500="#8b5cf6",
+            c600="#7c3aed",
+            c700="#6d28d9",
+            c800="#5b21b6",
+            c900="#4c1d95",
+            c950="#2e1065",
+        ),
+        neutral_hue=gr.themes.Color(
+            c50="#f8fafc",
+            c100="#f1f5f9",
+            c200="#e2e8f0",
+            c300="#cbd5e1",
+            c400="#94a3b8",
+            c500="#64748b",
+            c600="#475569",
+            c700="#334155",
+            c800="#1e293b",
+            c900="#0f172a",
+            c950="#020617",
+        ),
+    )
+# Measure tab instructions HTML
+MEASURE_INSTRUCTIONS_HTML = """
+### Click points on the image to compute distance.
+> <i class="fas fa-triangle-exclamation fa-color-red" style="margin-right: 5px;"></i> Metric scale estimation is difficult on aerial/drone images.
+"""

depth_anything_3/app/gradio_app.py ADDED Viewed

	@@ -0,0 +1,777 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Refactored Gradio App for Depth Anything 3.
+This is the main application file that orchestrates all components.
+The original functionality has been split into modular components for better maintainability.
+"""
+import argparse
+import os
+from functools import lru_cache
+from typing import Any, Dict, List
+import gradio as gr
+from depth_anything_3.app.css_and_html import GRADIO_CSS, get_gradio_theme
+from depth_anything_3.app.modules.event_handlers import EventHandlers
+from depth_anything_3.app.modules.ui_components import UIComponents
+# Set environment variables
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+class DepthAnything3App:
+    """
+    Main application class for Depth Anything 3 Gradio app.
+    """
+    def __init__(self, model_dir: str = None, workspace_dir: str = None, gallery_dir: str = None):
+        """
+        Initialize the application.
+        Args:
+            model_dir: Path to the model directory
+            workspace_dir: Path to the workspace directory
+            gallery_dir: Path to the gallery directory
+        """
+        self.model_dir = model_dir
+        self.workspace_dir = workspace_dir
+        self.gallery_dir = gallery_dir
+        # Set environment variables for directories
+        if self.model_dir:
+            os.environ["DA3_MODEL_DIR"] = self.model_dir
+        if self.workspace_dir:
+            os.environ["DA3_WORKSPACE_DIR"] = self.workspace_dir
+        if self.gallery_dir:
+            os.environ["DA3_GALLERY_DIR"] = self.gallery_dir
+        self.event_handlers = EventHandlers()
+        self.ui_components = UIComponents()
+    def cache_examples(
+        self,
+        show_cam: bool = True,
+        filter_black_bg: bool = False,
+        filter_white_bg: bool = False,
+        save_percentage: float = 20.0,
+        num_max_points: int = 1000,
+        cache_gs_tag: str = "",
+        gs_trj_mode: str = "smooth",
+        gs_video_quality: str = "low",
+    ) -> None:
+        """
+        Pre-cache all example scenes at startup.
+        Args:
+            show_cam: Whether to show camera in visualization
+            filter_black_bg: Whether to filter black background
+            filter_white_bg: Whether to filter white background
+            save_percentage: Filter percentage for point cloud
+            num_max_points: Maximum number of points
+            cache_gs_tag: Tag to match scene names for high-res+3DGS caching (e.g., "dl3dv")
+            gs_trj_mode: Trajectory mode for 3DGS
+            gs_video_quality: Video quality for 3DGS
+        """
+        from depth_anything_3.app.modules.utils import get_scene_info
+        examples_dir = os.path.join(self.workspace_dir, "examples")
+        if not os.path.exists(examples_dir):
+            print(f"Examples directory not found: {examples_dir}")
+            return
+        scenes = get_scene_info(examples_dir)
+        if not scenes:
+            print("No example scenes found to cache.")
+            return
+        print(f"\n{'='*60}")
+        print(f"Caching {len(scenes)} example scenes...")
+        print(f"{'='*60}\n")
+        for i, scene in enumerate(scenes, 1):
+            scene_name = scene["name"]
+            # Check if scene name matches the gs tag for high-res+3DGS caching
+            use_high_res_gs = cache_gs_tag and cache_gs_tag.lower() in scene_name.lower()
+            if use_high_res_gs:
+                print(f"[{i}/{len(scenes)}] Caching scene: {scene_name} (HIGH-RES + 3DGS)")
+                print(f"  - Number of images: {scene['num_images']}")
+                print(f"  - Matched tag: '{cache_gs_tag}' - using high_res + 3DGS")
+            else:
+                print(f"[{i}/{len(scenes)}] Caching scene: {scene_name} (LOW-RES)")
+                print(f"  - Number of images: {scene['num_images']}")
+            try:
+                # Load example scene
+                _, target_dir, _, _, _, _, _, _, _ = self.event_handlers.load_example_scene(
+                    scene_name
+                )
+                if target_dir and target_dir != "None":
+                    # Run reconstruction with appropriate settings
+                    print("  - Running reconstruction...")
+                    result = self.event_handlers.gradio_demo(
+                        target_dir=target_dir,
+                        show_cam=show_cam,
+                        filter_black_bg=filter_black_bg,
+                        filter_white_bg=filter_white_bg,
+                        process_res_method="high_res" if use_high_res_gs else "low_res",
+                        selected_first_frame="",
+                        save_percentage=save_percentage,
+                        num_max_points=num_max_points,
+                        infer_gs=use_high_res_gs,
+                        gs_trj_mode=gs_trj_mode,
+                        gs_video_quality=gs_video_quality,
+                    )
+                    # Check if successful
+                    if result[0] is not None:  # reconstruction_output
+                        print(f"  ✓ Scene '{scene_name}' cached successfully")
+                    else:
+                        print(f"  ✗ Scene '{scene_name}' caching failed: {result[1]}")
+                else:
+                    print(f"  ✗ Scene '{scene_name}' loading failed")
+            except Exception as e:
+                print(f"  ✗ Error caching scene '{scene_name}': {str(e)}")
+            print()
+        print("=" * 60)
+        print("Example scene caching completed!")
+        print("=" * 60 + "\n")
+    def create_app(self) -> gr.Blocks:
+        """
+        Create and configure the Gradio application.
+        Returns:
+            Configured Gradio Blocks interface
+        """
+        # Initialize theme
+        def get_theme():
+            return get_gradio_theme()
+        with gr.Blocks(theme=get_theme(), css=GRADIO_CSS) as demo:
+            # State variables for the tabbed interface
+            is_example = gr.Textbox(label="is_example", visible=False, value="None")
+            processed_data_state = gr.State(value=None)
+            measure_points_state = gr.State(value=[])
+            selected_first_frame_state = gr.State(value="")
+            selected_image_index_state = gr.State(value=0)  # Track selected image index
+            # current_view_index = gr.State(value=0)  # noqa: F841 Track current view index
+            # Header and description
+            self.ui_components.create_header_section()
+            self.ui_components.create_description_section()
+            target_dir_output = gr.Textbox(label="Target Dir", visible=False, value="None")
+            # Main content area
+            with gr.Row():
+                with gr.Column(scale=2):
+                    # Upload section
+                    (
+                        input_video,
+                        s_time_interval,
+                        input_images,
+                        image_gallery,
+                        select_first_frame_btn,
+                    ) = self.ui_components.create_upload_section()
+                with gr.Column(scale=4):
+                    with gr.Column():
+                        # gr.Markdown("**Metric 3D Reconstruction (Point Cloud and Camera Poses)**")
+                        # Reconstruction control section (buttons) - moved below tabs
+                        log_output = gr.Markdown(
+                            "Please upload a video or images, then click Reconstruct.",
+                            elem_classes=["custom-log"],
+                        )
+                        # Tabbed interface
+                        with gr.Tabs():
+                            with gr.Tab("Point Cloud & Cameras"):
+                                reconstruction_output = (
+                                    self.ui_components.create_3d_viewer_section()
+                                )
+                            with gr.Tab("Metric Depth"):
+                                (
+                                    prev_measure_btn,
+                                    measure_view_selector,
+                                    next_measure_btn,
+                                    measure_image,
+                                    measure_depth_image,
+                                    measure_text,
+                                ) = self.ui_components.create_measure_section()
+                            with gr.Tab("3DGS Rendered Novel Views"):
+                                gs_video, gs_info = self.ui_components.create_nvs_video()
+                        # Inference control section (before inference)
+                        (process_res_method_dropdown, infer_gs) = (
+                            self.ui_components.create_inference_control_section()
+                        )
+                        # Display control section - includes 3DGS options, buttons, and Visualization Options  # noqa: E501
+                        (
+                            show_cam,
+                            filter_black_bg,
+                            filter_white_bg,
+                            save_percentage,
+                            num_max_points,
+                            gs_trj_mode,
+                            gs_video_quality,
+                            submit_btn,
+                            clear_btn,
+                        ) = self.ui_components.create_display_control_section()
+                        # bind visibility of gs_trj_mode to infer_gs
+                        infer_gs.change(
+                            fn=lambda checked: (
+                                gr.update(visible=checked),
+                                gr.update(visible=checked),
+                                gr.update(visible=checked),
+                                gr.update(visible=(not checked)),
+                            ),
+                            inputs=infer_gs,
+                            outputs=[gs_trj_mode, gs_video_quality, gs_video, gs_info],
+                        )
+            # Example scenes section
+            gr.Markdown("## Example Scenes")
+            scenes = self.ui_components.create_example_scenes_section()
+            scene_components = self.ui_components.create_example_scene_grid(scenes)
+            # Set up event handlers
+            self._setup_event_handlers(
+                demo,
+                is_example,
+                processed_data_state,
+                measure_points_state,
+                target_dir_output,
+                input_video,
+                input_images,
+                s_time_interval,
+                image_gallery,
+                reconstruction_output,
+                log_output,
+                show_cam,
+                filter_black_bg,
+                filter_white_bg,
+                process_res_method_dropdown,
+                save_percentage,
+                submit_btn,
+                clear_btn,
+                num_max_points,
+                infer_gs,
+                select_first_frame_btn,
+                selected_first_frame_state,
+                selected_image_index_state,
+                measure_view_selector,
+                measure_image,
+                measure_depth_image,
+                measure_text,
+                prev_measure_btn,
+                next_measure_btn,
+                scenes,
+                scene_components,
+                gs_video,
+                gs_info,
+                gs_trj_mode,
+                gs_video_quality,
+            )
+            # Acknowledgements
+            self.ui_components.create_acknowledgements_section()
+        return demo
+    def _setup_event_handlers(
+        self,
+        demo: gr.Blocks,
+        is_example: gr.Textbox,
+        processed_data_state: gr.State,
+        measure_points_state: gr.State,
+        target_dir_output: gr.Textbox,
+        input_video: gr.Video,
+        input_images: gr.File,
+        s_time_interval: gr.Slider,
+        image_gallery: gr.Gallery,
+        reconstruction_output: gr.Model3D,
+        log_output: gr.Markdown,
+        show_cam: gr.Checkbox,
+        filter_black_bg: gr.Checkbox,
+        filter_white_bg: gr.Checkbox,
+        process_res_method_dropdown: gr.Dropdown,
+        save_percentage: gr.Slider,
+        submit_btn: gr.Button,
+        clear_btn: gr.ClearButton,
+        num_max_points: gr.Slider,
+        infer_gs: gr.Checkbox,
+        select_first_frame_btn: gr.Button,
+        selected_first_frame_state: gr.State,
+        selected_image_index_state: gr.State,
+        measure_view_selector: gr.Dropdown,
+        measure_image: gr.Image,
+        measure_depth_image: gr.Image,
+        measure_text: gr.Markdown,
+        prev_measure_btn: gr.Button,
+        next_measure_btn: gr.Button,
+        scenes: List[Dict[str, Any]],
+        scene_components: List[gr.Image],
+        gs_video: gr.Video,
+        gs_info: gr.Markdown,
+        gs_trj_mode: gr.Dropdown,
+        gs_video_quality: gr.Dropdown,
+    ) -> None:
+        """
+        Set up all event handlers for the application.
+        Args:
+            demo: Gradio Blocks interface
+            All other arguments: Gradio components to connect
+        """
+        # Configure clear button
+        clear_btn.add(
+            [
+                input_video,
+                input_images,
+                reconstruction_output,
+                log_output,
+                target_dir_output,
+                image_gallery,
+                gs_video,
+            ]
+        )
+        # Main reconstruction button
+        submit_btn.click(
+            fn=self.event_handlers.clear_fields, inputs=[], outputs=[reconstruction_output]
+        ).then(fn=self.event_handlers.update_log, inputs=[], outputs=[log_output]).then(
+            fn=self.event_handlers.gradio_demo,
+            inputs=[
+                target_dir_output,
+                show_cam,
+                filter_black_bg,
+                filter_white_bg,
+                process_res_method_dropdown,
+                selected_first_frame_state,
+                save_percentage,
+                # pass num_max_points
+                num_max_points,
+                infer_gs,
+                gs_trj_mode,
+                gs_video_quality,
+            ],
+            outputs=[
+                reconstruction_output,
+                log_output,
+                processed_data_state,
+                measure_image,
+                measure_depth_image,
+                measure_text,
+                measure_view_selector,
+                gs_video,
+                gs_video,  # gs_video visibility
+                gs_info,  # gs_info visibility
+            ],
+        ).then(
+            fn=lambda: "False",
+            inputs=[],
+            outputs=[is_example],  # set is_example to "False"
+        )
+        # Real-time visualization updates
+        self._setup_visualization_handlers(
+            show_cam,
+            filter_black_bg,
+            filter_white_bg,
+            process_res_method_dropdown,
+            target_dir_output,
+            is_example,
+            reconstruction_output,
+            log_output,
+        )
+        # File upload handlers
+        input_video.change(
+            fn=self.event_handlers.handle_uploads,
+            inputs=[input_video, input_images, s_time_interval],
+            outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
+        )
+        input_images.change(
+            fn=self.event_handlers.handle_uploads,
+            inputs=[input_video, input_images, s_time_interval],
+            outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
+        )
+        # Image gallery click handler (for selecting first frame)
+        def handle_image_selection(evt: gr.SelectData):
+            if evt is None or evt.index is None:
+                return "No image selected", 0
+            selected_index = evt.index
+            return f"Selected image {selected_index} as potential first frame", selected_index
+        image_gallery.select(
+            fn=handle_image_selection,
+            outputs=[log_output, selected_image_index_state],
+        )
+        # Select first frame handler
+        select_first_frame_btn.click(
+            fn=self.event_handlers.select_first_frame,
+            inputs=[image_gallery, selected_image_index_state],
+            outputs=[image_gallery, log_output, selected_first_frame_state],
+        )
+        # Navigation handlers
+        self._setup_navigation_handlers(
+            prev_measure_btn,
+            next_measure_btn,
+            measure_view_selector,
+            measure_image,
+            measure_depth_image,
+            measure_points_state,
+            processed_data_state,
+        )
+        # Measurement handler
+        measure_image.select(
+            fn=self.event_handlers.measure,
+            inputs=[processed_data_state, measure_points_state, measure_view_selector],
+            outputs=[measure_image, measure_depth_image, measure_points_state, measure_text],
+        )
+        # Example scene handlers
+        self._setup_example_scene_handlers(
+            scenes,
+            scene_components,
+            reconstruction_output,
+            target_dir_output,
+            image_gallery,
+            log_output,
+            is_example,
+            processed_data_state,
+            measure_view_selector,
+            measure_image,
+            measure_depth_image,
+            gs_video,
+            gs_info,
+        )
+    def _setup_visualization_handlers(
+        self,
+        show_cam: gr.Checkbox,
+        filter_black_bg: gr.Checkbox,
+        filter_white_bg: gr.Checkbox,
+        process_res_method_dropdown: gr.Dropdown,
+        target_dir_output: gr.Textbox,
+        is_example: gr.Textbox,
+        reconstruction_output: gr.Model3D,
+        log_output: gr.Markdown,
+    ) -> None:
+        """Set up visualization update handlers."""
+        # Common inputs for visualization updates
+        viz_inputs = [
+            target_dir_output,
+            show_cam,
+            is_example,
+            filter_black_bg,
+            filter_white_bg,
+            process_res_method_dropdown,
+        ]
+        # Set up change handlers for all visualization controls
+        for component in [show_cam, filter_black_bg, filter_white_bg]:
+            component.change(
+                fn=self.event_handlers.update_visualization,
+                inputs=viz_inputs,
+                outputs=[reconstruction_output, log_output],
+            )
+    def _setup_navigation_handlers(
+        self,
+        prev_measure_btn: gr.Button,
+        next_measure_btn: gr.Button,
+        measure_view_selector: gr.Dropdown,
+        measure_image: gr.Image,
+        measure_depth_image: gr.Image,
+        measure_points_state: gr.State,
+        processed_data_state: gr.State,
+    ) -> None:
+        """Set up navigation handlers for measure tab."""
+        # Measure tab navigation
+        prev_measure_btn.click(
+            fn=lambda processed_data, current_selector: self.event_handlers.navigate_measure_view(
+                processed_data, current_selector, -1
+            ),
+            inputs=[processed_data_state, measure_view_selector],
+            outputs=[
+                measure_view_selector,
+                measure_image,
+                measure_depth_image,
+                measure_points_state,
+            ],
+        )
+        next_measure_btn.click(
+            fn=lambda processed_data, current_selector: self.event_handlers.navigate_measure_view(
+                processed_data, current_selector, 1
+            ),
+            inputs=[processed_data_state, measure_view_selector],
+            outputs=[
+                measure_view_selector,
+                measure_image,
+                measure_depth_image,
+                measure_points_state,
+            ],
+        )
+        measure_view_selector.change(
+            fn=lambda processed_data, selector_value: (
+                self.event_handlers.update_measure_view(
+                    processed_data, int(selector_value.split()[1]) - 1
+                )
+                if selector_value
+                else (None, None, [])
+            ),
+            inputs=[processed_data_state, measure_view_selector],
+            outputs=[measure_image, measure_depth_image, measure_points_state],
+        )
+    def _setup_example_scene_handlers(
+        self,
+        scenes: List[Dict[str, Any]],
+        scene_components: List[gr.Image],
+        reconstruction_output: gr.Model3D,
+        target_dir_output: gr.Textbox,
+        image_gallery: gr.Gallery,
+        log_output: gr.Markdown,
+        is_example: gr.Textbox,
+        processed_data_state: gr.State,
+        measure_view_selector: gr.Dropdown,
+        measure_image: gr.Image,
+        measure_depth_image: gr.Image,
+        gs_video: gr.Video,
+        gs_info: gr.Markdown,
+    ) -> None:
+        """Set up example scene handlers."""
+        # Cache for example scene loading (in-memory cache for faster access)
+        _example_cache = {}
+        def load_and_update_measure(name):
+            # Check cache first
+            if name in _example_cache:
+                print(f"✅ Using cached result for example scene: {name}")
+                return _example_cache[name]
+            # Load example scene
+            result = self.event_handlers.load_example_scene(name)
+            # result = (reconstruction_output, target_dir, image_paths, log_message, processed_data, measure_view_selector, gs_video, gs_video_vis, gs_info_vis)  # noqa: E501
+            # Update measure view if processed_data is available
+            measure_img = None
+            measure_depth = None
+            if result[4] is not None:  # processed_data exists
+                measure_img, measure_depth, _ = (
+                    self.event_handlers.visualization_handler.update_measure_view(result[4], 0)
+                )
+            final_result = result + ("True", measure_img, measure_depth)
+            # Cache the result (limit cache size to prevent memory issues)
+            if len(_example_cache) < 20:  # Cache up to 20 scenes
+                _example_cache[name] = final_result
+                print(f"💾 Cached result for example scene: {name}")
+            else:
+                print(f"⚠️ Cache full, not caching: {name}")
+            return final_result
+        # Enable caching for example scene loading
+        # Gradio will cache the results based on the scene name
+        for i, scene in enumerate(scenes):
+            if i < len(scene_components):
+                scene_components[i].select(
+                    fn=lambda name=scene["name"]: load_and_update_measure(name),
+                    outputs=[
+                        reconstruction_output,
+                        target_dir_output,
+                        image_gallery,
+                        log_output,
+                        processed_data_state,
+                        measure_view_selector,
+                        gs_video,
+                        gs_video,  # gs_video_visibility
+                        gs_info,  # gs_info_visibility
+                        is_example,
+                        measure_image,
+                        measure_depth_image,
+                    ],
+                    # Note: cache_examples is not a valid parameter for select()
+                    # Caching is handled by file-based cache in load_example_scene()
+                    # which checks for predictions.npz files
+                )
+    def launch(self, host: str = "127.0.0.1", port: int = 7860, **kwargs) -> None:
+        """
+        Launch the application.
+        Args:
+            host: Host address to bind to
+            port: Port number to bind to
+            **kwargs: Additional arguments for demo.launch()
+        """
+        demo = self.create_app()
+        # Configure launch settings for Spaces compatibility
+        # Use minimal config to avoid routing issues
+        demo.queue(max_size=20).launch(
+            show_error=True,
+            ssr_mode=False,
+            server_name=host,
+            server_port=port,
+            **kwargs
+        )
+def main():
+    """Main function to run the application."""
+    parser = argparse.ArgumentParser(
+        description="Depth Anything 3 Gradio Application",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic usage
+  python gradio_app.py --help
+  python gradio_app.py --host 0.0.0.0 --port 8080
+  python gradio_app.py --model-dir /path/to/model --workspace-dir /path/to/workspace
+  # Cache examples at startup (all low-res)
+  python gradio_app.py --cache-examples
+  # Cache with selective high-res+3DGS for scenes matching tag
+  python gradio_app.py --cache-examples --cache-gs-tag dl3dv
+  # This will use high-res + 3DGS for scenes containing "dl3dv" in their name,
+  # and low-res only for other scenes
+        """,
+    )
+    # Server configuration
+    parser.add_argument(
+        "--host", default="127.0.0.1", help="Host address to bind to (default: 127.0.0.1)"
+    )
+    parser.add_argument(
+        "--port", type=int, default=7860, help="Port number to bind to (default: 7860)"
+    )
+    # Directory configuration
+    parser.add_argument(
+        "--model-dir",
+        default="depth-anything/DA3NESTED-GIANT-LARGE",
+        help="Path to the model directory (default: depth-anything/DA3NESTED-GIANT-LARGE)",
+    )
+    parser.add_argument(
+        "--workspace-dir",
+        default="workspace/gradio",  # noqa: E501
+        help="Path to the workspace directory (default: workspace/gradio)",  # noqa: E501
+    )
+    parser.add_argument(
+        "--gallery-dir",
+        default="workspace/gallery",
+        help="Path to the gallery directory (default: workspace/gallery)",  # noqa: E501
+    )
+    # Additional Gradio options
+    parser.add_argument("--share", action="store_true", help="Create a public link for the app")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    # Example caching options
+    parser.add_argument(
+        "--cache-examples",
+        action="store_true",
+        help="Pre-cache all example scenes at startup for faster loading",
+    )
+    parser.add_argument(
+        "--cache-gs-tag",
+        type=str,
+        default="",
+        help="Tag to match scene names for high-res+3DGS caching (e.g., 'dl3dv'). Scenes containing this tag will use high_res and infer_gs=True; others will use low_res only.",  # noqa: E501
+    )
+    args = parser.parse_args()
+    # Create directories if they don't exist
+    os.makedirs(args.workspace_dir, exist_ok=True)
+    os.makedirs(args.gallery_dir, exist_ok=True)
+    # Initialize and launch the application
+    app = DepthAnything3App(
+        model_dir=args.model_dir, workspace_dir=args.workspace_dir, gallery_dir=args.gallery_dir
+    )
+    # Prepare launch arguments
+    launch_kwargs = {"share": args.share, "debug": args.debug}
+    print("Starting Depth Anything 3 Gradio App...")
+    print(f"Host: {args.host}")
+    print(f"Port: {args.port}")
+    print(f"Model Directory: {args.model_dir}")
+    print(f"Workspace Directory: {args.workspace_dir}")
+    print(f"Gallery Directory: {args.gallery_dir}")
+    print(f"Share: {args.share}")
+    print(f"Debug: {args.debug}")
+    print(f"Cache Examples: {args.cache_examples}")
+    if args.cache_examples:
+        if args.cache_gs_tag:
+            print(
+                f"Cache GS Tag: '{args.cache_gs_tag}' (scenes matching this tag will use high-res + 3DGS)"  # noqa: E501
+            )  # noqa: E501
+        else:
+            print("Cache GS Tag: None (all scenes will use low-res only)")
+    # Pre-cache examples if requested
+    if args.cache_examples:
+        print("\n" + "=" * 60)
+        print("Pre-caching mode enabled")
+        if args.cache_gs_tag:
+            print(f"Scenes containing '{args.cache_gs_tag}' will use HIGH-RES + 3DGS")
+            print("Other scenes will use LOW-RES only")
+        else:
+            print("All scenes will use LOW-RES only")
+        print("=" * 60)
+        app.cache_examples(
+            show_cam=True,
+            filter_black_bg=False,
+            filter_white_bg=False,
+            save_percentage=5.0,
+            num_max_points=1000,
+            cache_gs_tag=args.cache_gs_tag,
+            gs_trj_mode="smooth",
+            gs_video_quality="low",
+        )
+    app.launch(host=args.host, port=args.port, **launch_kwargs)
+if __name__ == "__main__":
+    main()

depth_anything_3/app/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Modules package for Depth Anything 3 Gradio app.
+This package contains all the modular components for the Gradio application.
+"""
+from depth_anything_3.app.modules.event_handlers import EventHandlers
+from depth_anything_3.app.modules.file_handlers import FileHandler
+from depth_anything_3.app.modules.model_inference import ModelInference
+from depth_anything_3.app.modules.ui_components import UIComponents
+from depth_anything_3.app.modules.utils import (
+    cleanup_memory,
+    create_depth_visualization,
+    get_logo_base64,
+    get_scene_info,
+    save_to_gallery_func,
+)
+from depth_anything_3.app.modules.visualization import VisualizationHandler
+__all__ = [
+    "ModelInference",
+    "FileHandler",
+    "VisualizationHandler",
+    "EventHandlers",
+    "UIComponents",
+    "create_depth_visualization",
+    "save_to_gallery_func",
+    "get_scene_info",
+    "cleanup_memory",
+    "get_logo_base64",
+]

depth_anything_3/app/modules/event_handlers.py ADDED Viewed

	@@ -0,0 +1,629 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Event handling module for Depth Anything 3 Gradio app.
+This module handles all event callbacks and user interactions.
+"""
+import os
+import time
+from glob import glob
+from typing import Any, Dict, List, Optional, Tuple
+import gradio as gr
+import numpy as np
+import torch
+from depth_anything_3.app.modules.file_handlers import FileHandler
+from depth_anything_3.app.modules.model_inference import ModelInference
+from depth_anything_3.app.modules.utils import cleanup_memory
+from depth_anything_3.app.modules.visualization import VisualizationHandler
+class EventHandlers:
+    """
+    Handles all event callbacks and user interactions for the Gradio app.
+    """
+    def __init__(self):
+        """Initialize the event handlers."""
+        self.model_inference = ModelInference()
+        self.file_handler = FileHandler()
+        self.visualization_handler = VisualizationHandler()
+    def clear_fields(self) -> None:
+        """
+        Clears the 3D viewer, the stored target_dir, and empties the gallery.
+        """
+        return None
+    def update_log(self) -> str:
+        """
+        Display a quick log message while waiting.
+        """
+        return "Loading and Reconstructing..."
+    def save_current_visualization(
+        self,
+        target_dir: str,
+        save_percentage: float,
+        show_cam: bool,
+        filter_black_bg: bool,
+        filter_white_bg: bool,
+        processed_data: Optional[Dict],
+        scene_name: str = "",
+    ) -> str:
+        """
+        Save current visualization results to gallery with specified save percentage.
+        Args:
+            target_dir: Directory containing results
+            save_percentage: Percentage of points to save (0-100)
+            show_cam: Whether to show cameras
+            filter_black_bg: Whether to filter black background
+            filter_white_bg: Whether to filter white background
+            processed_data: Processed data from reconstruction
+        Returns:
+            Status message
+        """
+        if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
+            return "No reconstruction available. Please run 'Reconstruct' first."
+        if processed_data is None:
+            return "No processed data available. Please run 'Reconstruct' first."
+        try:
+            # Add debug information
+            print("[DEBUG] save_current_visualization called with:")
+            print(f"  target_dir: {target_dir}")
+            print(f"  save_percentage: {save_percentage}")
+            print(f"  show_cam: {show_cam}")
+            print(f"  filter_black_bg: {filter_black_bg}")
+            print(f"  filter_white_bg: {filter_white_bg}")
+            print(f"  processed_data: {processed_data is not None}")
+            # Import the gallery save function
+            # Create gallery name with user input or auto-generated
+            import datetime
+            from .utils import save_to_gallery_func
+            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+            if scene_name and scene_name.strip():
+                gallery_name = f"{scene_name.strip()}_{timestamp}_pct{save_percentage:.0f}"
+            else:
+                gallery_name = f"save_{timestamp}_pct{save_percentage:.0f}"
+            print(f"[DEBUG] Saving to gallery with name: {gallery_name}")
+            # Save entire process folder to gallery
+            success, message = save_to_gallery_func(
+                target_dir=target_dir, processed_data=processed_data, gallery_name=gallery_name
+            )
+            if success:
+                print(f"[DEBUG] Gallery save completed successfully: {message}")
+                return (
+                    "Successfully saved to gallery!\n"
+                    f"Gallery name: {gallery_name}\n"
+                    f"Save percentage: {save_percentage}%\n"
+                    f"Show cameras: {show_cam}\n"
+                    f"Filter black bg: {filter_black_bg}\n"
+                    f"Filter white bg: {filter_white_bg}\n\n"
+                    f"{message}"
+                )
+            else:
+                print(f"[DEBUG] Gallery save failed: {message}")
+                return f"Failed to save to gallery: {message}"
+        except Exception as e:
+            return f"Error saving visualization: {str(e)}"
+    def gradio_demo(
+        self,
+        target_dir: str,
+        show_cam: bool = True,
+        filter_black_bg: bool = False,
+        filter_white_bg: bool = False,
+        process_res_method: str = "upper_bound_resize",
+        selected_first_frame: str = "",
+        save_percentage: float = 30.0,
+        num_max_points: int = 1_000_000,
+        infer_gs: bool = False,
+        gs_trj_mode: str = "extend",
+        gs_video_quality: str = "high",
+    ) -> Tuple[
+        Optional[str],
+        str,
+        Optional[Dict],
+        Optional[np.ndarray],
+        Optional[np.ndarray],
+        str,
+        gr.Dropdown,
+        Optional[str],  # gs video path
+        gr.update,  # gs video visibility update
+        gr.update,  # gs info visibility update
+    ]:
+        """
+        Perform reconstruction using the already-created target_dir/images.
+        Args:
+            target_dir: Directory containing images
+            show_cam: Whether to show camera
+            filter_black_bg: Whether to filter black background
+            filter_white_bg: Whether to filter white background
+            process_res_method: Method for resizing input images
+            selected_first_frame: Selected first frame filename
+            infer_gs: Whether to infer 3D Gaussian Splatting
+        Returns:
+            Tuple of reconstruction results
+        """
+        if not os.path.isdir(target_dir) or target_dir == "None":
+            return (
+                None,
+                "No valid target directory found. Please upload first.",
+                None,
+                None,
+                None,
+                "",
+                None,
+                None,
+                gr.update(visible=False),  # gs_video
+                gr.update(visible=True),  # gs_info
+            )
+        start_time = time.time()
+        cleanup_memory()
+        # Get image files for logging
+        target_dir_images = os.path.join(target_dir, "images")
+        all_files = (
+            sorted(os.listdir(target_dir_images)) if os.path.isdir(target_dir_images) else []
+        )
+        print("Running DepthAnything3 model...")
+        print(f"Selected first frame: {selected_first_frame}")
+        # Validate selected_first_frame against current image list
+        if selected_first_frame and target_dir_images:
+            current_files = (
+                sorted(os.listdir(target_dir_images)) if os.path.isdir(target_dir_images) else []
+            )
+            if selected_first_frame not in current_files:
+                print(
+                    f"Selected first frame '{selected_first_frame}' not found in "
+                    "current images. Using default order."
+                )
+                selected_first_frame = ""  # Reset to use default order
+        with torch.no_grad():
+            prediction, processed_data = self.model_inference.run_inference(
+                target_dir,
+                process_res_method=process_res_method,
+                show_camera=show_cam,
+                selected_first_frame=selected_first_frame,
+                save_percentage=save_percentage,
+                num_max_points=int(num_max_points * 1000),  # Convert K to actual count
+                infer_gs=infer_gs,
+                gs_trj_mode=gs_trj_mode,
+                gs_video_quality=gs_video_quality,
+            )
+        # The GLB file is already generated by the API
+        glbfile = os.path.join(target_dir, "scene.glb")
+        # Handle 3DGS video based on infer_gs flag
+        gsvideo_path = None
+        gs_video_visible = False
+        gs_info_visible = True
+        if infer_gs:
+            try:
+                gsvideo_path = sorted(glob(os.path.join(target_dir, "gs_video", "*.mp4")))[-1]
+                gs_video_visible = True
+                gs_info_visible = False
+            except IndexError:
+                gsvideo_path = None
+                print("3DGS video not found, but infer_gs was enabled")
+        # Cleanup
+        cleanup_memory()
+        end_time = time.time()
+        print(f"Total time: {end_time - start_time:.2f} seconds")
+        log_msg = f"Reconstruction Success ({len(all_files)} frames). Waiting for visualization."
+        # Populate visualization tabs with processed data
+        depth_vis, measure_img, measure_depth_vis, measure_pts = (
+            self.visualization_handler.populate_visualization_tabs(processed_data)
+        )
+        # Update view selectors based on available views
+        depth_selector, measure_selector = self.visualization_handler.update_view_selectors(
+            processed_data
+        )
+        return (
+            glbfile,
+            log_msg,
+            processed_data,
+            measure_img,  # measure_image
+            measure_depth_vis,  # measure_depth_image
+            "",  # measure_text (empty initially)
+            measure_selector,  # measure_view_selector
+            gsvideo_path,
+            gr.update(visible=gs_video_visible),  # gs_video visibility
+            gr.update(visible=gs_info_visible),  # gs_info visibility
+        )
+    def update_visualization(
+        self,
+        target_dir: str,
+        show_cam: bool,
+        is_example: str,
+        filter_black_bg: bool = False,
+        filter_white_bg: bool = False,
+        process_res_method: str = "upper_bound_resize",
+    ) -> Tuple[gr.update, str]:
+        """
+        Reload saved predictions from npz, create (or reuse) the GLB for new parameters,
+        and return it for the 3D viewer.
+        Args:
+            target_dir: Directory containing results
+            show_cam: Whether to show camera
+            is_example: Whether this is an example scene
+            filter_black_bg: Whether to filter black background
+            filter_white_bg: Whether to filter white background
+            process_res_method: Method for resizing input images
+        Returns:
+            Tuple of (glb_file, log_message)
+        """
+        if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
+            return (
+                gr.update(),
+                "No reconstruction available. Please click the Reconstruct button first.",
+            )
+        # Check if GLB exists (could be cached example or reconstructed scene)
+        glbfile = os.path.join(target_dir, "scene.glb")
+        if os.path.exists(glbfile):
+            return (
+                glbfile,
+                (
+                    "Visualization loaded from cache."
+                    if is_example == "True"
+                    else "Visualization updated."
+                ),
+            )
+        # If no GLB but it's an example that hasn't been reconstructed yet
+        if is_example == "True":
+            return (
+                gr.update(),
+                "No reconstruction available. Please click the Reconstruct button first.",
+            )
+        # For non-examples, check predictions.npz
+        predictions_path = os.path.join(target_dir, "predictions.npz")
+        if not os.path.exists(predictions_path):
+            error_message = (
+                f"No reconstruction available at {predictions_path}. "
+                "Please run 'Reconstruct' first."
+            )
+            return gr.update(), error_message
+        loaded = np.load(predictions_path, allow_pickle=True)
+        predictions = {key: loaded[key] for key in loaded.keys()}  # noqa: F841
+        return (
+            glbfile,
+            "Visualization updated.",
+        )
+    def handle_uploads(
+        self,
+        input_video: Optional[str],
+        input_images: Optional[List],
+        s_time_interval: float = 10.0,
+    ) -> Tuple[Optional[str], Optional[str], Optional[List], Optional[str]]:
+        """
+        Handle file uploads and update gallery.
+        Args:
+            input_video: Path to input video file
+            input_images: List of input image files
+            s_time_interval: Sampling FPS (frames per second) for frame extraction
+        Returns:
+            Tuple of (reconstruction_output, target_dir, image_paths, log_message)
+        """
+        return self.file_handler.update_gallery_on_upload(
+            input_video, input_images, s_time_interval
+        )
+    def load_example_scene(self, scene_name: str, examples_dir: str = None) -> Tuple[
+        Optional[str],
+        Optional[str],
+        Optional[List],
+        str,
+        Optional[Dict],
+        gr.Dropdown,
+        Optional[str],
+        gr.update,
+        gr.update,
+    ]:
+        """
+        Load a scene from examples directory.
+        Args:
+            scene_name: Name of the scene to load
+            examples_dir: Path to examples directory (if None, uses workspace_dir/examples)
+        Returns:
+            Tuple of (reconstruction_output, target_dir, image_paths, log_message, processed_data, measure_view_selector, gs_video, gs_video_vis, gs_info_vis)  # noqa: E501
+        """
+        if examples_dir is None:
+            # Get workspace directory from environment variable
+            workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
+            examples_dir = os.path.join(workspace_dir, "examples")
+        reconstruction_output, target_dir, image_paths, log_message = (
+            self.file_handler.load_example_scene(scene_name, examples_dir)
+        )
+        # Try to load cached processed data if available
+        processed_data = None
+        measure_view_selector = gr.Dropdown(choices=["View 1"], value="View 1")
+        gs_video_path = None
+        gs_video_visible = False
+        gs_info_visible = True
+        if target_dir and target_dir != "None":
+            predictions_path = os.path.join(target_dir, "predictions.npz")
+            if os.path.exists(predictions_path):
+                try:
+                    # Load predictions from cache
+                    loaded = np.load(predictions_path, allow_pickle=True)
+                    predictions = {key: loaded[key] for key in loaded.keys()}
+                    # Reconstruct processed_data structure
+                    num_images = len(predictions.get("images", []))
+                    processed_data = {}
+                    for i in range(num_images):
+                        processed_data[i] = {
+                            "image": predictions["images"][i] if "images" in predictions else None,
+                            "depth": predictions["depths"][i] if "depths" in predictions else None,
+                            "depth_image": os.path.join(
+                                target_dir, "depth_vis", f"{i:04d}.jpg"  # Fixed: use .jpg not .png
+                            ),
+                            "intrinsics": (
+                                predictions["intrinsics"][i]
+                                if "intrinsics" in predictions
+                                and i < len(predictions["intrinsics"])
+                                else None
+                            ),
+                            "mask": None,
+                        }
+                    # Update measure view selector
+                    choices = [f"View {i + 1}" for i in range(num_images)]
+                    measure_view_selector = gr.Dropdown(choices=choices, value=choices[0])
+                except Exception as e:
+                    print(f"Error loading cached data: {e}")
+            # Check for cached 3DGS video
+            gs_video_dir = os.path.join(target_dir, "gs_video")
+            if os.path.exists(gs_video_dir):
+                try:
+                    from glob import glob
+                    gs_videos = sorted(glob(os.path.join(gs_video_dir, "*.mp4")))
+                    if gs_videos:
+                        gs_video_path = gs_videos[-1]
+                        gs_video_visible = True
+                        gs_info_visible = False
+                        print(f"Loaded cached 3DGS video: {gs_video_path}")
+                except Exception as e:
+                    print(f"Error loading cached 3DGS video: {e}")
+        return (
+            reconstruction_output,
+            target_dir,
+            image_paths,
+            log_message,
+            processed_data,
+            measure_view_selector,
+            gs_video_path,
+            gr.update(visible=gs_video_visible),
+            gr.update(visible=gs_info_visible),
+        )
+    def navigate_depth_view(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        current_selector: str,
+        direction: int,
+    ) -> Tuple[str, Optional[str]]:
+        """
+        Navigate depth view.
+        Args:
+            processed_data: Processed data dictionary
+            current_selector: Current selector value
+            direction: Direction to navigate
+        Returns:
+            Tuple of (new_selector_value, depth_vis)
+        """
+        return self.visualization_handler.navigate_depth_view(
+            processed_data, current_selector, direction
+        )
+    def update_depth_view(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]], view_index: int
+    ) -> Optional[str]:
+        """
+        Update depth view for a specific view index.
+        Args:
+            processed_data: Processed data dictionary
+            view_index: Index of the view to update
+        Returns:
+            Path to depth visualization image or None
+        """
+        return self.visualization_handler.update_depth_view(processed_data, view_index)
+    def navigate_measure_view(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        current_selector: str,
+        direction: int,
+    ) -> Tuple[str, Optional[np.ndarray], Optional[np.ndarray], List]:
+        """
+        Navigate measure view.
+        Args:
+            processed_data: Processed data dictionary
+            current_selector: Current selector value
+            direction: Direction to navigate
+        Returns:
+            Tuple of (new_selector_value, measure_image, depth_right_half, measure_points)
+        """
+        return self.visualization_handler.navigate_measure_view(
+            processed_data, current_selector, direction
+        )
+    def update_measure_view(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]], view_index: int
+    ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], List]:
+        """
+        Update measure view for a specific view index.
+        Args:
+            processed_data: Processed data dictionary
+            view_index: Index of the view to update
+        Returns:
+            Tuple of (measure_image, depth_right_half, measure_points)
+        """
+        return self.visualization_handler.update_measure_view(processed_data, view_index)
+    def measure(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        measure_points: List,
+        current_view_selector: str,
+        event: gr.SelectData,
+    ) -> List:
+        """
+        Handle measurement on images.
+        Args:
+            processed_data: Processed data dictionary
+            measure_points: List of current measure points
+            current_view_selector: Current view selector value
+            event: Gradio select event
+        Returns:
+            List of [image, depth_right_half, measure_points, text]
+        """
+        return self.visualization_handler.measure(
+            processed_data, measure_points, current_view_selector, event
+        )
+    def select_first_frame(
+        self, image_gallery: List, selected_index: int = 0
+    ) -> Tuple[List, str, str]:
+        """
+        Select the first frame from the image gallery.
+        Args:
+            image_gallery: List of images in the gallery
+            selected_index: Index of the selected image (default: 0)
+        Returns:
+            Tuple of (updated_image_gallery, log_message, selected_frame_path)
+        """
+        try:
+            if not image_gallery or len(image_gallery) == 0:
+                return image_gallery, "No images available to select as first frame.", ""
+            # Handle None or invalid selected_index
+            if (
+                selected_index is None
+                or selected_index < 0
+                or selected_index >= len(image_gallery)
+            ):
+                selected_index = 0
+                print(f"Invalid selected_index: {selected_index}, using default: 0")
+            # Get the selected image based on index
+            selected_image = image_gallery[selected_index]
+            print(f"Selected image index: {selected_index}")
+            print(f"Total images: {len(image_gallery)}")
+            # Extract the file path from the selected image
+            selected_frame_path = ""
+            print(f"Selected image type: {type(selected_image)}")
+            print(f"Selected image: {selected_image}")
+            if isinstance(selected_image, tuple):
+                # Gradio Gallery returns tuple (path, None)
+                selected_frame_path = selected_image[0]
+            elif isinstance(selected_image, str):
+                selected_frame_path = selected_image
+            elif hasattr(selected_image, "name"):
+                selected_frame_path = selected_image.name
+            elif isinstance(selected_image, dict):
+                if "name" in selected_image:
+                    selected_frame_path = selected_image["name"]
+                elif "path" in selected_image:
+                    selected_frame_path = selected_image["path"]
+                elif "src" in selected_image:
+                    selected_frame_path = selected_image["src"]
+            else:
+                # Try to convert to string
+                selected_frame_path = str(selected_image)
+            print(f"Extracted path: {selected_frame_path}")
+            # Extract filename from the path for matching
+            import os
+            selected_filename = os.path.basename(selected_frame_path)
+            print(f"Selected filename: {selected_filename}")
+            # Move the selected image to the front
+            updated_gallery = [selected_image] + [
+                img for img in image_gallery if img != selected_image
+            ]
+            log_message = (
+                f"Selected frame: {selected_filename}. "
+                f"Moved to first position. Total frames: {len(updated_gallery)}"
+            )
+            return updated_gallery, log_message, selected_filename
+        except Exception as e:
+            print(f"Error selecting first frame: {e}")
+            return image_gallery, f"Error selecting first frame: {e}", ""

depth_anything_3/app/modules/file_handlers.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+File handling module for Depth Anything 3 Gradio app.
+This module handles file uploads, video processing, and file operations.
+"""
+import os
+import shutil
+import time
+from datetime import datetime
+from typing import List, Optional, Tuple
+import cv2
+from PIL import Image
+from pillow_heif import register_heif_opener
+register_heif_opener()
+class FileHandler:
+    """
+    Handles file uploads and processing for the Gradio app.
+    """
+    def __init__(self):
+        """Initialize the file handler."""
+    def handle_uploads(
+        self,
+        input_video: Optional[str],
+        input_images: Optional[List],
+        s_time_interval: float = 10.0,
+    ) -> Tuple[str, List[str]]:
+        """
+        Create a new 'target_dir' + 'images' subfolder, and place user-uploaded
+        images or extracted frames from video into it.
+        Args:
+            input_video: Path to input video file
+            input_images: List of input image files
+            s_time_interval: Sampling FPS (frames per second) for frame extraction
+        Returns:
+            Tuple of (target_dir, image_paths)
+        """
+        start_time = time.time()
+        # Get workspace directory from environment variable or use default
+        workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
+        if not os.path.exists(workspace_dir):
+            os.makedirs(workspace_dir)
+        # Create input_images subdirectory
+        input_images_dir = os.path.join(workspace_dir, "input_images")
+        if not os.path.exists(input_images_dir):
+            os.makedirs(input_images_dir)
+        # Create a unique folder name within input_images
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        target_dir = os.path.join(input_images_dir, f"session_{timestamp}")
+        target_dir_images = os.path.join(target_dir, "images")
+        # Clean up if somehow that folder already exists
+        if os.path.exists(target_dir):
+            shutil.rmtree(target_dir)
+        os.makedirs(target_dir)
+        os.makedirs(target_dir_images)
+        image_paths = []
+        # Handle images
+        if input_images is not None:
+            image_paths.extend(self._process_images(input_images, target_dir_images))
+        # Handle video
+        if input_video is not None:
+            image_paths.extend(
+                self._process_video(input_video, target_dir_images, s_time_interval)
+            )
+        # Sort final images for gallery
+        image_paths = sorted(image_paths)
+        end_time = time.time()
+        print(f"Files copied to {target_dir_images}; took {end_time - start_time:.3f} seconds")
+        return target_dir, image_paths
+    def _process_images(self, input_images: List, target_dir_images: str) -> List[str]:
+        """
+        Process uploaded images.
+        Args:
+            input_images: List of input image files
+            target_dir_images: Target directory for images
+        Returns:
+            List of processed image paths
+        """
+        image_paths = []
+        for file_data in input_images:
+            if isinstance(file_data, dict) and "name" in file_data:
+                file_path = file_data["name"]
+            else:
+                file_path = file_data
+            # Check if the file is a HEIC image
+            file_ext = os.path.splitext(file_path)[1].lower()
+            if file_ext in [".heic", ".heif"]:
+                # Convert HEIC to JPEG for better gallery compatibility
+                try:
+                    with Image.open(file_path) as img:
+                        # Convert to RGB if necessary (HEIC can have different color modes)
+                        if img.mode not in ("RGB", "L"):
+                            img = img.convert("RGB")
+                        # Create JPEG filename
+                        base_name = os.path.splitext(os.path.basename(file_path))[0]
+                        dst_path = os.path.join(target_dir_images, f"{base_name}.jpg")
+                        # Save as JPEG with high quality
+                        img.save(dst_path, "JPEG", quality=95)
+                        image_paths.append(dst_path)
+                        print(
+                            f"Converted HEIC to JPEG: {os.path.basename(file_path)} -> "
+                            f"{os.path.basename(dst_path)}"
+                        )
+                except Exception as e:
+                    print(f"Error converting HEIC file {file_path}: {e}")
+                    # Fall back to copying as is
+                    dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
+                    shutil.copy(file_path, dst_path)
+                    image_paths.append(dst_path)
+            else:
+                # Regular image files - copy as is
+                dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
+                shutil.copy(file_path, dst_path)
+                image_paths.append(dst_path)
+        return image_paths
+    def _process_video(
+        self, input_video: str, target_dir_images: str, s_time_interval: float
+    ) -> List[str]:
+        """
+        Process video file and extract frames.
+        Args:
+            input_video: Path to input video file
+            target_dir_images: Target directory for extracted frames
+            s_time_interval: Sampling FPS (frames per second) for frame extraction
+        Returns:
+            List of extracted frame paths
+        """
+        image_paths = []
+        if isinstance(input_video, dict) and "name" in input_video:
+            video_path = input_video["name"]
+        else:
+            video_path = input_video
+        vs = cv2.VideoCapture(video_path)
+        fps = vs.get(cv2.CAP_PROP_FPS)
+        frame_interval = max(1, int(fps / s_time_interval))  # Convert FPS to frame interval
+        count = 0
+        video_frame_num = 0
+        while True:
+            gotit, frame = vs.read()
+            if not gotit:
+                break
+            count += 1
+            if count % frame_interval == 0:
+                image_path = os.path.join(target_dir_images, f"{video_frame_num:06}.png")
+                cv2.imwrite(image_path, frame)
+                image_paths.append(image_path)
+                video_frame_num += 1
+        return image_paths
+    def update_gallery_on_upload(
+        self,
+        input_video: Optional[str],
+        input_images: Optional[List],
+        s_time_interval: float = 10.0,
+    ) -> Tuple[Optional[str], Optional[str], Optional[List], Optional[str]]:
+        """
+        Handle file uploads and update gallery.
+        Args:
+            input_video: Path to input video file
+            input_images: List of input image files
+            s_time_interval: Sampling FPS (frames per second) for frame extraction
+        Returns:
+            Tuple of (reconstruction_output, target_dir, image_paths, log_message)
+        """
+        if not input_video and not input_images:
+            return None, None, None, None
+        target_dir, image_paths = self.handle_uploads(input_video, input_images, s_time_interval)
+        return (
+            None,
+            target_dir,
+            image_paths,
+            "Upload complete. Click 'Reconstruct' to begin 3D processing.",
+        )
+    def load_example_scene(
+        self, scene_name: str, examples_dir: str = "examples"
+    ) -> Tuple[Optional[str], Optional[str], Optional[List], str]:
+        """
+        Load a scene from examples directory.
+        Args:
+            scene_name: Name of the scene to load
+            examples_dir: Path to examples directory
+        Returns:
+            Tuple of (reconstruction_output, target_dir, image_paths, log_message)
+        """
+        from depth_anything_3.app.modules.utils import get_scene_info
+        scenes = get_scene_info(examples_dir)
+        # Find the selected scene
+        selected_scene = None
+        for scene in scenes:
+            if scene["name"] == scene_name:
+                selected_scene = scene
+                break
+        if selected_scene is None:
+            return None, None, None, "Scene not found"
+        # Use fixed directory name for examples (not timestamp-based)
+        workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
+        input_images_dir = os.path.join(workspace_dir, "input_images")
+        if not os.path.exists(input_images_dir):
+            os.makedirs(input_images_dir)
+        # Create a fixed folder name based on scene name
+        target_dir = os.path.join(input_images_dir, f"example_{scene_name}")
+        target_dir_images = os.path.join(target_dir, "images")
+        # Check if already cached (GLB file exists)
+        glb_path = os.path.join(target_dir, "scene.glb")
+        is_cached = os.path.exists(glb_path)
+        # Create directory if it doesn't exist
+        if not os.path.exists(target_dir):
+            os.makedirs(target_dir)
+            os.makedirs(target_dir_images)
+        # Copy images if directory is new or empty
+        if not os.path.exists(target_dir_images) or len(os.listdir(target_dir_images)) == 0:
+            os.makedirs(target_dir_images, exist_ok=True)
+            image_paths = []
+            for file_path in selected_scene["image_files"]:
+                dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
+                shutil.copy(file_path, dst_path)
+                image_paths.append(dst_path)
+        else:
+            # Use existing images
+            image_paths = sorted(
+                [
+                    os.path.join(target_dir_images, f)
+                    for f in os.listdir(target_dir_images)
+                    if f.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif"))
+                ]
+            )
+        # Return cached GLB if available
+        if is_cached:
+            return (
+                glb_path,  # Return cached reconstruction
+                target_dir,  # Set target directory
+                image_paths,  # Set gallery
+                f"Loaded cached scene '{scene_name}' with {selected_scene['num_images']} images.",
+            )
+        else:
+            return (
+                None,  # No cached reconstruction
+                target_dir,  # Set target directory
+                image_paths,  # Set gallery
+                (
+                    f"Loaded scene '{scene_name}' with {selected_scene['num_images']} images. "
+                    "Click 'Reconstruct' to begin 3D processing."
+                ),
+            )

depth_anything_3/app/modules/model_inference.py ADDED Viewed

	@@ -0,0 +1,365 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Model inference module for Depth Anything 3 Gradio app.
+This module handles all model-related operations including inference,
+data processing, and result preparation.
+"""
+import gc
+import glob
+import os
+from typing import Any, Dict, Optional, Tuple
+import numpy as np
+import torch
+from depth_anything_3.api import DepthAnything3
+from depth_anything_3.utils.export.glb import export_to_glb
+from depth_anything_3.utils.export.gs import export_to_gs_video
+# Global cache for model (safe in GPU subprocess with @spaces.GPU)
+# Each subprocess gets its own copy of this global variable
+_MODEL_CACHE = None
+class ModelInference:
+    """
+    Handles model inference and data processing for Depth Anything 3.
+    """
+    def __init__(self):
+        """Initialize the model inference handler.
+        Note: Do not store model in instance variable to avoid
+        cross-process state issues with @spaces.GPU decorator.
+        """
+        # No instance variables - model cached in global variable
+        pass
+    def initialize_model(self, device: str = "cuda"):
+        """
+        Initialize the DepthAnything3 model using global cache.
+        Optimization: Load model to CPU first, then move to GPU when needed.
+        This is faster than reloading from disk each time.
+        This uses a global variable which is safe because @spaces.GPU
+        runs in isolated subprocess, each with its own global namespace.
+        Args:
+            device: Device to run inference on (will move model to this device)
+        Returns:
+            Model instance ready for inference on specified device
+        """
+        global _MODEL_CACHE
+        if _MODEL_CACHE is None:
+            # First time loading in this subprocess
+            # Load to CPU first (faster than loading directly to GPU)
+            model_dir = os.environ.get(
+                "DA3_MODEL_DIR", "depth-anything/DA3NESTED-GIANT-LARGE"
+            )
+            print(f"🔄 Loading model from {model_dir} to CPU...")
+            print("   (Model files are cached on disk)")
+            _MODEL_CACHE = DepthAnything3.from_pretrained(model_dir)
+            # Load to CPU first (faster, and allows reuse)
+            _MODEL_CACHE = _MODEL_CACHE.to("cpu")
+            _MODEL_CACHE.eval()
+            print("✅ Model loaded to CPU memory (cached in subprocess)")
+        # Move to target device for inference
+        if device != "cpu" and next(_MODEL_CACHE.parameters()).device.type != device:
+            print(f"🚀 Moving model from {next(_MODEL_CACHE.parameters()).device} to {device}...")
+            _MODEL_CACHE = _MODEL_CACHE.to(device)
+            print(f"✅ Model ready on {device}")
+        elif device == "cpu":
+            # Already on CPU or requested CPU
+            pass
+        return _MODEL_CACHE
+    def run_inference(
+        self,
+        target_dir: str,
+        filter_black_bg: bool = False,
+        filter_white_bg: bool = False,
+        process_res_method: str = "upper_bound_resize",
+        show_camera: bool = True,
+        selected_first_frame: Optional[str] = None,
+        save_percentage: float = 30.0,
+        num_max_points: int = 1_000_000,
+        infer_gs: bool = False,
+        gs_trj_mode: str = "extend",
+        gs_video_quality: str = "high",
+    ) -> Tuple[Any, Dict[int, Dict[str, Any]]]:
+        """
+        Run DepthAnything3 model inference on images.
+        Args:
+            target_dir: Directory containing images
+            apply_mask: Whether to apply mask for ambiguous depth classes
+            mask_edges: Whether to mask edges
+            filter_black_bg: Whether to filter black background
+            filter_white_bg: Whether to filter white background
+            process_res_method: Method for resizing input images
+            show_camera: Whether to show camera in 3D view
+            selected_first_frame: Selected first frame filename
+            save_percentage: Percentage of points to save (0-100)
+            infer_gs: Whether to infer 3D Gaussian Splatting
+        Returns:
+            Tuple of (prediction, processed_data)
+        """
+        print(f"Processing images from {target_dir}")
+        # Device check
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        device = torch.device(device)
+        # Initialize model if needed - get model instance (not stored in self)
+        model = self.initialize_model(device)
+        # Get image paths
+        print("Loading images...")
+        image_folder_path = os.path.join(target_dir, "images")
+        all_image_paths = sorted(glob.glob(os.path.join(image_folder_path, "*")))
+        # Filter for image files
+        image_extensions = [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]
+        all_image_paths = [
+            path
+            for path in all_image_paths
+            if any(path.lower().endswith(ext) for ext in image_extensions)
+        ]
+        print(f"Found {len(all_image_paths)} images")
+        print(f"All image paths: {all_image_paths}")
+        # Apply first frame selection logic
+        if selected_first_frame:
+            # Find the image with matching filename
+            selected_path = None
+            for path in all_image_paths:
+                if os.path.basename(path) == selected_first_frame:
+                    selected_path = path
+                    break
+            if selected_path:
+                # Move selected frame to the front
+                image_paths = [selected_path] + [
+                    path for path in all_image_paths if path != selected_path
+                ]
+                print(f"User selected first frame: {selected_first_frame} -> {selected_path}")
+                print(f"Reordered image paths: {image_paths}")
+            else:
+                # Use default order if no match found
+                image_paths = all_image_paths
+                print(
+                    f"Selected frame '{selected_first_frame}' not found in image paths. "
+                    "Using default order."
+                )
+                first_frame_display = image_paths[0] if image_paths else "No images"
+                print(f"Using default order (first frame): {first_frame_display}")
+        else:
+            # Use default order (sorted)
+            image_paths = all_image_paths
+            first_frame_display = image_paths[0] if image_paths else "No images"
+            print(f"Using default order (first frame): {first_frame_display}")
+        if len(image_paths) == 0:
+            raise ValueError("No images found. Check your upload.")
+        # Map UI options to actual method names
+        method_mapping = {"high_res": "lower_bound_resize", "low_res": "upper_bound_resize"}
+        actual_method = method_mapping.get(process_res_method, "upper_bound_crop")
+        # Run model inference
+        print(f"Running inference with method: {actual_method}")
+        with torch.no_grad():
+            prediction = model.inference(
+                image_paths, export_dir=None, process_res_method=actual_method, infer_gs=infer_gs
+            )
+        # num_max_points: int = 1_000_000,
+        export_to_glb(
+            prediction,
+            filter_black_bg=filter_black_bg,
+            filter_white_bg=filter_white_bg,
+            export_dir=target_dir,
+            show_cameras=show_camera,
+            conf_thresh_percentile=save_percentage,
+            num_max_points=int(num_max_points),
+        )
+        # export to gs video if needed
+        if infer_gs:
+            mode_mapping = {"extend": "extend", "smooth": "interpolate_smooth"}
+            print(f"GS mode: {gs_trj_mode}; Backend mode: {mode_mapping[gs_trj_mode]}")
+            export_to_gs_video(
+                prediction,
+                export_dir=target_dir,
+                chunk_size=4,
+                trj_mode=mode_mapping.get(gs_trj_mode, "extend"),
+                enable_tqdm=True,
+                vis_depth="hcat",
+                video_quality=gs_video_quality,
+            )
+        # Save predictions.npz for caching metric depth data
+        self._save_predictions_cache(target_dir, prediction)
+        # Process results
+        processed_data = self._process_results(target_dir, prediction, image_paths)
+        # CRITICAL: Move all CUDA tensors to CPU before returning
+        # This prevents CUDA initialization in main process during unpickling
+        prediction = self._move_prediction_to_cpu(prediction)
+        # Clean up
+        torch.cuda.empty_cache()
+        return prediction, processed_data
+    def _save_predictions_cache(self, target_dir: str, prediction: Any) -> None:
+        """
+        Save predictions data to predictions.npz for caching.
+        Args:
+            target_dir: Directory to save the cache
+            prediction: Model prediction object
+        """
+        try:
+            output_file = os.path.join(target_dir, "predictions.npz")
+            # Build save dict with prediction data
+            save_dict = {}
+            # Save processed images if available
+            if prediction.processed_images is not None:
+                save_dict["images"] = prediction.processed_images
+            # Save depth data
+            if prediction.depth is not None:
+                save_dict["depths"] = np.round(prediction.depth, 6)
+            # Save confidence if available
+            if prediction.conf is not None:
+                save_dict["conf"] = np.round(prediction.conf, 2)
+            # Save camera parameters
+            if prediction.extrinsics is not None:
+                save_dict["extrinsics"] = prediction.extrinsics
+            if prediction.intrinsics is not None:
+                save_dict["intrinsics"] = prediction.intrinsics
+            # Save to file
+            np.savez_compressed(output_file, **save_dict)
+            print(f"Saved predictions cache to: {output_file}")
+        except Exception as e:
+            print(f"Warning: Failed to save predictions cache: {e}")
+    def _process_results(
+        self, target_dir: str, prediction: Any, image_paths: list
+    ) -> Dict[int, Dict[str, Any]]:
+        """
+        Process model results into structured data.
+        Args:
+            target_dir: Directory containing results
+            prediction: Model prediction object
+            image_paths: List of input image paths
+        Returns:
+            Dictionary containing processed data for each view
+        """
+        processed_data = {}
+        # Read generated depth visualization files
+        depth_vis_dir = os.path.join(target_dir, "depth_vis")
+        if os.path.exists(depth_vis_dir):
+            depth_files = sorted(glob.glob(os.path.join(depth_vis_dir, "*.jpg")))
+            for i, depth_file in enumerate(depth_files):
+                # Use processed images directly from API
+                processed_image = None
+                if prediction.processed_images is not None and i < len(
+                    prediction.processed_images
+                ):
+                    processed_image = prediction.processed_images[i]
+                processed_data[i] = {
+                    "depth_image": depth_file,
+                    "image": processed_image,
+                    "original_image_path": image_paths[i] if i < len(image_paths) else None,
+                    "depth": prediction.depth[i] if i < len(prediction.depth) else None,
+                    "intrinsics": (
+                        prediction.intrinsics[i]
+                        if prediction.intrinsics is not None and i < len(prediction.intrinsics)
+                        else None
+                    ),
+                    "mask": None,  # No mask information available
+                }
+        return processed_data
+    def _move_prediction_to_cpu(self, prediction: Any) -> Any:
+        """
+        Move all CUDA tensors in prediction to CPU for safe pickling.
+        This is REQUIRED for HF Spaces with @spaces.GPU decorator to avoid
+        CUDA initialization in the main process during unpickling.
+        Args:
+            prediction: Prediction object that may contain CUDA tensors
+        Returns:
+            Prediction object with all tensors moved to CPU
+        """
+        # Move gaussians tensors to CPU
+        if hasattr(prediction, 'gaussians') and prediction.gaussians is not None:
+            gaussians = prediction.gaussians
+            # Move each tensor attribute to CPU
+            tensor_attrs = ['means', 'scales', 'rotations', 'harmonics', 'opacities']
+            for attr in tensor_attrs:
+                if hasattr(gaussians, attr):
+                    tensor = getattr(gaussians, attr)
+                    if isinstance(tensor, torch.Tensor) and tensor.is_cuda:
+                        setattr(gaussians, attr, tensor.cpu())
+                        print(f"  ✓ Moved gaussians.{attr} to CPU")
+        # Move any tensors in aux dict to CPU
+        if hasattr(prediction, 'aux') and prediction.aux is not None:
+            for key, value in list(prediction.aux.items()):
+                if isinstance(value, torch.Tensor) and value.is_cuda:
+                    prediction.aux[key] = value.cpu()
+                    print(f"  ✓ Moved aux['{key}'] to CPU")
+                elif isinstance(value, dict):
+                    # Recursively handle nested dicts
+                    for k, v in list(value.items()):
+                        if isinstance(v, torch.Tensor) and v.is_cuda:
+                            value[k] = v.cpu()
+                            print(f"  ✓ Moved aux['{key}']['{k}'] to CPU")
+        return prediction
+    def cleanup(self) -> None:
+        """Clean up GPU memory."""
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        gc.collect()

depth_anything_3/app/modules/ui_components.py ADDED Viewed

	@@ -0,0 +1,474 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+UI components module for Depth Anything 3 Gradio app.
+This module contains UI component definitions and layout functions.
+"""
+import os
+from typing import Any, Dict, List, Tuple
+import gradio as gr
+from depth_anything_3.app.modules.utils import get_logo_base64, get_scene_info
+class UIComponents:
+    """
+    Handles UI component creation and layout for the Gradio app.
+    """
+    def __init__(self):
+        """Initialize the UI components handler."""
+    def create_upload_section(self) -> Tuple[gr.Video, gr.Slider, gr.File, gr.Gallery, gr.Button]:
+        """
+        Create the upload section with video, images, and gallery components.
+        Returns:
+            A tuple of Gradio components: (input_video, s_time_interval, input_images,
+            image_gallery, select_first_frame_btn).
+        """
+        input_video = gr.Video(label="Upload Video", interactive=True)
+        s_time_interval = gr.Slider(
+            minimum=0.1,
+            maximum=60,
+            value=10,
+            step=0.1,
+            label="Sampling FPS (Frames Per Second)",
+            interactive=True,
+            visible=True,
+        )
+        input_images = gr.File(file_count="multiple", label="Upload Images", interactive=True)
+        image_gallery = gr.Gallery(
+            label="Preview",
+            columns=4,
+            height="300px",
+            show_download_button=True,
+            object_fit="contain",
+            preview=True,
+            interactive=False,
+        )
+        # Select first frame button (moved below image gallery)
+        select_first_frame_btn = gr.Button("Select First Frame", scale=1)
+        return input_video, s_time_interval, input_images, image_gallery, select_first_frame_btn
+    def create_3d_viewer_section(self) -> gr.Model3D:
+        """
+        Create the 3D viewer component.
+        Returns:
+            3D model viewer component
+        """
+        return gr.Model3D(
+            height=520,
+            zoom_speed=0.5,
+            pan_speed=0.5,
+            clear_color=[0.0, 0.0, 0.0, 0.0],
+            key="persistent_3d_viewer",
+            elem_id="reconstruction_3d_viewer",
+        )
+    def create_nvs_video(self) -> Tuple[gr.Video, gr.Markdown]:
+        """
+        Create the 3DGS rendered video display component and info message.
+        Returns:
+            Tuple of (video component, info message component)
+        """
+        with gr.Column():
+            gs_info = gr.Markdown(
+                (
+                    "‼️ **3D Gaussian Splatting rendering is currently DISABLED.** <br><br><br>"
+                    "To render novel views from 3DGS, "
+                    "enable **Infer 3D Gaussian Splatting** below. <br>"
+                    "Next, in **Visualization Options**, "
+                    "*optionally* configure the **rendering trajectory** (default: smooth) "
+                    "and **video quality** (default: low), "
+                    "then click **Reconstruct**."
+                ),
+                visible=True,
+                height=520,
+            )
+            gs_video = gr.Video(
+                height=520,
+                label="3DGS Rendered NVS Video (depth shown for reference only)",
+                interactive=False,
+                visible=False,
+            )
+        return gs_video, gs_info
+    def create_depth_section(self) -> Tuple[gr.Button, gr.Dropdown, gr.Button, gr.Image]:
+        """
+        Create the depth visualization section.
+        Returns:
+            A tuple of (prev_depth_btn, depth_view_selector, next_depth_btn, depth_map)
+        """
+        with gr.Row(elem_classes=["navigation-row"]):
+            prev_depth_btn = gr.Button("◀ Previous", size="sm", scale=1)
+            depth_view_selector = gr.Dropdown(
+                choices=["View 1"],
+                value="View 1",
+                label="Select View",
+                scale=2,
+                interactive=True,
+                allow_custom_value=True,
+            )
+            next_depth_btn = gr.Button("Next ▶", size="sm", scale=1)
+        depth_map = gr.Image(
+            type="numpy",
+            label="Colorized Depth Map",
+            format="png",
+            interactive=False,
+        )
+        return prev_depth_btn, depth_view_selector, next_depth_btn, depth_map
+    def create_measure_section(
+        self,
+    ) -> Tuple[gr.Button, gr.Dropdown, gr.Button, gr.Image, gr.Image, gr.Markdown]:
+        """
+        Create the measurement section.
+        Returns:
+            A tuple of (prev_measure_btn, measure_view_selector, next_measure_btn, measure_image,
+            measure_depth_image, measure_text)
+        """
+        from depth_anything_3.app.css_and_html import MEASURE_INSTRUCTIONS_HTML
+        gr.Markdown(MEASURE_INSTRUCTIONS_HTML)
+        with gr.Row(elem_classes=["navigation-row"]):
+            prev_measure_btn = gr.Button("◀ Previous", size="sm", scale=1)
+            measure_view_selector = gr.Dropdown(
+                choices=["View 1"],
+                value="View 1",
+                label="Select View",
+                scale=2,
+                interactive=True,
+                allow_custom_value=True,
+            )
+            next_measure_btn = gr.Button("Next ▶", size="sm", scale=1)
+        with gr.Row():
+            measure_image = gr.Image(
+                type="numpy",
+                show_label=False,
+                format="webp",
+                interactive=False,
+                sources=[],
+                label="RGB Image",
+                scale=1,
+                height=275,
+            )
+            measure_depth_image = gr.Image(
+                type="numpy",
+                show_label=False,
+                format="webp",
+                interactive=False,
+                sources=[],
+                label="Depth Visualization (Right Half)",
+                scale=1,
+                height=275,
+            )
+        gr.Markdown(
+            "**Note:** Images have been adjusted to model processing size. "
+            "Click two points on the RGB image to measure distance."
+        )
+        measure_text = gr.Markdown("")
+        return (
+            prev_measure_btn,
+            measure_view_selector,
+            next_measure_btn,
+            measure_image,
+            measure_depth_image,
+            measure_text,
+        )
+    def create_inference_control_section(self) -> Tuple[gr.Dropdown, gr.Checkbox]:
+        """
+        Create the inference control section (before inference).
+        Returns:
+            Tuple of (process_res_method_dropdown, infer_gs)
+        """
+        with gr.Row():
+            process_res_method_dropdown = gr.Dropdown(
+                choices=["high_res", "low_res"],
+                value="low_res",
+                label="Image Processing Method",
+                info="low_res for much more images",
+                scale=1,
+            )
+            # Modify line 220, add color class
+            infer_gs = gr.Checkbox(
+                label="Infer 3D Gaussian Splatting",
+                value=False,
+                info=(
+                    'Enable novel view rendering from 3DGS (<i class="fas fa-triangle-exclamation '
+                    'fa-color-red"></i> requires extra processing time)'
+                ),
+                scale=1,
+            )
+        return (process_res_method_dropdown, infer_gs)
+    def create_display_control_section(
+        self,
+    ) -> Tuple[
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Slider,
+        gr.Slider,
+        gr.Dropdown,
+        gr.Dropdown,
+        gr.Button,
+        gr.ClearButton,
+    ]:
+        """
+        Create the display control section (options for visualization).
+        Returns:
+            Tuple of display control components including buttons
+        """
+        with gr.Column():
+            # 3DGS options at the top
+            with gr.Row():
+                gs_trj_mode = gr.Dropdown(
+                    choices=["smooth", "extend"],
+                    value="smooth",
+                    label=("Rendering trajectory for 3DGS viewpoints (requires n_views ≥ 2)"),
+                    info=("'smooth' for view interpolation; 'extend' for longer trajectory"),
+                    visible=False,  # initially hidden
+                )
+                gs_video_quality = gr.Dropdown(
+                    choices=["low", "medium", "high"],
+                    value="low",
+                    label=("Video quality for 3DGS rendered outputs"),
+                    info=("'low' for faster loading speed; 'high' for better visual quality"),
+                    visible=False,  # initially hidden
+                )
+            # Reconstruct and Clear buttons (before Visualization Options)
+            with gr.Row():
+                submit_btn = gr.Button("Reconstruct", scale=1, variant="primary")
+                clear_btn = gr.ClearButton(scale=1)
+            gr.Markdown("### Visualization Options: (Click Reconstruct to update)")
+            show_cam = gr.Checkbox(label="Show Camera", value=True)
+            filter_black_bg = gr.Checkbox(label="Filter Black Background", value=False)
+            filter_white_bg = gr.Checkbox(label="Filter White Background", value=False)
+            save_percentage = gr.Slider(
+                minimum=0,
+                maximum=100,
+                value=10,
+                step=1,
+                label="Filter Percentage",
+                info="Confidence Threshold (%): Higher values filter more points.",
+            )
+            num_max_points = gr.Slider(
+                minimum=1000,
+                maximum=100000,
+                value=1000,
+                step=1000,
+                label="Max Points (K points)",
+                info="Maximum number of points to export to GLB (in thousands)",
+            )
+        return (
+            show_cam,
+            filter_black_bg,
+            filter_white_bg,
+            save_percentage,
+            num_max_points,
+            gs_trj_mode,
+            gs_video_quality,
+            submit_btn,
+            clear_btn,
+        )
+    def create_control_section(
+        self,
+    ) -> Tuple[
+        gr.Button,
+        gr.ClearButton,
+        gr.Dropdown,
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Dropdown,
+        gr.Checkbox,
+        gr.Textbox,
+    ]:
+        """
+        Create the control section with buttons and options.
+        Returns:
+            Tuple of control components
+        """
+        with gr.Row():
+            submit_btn = gr.Button("Reconstruct", scale=1, variant="primary")
+            clear_btn = gr.ClearButton(
+                scale=1,
+            )
+        with gr.Row():
+            frame_filter = gr.Dropdown(
+                choices=["All"], value="All", label="Show Points from Frame"
+            )
+            with gr.Column():
+                gr.Markdown("### Visualization Option: (Click Reconstruct to update)")
+                show_cam = gr.Checkbox(label="Show Camera", value=True)
+                show_mesh = gr.Checkbox(label="Show Mesh", value=True)
+                filter_black_bg = gr.Checkbox(label="Filter Black Background", value=False)
+                filter_white_bg = gr.Checkbox(label="Filter White Background", value=False)
+                gr.Markdown("### Reconstruction Options: (updated on next run)")
+                apply_mask_checkbox = gr.Checkbox(
+                    label="Apply mask for predicted ambiguous depth classes & edges",
+                    value=True,
+                )
+                process_res_method_dropdown = gr.Dropdown(
+                    choices=[
+                        "upper_bound_resize",
+                        "upper_bound_crop",
+                        "lower_bound_resize",
+                        "lower_bound_crop",
+                    ],
+                    value="upper_bound_resize",
+                    label="Image Processing Method",
+                    info="Method for resizing input images",
+                )
+                save_to_gallery_checkbox = gr.Checkbox(
+                    label="Save to Gallery",
+                    value=False,
+                    info="Save current reconstruction results to gallery directory",
+                )
+                gallery_name_input = gr.Textbox(
+                    label="Gallery Name",
+                    placeholder="Enter a name for the gallery folder",
+                    value="",
+                    info="Leave empty for auto-generated name with timestamp",
+                )
+        return (
+            submit_btn,
+            clear_btn,
+            frame_filter,
+            show_cam,
+            show_mesh,
+            filter_black_bg,
+            filter_white_bg,
+            apply_mask_checkbox,
+            process_res_method_dropdown,
+            save_to_gallery_checkbox,
+            gallery_name_input,
+        )
+    def create_example_scenes_section(self) -> List[Dict[str, Any]]:
+        """
+        Create the example scenes section.
+        Returns:
+            List of scene information dictionaries
+        """
+        # Get workspace directory from environment variable
+        workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
+        examples_dir = os.path.join(workspace_dir, "examples")
+        # Get scene information
+        scenes = get_scene_info(examples_dir)
+        return scenes
+    def create_example_scene_grid(self, scenes: List[Dict[str, Any]]) -> List[gr.Image]:
+        """
+        Create the example scene grid.
+        Args:
+            scenes: List of scene information dictionaries
+        Returns:
+            List of scene image components
+        """
+        scene_components = []
+        if scenes:
+            for i in range(0, len(scenes), 4):  # Process 4 scenes per row
+                with gr.Row():
+                    for j in range(4):
+                        scene_idx = i + j
+                        if scene_idx < len(scenes):
+                            scene = scenes[scene_idx]
+                            with gr.Column(scale=1, elem_classes=["clickable-thumbnail"]):
+                                # Clickable thumbnail
+                                scene_img = gr.Image(
+                                    value=scene["thumbnail"],
+                                    height=150,
+                                    interactive=False,
+                                    show_label=False,
+                                    elem_id=f"scene_thumb_{scene['name']}",
+                                    sources=[],
+                                )
+                                scene_components.append(scene_img)
+                                # Scene name and image count as text below thumbnail
+                                gr.Markdown(
+                                    f"**{scene['name']}** \n {scene['num_images']} images",
+                                    elem_classes=["scene-info"],
+                                )
+                        else:
+                            # Empty column to maintain grid structure
+                            with gr.Column(scale=1):
+                                pass
+        return scene_components
+    def create_header_section(self) -> gr.HTML:
+        """
+        Create the header section with logo and title.
+        Returns:
+            Header HTML component
+        """
+        from depth_anything_3.app.css_and_html import get_header_html
+        return gr.HTML(get_header_html(get_logo_base64()))
+    def create_description_section(self) -> gr.HTML:
+        """
+        Create the description section.
+        Returns:
+            Description HTML component
+        """
+        from depth_anything_3.app.css_and_html import get_description_html
+        return gr.HTML(get_description_html())
+    def create_acknowledgements_section(self) -> gr.HTML:
+        """
+        Create the acknowledgements section.
+        Returns:
+            Acknowledgements HTML component
+        """
+        from depth_anything_3.app.css_and_html import get_acknowledgements_html
+        return gr.HTML(get_acknowledgements_html())

depth_anything_3/app/modules/utils.py ADDED Viewed

	@@ -0,0 +1,211 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utility functions for Depth Anything 3 Gradio app.
+This module contains helper functions for data processing, visualization,
+and file operations.
+"""
+import gc
+import json
+import os
+import shutil
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+import torch
+def create_depth_visualization(depth: np.ndarray) -> Optional[np.ndarray]:
+    """
+    Create a colored depth visualization.
+    Args:
+        depth: Depth array
+    Returns:
+        Colored depth visualization or None
+    """
+    if depth is None:
+        return None
+    # Normalize depth to 0-1 range
+    depth_min = depth[depth > 0].min() if (depth > 0).any() else 0
+    depth_max = depth.max()
+    if depth_max <= depth_min:
+        return None
+    # Normalize depth
+    depth_norm = (depth - depth_min) / (depth_max - depth_min)
+    depth_norm = np.clip(depth_norm, 0, 1)
+    # Apply colormap (using matplotlib's viridis colormap)
+    import matplotlib.cm as cm
+    # Convert to colored image
+    depth_colored = cm.viridis(depth_norm)[:, :, :3]  # Remove alpha channel
+    depth_colored = (depth_colored * 255).astype(np.uint8)
+    return depth_colored
+def save_to_gallery_func(
+    target_dir: str, processed_data: Dict[int, Dict[str, Any]], gallery_name: Optional[str] = None
+) -> Tuple[bool, str]:
+    """
+    Save the current reconstruction results to the gallery directory.
+    Args:
+        target_dir: Source directory containing reconstruction results
+        processed_data: Processed data dictionary
+        gallery_name: Name for the gallery folder
+    Returns:
+        Tuple of (success, message)
+    """
+    try:
+        # Get gallery directory from environment variable or use default
+        gallery_dir = os.environ.get(
+            "DA3_GALLERY_DIR",
+            "workspace/gallery",
+        )
+        if not os.path.exists(gallery_dir):
+            os.makedirs(gallery_dir)
+        # Use provided name or create a unique name
+        if gallery_name is None or gallery_name.strip() == "":
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            gallery_name = f"reconstruction_{timestamp}"
+        gallery_path = os.path.join(gallery_dir, gallery_name)
+        # Check if directory already exists
+        if os.path.exists(gallery_path):
+            return False, f"Save failed: folder '{gallery_name}' already exists"
+        # Create the gallery directory
+        os.makedirs(gallery_path, exist_ok=True)
+        # Copy GLB file
+        glb_source = os.path.join(target_dir, "scene.glb")
+        glb_dest = os.path.join(gallery_path, "scene.glb")
+        if os.path.exists(glb_source):
+            shutil.copy2(glb_source, glb_dest)
+        # Copy depth visualization images
+        depth_vis_dir = os.path.join(target_dir, "depth_vis")
+        if os.path.exists(depth_vis_dir):
+            gallery_depth_vis = os.path.join(gallery_path, "depth_vis")
+            shutil.copytree(depth_vis_dir, gallery_depth_vis)
+        # Copy original images
+        images_source = os.path.join(target_dir, "images")
+        if os.path.exists(images_source):
+            gallery_images = os.path.join(gallery_path, "images")
+            shutil.copytree(images_source, gallery_images)
+        scene_preview_source = os.path.join(target_dir, "scene.jpg")
+        scene_preview_dest = os.path.join(gallery_path, "scene.jpg")
+        shutil.copy2(scene_preview_source, scene_preview_dest)
+        # Save metadata
+        metadata = {
+            "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
+            "num_images": len(processed_data) if processed_data else 0,
+            "gallery_name": gallery_name,
+        }
+        with open(os.path.join(gallery_path, "metadata.json"), "w") as f:
+            json.dump(metadata, f, indent=2)
+        print(f"Saved reconstruction to gallery: {gallery_path}")
+        return True, f"Save successful: saved to {gallery_path}"
+    except Exception as e:
+        print(f"Error saving to gallery: {e}")
+        return False, f"Save failed: {str(e)}"
+def get_scene_info(examples_dir: str) -> List[Dict[str, Any]]:
+    """
+    Get information about scenes in the examples directory.
+    Args:
+        examples_dir: Path to examples directory
+    Returns:
+        List of scene information dictionaries
+    """
+    import glob
+    scenes = []
+    if not os.path.exists(examples_dir):
+        return scenes
+    for scene_folder in sorted(os.listdir(examples_dir)):
+        scene_path = os.path.join(examples_dir, scene_folder)
+        if os.path.isdir(scene_path):
+            # Find all image files in the scene folder
+            image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff", "*.tif"]
+            image_files = []
+            for ext in image_extensions:
+                image_files.extend(glob.glob(os.path.join(scene_path, ext)))
+                image_files.extend(glob.glob(os.path.join(scene_path, ext.upper())))
+            if image_files:
+                # Sort images and get the first one for thumbnail
+                image_files = sorted(image_files)
+                first_image = image_files[0]
+                num_images = len(image_files)
+                scenes.append(
+                    {
+                        "name": scene_folder,
+                        "path": scene_path,
+                        "thumbnail": first_image,
+                        "num_images": num_images,
+                        "image_files": image_files,
+                    }
+                )
+    return scenes
+def cleanup_memory() -> None:
+    """Clean up GPU memory and garbage collect."""
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+def get_logo_base64() -> Optional[str]:
+    """
+    Convert WAI logo to base64 for embedding in HTML.
+    Returns:
+        Base64 encoded logo string or None
+    """
+    import base64
+    logo_path = "examples/WAI-Logo/wai_logo.png"
+    try:
+        with open(logo_path, "rb") as img_file:
+            img_data = img_file.read()
+            base64_str = base64.b64encode(img_data).decode()
+            return f"data:image/png;base64,{base64_str}"
+    except FileNotFoundError:
+        return None

depth_anything_3/app/modules/visualization.py ADDED Viewed

	@@ -0,0 +1,434 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Visualization module for Depth Anything 3 Gradio app.
+This module handles visualization updates, navigation, and measurement functionality.
+"""
+import os
+from typing import Any, Dict, List, Optional, Tuple
+import cv2
+import gradio as gr
+import numpy as np
+class VisualizationHandler:
+    """
+    Handles visualization updates and navigation for the Gradio app.
+    """
+    def __init__(self):
+        """Initialize the visualization handler."""
+    def update_view_selectors(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]]
+    ) -> Tuple[gr.Dropdown, gr.Dropdown]:
+        """
+        Update view selector dropdowns based on available views.
+        Args:
+            processed_data: Processed data dictionary
+        Returns:
+            Tuple of (depth_view_selector, measure_view_selector)
+        """
+        if processed_data is None or len(processed_data) == 0:
+            choices = ["View 1"]
+        else:
+            num_views = len(processed_data)
+            choices = [f"View {i + 1}" for i in range(num_views)]
+        return (
+            gr.Dropdown(choices=choices, value=choices[0]),  # depth_view_selector
+            gr.Dropdown(choices=choices, value=choices[0]),  # measure_view_selector
+        )
+    def get_view_data_by_index(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]], view_index: int
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Get view data by index, handling bounds.
+        Args:
+            processed_data: Processed data dictionary
+            view_index: Index of the view to get
+        Returns:
+            View data dictionary or None
+        """
+        if processed_data is None or len(processed_data) == 0:
+            return None
+        view_keys = list(processed_data.keys())
+        if view_index < 0 or view_index >= len(view_keys):
+            view_index = 0
+        return processed_data[view_keys[view_index]]
+    def update_depth_view(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]], view_index: int
+    ) -> Optional[str]:
+        """
+        Update depth view for a specific view index.
+        Args:
+            processed_data: Processed data dictionary
+            view_index: Index of the view to update
+        Returns:
+            Path to depth visualization image or None
+        """
+        view_data = self.get_view_data_by_index(processed_data, view_index)
+        if view_data is None or view_data.get("depth_image") is None:
+            return None
+        # Return the depth visualization image directly
+        return view_data["depth_image"]
+    def navigate_depth_view(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        current_selector_value: str,
+        direction: int,
+    ) -> Tuple[str, Optional[str]]:
+        """
+        Navigate depth view (direction: -1 for previous, +1 for next).
+        Args:
+            processed_data: Processed data dictionary
+            current_selector_value: Current selector value
+            direction: Direction to navigate (-1 for previous, +1 for next)
+        Returns:
+            Tuple of (new_selector_value, depth_vis)
+        """
+        if processed_data is None or len(processed_data) == 0:
+            return "View 1", None
+        # Parse current view number
+        try:
+            current_view = int(current_selector_value.split()[1]) - 1
+        except:  # noqa
+            current_view = 0
+        num_views = len(processed_data)
+        new_view = (current_view + direction) % num_views
+        new_selector_value = f"View {new_view + 1}"
+        depth_vis = self.update_depth_view(processed_data, new_view)
+        return new_selector_value, depth_vis
+    def update_measure_view(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]], view_index: int
+    ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], List]:
+        """
+        Update measure view for a specific view index.
+        Args:
+            processed_data: Processed data dictionary
+            view_index: Index of the view to update
+        Returns:
+            Tuple of (measure_image, depth_right_half, measure_points)
+        """
+        view_data = self.get_view_data_by_index(processed_data, view_index)
+        if view_data is None:
+            return None, None, []  # image, depth_right_half, measure_points
+        # Get the processed (resized) image
+        if "image" in view_data and view_data["image"] is not None:
+            image = view_data["image"].copy()
+        else:
+            return None, None, []
+        # Ensure image is in uint8 format
+        if image.dtype != np.uint8:
+            if image.max() <= 1.0:
+                image = (image * 255).astype(np.uint8)
+            else:
+                image = image.astype(np.uint8)
+        # Extract right half of the depth visualization (pure depth part)
+        depth_image_path = view_data.get("depth_image", None)
+        depth_right_half = None
+        if depth_image_path and os.path.exists(depth_image_path):
+            try:
+                # Load the combined depth visualization image
+                depth_combined = cv2.imread(depth_image_path)
+                depth_combined = cv2.cvtColor(depth_combined, cv2.COLOR_BGR2RGB)
+                if depth_combined is not None:
+                    height, width = depth_combined.shape[:2]
+                    # Extract right half (depth visualization part)
+                    depth_right_half = depth_combined[:, width // 2 :]
+            except Exception as e:
+                print(f"Error extracting depth right half: {e}")
+        return image, depth_right_half, []
+    def navigate_measure_view(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        current_selector_value: str,
+        direction: int,
+    ) -> Tuple[str, Optional[np.ndarray], Optional[str], List]:
+        """
+        Navigate measure view (direction: -1 for previous, +1 for next).
+        Args:
+            processed_data: Processed data dictionary
+            current_selector_value: Current selector value
+            direction: Direction to navigate (-1 for previous, +1 for next)
+        Returns:
+            Tuple of (new_selector_value, measure_image, depth_image_path, measure_points)
+        """
+        if processed_data is None or len(processed_data) == 0:
+            return "View 1", None, None, []
+        # Parse current view number
+        try:
+            current_view = int(current_selector_value.split()[1]) - 1
+        except:  # noqa
+            current_view = 0
+        num_views = len(processed_data)
+        new_view = (current_view + direction) % num_views
+        new_selector_value = f"View {new_view + 1}"
+        measure_image, depth_right_half, measure_points = self.update_measure_view(
+            processed_data, new_view
+        )
+        return new_selector_value, measure_image, depth_right_half, measure_points
+    def populate_visualization_tabs(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]]
+    ) -> Tuple[Optional[str], Optional[np.ndarray], Optional[str], List]:
+        """
+        Populate the depth and measure tabs with processed data.
+        Args:
+            processed_data: Processed data dictionary
+        Returns:
+            Tuple of (depth_vis, measure_img, depth_image_path, measure_points)
+        """
+        if processed_data is None or len(processed_data) == 0:
+            return None, None, None, []
+        # Use update function to get depth visualization
+        depth_vis = self.update_depth_view(processed_data, 0)
+        measure_img, depth_right_half, _ = self.update_measure_view(processed_data, 0)
+        return depth_vis, measure_img, depth_right_half, []
+    def reset_measure(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]]
+    ) -> Tuple[Optional[np.ndarray], List, str]:
+        """
+        Reset measure points.
+        Args:
+            processed_data: Processed data dictionary
+        Returns:
+            Tuple of (image, measure_points, text)
+        """
+        if processed_data is None or len(processed_data) == 0:
+            return None, [], ""
+        # Return the first view image
+        first_view = list(processed_data.values())[0]
+        return first_view["image"], [], ""
+    def measure(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        measure_points: List,
+        current_view_selector: str,
+        event: gr.SelectData,
+    ) -> List:
+        """
+        Handle measurement on images.
+        Args:
+            processed_data: Processed data dictionary
+            measure_points: List of current measure points
+            current_view_selector: Current view selector value
+            event: Gradio select event
+        Returns:
+            List of [image, depth_right_half, measure_points, text]
+        """
+        try:
+            print(f"Measure function called with selector: {current_view_selector}")
+            if processed_data is None or len(processed_data) == 0:
+                return [None, [], "No data available"]
+            # Use the currently selected view instead of always using the first view
+            try:
+                current_view_index = int(current_view_selector.split()[1]) - 1
+            except:  # noqa
+                current_view_index = 0
+            print(f"Using view index: {current_view_index}")
+            # Get view data safely
+            if current_view_index < 0 or current_view_index >= len(processed_data):
+                current_view_index = 0
+            view_keys = list(processed_data.keys())
+            current_view = processed_data[view_keys[current_view_index]]
+            if current_view is None:
+                return [None, [], "No view data available"]
+            point2d = event.index[0], event.index[1]
+            print(f"Clicked point: {point2d}")
+            measure_points.append(point2d)
+            # Get image and depth visualization
+            image, depth_right_half, _ = self.update_measure_view(
+                processed_data, current_view_index
+            )
+            if image is None:
+                return [None, [], "No image available"]
+            image = image.copy()
+            # Ensure image is in uint8 format for proper cv2 operations
+            try:
+                if image.dtype != np.uint8:
+                    if image.max() <= 1.0:
+                        # Image is in [0, 1] range, convert to [0, 255]
+                        image = (image * 255).astype(np.uint8)
+                    else:
+                        # Image is already in [0, 255] range
+                        image = image.astype(np.uint8)
+            except Exception as e:
+                print(f"Image conversion error: {e}")
+                return [None, [], f"Image conversion error: {e}"]
+            # Draw circles for points
+            try:
+                for p in measure_points:
+                    if 0 <= p[0] < image.shape[1] and 0 <= p[1] < image.shape[0]:
+                        image = cv2.circle(image, p, radius=5, color=(255, 0, 0), thickness=2)
+            except Exception as e:
+                print(f"Drawing error: {e}")
+                return [None, [], f"Drawing error: {e}"]
+            # Get depth information from processed_data
+            depth_text = ""
+            try:
+                for i, p in enumerate(measure_points):
+                    if (
+                        current_view["depth"] is not None
+                        and 0 <= p[1] < current_view["depth"].shape[0]
+                        and 0 <= p[0] < current_view["depth"].shape[1]
+                    ):
+                        d = current_view["depth"][p[1], p[0]]
+                        depth_text += f"- **P{i + 1} depth: {d:.2f}m**\n"
+                    else:
+                        depth_text += f"- **P{i + 1}: Click position ({p[0]}, {p[1]}) - No depth information**\n"  # noqa: E501
+            except Exception as e:
+                print(f"Depth text error: {e}")
+                depth_text = f"Error computing depth: {e}\n"
+            if len(measure_points) == 2:
+                try:
+                    point1, point2 = measure_points
+                    # Draw line
+                    if (
+                        0 <= point1[0] < image.shape[1]
+                        and 0 <= point1[1] < image.shape[0]
+                        and 0 <= point2[0] < image.shape[1]
+                        and 0 <= point2[1] < image.shape[0]
+                    ):
+                        image = cv2.line(image, point1, point2, color=(255, 0, 0), thickness=2)
+                    # Compute 3D distance using depth information and camera intrinsics
+                    distance_text = "- **Distance: Unable to calculate 3D distance**"
+                    if (
+                        current_view["depth"] is not None
+                        and 0 <= point1[1] < current_view["depth"].shape[0]
+                        and 0 <= point1[0] < current_view["depth"].shape[1]
+                        and 0 <= point2[1] < current_view["depth"].shape[0]
+                        and 0 <= point2[0] < current_view["depth"].shape[1]
+                    ):
+                        try:
+                            # Get depth values at the two points
+                            d1 = current_view["depth"][point1[1], point1[0]]
+                            d2 = current_view["depth"][point2[1], point2[0]]
+                            # Convert 2D pixel coordinates to 3D world coordinates
+                            if current_view["intrinsics"] is not None:
+                                # Get camera intrinsics
+                                K = current_view["intrinsics"]  # 3x3 intrinsic matrix
+                                fx, fy = K[0, 0], K[1, 1]  # focal lengths
+                                cx, cy = K[0, 2], K[1, 2]  # principal point
+                                # Convert pixel coordinates to normalized camera coordinates
+                                # Point 1: (u1, v1) -> (x1, y1, z1)
+                                u1, v1 = point1[0], point1[1]
+                                x1 = (u1 - cx) * d1 / fx
+                                y1 = (v1 - cy) * d1 / fy
+                                z1 = d1
+                                # Point 2: (u2, v2) -> (x2, y2, z2)
+                                u2, v2 = point2[0], point2[1]
+                                x2 = (u2 - cx) * d2 / fx
+                                y2 = (v2 - cy) * d2 / fy
+                                z2 = d2
+                                # Calculate 3D Euclidean distance
+                                p1_3d = np.array([x1, y1, z1])
+                                p2_3d = np.array([x2, y2, z2])
+                                distance_3d = np.linalg.norm(p1_3d - p2_3d)
+                                distance_text = f"- **Distance: {distance_3d:.2f}m**"
+                            else:
+                                # Fallback to simplified calculation if no intrinsics
+                                pixel_distance = np.sqrt(
+                                    (point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2
+                                )
+                                avg_depth = (d1 + d2) / 2
+                                scale_factor = avg_depth / 1000  # Rough scaling factor
+                                estimated_3d_distance = pixel_distance * scale_factor
+                                distance_text = f"- **Distance: {estimated_3d_distance:.2f}m (estimated, no intrinsics)**"  # noqa: E501
+                        except Exception as e:
+                            print(f"Distance computation error: {e}")
+                            distance_text = f"- **Distance computation error: {e}**"
+                    measure_points = []
+                    text = depth_text + distance_text
+                    print(f"Measurement complete: {text}")
+                    return [image, depth_right_half, measure_points, text]
+                except Exception as e:
+                    print(f"Final measurement error: {e}")
+                    return [None, [], f"Measurement error: {e}"]
+            else:
+                print(f"Single point measurement: {depth_text}")
+                return [image, depth_right_half, measure_points, depth_text]
+        except Exception as e:
+            print(f"Overall measure function error: {e}")
+            return [None, [], f"Measure function error: {e}"]

depth_anything_3/cfg.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Configuration utility functions
+"""
+import importlib
+from pathlib import Path
+from typing import Any, Callable, List, Union
+from omegaconf import DictConfig, ListConfig, OmegaConf
+try:
+    OmegaConf.register_new_resolver("eval", eval)
+except Exception as e:
+    # if eval is not available, we can just pass
+    print(f"Error registering eval resolver: {e}")
+def load_config(path: str, argv: List[str] = None) -> Union[DictConfig, ListConfig]:
+    """
+    Load a configuration. Will resolve inheritance.
+    Supports both file paths and module paths (e.g., depth_anything_3.configs.giant).
+    """
+    # Check if path is a module path (contains dots but no slashes and doesn't end with .yaml)
+    if "." in path and "/" not in path and not path.endswith(".yaml"):
+        # It's a module path, load from package resources
+        path_parts = path.split(".")[1:]
+        config_path = Path(__file__).resolve().parent
+        for part in path_parts:
+            config_path = config_path.joinpath(part)
+        config_path = config_path.with_suffix(".yaml")
+        config = OmegaConf.load(str(config_path))
+    else:
+        # It's a file path (absolute, relative, or with .yaml extension)
+        config = OmegaConf.load(path)
+    if argv is not None:
+        config_argv = OmegaConf.from_dotlist(argv)
+        config = OmegaConf.merge(config, config_argv)
+    config = resolve_recursive(config, resolve_inheritance)
+    return config
+def resolve_recursive(
+    config: Any,
+    resolver: Callable[[Union[DictConfig, ListConfig]], Union[DictConfig, ListConfig]],
+) -> Any:
+    config = resolver(config)
+    if isinstance(config, DictConfig):
+        for k in config.keys():
+            v = config.get(k)
+            if isinstance(v, (DictConfig, ListConfig)):
+                config[k] = resolve_recursive(v, resolver)
+    if isinstance(config, ListConfig):
+        for i in range(len(config)):
+            v = config.get(i)
+            if isinstance(v, (DictConfig, ListConfig)):
+                config[i] = resolve_recursive(v, resolver)
+    return config
+def resolve_inheritance(config: Union[DictConfig, ListConfig]) -> Any:
+    """
+    Recursively resolve inheritance if the config contains:
+    __inherit__: path/to/parent.yaml or a ListConfig of such paths.
+    """
+    if isinstance(config, DictConfig):
+        inherit = config.pop("__inherit__", None)
+        if inherit:
+            inherit_list = inherit if isinstance(inherit, ListConfig) else [inherit]
+            parent_config = None
+            for parent_path in inherit_list:
+                assert isinstance(parent_path, str)
+                parent_config = (
+                    load_config(parent_path)
+                    if parent_config is None
+                    else OmegaConf.merge(parent_config, load_config(parent_path))
+                )
+            if len(config.keys()) > 0:
+                config = OmegaConf.merge(parent_config, config)
+            else:
+                config = parent_config
+    return config
+def import_item(path: str, name: str) -> Any:
+    """
+    Import a python item. Example: import_item("path.to.file", "MyClass") -> MyClass
+    """
+    return getattr(importlib.import_module(path), name)
+def create_object(config: DictConfig) -> Any:
+    """
+    Create an object from config.
+    The config is expected to contains the following:
+    __object__:
+      path: path.to.module
+      name: MyClass
+      args: as_config | as_params (default to as_config)
+    """
+    config = DictConfig(config)
+    item = import_item(
+        path=config.__object__.path,
+        name=config.__object__.name,
+    )
+    args = config.__object__.get("args", "as_config")
+    if args == "as_config":
+        return item(config)
+    if args == "as_params":
+        config = OmegaConf.to_object(config)
+        config.pop("__object__")
+        return item(**config)
+    raise NotImplementedError(f"Unknown args type: {args}")
+def create_dataset(path: str, *args, **kwargs) -> Any:
+    """
+    Create a dataset. Requires the file to contain a "create_dataset" function.
+    """
+    return import_item(path, "create_dataset")(*args, **kwargs)
+def to_dict_recursive(config_obj):
+    if isinstance(config_obj, DictConfig):
+        return {k: to_dict_recursive(v) for k, v in config_obj.items()}
+    elif isinstance(config_obj, ListConfig):
+        return [to_dict_recursive(item) for item in config_obj]
+    return config_obj

depth_anything_3/cli.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# flake8: noqa: E402
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Refactored Depth Anything 3 CLI
+Clean, modular command-line interface
+"""
+from __future__ import annotations
+import os
+import typer
+from depth_anything_3.services import start_server
+from depth_anything_3.services.gallery import gallery as gallery_main
+from depth_anything_3.services.inference_service import run_inference
+from depth_anything_3.services.input_handlers import (
+    ColmapHandler,
+    ImageHandler,
+    ImagesHandler,
+    InputHandler,
+    VideoHandler,
+    parse_export_feat,
+)
+from depth_anything_3.utils.constants import DEFAULT_EXPORT_DIR, DEFAULT_GALLERY_DIR, DEFAULT_GRADIO_DIR, DEFAULT_MODEL
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+app = typer.Typer(help="Depth Anything 3 - Video depth estimation CLI", add_completion=False)
+#

depth_anything_3/model/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from depth_anything_3.model.da3 import DepthAnything3Net, NestedDepthAnything3Net
+__export__ = [
+    NestedDepthAnything3Net,
+    DepthAnything3Net,
+]

depth_anything_3/model/cam_dec.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+class CameraDec(nn.Module):
+    def __init__(self, dim_in=1536):
+        super().__init__()
+        output_dim = dim_in
+        self.backbone = nn.Sequential(
+            nn.Linear(output_dim, output_dim),
+            nn.ReLU(),
+            nn.Linear(output_dim, output_dim),
+            nn.ReLU(),
+        )
+        self.fc_t = nn.Linear(output_dim, 3)
+        self.fc_qvec = nn.Linear(output_dim, 4)
+        self.fc_fov = nn.Sequential(nn.Linear(output_dim, 2), nn.ReLU())
+    def forward(self, feat, camera_encoding=None, *args, **kwargs):
+        B, N = feat.shape[:2]
+        feat = feat.reshape(B * N, -1)
+        feat = self.backbone(feat)
+        out_t = self.fc_t(feat.float()).reshape(B, N, 3)
+        if camera_encoding is None:
+            out_qvec = self.fc_qvec(feat.float()).reshape(B, N, 4)
+            out_fov = self.fc_fov(feat.float()).reshape(B, N, 2)
+        else:
+            out_qvec = camera_encoding[..., 3:7]
+            out_fov = camera_encoding[..., -2:]
+        pose_enc = torch.cat([out_t, out_qvec, out_fov], dim=-1)
+        return pose_enc

depth_anything_3/model/cam_enc.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+from depth_anything_3.model.utils.attention import Mlp
+from depth_anything_3.model.utils.block import Block
+from depth_anything_3.model.utils.transform import extri_intri_to_pose_encoding
+from depth_anything_3.utils.geometry import affine_inverse
+class CameraEnc(nn.Module):
+    """
+    CameraHead predicts camera parameters from token representations using iterative refinement.
+    It applies a series of transformer blocks (the "trunk") to dedicated camera tokens.
+    """
+    def __init__(
+        self,
+        dim_out: int = 1024,
+        dim_in: int = 9,
+        trunk_depth: int = 4,
+        target_dim: int = 9,
+        num_heads: int = 16,
+        mlp_ratio: int = 4,
+        init_values: float = 0.01,
+        **kwargs,
+    ):
+        super().__init__()
+        self.target_dim = target_dim
+        self.trunk_depth = trunk_depth
+        self.trunk = nn.Sequential(
+            *[
+                Block(
+                    dim=dim_out,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    init_values=init_values,
+                )
+                for _ in range(trunk_depth)
+            ]
+        )
+        self.token_norm = nn.LayerNorm(dim_out)
+        self.trunk_norm = nn.LayerNorm(dim_out)
+        self.pose_branch = Mlp(
+            in_features=dim_in,
+            hidden_features=dim_out // 2,
+            out_features=dim_out,
+            drop=0,
+        )
+    def forward(
+        self,
+        ext,
+        ixt,
+        image_size,
+    ) -> tuple:
+        c2ws = affine_inverse(ext)
+        pose_encoding = extri_intri_to_pose_encoding(
+            c2ws,
+            ixt,
+            image_size,
+        )
+        pose_tokens = self.pose_branch(pose_encoding)
+        pose_tokens = self.token_norm(pose_tokens)
+        pose_tokens = self.trunk(pose_tokens)
+        pose_tokens = self.trunk_norm(pose_tokens)
+        return pose_tokens

depth_anything_3/model/da3.py ADDED Viewed

	@@ -0,0 +1,378 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import torch
+import torch.nn as nn
+from addict import Dict
+from omegaconf import DictConfig, OmegaConf
+from depth_anything_3.cfg import create_object
+from depth_anything_3.model.utils.transform import pose_encoding_to_extri_intri
+from depth_anything_3.utils.alignment import (
+    apply_metric_scaling,
+    compute_alignment_mask,
+    compute_sky_mask,
+    least_squares_scale_scalar,
+    sample_tensor_for_quantile,
+    set_sky_regions_to_max_depth,
+)
+from depth_anything_3.utils.geometry import affine_inverse, as_homogeneous, map_pdf_to_opacity
+def _wrap_cfg(cfg_obj):
+    return OmegaConf.create(cfg_obj)
+class DepthAnything3Net(nn.Module):
+    """
+    Depth Anything 3 network for depth estimation and camera pose estimation.
+    This network consists of:
+    - Backbone: DinoV2 feature extractor
+    - Head: DPT or DualDPT for depth prediction
+    - Optional camera decoders for pose estimation
+    - Optional GSDPT for 3DGS prediction
+    Args:
+        preset: Configuration preset containing network dimensions and settings
+    Returns:
+        Dictionary containing:
+        - depth: Predicted depth map (B, H, W)
+        - depth_conf: Depth confidence map (B, H, W)
+        - extrinsics: Camera extrinsics (B, N, 4, 4)
+        - intrinsics: Camera intrinsics (B, N, 3, 3)
+        - gaussians: 3D Gaussian Splats (world space), type: model.gs_adapter.Gaussians
+        - aux: Auxiliary features for specified layers
+    """
+    # Patch size for feature extraction
+    PATCH_SIZE = 14
+    def __init__(self, net, head, cam_dec=None, cam_enc=None, gs_head=None, gs_adapter=None):
+        """
+        Initialize DepthAnything3Net with given yaml-initialized configuration.
+        """
+        super().__init__()
+        self.backbone = net if isinstance(net, nn.Module) else create_object(_wrap_cfg(net))
+        self.head = head if isinstance(head, nn.Module) else create_object(_wrap_cfg(head))
+        self.cam_dec, self.cam_enc = None, None
+        if cam_dec is not None:
+            self.cam_dec = (
+                cam_dec if isinstance(cam_dec, nn.Module) else create_object(_wrap_cfg(cam_dec))
+            )
+            self.cam_enc = (
+                cam_dec if isinstance(cam_enc, nn.Module) else create_object(_wrap_cfg(cam_enc))
+            )
+        self.gs_adapter, self.gs_head = None, None
+        if gs_head is not None and gs_adapter is not None:
+            self.gs_adapter = (
+                gs_adapter
+                if isinstance(gs_adapter, nn.Module)
+                else create_object(_wrap_cfg(gs_adapter))
+            )
+            gs_out_dim = self.gs_adapter.d_in + 1
+            if isinstance(gs_head, nn.Module):
+                assert (
+                    gs_head.out_dim == gs_out_dim
+                ), f"gs_head.out_dim should be {gs_out_dim}, got {gs_head.out_dim}"
+                self.gs_head = gs_head
+            else:
+                assert (
+                    gs_head["output_dim"] == gs_out_dim
+                ), f"gs_head output_dim should set to {gs_out_dim}, got {gs_head['output_dim']}"
+                self.gs_head = create_object(_wrap_cfg(gs_head))
+    def forward(
+        self,
+        x: torch.Tensor,
+        extrinsics: torch.Tensor | None = None,
+        intrinsics: torch.Tensor | None = None,
+        export_feat_layers: list[int] | None = [],
+        infer_gs: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through the network.
+        Args:
+            x: Input images (B, N, 3, H, W)
+            extrinsics: Camera extrinsics (B, N, 4, 4) - unused
+            intrinsics: Camera intrinsics (B, N, 3, 3) - unused
+            feat_layers: List of layer indices to extract features from
+        Returns:
+            Dictionary containing predictions and auxiliary features
+        """
+        # Extract features using backbone
+        if extrinsics is not None:
+            with torch.autocast(device_type=x.device.type, enabled=False):
+                cam_token = self.cam_enc(extrinsics, intrinsics, x.shape[-2:])
+        else:
+            cam_token = None
+        feats, aux_feats = self.backbone(
+            x, cam_token=cam_token, export_feat_layers=export_feat_layers
+        )
+        # feats = [[item for item in feat] for feat in feats]
+        H, W = x.shape[-2], x.shape[-1]
+        # Process features through depth head
+        with torch.autocast(device_type=x.device.type, enabled=False):
+            output = self._process_depth_head(feats, H, W)
+            output = self._process_camera_estimation(feats, H, W, output)
+            if infer_gs:
+                output = self._process_gs_head(feats, H, W, output, x, extrinsics, intrinsics)
+        # Extract auxiliary features if requested
+        output.aux = self._extract_auxiliary_features(aux_feats, export_feat_layers, H, W)
+        return output
+    def _process_depth_head(
+        self, feats: list[torch.Tensor], H: int, W: int
+    ) -> Dict[str, torch.Tensor]:
+        """Process features through the depth prediction head."""
+        return self.head(feats, H, W, patch_start_idx=0)
+    def _process_camera_estimation(
+        self, feats: list[torch.Tensor], H: int, W: int, output: Dict[str, torch.Tensor]
+    ) -> Dict[str, torch.Tensor]:
+        """Process camera pose estimation if camera decoder is available."""
+        if self.cam_dec is not None:
+            pose_enc = self.cam_dec(feats[-1][1])
+            # Remove ray information as it's not needed for pose estimation
+            if "ray" in output:
+                del output.ray
+            if "ray_conf" in output:
+                del output.ray_conf
+            # Convert pose encoding to extrinsics and intrinsics
+            c2w, ixt = pose_encoding_to_extri_intri(pose_enc, (H, W))
+            output.extrinsics = affine_inverse(c2w)
+            output.intrinsics = ixt
+        return output
+    def _process_gs_head(
+        self,
+        feats: list[torch.Tensor],
+        H: int,
+        W: int,
+        output: Dict[str, torch.Tensor],
+        in_images: torch.Tensor,
+        extrinsics: torch.Tensor | None = None,
+        intrinsics: torch.Tensor | None = None,
+    ) -> Dict[str, torch.Tensor]:
+        """Process 3DGS parameters estimation if 3DGS head is available."""
+        if self.gs_head is None or self.gs_adapter is None:
+            return output
+        assert output.get("depth", None) is not None, "must provide MV depth for the GS head."
+        # if GT camera poses are provided, use them
+        if extrinsics is not None and intrinsics is not None:
+            ctx_extr = extrinsics
+            ctx_intr = intrinsics
+        else:
+            ctx_extr = output.get("extrinsics", None)
+            ctx_intr = output.get("intrinsics", None)
+            assert (
+                ctx_extr is not None and ctx_intr is not None
+            ), "must process camera info first if GT is not available"
+        gt_extr = extrinsics
+        # homo the extr if needed
+        ctx_extr = as_homogeneous(ctx_extr)
+        if gt_extr is not None:
+            gt_extr = as_homogeneous(gt_extr)
+        # forward through the gs_dpt head to get 'camera space' parameters
+        gs_outs = self.gs_head(
+            feats=feats,
+            H=H,
+            W=W,
+            patch_start_idx=0,
+            images=in_images,
+        )
+        raw_gaussians = gs_outs.raw_gs
+        densities = gs_outs.raw_gs_conf
+        # convert to 'world space' 3DGS parameters; ready to export and render
+        # gt_extr could be None, and will be used to align the pose scale if available
+        gs_world = self.gs_adapter(
+            extrinsics=ctx_extr,
+            intrinsics=ctx_intr,
+            depths=output.depth,
+            opacities=map_pdf_to_opacity(densities),
+            raw_gaussians=raw_gaussians,
+            image_shape=(H, W),
+            gt_extrinsics=gt_extr,
+        )
+        output.gaussians = gs_world
+        return output
+    def _extract_auxiliary_features(
+        self, feats: list[torch.Tensor], feat_layers: list[int], H: int, W: int
+    ) -> Dict[str, torch.Tensor]:
+        """Extract auxiliary features from specified layers."""
+        aux_features = Dict()
+        assert len(feats) == len(feat_layers)
+        for feat, feat_layer in zip(feats, feat_layers):
+            # Reshape features to spatial dimensions
+            feat_reshaped = feat.reshape(
+                [
+                    feat.shape[0],
+                    feat.shape[1],
+                    H // self.PATCH_SIZE,
+                    W // self.PATCH_SIZE,
+                    feat.shape[-1],
+                ]
+            )
+            aux_features[f"feat_layer_{feat_layer}"] = feat_reshaped
+        return aux_features
+class NestedDepthAnything3Net(nn.Module):
+    """
+    Nested Depth Anything 3 network with metric scaling capabilities.
+    This network combines two DepthAnything3Net branches:
+    - Main branch: Standard depth estimation
+    - Metric branch: Metric depth estimation for scaling alignment
+    The network performs depth alignment using least squares scaling
+    and handles sky region masking for improved depth estimation.
+    Args:
+        preset: Configuration for the main depth estimation branch
+        second_preset: Configuration for the metric depth branch
+    """
+    def __init__(self, anyview: DictConfig, metric: DictConfig):
+        """
+        Initialize NestedDepthAnything3Net with two branches.
+        Args:
+            preset: Configuration for main depth estimation branch
+            second_preset: Configuration for metric depth branch
+        """
+        super().__init__()
+        self.da3 = create_object(anyview)
+        self.da3_metric = create_object(metric)
+    def forward(
+        self,
+        x: torch.Tensor,
+        extrinsics: torch.Tensor | None = None,
+        intrinsics: torch.Tensor | None = None,
+        export_feat_layers: list[int] | None = [],
+        infer_gs: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through both branches with metric scaling alignment.
+        Args:
+            x: Input images (B, N, 3, H, W)
+            extrinsics: Camera extrinsics (B, N, 4, 4) - unused
+            intrinsics: Camera intrinsics (B, N, 3, 3) - unused
+            feat_layers: List of layer indices to extract features from
+            metric_feat: Whether to use metric features (unused)
+        Returns:
+            Dictionary containing aligned depth predictions and camera parameters
+        """
+        # Get predictions from both branches
+        output = self.da3(
+            x, extrinsics, intrinsics, export_feat_layers=export_feat_layers, infer_gs=infer_gs
+        )
+        metric_output = self.da3_metric(x, infer_gs=infer_gs)
+        # Apply metric scaling and alignment
+        output = self._apply_metric_scaling(output, metric_output)
+        output = self._apply_depth_alignment(output, metric_output)
+        output = self._handle_sky_regions(output, metric_output)
+        return output
+    def _apply_metric_scaling(
+        self, output: Dict[str, torch.Tensor], metric_output: Dict[str, torch.Tensor]
+    ) -> Dict[str, torch.Tensor]:
+        """Apply metric scaling to the metric depth output."""
+        # Scale metric depth based on camera intrinsics
+        metric_output.depth = apply_metric_scaling(
+            metric_output.depth,
+            output.intrinsics,
+        )
+        return output
+    def _apply_depth_alignment(
+        self, output: Dict[str, torch.Tensor], metric_output: Dict[str, torch.Tensor]
+    ) -> Dict[str, torch.Tensor]:
+        """Apply depth alignment using least squares scaling."""
+        # Compute non-sky mask
+        non_sky_mask = compute_sky_mask(metric_output.sky, threshold=0.3)
+        # Ensure we have enough non-sky pixels
+        assert non_sky_mask.sum() > 10, "Insufficient non-sky pixels for alignment"
+        # Sample depth confidence for quantile computation
+        depth_conf_ns = output.depth_conf[non_sky_mask]
+        depth_conf_sampled = sample_tensor_for_quantile(depth_conf_ns, max_samples=100000)
+        median_conf = torch.quantile(depth_conf_sampled, 0.5)
+        # Compute alignment mask
+        align_mask = compute_alignment_mask(
+            output.depth_conf, non_sky_mask, output.depth, metric_output.depth, median_conf
+        )
+        # Compute scale factor using least squares
+        valid_depth = output.depth[align_mask]
+        valid_metric_depth = metric_output.depth[align_mask]
+        scale_factor = least_squares_scale_scalar(valid_metric_depth, valid_depth)
+        # Apply scaling to depth and extrinsics
+        output.depth *= scale_factor
+        output.extrinsics[:, :, :3, 3] *= scale_factor
+        output.is_metric = 1
+        output.scale_factor = scale_factor.item()
+        return output
+    def _handle_sky_regions(
+        self,
+        output: Dict[str, torch.Tensor],
+        metric_output: Dict[str, torch.Tensor],
+        sky_depth_def: float = 200.0,
+    ) -> Dict[str, torch.Tensor]:
+        """Handle sky regions by setting them to maximum depth."""
+        non_sky_mask = compute_sky_mask(metric_output.sky, threshold=0.3)
+        # Compute maximum depth for non-sky regions
+        # Use sampling to safely compute quantile on large tensors
+        non_sky_depth = output.depth[non_sky_mask]
+        if non_sky_depth.numel() > 100000:
+            idx = torch.randint(0, non_sky_depth.numel(), (100000,), device=non_sky_depth.device)
+            sampled_depth = non_sky_depth[idx]
+        else:
+            sampled_depth = non_sky_depth
+        non_sky_max = min(torch.quantile(sampled_depth, 0.99), sky_depth_def)
+        # Set sky regions to maximum depth and high confidence
+        output.depth, output.depth_conf = set_sky_regions_to_max_depth(
+            output.depth, output.depth_conf, non_sky_mask, max_depth=non_sky_max
+        )
+        return output

depth_anything_3/model/dinov2/dinov2.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from typing import List
+import torch.nn as nn
+from depth_anything_3.model.dinov2.vision_transformer import (
+    vit_base,
+    vit_giant2,
+    vit_large,
+    vit_small,
+)
+class DinoV2(nn.Module):
+    def __init__(
+        self,
+        name: str,
+        out_layers: List[int],
+        alt_start: int = -1,
+        qknorm_start: int = -1,
+        rope_start: int = -1,
+        cat_token: bool = True,
+        **kwargs,
+    ):
+        super().__init__()
+        assert name in {"vits", "vitb", "vitl", "vitg"}
+        self.name = name
+        self.out_layers = out_layers
+        self.alt_start = alt_start
+        self.qknorm_start = qknorm_start
+        self.rope_start = rope_start
+        self.cat_token = cat_token
+        encoder_map = {
+            "vits": vit_small,
+            "vitb": vit_base,
+            "vitl": vit_large,
+            "vitg": vit_giant2,
+        }
+        encoder_fn = encoder_map[self.name]
+        ffn_layer = "swiglufused" if self.name == "vitg" else "mlp"
+        self.pretrained = encoder_fn(
+            img_size=518,
+            patch_size=14,
+            ffn_layer=ffn_layer,
+            alt_start=alt_start,
+            qknorm_start=qknorm_start,
+            rope_start=rope_start,
+            cat_token=cat_token,
+        )
+    def forward(self, x, **kwargs):
+        return self.pretrained.get_intermediate_layers(
+            x,
+            self.out_layers,
+            **kwargs,
+        )

depth_anything_3/model/dinov2/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# from .attention import MemEffAttention
+from .block import Block
+from .layer_scale import LayerScale
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .rope import PositionGetter, RotaryPositionEmbedding2D
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+__all__ = [
+    Mlp,
+    PatchEmbed,
+    SwiGLUFFN,
+    SwiGLUFFNFused,
+    Block,
+    # MemEffAttention,
+    LayerScale,
+    PositionGetter,
+    RotaryPositionEmbedding2D,
+]

depth_anything_3/model/dinov2/layers/attention.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import torch.nn.functional as F
+from torch import Tensor, nn
+logger = logging.getLogger("dinov2")
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+        qk_norm: bool = False,
+        fused_attn: bool = True,  # use F.scaled_dot_product_attention or not
+        rope=None,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.fused_attn = fused_attn
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope
+    def forward(self, x: Tensor, pos=None, attn_mask=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.rope is not None and pos is not None:
+            q = self.rope(q, pos)
+            k = self.rope(k, pos)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                dropout_p=self.attn_drop.p if self.training else 0.0,
+                attn_mask=(
+                    (attn_mask)[:, None].repeat(1, self.num_heads, 1, 1)
+                    if attn_mask is not None
+                    else None
+                ),
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def _forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

depth_anything_3/model/dinov2/layers/block.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# flake8: noqa: F821
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+from typing import Callable, Optional
+import torch
+from torch import Tensor, nn
+from .attention import Attention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+XFORMERS_AVAILABLE = True
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+        qk_norm: bool = False,
+        rope=None,
+        ln_eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim, eps=ln_eps)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            qk_norm=qk_norm,
+            rope=rope,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim, eps=ln_eps)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor, pos=None, attn_mask=None) -> Tensor:
+        def attn_residual_func(x: Tensor, pos=None, attn_mask=None) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x), pos=pos, attn_mask=attn_mask))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                pos=pos,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x, pos=pos, attn_mask=attn_mask))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x, pos=pos, attn_mask=attn_mask)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+    pos: Optional[Tensor] = None,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    if pos is not None:
+        # if necessary, apply rope to the subset
+        pos = pos[brange]
+        residual = residual_func(x_subset, pos=pos)
+    else:
+        residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(
+        x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor
+    )
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor

depth_anything_3/model/dinov2/layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

depth_anything_3/model/dinov2/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110  # noqa: E501
+from typing import Union
+import torch
+from torch import Tensor, nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.inplace = inplace
+        self.init_values = init_values
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+    def extra_repr(self) -> str:
+        return f"{self.dim}, init_values={self.init_values}, inplace={self.inplace}"

depth_anything_3/model/dinov2/layers/mlp.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

depth_anything_3/model/dinov2/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+import torch.nn as nn
+from torch import Tensor
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert (
+            H % patch_H == 0
+        ), f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert (
+            W % patch_W == 0
+        ), f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = (
+            Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        )
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

depth_anything_3/model/dinov2/layers/rope.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# Implementation of 2D Rotary Position Embeddings (RoPE).
+# This module provides a clean implementation of 2D Rotary Position Embeddings,
+# which extends the original RoPE concept to handle 2D spatial positions.
+# Inspired by:
+#         https://github.com/meta-llama/codellama/blob/main/llama/model.py
+#         https://github.com/naver-ai/rope-vit
+from typing import Dict, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class PositionGetter:
+    """Generates and caches 2D spatial positions for patches in a grid.
+    This class efficiently manages the generation of spatial coordinates for patches
+    in a 2D grid, caching results to avoid redundant computations.
+    Attributes:
+        position_cache: Dictionary storing precomputed position tensors for different
+            grid dimensions.
+    """
+    def __init__(self):
+        """Initializes the position generator with an empty cache."""
+        self.position_cache: Dict[Tuple[int, int], torch.Tensor] = {}
+    def __call__(
+        self, batch_size: int, height: int, width: int, device: torch.device
+    ) -> torch.Tensor:
+        """Generates spatial positions for a batch of patches.
+        Args:
+            batch_size: Number of samples in the batch.
+            height: Height of the grid in patches.
+            width: Width of the grid in patches.
+            device: Target device for the position tensor.
+        Returns:
+            Tensor of shape (batch_size, height*width, 2) containing y,x coordinates
+            for each position in the grid, repeated for each batch item.
+        """
+        if (height, width) not in self.position_cache:
+            y_coords = torch.arange(height, device=device)
+            x_coords = torch.arange(width, device=device)
+            positions = torch.cartesian_prod(y_coords, x_coords)
+            self.position_cache[height, width] = positions
+        cached_positions = self.position_cache[height, width]
+        return cached_positions.view(1, height * width, 2).expand(batch_size, -1, -1).clone()
+class RotaryPositionEmbedding2D(nn.Module):
+    """2D Rotary Position Embedding implementation.
+    This module applies rotary position embeddings to input tokens based on their
+    2D spatial positions. It handles the position-dependent rotation of features
+    separately for vertical and horizontal dimensions.
+    Args:
+        frequency: Base frequency for the position embeddings. Default: 100.0
+        scaling_factor: Scaling factor for frequency computation. Default: 1.0
+    Attributes:
+        base_frequency: Base frequency for computing position embeddings.
+        scaling_factor: Factor to scale the computed frequencies.
+        frequency_cache: Cache for storing precomputed frequency components.
+    """
+    def __init__(self, frequency: float = 100.0, scaling_factor: float = 1.0):
+        """Initializes the 2D RoPE module."""
+        super().__init__()
+        self.base_frequency = frequency
+        self.scaling_factor = scaling_factor
+        self.frequency_cache: Dict[Tuple, Tuple[torch.Tensor, torch.Tensor]] = {}
+    def _compute_frequency_components(
+        self, dim: int, seq_len: int, device: torch.device, dtype: torch.dtype
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Computes frequency components for rotary embeddings.
+        Args:
+            dim: Feature dimension (must be even).
+            seq_len: Maximum sequence length.
+            device: Target device for computations.
+            dtype: Data type for the computed tensors.
+        Returns:
+            Tuple of (cosine, sine) tensors for frequency components.
+        """
+        cache_key = (dim, seq_len, device, dtype)
+        if cache_key not in self.frequency_cache:
+            # Compute frequency bands
+            exponents = torch.arange(0, dim, 2, device=device).float() / dim
+            inv_freq = 1.0 / (self.base_frequency**exponents)
+            # Generate position-dependent frequencies
+            positions = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
+            angles = torch.einsum("i,j->ij", positions, inv_freq)
+            # Compute and cache frequency components
+            angles = angles.to(dtype)
+            angles = torch.cat((angles, angles), dim=-1)
+            cos_components = angles.cos().to(dtype)
+            sin_components = angles.sin().to(dtype)
+            self.frequency_cache[cache_key] = (cos_components, sin_components)
+        return self.frequency_cache[cache_key]
+    @staticmethod
+    def _rotate_features(x: torch.Tensor) -> torch.Tensor:
+        """Performs feature rotation by splitting and recombining feature dimensions.
+        Args:
+            x: Input tensor to rotate.
+        Returns:
+            Rotated feature tensor.
+        """
+        feature_dim = x.shape[-1]
+        x1, x2 = x[..., : feature_dim // 2], x[..., feature_dim // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+    def _apply_1d_rope(
+        self,
+        tokens: torch.Tensor,
+        positions: torch.Tensor,
+        cos_comp: torch.Tensor,
+        sin_comp: torch.Tensor,
+    ) -> torch.Tensor:
+        """Applies 1D rotary position embeddings along one dimension.
+        Args:
+            tokens: Input token features.
+            positions: Position indices.
+            cos_comp: Cosine components for rotation.
+            sin_comp: Sine components for rotation.
+        Returns:
+            Tokens with applied rotary position embeddings.
+        """
+        # Embed positions with frequency components
+        cos = F.embedding(positions, cos_comp)[:, None, :, :]
+        sin = F.embedding(positions, sin_comp)[:, None, :, :]
+        # Apply rotation
+        return (tokens * cos) + (self._rotate_features(tokens) * sin)
+    def forward(self, tokens: torch.Tensor, positions: torch.Tensor) -> torch.Tensor:
+        """Applies 2D rotary position embeddings to input tokens.
+        Args:
+            tokens: Input tensor of shape (batch_size, n_heads, n_tokens, dim).
+                   The feature dimension (dim) must be divisible by 4.
+            positions: Position tensor of shape (batch_size, n_tokens, 2) containing
+                      the y and x coordinates for each token.
+        Returns:
+            Tensor of same shape as input with applied 2D rotary position embeddings.
+        Raises:
+            AssertionError: If input dimensions are invalid or positions are malformed.
+        """
+        # Validate inputs
+        assert tokens.size(-1) % 2 == 0, "Feature dimension must be even"
+        assert (
+            positions.ndim == 3 and positions.shape[-1] == 2
+        ), "Positions must have shape (batch_size, n_tokens, 2)"
+        # Compute feature dimension for each spatial direction
+        feature_dim = tokens.size(-1) // 2
+        # Get frequency components
+        max_position = int(positions.max()) + 1
+        cos_comp, sin_comp = self._compute_frequency_components(
+            feature_dim, max_position, tokens.device, tokens.dtype
+        )
+        # Split features for vertical and horizontal processing
+        vertical_features, horizontal_features = tokens.chunk(2, dim=-1)
+        # Apply RoPE separately for each dimension
+        vertical_features = self._apply_1d_rope(
+            vertical_features, positions[..., 0], cos_comp, sin_comp
+        )
+        horizontal_features = self._apply_1d_rope(
+            horizontal_features, positions[..., 1], cos_comp, sin_comp
+        )
+        # Combine processed features
+        return torch.cat((vertical_features, horizontal_features), dim=-1)

depth_anything_3/model/dinov2/layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Callable, Optional
+import torch.nn.functional as F
+from torch import Tensor, nn
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+try:
+    from xformers.ops import SwiGLU
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

depth_anything_3/model/dinov2/vision_transformer.py ADDED Viewed

	@@ -0,0 +1,437 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import math
+from typing import Callable, List, Sequence, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from einops import rearrange
+from depth_anything_3.utils.logger import logger
+from .layers import LayerScale  # noqa: F401
+from .layers import Mlp  # noqa: F401
+from .layers import (  # noqa: F401
+    Block,
+    PatchEmbed,
+    PositionGetter,
+    RotaryPositionEmbedding2D,
+    SwiGLUFFNFused,
+)
+# logger = logging.getLogger("dinov2")
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+def named_apply(
+    fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False
+) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(
+            fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True
+        )
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=1.0,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+        alt_start=-1,
+        qknorm_start=-1,
+        rope_start=-1,
+        rope_freq=100,
+        plus_cam_token=False,
+        cat_token=True,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating
+                positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating
+                positional embeddings
+            block_prompt: (bool) whether to add ray embeddings to the block input
+        """
+        super().__init__()
+        self.patch_start_idx = 1
+        norm_layer = nn.LayerNorm
+        self.num_features = self.embed_dim = (
+            embed_dim  # num_features for consistency with other models
+        )
+        self.alt_start = alt_start
+        self.qknorm_start = qknorm_start
+        self.rope_start = rope_start
+        self.cat_token = cat_token
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim
+        )
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if self.alt_start != -1:
+            self.camera_token = nn.Parameter(torch.randn(1, 2, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim))
+            if num_register_tokens
+            else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [
+                x.item() for x in torch.linspace(0, drop_path_rate, depth)
+            ]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        if self.rope_start != -1:
+            self.rope = RotaryPositionEmbedding2D(frequency=rope_freq) if rope_freq > 0 else None
+            self.position_getter = PositionGetter() if self.rope is not None else None
+        else:
+            self.rope = None
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+                qk_norm=i >= qknorm_start if qknorm_start != -1 else False,
+                rope=self.rope if i >= rope_start and rope_start != -1 else None,
+            )
+            for i in range(depth)
+        ]
+        self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the
+            # interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using
+            # both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sx, sy)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (w0, h0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (w0, h0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_cls_token(self, B, S):
+        cls_token = self.cls_token.expand(B, S, -1)
+        cls_token = cls_token.reshape(B * S, -1, self.embed_dim)
+        return cls_token
+    def prepare_tokens_with_masks(self, x, masks=None, cls_token=None, **kwargs):
+        B, S, nc, w, h = x.shape
+        x = rearrange(x, "b s c h w -> (b s) c h w")
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        cls_token = self.prepare_cls_token(B, S)
+        x = torch.cat((cls_token, x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        x = rearrange(x, "(b s) n c -> b s n c", b=B, s=S)
+        return x
+    def _prepare_rope(self, B, S, H, W, device):
+        pos = None
+        pos_nodiff = None
+        if self.rope is not None:
+            pos = self.position_getter(
+                B * S, H // self.patch_size, W // self.patch_size, device=device
+            )
+            pos = rearrange(pos, "(b s) n c -> b s n c", b=B)
+            pos_nodiff = torch.zeros_like(pos).to(pos.dtype)
+            if self.patch_start_idx > 0:
+                pos = pos + 1
+                pos_special = torch.zeros(B * S, self.patch_start_idx, 2).to(device).to(pos.dtype)
+                pos_special = rearrange(pos_special, "(b s) n c -> b s n c", b=B)
+                pos = torch.cat([pos_special, pos], dim=2)
+                pos_nodiff = pos_nodiff + 1
+                pos_nodiff = torch.cat([pos_special, pos_nodiff], dim=2)
+        return pos, pos_nodiff
+    def _get_intermediate_layers_not_chunked(self, x, n=1, export_feat_layers=[], **kwargs):
+        B, S, _, H, W = x.shape
+        x = self.prepare_tokens_with_masks(x)
+        output, total_block_len, aux_output = [], len(self.blocks), []
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        pos, pos_nodiff = self._prepare_rope(B, S, H, W, x.device)
+        for i, blk in enumerate(self.blocks):
+            if i < self.rope_start or self.rope is None:
+                g_pos, l_pos = None, None
+            else:
+                g_pos = pos_nodiff
+                l_pos = pos
+            if self.alt_start != -1 and i == self.alt_start:
+                if kwargs.get("cam_token", None) is not None:
+                    logger.info("Using camera conditions provided by the user")
+                    cam_token = kwargs.get("cam_token")
+                else:
+                    ref_token = self.camera_token[:, :1].expand(B, -1, -1)
+                    src_token = self.camera_token[:, 1:].expand(B, S - 1, -1)
+                    cam_token = torch.cat([ref_token, src_token], dim=1)
+                x[:, :, 0] = cam_token
+            if self.alt_start != -1 and i >= self.alt_start and i % 2 == 1:
+                x = self.process_attention(
+                    x, blk, "global", pos=g_pos, attn_mask=kwargs.get("attn_mask", None)
+                )
+            else:
+                x = self.process_attention(x, blk, "local", pos=l_pos)
+                local_x = x
+            if i in blocks_to_take:
+                out_x = torch.cat([local_x, x], dim=-1) if self.cat_token else x
+                output.append((out_x[:, :, 0], out_x))
+            if i in export_feat_layers:
+                aux_output.append(x)
+        return output, aux_output
+    def process_attention(self, x, block, attn_type="global", pos=None, attn_mask=None):
+        b, s, n = x.shape[:3]
+        if attn_type == "local":
+            x = rearrange(x, "b s n c -> (b s) n c")
+            if pos is not None:
+                pos = rearrange(pos, "b s n c -> (b s) n c")
+        elif attn_type == "global":
+            x = rearrange(x, "b s n c -> b (s n) c")
+            if pos is not None:
+                pos = rearrange(pos, "b s n c -> b (s n) c")
+        else:
+            raise ValueError(f"Invalid attention type: {attn_type}")
+        x = block(x, pos=pos, attn_mask=attn_mask)
+        if attn_type == "local":
+            x = rearrange(x, "(b s) n c -> b s n c", b=b, s=s)
+        elif attn_type == "global":
+            x = rearrange(x, "b (s n) c -> b s n c", b=b, s=s)
+        return x
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        export_feat_layers: List[int] = [],
+        **kwargs,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        outputs, aux_outputs = self._get_intermediate_layers_not_chunked(
+            x, n, export_feat_layers=export_feat_layers, **kwargs
+        )
+        camera_tokens = [out[0] for out in outputs]
+        if outputs[0][1].shape[-1] == self.embed_dim:
+            outputs = [self.norm(out[1]) for out in outputs]
+        elif outputs[0][1].shape[-1] == (self.embed_dim * 2):
+            outputs = [
+                torch.cat(
+                    [out[1][..., : self.embed_dim], self.norm(out[1][..., self.embed_dim :])],
+                    dim=-1,
+                )
+                for out in outputs
+            ]
+        else:
+            raise ValueError(f"Invalid output shape: {outputs[0][1].shape}")
+        aux_outputs = [self.norm(out) for out in aux_outputs]
+        outputs = [out[..., 1 + self.num_register_tokens :, :] for out in outputs]
+        aux_outputs = [out[..., 1 + self.num_register_tokens :, :] for out in aux_outputs]
+        return tuple(zip(outputs, camera_tokens)), aux_outputs
+def vit_small(patch_size=16, num_register_tokens=0, depth=12, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=depth,
+        num_heads=6,
+        mlp_ratio=4,
+        # block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, depth=12, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=depth,
+        num_heads=12,
+        mlp_ratio=4,
+        # block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, depth=24, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=depth,
+        num_heads=16,
+        mlp_ratio=4,
+        # block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, depth=40, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=depth,
+        num_heads=24,
+        mlp_ratio=4,
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model

depth_anything_3/model/dpt.py ADDED Viewed

	@@ -0,0 +1,457 @@

+# flake8: noqa E501
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict as TyDict
+from typing import List, Sequence, Tuple
+import torch
+import torch.nn as nn
+from addict import Dict
+from einops import rearrange
+from depth_anything_3.model.utils.head_utils import (
+    Permute,
+    create_uv_grid,
+    custom_interpolate,
+    position_grid_to_embed,
+)
+class DPT(nn.Module):
+    """
+    DPT for dense prediction (main head + optional sky head, sky always 1 channel).
+    Returns:
+      - Main head:
+        * If output_dim>1: { head_name, f"{head_name}_conf" }
+        * If output_dim==1: { head_name }
+      - Sky head (if use_sky_head=True): { sky_name }  # [B, S, 1, H/down_ratio, W/down_ratio]
+    """
+    def __init__(
+        self,
+        dim_in: int,
+        *,
+        patch_size: int = 14,
+        output_dim: int = 1,
+        activation: str = "exp",
+        conf_activation: str = "expp1",
+        features: int = 256,
+        out_channels: Sequence[int] = (256, 512, 1024, 1024),
+        pos_embed: bool = False,
+        down_ratio: int = 1,
+        head_name: str = "depth",
+        # ---- sky head (fixed 1 channel) ----
+        use_sky_head: bool = True,
+        sky_name: str = "sky",
+        sky_activation: str = "relu",  # 'sigmoid' / 'relu' / 'linear'
+        use_ln_for_heads: bool = False,  # If needed, apply LayerNorm on intermediate features of both heads
+        norm_type: str = "idt",  # use to match legacy GS-DPT head, "idt" / "layer"
+        fusion_block_inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        # -------------------- configuration --------------------
+        self.patch_size = patch_size
+        self.activation = activation
+        self.conf_activation = conf_activation
+        self.pos_embed = pos_embed
+        self.down_ratio = down_ratio
+        # Names
+        self.head_main = head_name
+        self.sky_name = sky_name
+        # Main head: output dimension and confidence switch
+        self.out_dim = output_dim
+        self.has_conf = output_dim > 1
+        # Sky head parameters (always 1 channel)
+        self.use_sky_head = use_sky_head
+        self.sky_activation = sky_activation
+        # Fixed 4 intermediate outputs
+        self.intermediate_layer_idx: Tuple[int, int, int, int] = (0, 1, 2, 3)
+        # -------------------- token pre-norm + per-stage projection --------------------
+        if norm_type == "layer":
+            self.norm = nn.LayerNorm(dim_in)
+        elif norm_type == "idt":
+            self.norm = nn.Identity()
+        else:
+            raise Exception(f"Unknown norm_type {norm_type}, should be 'layer' or 'idt'.")
+        self.projects = nn.ModuleList(
+            [nn.Conv2d(dim_in, oc, kernel_size=1, stride=1, padding=0) for oc in out_channels]
+        )
+        # -------------------- Spatial re-size (align to common scale before fusion) --------------------
+        # Design consistent with original: relative to patch grid (x4, x2, x1, /2)
+        self.resize_layers = nn.ModuleList(
+            [
+                nn.ConvTranspose2d(
+                    out_channels[0], out_channels[0], kernel_size=4, stride=4, padding=0
+                ),
+                nn.ConvTranspose2d(
+                    out_channels[1], out_channels[1], kernel_size=2, stride=2, padding=0
+                ),
+                nn.Identity(),
+                nn.Conv2d(out_channels[3], out_channels[3], kernel_size=3, stride=2, padding=1),
+            ]
+        )
+        # -------------------- scratch: stage adapters + main fusion chain --------------------
+        self.scratch = _make_scratch(list(out_channels), features, expand=False)
+        # Main fusion chain
+        self.scratch.refinenet1 = _make_fusion_block(features, inplace=fusion_block_inplace)
+        self.scratch.refinenet2 = _make_fusion_block(features, inplace=fusion_block_inplace)
+        self.scratch.refinenet3 = _make_fusion_block(features, inplace=fusion_block_inplace)
+        self.scratch.refinenet4 = _make_fusion_block(
+            features, has_residual=False, inplace=fusion_block_inplace
+        )
+        # Heads (shared neck1; then split into two heads)
+        head_features_1 = features
+        head_features_2 = 32
+        self.scratch.output_conv1 = nn.Conv2d(
+            head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1
+        )
+        ln_seq = (
+            [Permute((0, 2, 3, 1)), nn.LayerNorm(head_features_2), Permute((0, 3, 1, 2))]
+            if use_ln_for_heads
+            else []
+        )
+        # Main head
+        self.scratch.output_conv2 = nn.Sequential(
+            nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+            *ln_seq,
+            nn.ReLU(inplace=True),
+            nn.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0),
+        )
+        # Sky head (fixed 1 channel)
+        if self.use_sky_head:
+            self.scratch.sky_output_conv2 = nn.Sequential(
+                nn.Conv2d(
+                    head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1
+                ),
+                *ln_seq,
+                nn.ReLU(inplace=True),
+                nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
+            )
+    # -------------------------------------------------------------------------
+    # Public forward (supports frame chunking to save memory)
+    # -------------------------------------------------------------------------
+    def forward(
+        self,
+        feats: List[torch.Tensor],
+        H: int,
+        W: int,
+        patch_start_idx: int,
+        chunk_size: int = 8,
+        **kwargs,
+    ) -> Dict:
+        """
+        Args:
+            feats: List of 4 entries, each entry is a tensor like [B, S, T, C] (or the 0th element of tuple/list is that tensor).
+            H, W:  Original image dimensions
+            patch_start_idx: Starting index of patch tokens in sequence (for cropping non-patch tokens)
+            chunk_size:      Chunk size along time dimension S
+        Returns:
+            Dict[str, Tensor]
+        """
+        B, S, N, C = feats[0][0].shape
+        feats = [feat[0].reshape(B * S, N, C) for feat in feats]
+        # update image info, used by the GS-DPT head
+        extra_kwargs = {}
+        if "images" in kwargs:
+            extra_kwargs.update({"images": rearrange(kwargs["images"], "B S ... -> (B S) ...")})
+        if chunk_size is None or chunk_size >= S:
+            out_dict = self._forward_impl(feats, H, W, patch_start_idx, **extra_kwargs)
+            out_dict = {k: v.view(B, S, *v.shape[1:]) for k, v in out_dict.items()}
+            return Dict(out_dict)
+        out_dicts: List[TyDict[str, torch.Tensor]] = []
+        for s0 in range(0, S, chunk_size):
+            s1 = min(s0 + chunk_size, S)
+            kw = {}
+            if "images" in extra_kwargs:
+                kw.update({"images": extra_kwargs["images"][s0:s1]})
+            out_dicts.append(
+                self._forward_impl([f[s0:s1] for f in feats], H, W, patch_start_idx, **kw)
+            )
+        out_dict = {k: torch.cat([od[k] for od in out_dicts], dim=0) for k in out_dicts[0].keys()}
+        out_dict = {k: v.view(B, S, *v.shape[1:]) for k, v in out_dict.items()}
+        return Dict(out_dict)
+    # -------------------------------------------------------------------------
+    # Internal forward (single chunk)
+    # -------------------------------------------------------------------------
+    def _forward_impl(
+        self,
+        feats: List[torch.Tensor],
+        H: int,
+        W: int,
+        patch_start_idx: int,
+    ) -> TyDict[str, torch.Tensor]:
+        B, _, C = feats[0].shape
+        ph, pw = H // self.patch_size, W // self.patch_size
+        resized_feats = []
+        for stage_idx, take_idx in enumerate(self.intermediate_layer_idx):
+            x = feats[take_idx][:, patch_start_idx:]  # [B*S, N_patch, C]
+            x = self.norm(x)
+            x = x.permute(0, 2, 1).reshape(B, C, ph, pw)  # [B*S, C, ph, pw]
+            x = self.projects[stage_idx](x)
+            if self.pos_embed:
+                x = self._add_pos_embed(x, W, H)
+            x = self.resize_layers[stage_idx](x)  # Align scale
+            resized_feats.append(x)
+        # 2) Fusion pyramid (main branch only)
+        fused = self._fuse(resized_feats)
+        # 3) Upsample to target resolution, optionally add position encoding again
+        h_out = int(ph * self.patch_size / self.down_ratio)
+        w_out = int(pw * self.patch_size / self.down_ratio)
+        fused = self.scratch.output_conv1(fused)
+        fused = custom_interpolate(fused, (h_out, w_out), mode="bilinear", align_corners=True)
+        if self.pos_embed:
+            fused = self._add_pos_embed(fused, W, H)
+        # 4) Shared neck1
+        feat = fused
+        # 5) Main head: logits -> activation
+        main_logits = self.scratch.output_conv2(feat)
+        outs: TyDict[str, torch.Tensor] = {}
+        if self.has_conf:
+            fmap = main_logits.permute(0, 2, 3, 1)
+            pred = self._apply_activation_single(fmap[..., :-1], self.activation)
+            conf = self._apply_activation_single(fmap[..., -1], self.conf_activation)
+            outs[self.head_main] = pred.squeeze(1)
+            outs[f"{self.head_main}_conf"] = conf.squeeze(1)
+        else:
+            outs[self.head_main] = self._apply_activation_single(
+                main_logits, self.activation
+            ).squeeze(1)
+        # 6) Sky head (fixed 1 channel)
+        if self.use_sky_head:
+            sky_logits = self.scratch.sky_output_conv2(feat)
+            outs[self.sky_name] = self._apply_sky_activation(sky_logits).squeeze(1)
+        return outs
+    # -------------------------------------------------------------------------
+    # Subroutines
+    # -------------------------------------------------------------------------
+    def _fuse(self, feats: List[torch.Tensor]) -> torch.Tensor:
+        """
+        4-layer top-down fusion, returns finest scale features (after fusion, before neck1).
+        """
+        l1, l2, l3, l4 = feats
+        l1_rn = self.scratch.layer1_rn(l1)
+        l2_rn = self.scratch.layer2_rn(l2)
+        l3_rn = self.scratch.layer3_rn(l3)
+        l4_rn = self.scratch.layer4_rn(l4)
+        # 4 -> 3 -> 2 -> 1
+        out = self.scratch.refinenet4(l4_rn, size=l3_rn.shape[2:])
+        out = self.scratch.refinenet3(out, l3_rn, size=l2_rn.shape[2:])
+        out = self.scratch.refinenet2(out, l2_rn, size=l1_rn.shape[2:])
+        out = self.scratch.refinenet1(out, l1_rn)
+        return out
+    def _apply_activation_single(
+        self, x: torch.Tensor, activation: str = "linear"
+    ) -> torch.Tensor:
+        """
+        Apply activation to single channel output, maintaining semantic consistency with value branch in multi-channel case.
+        Supports: exp / relu / sigmoid / softplus / tanh / linear / expp1
+        """
+        act = activation.lower() if isinstance(activation, str) else activation
+        if act == "exp":
+            return torch.exp(x)
+        if act == "expp1":
+            return torch.exp(x) + 1
+        if act == "expm1":
+            return torch.expm1(x)
+        if act == "relu":
+            return torch.relu(x)
+        if act == "sigmoid":
+            return torch.sigmoid(x)
+        if act == "softplus":
+            return torch.nn.functional.softplus(x)
+        if act == "tanh":
+            return torch.tanh(x)
+        # Default linear
+        return x
+    def _apply_sky_activation(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Sky head activation (fixed 1 channel):
+          * 'sigmoid' -> Sigmoid probability map
+          * 'relu'    -> ReLU positive domain output
+          * 'linear'  -> Original value (logits)
+        """
+        act = (
+            self.sky_activation.lower()
+            if isinstance(self.sky_activation, str)
+            else self.sky_activation
+        )
+        if act == "sigmoid":
+            return torch.sigmoid(x)
+        if act == "relu":
+            return torch.relu(x)
+        # 'linear'
+        return x
+    def _add_pos_embed(self, x: torch.Tensor, W: int, H: int, ratio: float = 0.1) -> torch.Tensor:
+        """Simple UV position encoding directly added to feature map."""
+        pw, ph = x.shape[-1], x.shape[-2]
+        pe = create_uv_grid(pw, ph, aspect_ratio=W / H, dtype=x.dtype, device=x.device)
+        pe = position_grid_to_embed(pe, x.shape[1]) * ratio
+        pe = pe.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1)
+        return x + pe
+# -----------------------------------------------------------------------------
+# Building blocks (preserved, consistent with original)
+# -----------------------------------------------------------------------------
+def _make_fusion_block(
+    features: int,
+    size: Tuple[int, int] = None,
+    has_residual: bool = True,
+    groups: int = 1,
+    inplace: bool = False,
+) -> nn.Module:
+    return FeatureFusionBlock(
+        features=features,
+        activation=nn.ReLU(inplace=inplace),
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        size=size,
+        has_residual=has_residual,
+        groups=groups,
+    )
+def _make_scratch(
+    in_shape: List[int], out_shape: int, groups: int = 1, expand: bool = False
+) -> nn.Module:
+    scratch = nn.Module()
+    # Optional expansion by stage
+    c1 = out_shape
+    c2 = out_shape * (2 if expand else 1)
+    c3 = out_shape * (4 if expand else 1)
+    c4 = out_shape * (8 if expand else 1)
+    scratch.layer1_rn = nn.Conv2d(in_shape[0], c1, 3, 1, 1, bias=False, groups=groups)
+    scratch.layer2_rn = nn.Conv2d(in_shape[1], c2, 3, 1, 1, bias=False, groups=groups)
+    scratch.layer3_rn = nn.Conv2d(in_shape[2], c3, 3, 1, 1, bias=False, groups=groups)
+    scratch.layer4_rn = nn.Conv2d(in_shape[3], c4, 3, 1, 1, bias=False, groups=groups)
+    return scratch
+class ResidualConvUnit(nn.Module):
+    """Lightweight residual convolution block for fusion"""
+    def __init__(self, features: int, activation: nn.Module, bn: bool, groups: int = 1) -> None:
+        super().__init__()
+        self.bn = bn
+        self.groups = groups
+        self.conv1 = nn.Conv2d(features, features, 3, 1, 1, bias=True, groups=groups)
+        self.conv2 = nn.Conv2d(features, features, 3, 1, 1, bias=True, groups=groups)
+        self.norm1 = None
+        self.norm2 = None
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore[override]
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.norm1 is not None:
+            out = self.norm1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.norm2 is not None:
+            out = self.norm2(out)
+        return self.skip_add.add(out, x)
+class FeatureFusionBlock(nn.Module):
+    """Top-down fusion block: (optional) residual merge + upsampling + 1x1 contraction"""
+    def __init__(
+        self,
+        features: int,
+        activation: nn.Module,
+        deconv: bool = False,
+        bn: bool = False,
+        expand: bool = False,
+        align_corners: bool = True,
+        size: Tuple[int, int] = None,
+        has_residual: bool = True,
+        groups: int = 1,
+    ) -> None:
+        super().__init__()
+        self.align_corners = align_corners
+        self.size = size
+        self.has_residual = has_residual
+        self.resConfUnit1 = (
+            ResidualConvUnit(features, activation, bn, groups=groups) if has_residual else None
+        )
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn, groups=groups)
+        out_features = (features // 2) if expand else features
+        self.out_conv = nn.Conv2d(features, out_features, 1, 1, 0, bias=True, groups=groups)
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, *xs: torch.Tensor, size: Tuple[int, int] = None) -> torch.Tensor:  # type: ignore[override]
+        """
+        xs:
+          - xs[0]: Top branch input
+          - xs[1]: Lateral input (can do residual addition with top branch)
+        """
+        y = xs[0]
+        if self.has_residual and len(xs) > 1 and self.resConfUnit1 is not None:
+            y = self.skip_add.add(y, self.resConfUnit1(xs[1]))
+        y = self.resConfUnit2(y)
+        # Upsampling
+        if (size is None) and (self.size is None):
+            up_kwargs = {"scale_factor": 2}
+        elif size is None:
+            up_kwargs = {"size": self.size}
+        else:
+            up_kwargs = {"size": size}
+        y = custom_interpolate(y, **up_kwargs, mode="bilinear", align_corners=self.align_corners)
+        y = self.out_conv(y)
+        return y

depth_anything_3/model/dualdpt.py ADDED Viewed

	@@ -0,0 +1,488 @@

+# flake8: noqa E501
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Sequence, Tuple
+import torch
+import torch.nn as nn
+from addict import Dict
+from depth_anything_3.model.dpt import _make_fusion_block, _make_scratch
+from depth_anything_3.model.utils.head_utils import (
+    Permute,
+    create_uv_grid,
+    custom_interpolate,
+    position_grid_to_embed,
+)
+class DualDPT(nn.Module):
+    """
+    Dual-head DPT for dense prediction with an always-on auxiliary head.
+    Architectural notes:
+      - Sky/object branches are removed.
+      - `intermediate_layer_idx` is fixed to (0, 1, 2, 3).
+      - Auxiliary head has its **own** fusion blocks (no fusion_inplace / no sharing).
+      - Auxiliary head is internally multi-level; **only the final level** is returned.
+      - Returns a **dict** with keys from `head_names`, e.g.:
+          { main_name, f"{main_name}_conf", aux_name, f"{aux_name}_conf" }
+      - `feature_only` is fixed to False.
+    """
+    def __init__(
+        self,
+        dim_in: int,
+        *,
+        patch_size: int = 14,
+        output_dim: int = 2,
+        activation: str = "exp",
+        conf_activation: str = "expp1",
+        features: int = 256,
+        out_channels: Sequence[int] = (256, 512, 1024, 1024),
+        pos_embed: bool = True,
+        down_ratio: int = 1,
+        aux_pyramid_levels: int = 4,
+        aux_out1_conv_num: int = 5,
+        head_names: Tuple[str, str] = ("depth", "ray"),
+    ) -> None:
+        super().__init__()
+        # -------------------- configuration --------------------
+        self.patch_size = patch_size
+        self.activation = activation
+        self.conf_activation = conf_activation
+        self.pos_embed = pos_embed
+        self.down_ratio = down_ratio
+        self.aux_levels = aux_pyramid_levels
+        self.aux_out1_conv_num = aux_out1_conv_num
+        # names ONLY come from config (no hard-coded strings elsewhere)
+        self.head_main, self.head_aux = head_names
+        # Always expect 4 scales; enforce intermediate idx = (0, 1, 2, 3)
+        self.intermediate_layer_idx: Tuple[int, int, int, int] = (0, 1, 2, 3)
+        # -------------------- token pre-norm + per-stage projection --------------------
+        self.norm = nn.LayerNorm(dim_in)
+        self.projects = nn.ModuleList(
+            [nn.Conv2d(dim_in, oc, kernel_size=1, stride=1, padding=0) for oc in out_channels]
+        )
+        # -------------------- spatial re-sizers (align to common scale before fusion) --------------------
+        # design: stage strides (x4, x2, x1, /2) relative to patch grid to align to a common pivot scale
+        self.resize_layers = nn.ModuleList(
+            [
+                nn.ConvTranspose2d(
+                    out_channels[0], out_channels[0], kernel_size=4, stride=4, padding=0
+                ),
+                nn.ConvTranspose2d(
+                    out_channels[1], out_channels[1], kernel_size=2, stride=2, padding=0
+                ),
+                nn.Identity(),
+                nn.Conv2d(out_channels[3], out_channels[3], kernel_size=3, stride=2, padding=1),
+            ]
+        )
+        # -------------------- scratch: stage adapters + fusion (main & aux are separate) --------------------
+        self.scratch = _make_scratch(list(out_channels), features, expand=False)
+        # Main fusion chain (independent)
+        self.scratch.refinenet1 = _make_fusion_block(features)
+        self.scratch.refinenet2 = _make_fusion_block(features)
+        self.scratch.refinenet3 = _make_fusion_block(features)
+        self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False)
+        # Primary head neck + head (independent)
+        head_features_1 = features
+        head_features_2 = 32
+        self.scratch.output_conv1 = nn.Conv2d(
+            head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1
+        )
+        self.scratch.output_conv2 = nn.Sequential(
+            nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0),
+        )
+        # Auxiliary fusion chain (completely separate; no sharing, i.e., "fusion_inplace=False")
+        self.scratch.refinenet1_aux = _make_fusion_block(features)
+        self.scratch.refinenet2_aux = _make_fusion_block(features)
+        self.scratch.refinenet3_aux = _make_fusion_block(features)
+        self.scratch.refinenet4_aux = _make_fusion_block(features, has_residual=False)
+        # Aux pre-head per level (we will only *return final level*)
+        self.scratch.output_conv1_aux = nn.ModuleList(
+            [self._make_aux_out1_block(head_features_1) for _ in range(self.aux_levels)]
+        )
+        # Aux final projection per level
+        use_ln = True
+        ln_seq = (
+            [Permute((0, 2, 3, 1)), nn.LayerNorm(head_features_2), Permute((0, 3, 1, 2))]
+            if use_ln
+            else []
+        )
+        self.scratch.output_conv2_aux = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Conv2d(
+                        head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1
+                    ),
+                    *ln_seq,
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(head_features_2, 7, kernel_size=1, stride=1, padding=0),
+                )
+                for _ in range(self.aux_levels)
+            ]
+        )
+    # -------------------------------------------------------------------------
+    # Public forward (supports frame chunking for memory)
+    # -------------------------------------------------------------------------
+    def forward(
+        self,
+        feats: List[torch.Tensor],
+        H: int,
+        W: int,
+        patch_start_idx: int,
+        chunk_size: int = 8,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Args:
+            aggregated_tokens_list: List of 4 tensors [B, S, T, C] from transformer.
+            images:                [B, S, 3, H, W], in [0, 1].
+            patch_start_idx:       Patch-token start in the token sequence (to drop non-patch tokens).
+            frames_chunk_size:     Optional chunking along S for memory.
+        Returns:
+            Dict[str, Tensor] with keys based on `head_names`, e.g.:
+                self.head_main, f"{self.head_main}_conf",
+                self.head_aux,  f"{self.head_aux}_conf"
+            Shapes:
+              main:    [B, S, out_dim, H/down_ratio, W/down_ratio]
+              main_cf: [B, S, 1,       H/down_ratio, W/down_ratio]
+              aux:     [B, S, 7,       H/down_ratio, W/down_ratio]
+              aux_cf:  [B, S, 1,       H/down_ratio, W/down_ratio]
+        """
+        B, S, N, C = feats[0][0].shape
+        feats = [feat[0].reshape(B * S, N, C) for feat in feats]
+        if chunk_size is None or chunk_size >= S:
+            out_dict = self._forward_impl(feats, H, W, patch_start_idx)
+            out_dict = {k: v.reshape(B, S, *v.shape[1:]) for k, v in out_dict.items()}
+            return Dict(out_dict)
+        out_dicts = []
+        for s0 in range(0, S, chunk_size):
+            s1 = min(s0 + chunk_size, S)
+            out_dict = self._forward_impl(
+                [feat[s0:s1] for feat in feats],
+                H,
+                W,
+                patch_start_idx,
+            )
+            out_dicts.append(out_dict)
+        out_dict = {
+            k: torch.cat([out_dict[k] for out_dict in out_dicts], dim=0)
+            for k in out_dicts[0].keys()
+        }
+        out_dict = {k: v.view(B, S, *v.shape[1:]) for k, v in out_dict.items()}
+        return Dict(out_dict)
+    # -------------------------------------------------------------------------
+    # Internal forward (single chunk)
+    # -------------------------------------------------------------------------
+    def _forward_impl(
+        self,
+        feats: List[torch.Tensor],
+        H: int,
+        W: int,
+        patch_start_idx: int,
+    ) -> Dict[str, torch.Tensor]:
+        B, _, C = feats[0].shape
+        ph, pw = H // self.patch_size, W // self.patch_size
+        resized_feats = []
+        for stage_idx, take_idx in enumerate(self.intermediate_layer_idx):
+            x = feats[take_idx][:, patch_start_idx:]
+            x = self.norm(x)
+            x = x.permute(0, 2, 1).reshape(B, C, ph, pw)  # [B*S, C, ph, pw]
+            x = self.projects[stage_idx](x)
+            if self.pos_embed:
+                x = self._add_pos_embed(x, W, H)
+            x = self.resize_layers[stage_idx](x)  # align scales
+            resized_feats.append(x)
+        # 2) Fuse pyramid (main & aux are completely independent)
+        fused_main, fused_aux_pyr = self._fuse(resized_feats)
+        # 3) Upsample to target resolution and (optional) add pos-embed again
+        h_out = int(ph * self.patch_size / self.down_ratio)
+        w_out = int(pw * self.patch_size / self.down_ratio)
+        fused_main = custom_interpolate(
+            fused_main, (h_out, w_out), mode="bilinear", align_corners=True
+        )
+        if self.pos_embed:
+            fused_main = self._add_pos_embed(fused_main, W, H)
+        # Primary head: conv1 -> conv2 -> activate
+        # fused_main = self.scratch.output_conv1(fused_main)
+        main_logits = self.scratch.output_conv2(fused_main)
+        fmap = main_logits.permute(0, 2, 3, 1)
+        main_pred = self._apply_activation_single(fmap[..., :-1], self.activation)
+        main_conf = self._apply_activation_single(fmap[..., -1], self.conf_activation)
+        # Auxiliary head (multi-level inside) -> only last level returned (after activation)
+        last_aux = fused_aux_pyr[-1]
+        if self.pos_embed:
+            last_aux = self._add_pos_embed(last_aux, W, H)
+        # neck (per-level pre-conv) then final projection (only for last level)
+        # last_aux = self.scratch.output_conv1_aux[-1](last_aux)
+        last_aux_logits = self.scratch.output_conv2_aux[-1](last_aux)
+        fmap_last = last_aux_logits.permute(0, 2, 3, 1)
+        aux_pred = self._apply_activation_single(fmap_last[..., :-1], "linear")
+        aux_conf = self._apply_activation_single(fmap_last[..., -1], self.conf_activation)
+        return {
+            self.head_main: main_pred.squeeze(-1),
+            f"{self.head_main}_conf": main_conf,
+            self.head_aux: aux_pred,
+            f"{self.head_aux}_conf": aux_conf,
+        }
+    # -------------------------------------------------------------------------
+    # Subroutines
+    # -------------------------------------------------------------------------
+    def _fuse(self, feats: List[torch.Tensor]) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """
+        Feature pyramid fusion.
+        Returns:
+            fused_main: Tensor at finest scale (after refinenet1)
+            aux_pyr:    List of aux tensors at each level (pre out_conv1_aux)
+        """
+        l1, l2, l3, l4 = feats
+        l1_rn = self.scratch.layer1_rn(l1)
+        l2_rn = self.scratch.layer2_rn(l2)
+        l3_rn = self.scratch.layer3_rn(l3)
+        l4_rn = self.scratch.layer4_rn(l4)
+        # level 4 -> 3
+        out = self.scratch.refinenet4(l4_rn, size=l3_rn.shape[2:])
+        aux_out = self.scratch.refinenet4_aux(l4_rn, size=l3_rn.shape[2:])
+        aux_list: List[torch.Tensor] = []
+        if self.aux_levels >= 4:
+            aux_list.append(aux_out)
+        # level 3 -> 2
+        out = self.scratch.refinenet3(out, l3_rn, size=l2_rn.shape[2:])
+        aux_out = self.scratch.refinenet3_aux(aux_out, l3_rn, size=l2_rn.shape[2:])
+        if self.aux_levels >= 3:
+            aux_list.append(aux_out)
+        # level 2 -> 1
+        out = self.scratch.refinenet2(out, l2_rn, size=l1_rn.shape[2:])
+        aux_out = self.scratch.refinenet2_aux(aux_out, l2_rn, size=l1_rn.shape[2:])
+        if self.aux_levels >= 2:
+            aux_list.append(aux_out)
+        # level 1 (final)
+        out = self.scratch.refinenet1(out, l1_rn)
+        aux_out = self.scratch.refinenet1_aux(aux_out, l1_rn)
+        aux_list.append(aux_out)
+        out = self.scratch.output_conv1(out)
+        aux_list = [self.scratch.output_conv1_aux[i](aux) for i, aux in enumerate(aux_list)]
+        return out, aux_list
+    def _add_pos_embed(self, x: torch.Tensor, W: int, H: int, ratio: float = 0.1) -> torch.Tensor:
+        """Simple UV positional embedding added to feature maps."""
+        pw, ph = x.shape[-1], x.shape[-2]
+        pe = create_uv_grid(pw, ph, aspect_ratio=W / H, dtype=x.dtype, device=x.device)
+        pe = position_grid_to_embed(pe, x.shape[1]) * ratio
+        pe = pe.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1)
+        return x + pe
+    def _make_aux_out1_block(self, in_ch: int) -> nn.Sequential:
+        """Factory for the aux pre-head stack before the final 1x1 projection."""
+        if self.aux_out1_conv_num == 5:
+            return nn.Sequential(
+                nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1),
+                nn.Conv2d(in_ch // 2, in_ch, 3, 1, 1),
+                nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1),
+                nn.Conv2d(in_ch // 2, in_ch, 3, 1, 1),
+                nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1),
+            )
+        if self.aux_out1_conv_num == 3:
+            return nn.Sequential(
+                nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1),
+                nn.Conv2d(in_ch // 2, in_ch, 3, 1, 1),
+                nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1),
+            )
+        if self.aux_out1_conv_num == 1:
+            return nn.Sequential(nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1))
+        raise ValueError(f"aux_out1_conv_num {self.aux_out1_conv_num} not supported")
+    def _apply_activation_single(
+        self, x: torch.Tensor, activation: str = "linear"
+    ) -> torch.Tensor:
+        """
+        Apply activation to single channel output, maintaining semantic consistency with value branch in multi-channel case.
+        Supports: exp / relu / sigmoid / softplus / tanh / linear / expp1
+        """
+        act = activation.lower() if isinstance(activation, str) else activation
+        if act == "exp":
+            return torch.exp(x)
+        if act == "expm1":
+            return torch.expm1(x)
+        if act == "expp1":
+            return torch.exp(x) + 1
+        if act == "relu":
+            return torch.relu(x)
+        if act == "sigmoid":
+            return torch.sigmoid(x)
+        if act == "softplus":
+            return torch.nn.functional.softplus(x)
+        if act == "tanh":
+            return torch.tanh(x)
+        # Default linear
+        return x
+# # -----------------------------------------------------------------------------
+# # Building blocks (tidy)
+# # -----------------------------------------------------------------------------
+# def _make_fusion_block(
+#     features: int,
+#     size: Tuple[int, int] = None,
+#     has_residual: bool = True,
+#     groups: int = 1,
+#     inplace: bool = False,  # <- activation uses inplace=True by default; not related to "fusion_inplace"
+# ) -> nn.Module:
+#     return FeatureFusionBlock(
+#         features=features,
+#         activation=nn.ReLU(inplace=inplace),
+#         deconv=False,
+#         bn=False,
+#         expand=False,
+#         align_corners=True,
+#         size=size,
+#         has_residual=has_residual,
+#         groups=groups,
+#     )
+# def _make_scratch(
+#     in_shape: List[int], out_shape: int, groups: int = 1, expand: bool = False
+# ) -> nn.Module:
+#     scratch = nn.Module()
+#     # optionally expand widths by stage
+#     c1 = out_shape
+#     c2 = out_shape * (2 if expand else 1)
+#     c3 = out_shape * (4 if expand else 1)
+#     c4 = out_shape * (8 if expand else 1)
+#     scratch.layer1_rn = nn.Conv2d(in_shape[0], c1, 3, 1, 1, bias=False, groups=groups)
+#     scratch.layer2_rn = nn.Conv2d(in_shape[1], c2, 3, 1, 1, bias=False, groups=groups)
+#     scratch.layer3_rn = nn.Conv2d(in_shape[2], c3, 3, 1, 1, bias=False, groups=groups)
+#     scratch.layer4_rn = nn.Conv2d(in_shape[3], c4, 3, 1, 1, bias=False, groups=groups)
+#     return scratch
+# class ResidualConvUnit(nn.Module):
+#     """Lightweight residual conv block used within fusion."""
+#     def __init__(self, features: int, activation: nn.Module, bn: bool, groups: int = 1) -> None:
+#         super().__init__()
+#         self.bn = bn
+#         self.groups = groups
+#         self.conv1 = nn.Conv2d(features, features, 3, 1, 1, bias=True, groups=groups)
+#         self.conv2 = nn.Conv2d(features, features, 3, 1, 1, bias=True, groups=groups)
+#         self.norm1 = None
+#         self.norm2 = None
+#         self.activation = activation
+#         self.skip_add = nn.quantized.FloatFunctional()
+#     def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore[override]
+#         out = self.activation(x)
+#         out = self.conv1(out)
+#         if self.norm1 is not None:
+#             out = self.norm1(out)
+#         out = self.activation(out)
+#         out = self.conv2(out)
+#         if self.norm2 is not None:
+#             out = self.norm2(out)
+#         return self.skip_add.add(out, x)
+# class FeatureFusionBlock(nn.Module):
+#     """Top-down fusion block: (optional) residual merge + upsample + 1x1 shrink."""
+#     def __init__(
+#         self,
+#         features: int,
+#         activation: nn.Module,
+#         deconv: bool = False,
+#         bn: bool = False,
+#         expand: bool = False,
+#         align_corners: bool = True,
+#         size: Tuple[int, int] = None,
+#         has_residual: bool = True,
+#         groups: int = 1,
+#     ) -> None:
+#         super().__init__()
+#         self.align_corners = align_corners
+#         self.size = size
+#         self.has_residual = has_residual
+#         self.resConfUnit1 = (
+#             ResidualConvUnit(features, activation, bn, groups=groups) if has_residual else None
+#         )
+#         self.resConfUnit2 = ResidualConvUnit(features, activation, bn, groups=groups)
+#         out_features = (features // 2) if expand else features
+#         self.out_conv = nn.Conv2d(features, out_features, 1, 1, 0, bias=True, groups=groups)
+#         self.skip_add = nn.quantized.FloatFunctional()
+#     def forward(self, *xs: torch.Tensor, size: Tuple[int, int] = None) -> torch.Tensor:  # type: ignore[override]
+#         """
+#         xs:
+#           - xs[0]: top input
+#           - xs[1]: (optional) lateral (to be added with residual)
+#         """
+#         y = xs[0]
+#         if self.has_residual and len(xs) > 1 and self.resConfUnit1 is not None:
+#             y = self.skip_add.add(y, self.resConfUnit1(xs[1]))
+#         y = self.resConfUnit2(y)
+#         # upsample
+#         if (size is None) and (self.size is None):
+#             up_kwargs = {"scale_factor": 2}
+#         elif size is None:
+#             up_kwargs = {"size": self.size}
+#         else:
+#             up_kwargs = {"size": size}
+#         y = custom_interpolate(y, **up_kwargs, mode="bilinear", align_corners=self.align_corners)
+#         y = self.out_conv(y)
+#         return y

depth_anything_3/model/gs_adapter.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import torch
+from einops import einsum, rearrange, repeat
+from torch import nn
+from depth_anything_3.model.utils.transform import cam_quat_xyzw_to_world_quat_wxyz
+from depth_anything_3.specs import Gaussians
+from depth_anything_3.utils.geometry import affine_inverse, get_world_rays, sample_image_grid
+from depth_anything_3.utils.pose_align import batch_align_poses_umeyama
+from depth_anything_3.utils.sh_helpers import rotate_sh
+class GaussianAdapter(nn.Module):
+    def __init__(
+        self,
+        sh_degree: int = 0,
+        pred_color: bool = False,
+        pred_offset_depth: bool = False,
+        pred_offset_xy: bool = True,
+        gaussian_scale_min: float = 1e-5,
+        gaussian_scale_max: float = 30.0,
+    ):
+        super().__init__()
+        self.sh_degree = sh_degree
+        self.pred_color = pred_color
+        self.pred_offset_depth = pred_offset_depth
+        self.pred_offset_xy = pred_offset_xy
+        self.gaussian_scale_min = gaussian_scale_min
+        self.gaussian_scale_max = gaussian_scale_max
+        # Create a mask for the spherical harmonics coefficients. This ensures that at
+        # initialization, the coefficients are biased towards having a large DC
+        # component and small view-dependent components.
+        if not pred_color:
+            self.register_buffer(
+                "sh_mask",
+                torch.ones((self.d_sh,), dtype=torch.float32),
+                persistent=False,
+            )
+            for degree in range(1, sh_degree + 1):
+                self.sh_mask[degree**2 : (degree + 1) ** 2] = 0.1 * 0.25**degree
+    def forward(
+        self,
+        extrinsics: torch.Tensor,  # "*#batch 4 4"
+        intrinsics: torch.Tensor,  # "*#batch 3 3"
+        depths: torch.Tensor,  # "*#batch"
+        opacities: torch.Tensor,  # "*#batch" | "*#batch _"
+        raw_gaussians: torch.Tensor,  # "*#batch _"
+        image_shape: tuple[int, int],
+        eps: float = 1e-8,
+        gt_extrinsics: Optional[torch.Tensor] = None,  # "*#batch 4 4"
+        **kwargs,
+    ) -> Gaussians:
+        device = extrinsics.device
+        dtype = raw_gaussians.dtype
+        H, W = image_shape
+        b, v = raw_gaussians.shape[:2]
+        # get cam2worlds and intr_normed to adapt to 3DGS codebase
+        cam2worlds = affine_inverse(extrinsics)
+        intr_normed = intrinsics.clone().detach()
+        intr_normed[..., 0, :] /= W
+        intr_normed[..., 1, :] /= H
+        # 1. compute 3DGS means
+        # 1.1) offset the predicted depth if needed
+        if self.pred_offset_depth:
+            gs_depths = depths + raw_gaussians[..., -1]
+            raw_gaussians = raw_gaussians[..., :-1]
+        else:
+            gs_depths = depths
+        # 1.2) align predicted poses with GT if needed
+        if gt_extrinsics is not None and extrinsics != gt_extrinsics:
+            try:
+                _, _, pose_scales = batch_align_poses_umeyama(
+                    gt_extrinsics.detach().float(),
+                    extrinsics.detach().float(),
+                )
+            except Exception:
+                pose_scales = torch.ones_like(extrinsics[:, 0, 0, 0])
+            pose_scales = torch.clamp(pose_scales, min=1 / 3.0, max=3.0)
+            cam2worlds[:, :, :3, 3] = cam2worlds[:, :, :3, 3] * rearrange(
+                pose_scales, "b -> b () ()"
+            )
+            gs_depths = gs_depths * rearrange(pose_scales, "b -> b () () () ()")
+        # 1.3) casting xy in image space
+        xy_ray, _ = sample_image_grid((H, W), device)
+        xy_ray = xy_ray[None, None, ...].expand(b, v, -1, -1, -1)  # b v h w xy
+        # offset xy if needed
+        if self.pred_offset_xy:
+            pixel_size = 1 / torch.tensor((W, H), dtype=xy_ray.dtype, device=device)
+            offset_xy = raw_gaussians[..., :2]
+            xy_ray = xy_ray + offset_xy * pixel_size
+            raw_gaussians = raw_gaussians[..., 2:]  # skip the offset_xy
+        # 1.4) unproject depth + xy to world ray
+        origins, directions = get_world_rays(
+            xy_ray,
+            repeat(cam2worlds, "b v i j -> b v h w i j", h=H, w=W),
+            repeat(intr_normed, "b v i j -> b v h w i j", h=H, w=W),
+        )
+        gs_means_world = origins + directions * gs_depths[..., None]
+        gs_means_world = rearrange(gs_means_world, "b v h w d -> b (v h w) d")
+        # 2. compute other GS attributes
+        scales, rotations, sh = raw_gaussians.split((3, 4, 3 * self.d_sh), dim=-1)
+        # 2.1) 3DGS scales
+        # make the scale invarient to resolution
+        scale_min = self.gaussian_scale_min
+        scale_max = self.gaussian_scale_max
+        scales = scale_min + (scale_max - scale_min) * scales.sigmoid()
+        pixel_size = 1 / torch.tensor((W, H), dtype=dtype, device=device)
+        multiplier = self.get_scale_multiplier(intr_normed, pixel_size)
+        gs_scales = scales * gs_depths[..., None] * multiplier[..., None, None, None]
+        gs_scales = rearrange(gs_scales, "b v h w d -> b (v h w) d")
+        # 2.2) 3DGS quaternion (world space)
+        # due to historical issue, assume quaternion in order xyzw, not wxyz
+        # Normalize the quaternion features to yield a valid quaternion.
+        rotations = rotations / (rotations.norm(dim=-1, keepdim=True) + eps)
+        # rotate them to world space
+        cam_quat_xyzw = rearrange(rotations, "b v h w c -> b (v h w) c")
+        c2w_mat = repeat(
+            cam2worlds,
+            "b v i j -> b (v h w) i j",
+            h=H,
+            w=W,
+        )
+        world_quat_wxyz = cam_quat_xyzw_to_world_quat_wxyz(cam_quat_xyzw, c2w_mat)
+        gs_rotations_world = world_quat_wxyz  # b (v h w) c
+        # 2.3) 3DGS color / SH coefficient (world space)
+        sh = rearrange(sh, "... (xyz d_sh) -> ... xyz d_sh", xyz=3)
+        if not self.pred_color:
+            sh = sh * self.sh_mask
+        if self.pred_color or self.sh_degree == 0:
+            # predict pre-computed color or predict only DC band, no need to transform
+            gs_sh_world = sh
+        else:
+            gs_sh_world = rotate_sh(sh, cam2worlds[:, :, None, None, None, :3, :3])
+        gs_sh_world = rearrange(gs_sh_world, "b v h w xyz d_sh -> b (v h w) xyz d_sh")
+        # 2.4) 3DGS opacity
+        gs_opacities = rearrange(opacities, "b v h w ... -> b (v h w) ...")
+        return Gaussians(
+            means=gs_means_world,
+            harmonics=gs_sh_world,
+            opacities=gs_opacities,
+            scales=gs_scales,
+            rotations=gs_rotations_world,
+        )
+    def get_scale_multiplier(
+        self,
+        intrinsics: torch.Tensor,  # "*#batch 3 3"
+        pixel_size: torch.Tensor,  # "*#batch 2"
+        multiplier: float = 0.1,
+    ) -> torch.Tensor:  # " *batch"
+        xy_multipliers = multiplier * einsum(
+            intrinsics[..., :2, :2].float().inverse().to(intrinsics),
+            pixel_size,
+            "... i j, j -> ... i",
+        )
+        return xy_multipliers.sum(dim=-1)
+    @property
+    def d_sh(self) -> int:
+        return 1 if self.pred_color else (self.sh_degree + 1) ** 2
+    @property
+    def d_in(self) -> int:
+        # provided as reference to the gs_dpt output dim
+        raw_gs_dim = 0
+        if self.pred_offset_xy:
+            raw_gs_dim += 2
+        raw_gs_dim += 3  # scales
+        raw_gs_dim += 4  # quaternion
+        raw_gs_dim += 3 * self.d_sh  # color
+        if self.pred_offset_depth:
+            raw_gs_dim += 1
+        return raw_gs_dim

depth_anything_3/model/gsdpt.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict as TyDict
+from typing import List, Sequence
+import torch
+import torch.nn as nn
+from depth_anything_3.model.dpt import DPT
+from depth_anything_3.model.utils.head_utils import activate_head_gs, custom_interpolate
+class GSDPT(DPT):
+    def __init__(
+        self,
+        dim_in: int,
+        patch_size: int = 14,
+        output_dim: int = 4,
+        activation: str = "linear",
+        conf_activation: str = "sigmoid",
+        features: int = 256,
+        out_channels: Sequence[int] = (256, 512, 1024, 1024),
+        pos_embed: bool = True,
+        feature_only: bool = False,
+        down_ratio: int = 1,
+        conf_dim: int = 1,
+        norm_type: str = "idt",  # use to match legacy GS-DPT head, "idt" / "layer"
+        fusion_block_inplace: bool = False,
+    ) -> None:
+        super().__init__(
+            dim_in=dim_in,
+            patch_size=patch_size,
+            output_dim=output_dim,
+            activation=activation,
+            conf_activation=conf_activation,
+            features=features,
+            out_channels=out_channels,
+            pos_embed=pos_embed,
+            down_ratio=down_ratio,
+            head_name="raw_gs",
+            use_sky_head=False,
+            norm_type=norm_type,
+            fusion_block_inplace=fusion_block_inplace,
+        )
+        self.conf_dim = conf_dim
+        if conf_dim and conf_dim > 1:
+            assert (
+                conf_activation == "linear"
+            ), "use linear prediction when using view-dependent opacity"
+        merger_out_dim = features if feature_only else features // 2
+        self.images_merger = nn.Sequential(
+            nn.Conv2d(3, merger_out_dim // 4, 3, 1, 1),  # fewer channels first
+            nn.GELU(),
+            nn.Conv2d(merger_out_dim // 4, merger_out_dim // 2, 3, 1, 1),
+            nn.GELU(),
+            nn.Conv2d(merger_out_dim // 2, merger_out_dim, 3, 1, 1),
+            nn.GELU(),
+        )
+    # -------------------------------------------------------------------------
+    # Internal forward (single chunk)
+    # -------------------------------------------------------------------------
+    def _forward_impl(
+        self,
+        feats: List[torch.Tensor],
+        H: int,
+        W: int,
+        patch_start_idx: int,
+        images: torch.Tensor,
+    ) -> TyDict[str, torch.Tensor]:
+        B, _, C = feats[0].shape
+        ph, pw = H // self.patch_size, W // self.patch_size
+        resized_feats = []
+        for stage_idx, take_idx in enumerate(self.intermediate_layer_idx):
+            x = feats[take_idx][:, patch_start_idx:]  # [B*S, N_patch, C]
+            x = self.norm(x)
+            x = x.permute(0, 2, 1).reshape(B, C, ph, pw)  # [B*S, C, ph, pw]
+            x = self.projects[stage_idx](x)
+            if self.pos_embed:
+                x = self._add_pos_embed(x, W, H)
+            x = self.resize_layers[stage_idx](x)  # Align scale
+            resized_feats.append(x)
+        # 2) Fusion pyramid (main branch only)
+        fused = self._fuse(resized_feats)
+        fused = self.scratch.output_conv1(fused)
+        # 3) Upsample to target resolution, optionally add position encoding again
+        h_out = int(ph * self.patch_size / self.down_ratio)
+        w_out = int(pw * self.patch_size / self.down_ratio)
+        fused = custom_interpolate(fused, (h_out, w_out), mode="bilinear", align_corners=True)
+        # inject the image information here
+        fused = fused + self.images_merger(images)
+        if self.pos_embed:
+            fused = self._add_pos_embed(fused, W, H)
+        # 4) Shared neck1
+        # feat = self.scratch.output_conv1(fused)
+        feat = fused
+        # 5) Main head: logits -> activate_head or single channel activation
+        main_logits = self.scratch.output_conv2(feat)
+        outs: TyDict[str, torch.Tensor] = {}
+        if self.has_conf:
+            pred, conf = activate_head_gs(
+                main_logits,
+                activation=self.activation,
+                conf_activation=self.conf_activation,
+                conf_dim=self.conf_dim,
+            )
+            outs[self.head_main] = pred.squeeze(1)
+            outs[f"{self.head_main}_conf"] = conf.squeeze(1)
+        else:
+            outs[self.head_main] = self._apply_activation_single(main_logits).squeeze(1)
+        return outs

depth_anything_3/model/utils/attention.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 # noqa
+from typing import Callable, Optional, Union
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+        qk_norm: bool = False,
+        rope=None,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope
+    def forward(self, x: Tensor, pos=None, attn_mask=None) -> Tensor:
+        # Debug breakpoint removed for production
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        q = self.rope(q, pos) if self.rope is not None else q
+        k = self.rope(k, pos) if self.rope is not None else k
+        x = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            dropout_p=self.attn_drop.p if self.training else 0.0,
+            attn_mask=attn_mask,
+        )
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

depth_anything_3/model/utils/block.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable
+from torch import Tensor, nn
+from .attention import Attention, LayerScale, Mlp
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+        qk_norm: bool = False,
+        rope=None,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            qk_norm=qk_norm,
+            rope=rope,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.sample_drop_ratio = 0.0  # Equivalent to always having drop_path=0
+    def forward(self, x: Tensor, pos=None, attn_mask=None) -> Tensor:
+        def attn_residual_func(x: Tensor, pos=None, attn_mask=None) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x), pos=pos, attn_mask=attn_mask))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        # drop_path is always 0, so always take the else branch
+        x = x + attn_residual_func(x, pos=pos, attn_mask=attn_mask)
+        x = x + ffn_residual_func(x)
+        return x

depth_anything_3/model/utils/gs_renderer.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from math import isqrt
+from typing import Literal, Optional
+import torch
+from einops import rearrange, repeat
+from tqdm import tqdm
+from depth_anything_3.specs import Gaussians
+from depth_anything_3.utils.camera_trj_helpers import (
+    interpolate_extrinsics,
+    interpolate_intrinsics,
+    render_dolly_zoom_path,
+    render_stabilization_path,
+    render_wander_path,
+    render_wobble_inter_path,
+)
+from depth_anything_3.utils.geometry import affine_inverse, as_homogeneous, get_fov
+from depth_anything_3.utils.logger import logger
+try:
+    from gsplat import rasterization
+except ImportError:
+    logger.warn(
+        "Dependency `gsplat` is required for rendering 3DGS. "
+        "Install via: pip install git+https://github.com/nerfstudio-project/"
+        "gsplat.git@0b4dddf04cb687367602c01196913cde6a743d70"
+    )
+def render_3dgs(
+    extrinsics: torch.Tensor,  # "batch_views 4 4", w2c
+    intrinsics: torch.Tensor,  # "batch_views 3 3", normalized
+    image_shape: tuple[int, int],
+    gaussian: Gaussians,
+    background_color: Optional[torch.Tensor] = None,  # "batch_views 3"
+    use_sh: bool = True,
+    num_view: int = 1,
+    color_mode: Literal["RGB+D", "RGB+ED"] = "RGB+D",
+    **kwargs,
+) -> tuple[
+    torch.Tensor,  # "batch_views 3 height width"
+    torch.Tensor,  # "batch_views height width"
+]:
+    # extract gaussian params
+    gaussian_means = gaussian.means
+    gaussian_scales = gaussian.scales
+    gaussian_quats = gaussian.rotations
+    gaussian_opacities = gaussian.opacities
+    gaussian_sh_coefficients = gaussian.harmonics
+    b, _, _ = extrinsics.shape
+    if background_color is None:
+        background_color = repeat(torch.tensor([0.0, 0.0, 0.0]), "c -> b c", b=b).to(
+            gaussian_sh_coefficients
+        )
+    if use_sh:
+        _, _, _, n = gaussian_sh_coefficients.shape
+        degree = isqrt(n) - 1
+        shs = rearrange(gaussian_sh_coefficients, "b g xyz n -> b g n xyz").contiguous()
+    else:  # use color
+        shs = (
+            gaussian_sh_coefficients.squeeze(-1).sigmoid().contiguous()
+        )  # (b, g, c), normed to (0, 1)
+    h, w = image_shape
+    fov_x, fov_y = get_fov(intrinsics).unbind(dim=-1)
+    tan_fov_x = (0.5 * fov_x).tan()
+    tan_fov_y = (0.5 * fov_y).tan()
+    focal_length_x = w / (2 * tan_fov_x)
+    focal_length_y = h / (2 * tan_fov_y)
+    view_matrix = extrinsics.float()
+    all_images = []
+    all_radii = []
+    all_depths = []
+    # render view in a batch based, each batch contains one scene
+    # assume the Gaussian parameters are originally repeated along the view dim
+    batch_scene = b // num_view
+    def index_i_gs_attr(full_attr, idx):
+        # return rearrange(full_attr, "(b v) ... -> b v ...", v=num_view)[idx, 0]
+        return full_attr[idx]
+    for i in range(batch_scene):
+        K = repeat(
+            torch.tensor(
+                [
+                    [0, 0, w / 2.0],
+                    [0, 0, h / 2.0],
+                    [0, 0, 1],
+                ]
+            ),
+            "i j -> v i j",
+            v=num_view,
+        ).to(gaussian_means)
+        K[:, 0, 0] = focal_length_x.reshape(batch_scene, num_view)[i]
+        K[:, 1, 1] = focal_length_y.reshape(batch_scene, num_view)[i]
+        i_means = index_i_gs_attr(gaussian_means, i)  # [N, 3]
+        i_scales = index_i_gs_attr(gaussian_scales, i)
+        i_quats = index_i_gs_attr(gaussian_quats, i)
+        i_opacities = index_i_gs_attr(gaussian_opacities, i)  # [N,]
+        i_colors = index_i_gs_attr(shs, i)  # [N, K, 3]
+        i_viewmats = rearrange(view_matrix, "(b v) ... -> b v ...", v=num_view)[i]  # [v, 4, 4]
+        i_backgrounds = rearrange(background_color, "(b v) ... -> b v ...", v=num_view)[
+            i
+        ]  # [v, 3]
+        render_colors, render_alphas, info = rasterization(
+            means=i_means,
+            quats=i_quats,  # [N, 4]
+            scales=i_scales,  # [N, 3]
+            opacities=i_opacities,
+            colors=i_colors,
+            viewmats=i_viewmats,  # [v, 4, 4]
+            Ks=K,  # [v, 3, 3]
+            backgrounds=i_backgrounds,
+            render_mode=color_mode,
+            width=w,
+            height=h,
+            packed=False,
+            sh_degree=degree if use_sh else None,
+        )
+        depth = render_colors[..., -1].unbind(dim=0)
+        image = rearrange(render_colors[..., :3], "v h w c -> v c h w").unbind(dim=0)
+        radii = info["radii"].unbind(dim=0)
+        try:
+            info["means2d"].retain_grad()  # [1, N, 2]
+        except Exception:
+            pass
+        all_images.extend(image)
+        all_depths.extend(depth)
+        all_radii.extend(radii)
+    return torch.stack(all_images), torch.stack(all_depths)
+def run_renderer_in_chunk_w_trj_mode(
+    gaussians: Gaussians,
+    extrinsics: torch.Tensor,  # world2cam, "batch view 4 4" | "batch view 3 4"
+    intrinsics: torch.Tensor,  # unnormed intrinsics, "batch view 3 3"
+    image_shape: tuple[int, int],
+    chunk_size: Optional[int] = 8,
+    trj_mode: Literal[
+        "original",
+        "smooth",
+        "interpolate",
+        "interpolate_smooth",
+        "wander",
+        "dolly_zoom",
+        "extend",
+        "wobble_inter",
+    ] = "smooth",
+    input_shape: Optional[tuple[int, int]] = None,
+    enable_tqdm: Optional[bool] = False,
+    **kwargs,
+) -> tuple[
+    torch.Tensor,  # color, "batch view 3 height width"
+    torch.Tensor,  # depth, "batch view height width"
+]:
+    cam2world = affine_inverse(as_homogeneous(extrinsics))
+    if input_shape is not None:
+        in_h, in_w = input_shape
+    else:
+        in_h, in_w = image_shape
+    intr_normed = intrinsics.clone().detach()
+    intr_normed[..., 0, :] /= in_w
+    intr_normed[..., 1, :] /= in_h
+    if extrinsics.shape[1] <= 1:
+        assert trj_mode in [
+            "wander",
+            "dolly_zoom",
+        ], "Please set trj_mode to 'wander' or 'dolly_zoom' when n_views=1"
+    def _smooth_trj_fn_batch(raw_c2ws, k_size=50):
+        try:
+            smooth_c2ws = torch.stack(
+                [render_stabilization_path(c2w_i, k_size) for c2w_i in raw_c2ws],
+                dim=0,
+            )
+        except Exception as e:
+            print(f"[DEBUG] Path smoothing failed with error: {e}.")
+            smooth_c2ws = raw_c2ws
+        return smooth_c2ws
+    # get rendered trj
+    if trj_mode == "original":
+        tgt_c2w = cam2world
+        tgt_intr = intr_normed
+    elif trj_mode == "smooth":
+        tgt_c2w = _smooth_trj_fn_batch(cam2world)
+        tgt_intr = intr_normed
+    elif trj_mode in ["interpolate", "interpolate_smooth", "extend"]:
+        inter_len = 8
+        total_len = (cam2world.shape[1] - 1) * inter_len
+        if total_len > 24 * 18:  # no more than 18s
+            inter_len = max(1, 24 * 10 // (cam2world.shape[1] - 1))
+        if total_len < 24 * 2:  # no less than 2s
+            inter_len = max(1, 24 * 2 // (cam2world.shape[1] - 1))
+        if inter_len > 2:
+            t = torch.linspace(0, 1, inter_len, dtype=torch.float32, device=cam2world.device)
+            t = (torch.cos(torch.pi * (t + 1)) + 1) / 2
+            tgt_c2w_b = []
+            tgt_intr_b = []
+            for b_idx in range(cam2world.shape[0]):
+                tgt_c2w = []
+                tgt_intr = []
+                for cur_idx in range(cam2world.shape[1] - 1):
+                    tgt_c2w.append(
+                        interpolate_extrinsics(
+                            cam2world[b_idx, cur_idx], cam2world[b_idx, cur_idx + 1], t
+                        )[(0 if cur_idx == 0 else 1) :]
+                    )
+                    tgt_intr.append(
+                        interpolate_intrinsics(
+                            intr_normed[b_idx, cur_idx], intr_normed[b_idx, cur_idx + 1], t
+                        )[(0 if cur_idx == 0 else 1) :]
+                    )
+                tgt_c2w_b.append(torch.cat(tgt_c2w))
+                tgt_intr_b.append(torch.cat(tgt_intr))
+            tgt_c2w = torch.stack(tgt_c2w_b)  # b v 4 4
+            tgt_intr = torch.stack(tgt_intr_b)  # b v 3 3
+        else:
+            tgt_c2w = cam2world
+            tgt_intr = intr_normed
+        if trj_mode in ["interpolate_smooth", "extend"]:
+            tgt_c2w = _smooth_trj_fn_batch(tgt_c2w)
+        if trj_mode == "extend":
+            # apply dolly_zoom and wander in the middle frame
+            assert cam2world.shape[0] == 1, "extend only supports for batch_size=1 currently."
+            mid_idx = tgt_c2w.shape[1] // 2
+            c2w_wd, intr_wd = render_wander_path(
+                tgt_c2w[0, mid_idx],
+                tgt_intr[0, mid_idx],
+                h=in_h,
+                w=in_w,
+                num_frames=max(36, min(60, mid_idx // 2)),
+                max_disp=24.0,
+            )
+            c2w_dz, intr_dz = render_dolly_zoom_path(
+                tgt_c2w[0, mid_idx],
+                tgt_intr[0, mid_idx],
+                h=in_h,
+                w=in_w,
+                num_frames=max(36, min(60, mid_idx // 2)),
+            )
+            tgt_c2w = torch.cat(
+                [
+                    tgt_c2w[:, :mid_idx],
+                    c2w_wd.unsqueeze(0),
+                    c2w_dz.unsqueeze(0),
+                    tgt_c2w[:, mid_idx:],
+                ],
+                dim=1,
+            )
+            tgt_intr = torch.cat(
+                [
+                    tgt_intr[:, :mid_idx],
+                    intr_wd.unsqueeze(0),
+                    intr_dz.unsqueeze(0),
+                    tgt_intr[:, mid_idx:],
+                ],
+                dim=1,
+            )
+    elif trj_mode in ["wander", "dolly_zoom"]:
+        if trj_mode == "wander":
+            render_fn = render_wander_path
+            extra_kwargs = {"max_disp": 24.0}
+        else:
+            render_fn = render_dolly_zoom_path
+            extra_kwargs = {"D_focus": 30.0, "max_disp": 2.0}
+        tgt_c2w = []
+        tgt_intr = []
+        for b_idx in range(cam2world.shape[0]):
+            c2w_i, intr_i = render_fn(
+                cam2world[b_idx, 0], intr_normed[b_idx, 0], h=in_h, w=in_w, **extra_kwargs
+            )
+            tgt_c2w.append(c2w_i)
+            tgt_intr.append(intr_i)
+        tgt_c2w = torch.stack(tgt_c2w)
+        tgt_intr = torch.stack(tgt_intr)
+    elif trj_mode == "wobble_inter":
+        tgt_c2w, tgt_intr = render_wobble_inter_path(
+            cam2world=cam2world,
+            intr_normed=intr_normed,
+            inter_len=10,
+            n_skip=3,
+        )
+    else:
+        raise Exception(f"trj mode [{trj_mode}] is not implemented.")
+    _, v = tgt_c2w.shape[:2]
+    tgt_extr = affine_inverse(tgt_c2w)
+    if chunk_size is None:
+        chunk_size = v
+    chunk_size = min(v, chunk_size)
+    all_colors = []
+    all_depths = []
+    for chunk_idx in tqdm(
+        range(math.ceil(v / chunk_size)),
+        desc="Rendering novel views",
+        disable=(not enable_tqdm),
+        leave=False,
+    ):
+        s = int(chunk_idx * chunk_size)
+        e = int((chunk_idx + 1) * chunk_size)
+        cur_n_view = tgt_extr[:, s:e].shape[1]
+        color, depth = render_3dgs(
+            extrinsics=rearrange(tgt_extr[:, s:e], "b v ... -> (b v) ..."),  # w2c
+            intrinsics=rearrange(tgt_intr[:, s:e], "b v ... -> (b v) ..."),  # normed
+            image_shape=image_shape,
+            gaussian=gaussians,
+            num_view=cur_n_view,
+            **kwargs,
+        )
+        all_colors.append(rearrange(color, "(b v) ... -> b v ...", v=cur_n_view))
+        all_depths.append(rearrange(depth, "(b v) ... -> b v ...", v=cur_n_view))
+    all_colors = torch.cat(all_colors, dim=1)
+    all_depths = torch.cat(all_depths, dim=1)
+    return all_colors, all_depths

depth_anything_3/model/utils/head_utils.py ADDED Viewed

	@@ -0,0 +1,230 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# -----------------------------------------------------------------------------
+# Activation functions
+# -----------------------------------------------------------------------------
+def activate_head_gs(out, activation="norm_exp", conf_activation="expp1", conf_dim=None):
+    """
+    Process network output to extract GS params and density values.
+    Density could be view-dependent as SH coefficient
+    Args:
+        out: Network output tensor (B, C, H, W)
+        activation: Activation type for 3D points
+        conf_activation: Activation type for confidence values
+    Returns:
+        Tuple of (3D points tensor, confidence tensor)
+    """
+    # Move channels from last dim to the 4th dimension => (B, H, W, C)
+    fmap = out.permute(0, 2, 3, 1)  # B,H,W,C expected
+    # Split into xyz (first C-1 channels) and confidence (last channel)
+    conf_dim = 1 if conf_dim is None else conf_dim
+    xyz = fmap[:, :, :, :-conf_dim]
+    conf = fmap[:, :, :, -1] if conf_dim == 1 else fmap[:, :, :, -conf_dim:]
+    if activation == "norm_exp":
+        d = xyz.norm(dim=-1, keepdim=True).clamp(min=1e-8)
+        xyz_normed = xyz / d
+        pts3d = xyz_normed * torch.expm1(d)
+    elif activation == "norm":
+        pts3d = xyz / xyz.norm(dim=-1, keepdim=True)
+    elif activation == "exp":
+        pts3d = torch.exp(xyz)
+    elif activation == "relu":
+        pts3d = F.relu(xyz)
+    elif activation == "sigmoid":
+        pts3d = torch.sigmoid(xyz)
+    elif activation == "linear":
+        pts3d = xyz
+    else:
+        raise ValueError(f"Unknown activation: {activation}")
+    if conf_activation == "expp1":
+        conf_out = 1 + conf.exp()
+    elif conf_activation == "expp0":
+        conf_out = conf.exp()
+    elif conf_activation == "sigmoid":
+        conf_out = torch.sigmoid(conf)
+    elif conf_activation == "linear":
+        conf_out = conf
+    else:
+        raise ValueError(f"Unknown conf_activation: {conf_activation}")
+    return pts3d, conf_out
+# -----------------------------------------------------------------------------
+# Other utilities
+# -----------------------------------------------------------------------------
+class Permute(nn.Module):
+    """nn.Module wrapper around Tensor.permute for cleaner nn.Sequential usage."""
+    dims: Tuple[int, ...]
+    def __init__(self, dims: Tuple[int, ...]) -> None:
+        super().__init__()
+        self.dims = dims
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore[override]
+        return x.permute(*self.dims)
+def position_grid_to_embed(
+    pos_grid: torch.Tensor, embed_dim: int, omega_0: float = 100
+) -> torch.Tensor:
+    """
+    Convert 2D position grid (HxWx2) to sinusoidal embeddings (HxWxC)
+    Args:
+        pos_grid: Tensor of shape (H, W, 2) containing 2D coordinates
+        embed_dim: Output channel dimension for embeddings
+    Returns:
+        Tensor of shape (H, W, embed_dim) with positional embeddings
+    """
+    H, W, grid_dim = pos_grid.shape
+    assert grid_dim == 2
+    pos_flat = pos_grid.reshape(-1, grid_dim)  # Flatten to (H*W, 2)
+    # Process x and y coordinates separately
+    emb_x = make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 0], omega_0=omega_0)  # [1, H*W, D/2]
+    emb_y = make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 1], omega_0=omega_0)  # [1, H*W, D/2]
+    # Combine and reshape
+    emb = torch.cat([emb_x, emb_y], dim=-1)  # [1, H*W, D]
+    return emb.view(H, W, embed_dim)  # [H, W, D]
+def make_sincos_pos_embed(embed_dim: int, pos: torch.Tensor, omega_0: float = 100) -> torch.Tensor:
+    """
+    This function generates a 1D positional embedding from a given grid using sine and cosine functions. # noqa
+    Args:
+    - embed_dim: The embedding dimension.
+    - pos: The position to generate the embedding from.
+    Returns:
+    - emb: The generated 1D positional embedding.
+    """
+    assert embed_dim % 2 == 0
+    omega = torch.arange(embed_dim // 2, dtype=torch.double, device=pos.device)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / omega_0**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = torch.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = torch.sin(out)  # (M, D/2)
+    emb_cos = torch.cos(out)  # (M, D/2)
+    emb = torch.cat([emb_sin, emb_cos], dim=1)  # (M, D)
+    return emb.float()
+# Inspired by https://github.com/microsoft/moge
+def create_uv_grid(
+    width: int,
+    height: int,
+    aspect_ratio: float = None,
+    dtype: torch.dtype = None,
+    device: torch.device = None,
+) -> torch.Tensor:
+    """
+    Create a normalized UV grid of shape (width, height, 2).
+    The grid spans horizontally and vertically according to an aspect ratio,
+    ensuring the top-left corner is at (-x_span, -y_span) and the bottom-right
+    corner is at (x_span, y_span), normalized by the diagonal of the plane.
+    Args:
+        width (int): Number of points horizontally.
+        height (int): Number of points vertically.
+        aspect_ratio (float, optional): Width-to-height ratio. Defaults to width/height.
+        dtype (torch.dtype, optional): Data type of the resulting tensor.
+        device (torch.device, optional): Device on which the tensor is created.
+    Returns:
+        torch.Tensor: A (width, height, 2) tensor of UV coordinates.
+    """
+    # Derive aspect ratio if not explicitly provided
+    if aspect_ratio is None:
+        aspect_ratio = float(width) / float(height)
+    # Compute normalized spans for X and Y
+    diag_factor = (aspect_ratio**2 + 1.0) ** 0.5
+    span_x = aspect_ratio / diag_factor
+    span_y = 1.0 / diag_factor
+    # Establish the linspace boundaries
+    left_x = -span_x * (width - 1) / width
+    right_x = span_x * (width - 1) / width
+    top_y = -span_y * (height - 1) / height
+    bottom_y = span_y * (height - 1) / height
+    # Generate 1D coordinates
+    x_coords = torch.linspace(left_x, right_x, steps=width, dtype=dtype, device=device)
+    y_coords = torch.linspace(top_y, bottom_y, steps=height, dtype=dtype, device=device)
+    # Create 2D meshgrid (width x height) and stack into UV
+    uu, vv = torch.meshgrid(x_coords, y_coords, indexing="xy")
+    uv_grid = torch.stack((uu, vv), dim=-1)
+    return uv_grid
+# -----------------------------------------------------------------------------
+# Interpolation (safe interpolation, avoid INT_MAX overflow)
+# -----------------------------------------------------------------------------
+def custom_interpolate(
+    x: torch.Tensor,
+    size: Union[Tuple[int, int], None] = None,
+    scale_factor: Union[float, None] = None,
+    mode: str = "bilinear",
+    align_corners: bool = True,
+) -> torch.Tensor:
+    """
+    Safe interpolation implementation to avoid INT_MAX overflow in torch.nn.functional.interpolate.
+    """
+    if size is None:
+        assert scale_factor is not None, "Either size or scale_factor must be provided."
+        size = (int(x.shape[-2] * scale_factor), int(x.shape[-1] * scale_factor))
+    INT_MAX = 1610612736
+    total = size[0] * size[1] * x.shape[0] * x.shape[1]
+    if total > INT_MAX:
+        chunks = torch.chunk(x, chunks=(total // INT_MAX) + 1, dim=0)
+        outs = [
+            nn.functional.interpolate(c, size=size, mode=mode, align_corners=align_corners)
+            for c in chunks
+        ]
+        return torch.cat(outs, dim=0).contiguous()
+    return nn.functional.interpolate(x, size=size, mode=mode, align_corners=align_corners)

depth_anything_3/model/utils/transform.py ADDED Viewed

	@@ -0,0 +1,208 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+def extri_intri_to_pose_encoding(
+    extrinsics,
+    intrinsics,
+    image_size_hw=None,
+):
+    """Convert camera extrinsics and intrinsics to a compact pose encoding."""
+    # extrinsics: BxSx3x4
+    # intrinsics: BxSx3x3
+    R = extrinsics[:, :, :3, :3]  # BxSx3x3
+    T = extrinsics[:, :, :3, 3]  # BxSx3
+    quat = mat_to_quat(R)
+    # Note the order of h and w here
+    H, W = image_size_hw
+    fov_h = 2 * torch.atan((H / 2) / intrinsics[..., 1, 1])
+    fov_w = 2 * torch.atan((W / 2) / intrinsics[..., 0, 0])
+    pose_encoding = torch.cat([T, quat, fov_h[..., None], fov_w[..., None]], dim=-1).float()
+    return pose_encoding
+def pose_encoding_to_extri_intri(
+    pose_encoding,
+    image_size_hw=None,
+):
+    """Convert a pose encoding back to camera extrinsics and intrinsics."""
+    T = pose_encoding[..., :3]
+    quat = pose_encoding[..., 3:7]
+    fov_h = pose_encoding[..., 7]
+    fov_w = pose_encoding[..., 8]
+    R = quat_to_mat(quat)
+    extrinsics = torch.cat([R, T[..., None]], dim=-1)
+    H, W = image_size_hw
+    fy = (H / 2.0) / torch.clamp(torch.tan(fov_h / 2.0), 1e-6)
+    fx = (W / 2.0) / torch.clamp(torch.tan(fov_w / 2.0), 1e-6)
+    intrinsics = torch.zeros(pose_encoding.shape[:2] + (3, 3), device=pose_encoding.device)
+    intrinsics[..., 0, 0] = fx
+    intrinsics[..., 1, 1] = fy
+    intrinsics[..., 0, 2] = W / 2
+    intrinsics[..., 1, 2] = H / 2
+    intrinsics[..., 2, 2] = 1.0  # Set the homogeneous coordinate to 1
+    return extrinsics, intrinsics
+def quat_to_mat(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Quaternion Order: XYZW or say ijkr, scalar-last
+    Convert rotations given as quaternions to rotation matrices.
+    Args:
+        quaternions: quaternions with real part last,
+            as tensor of shape (..., 4).
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    i, j, k, r = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+def mat_to_quat(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to quaternions.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+    Returns:
+        quaternions with real part last, as tensor of shape (..., 4).
+        Quaternion Order: XYZW or say ijkr, scalar-last
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
+        matrix.reshape(batch_dim + (9,)), dim=-1
+    )
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [
+                1.0 + m00 + m11 + m22,
+                1.0 + m00 - m11 - m22,
+                1.0 - m00 + m11 - m22,
+                1.0 - m00 - m11 + m22,
+            ],
+            dim=-1,
+        )
+    )
+    quat_by_rijk = torch.stack(
+        [
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+    out = quat_candidates[F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :].reshape(
+        batch_dim + (4,)
+    )
+    out = out[..., [1, 2, 3, 0]]
+    out = standardize_quaternion(out)
+    return out
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    if torch.is_grad_enabled():
+        ret[positive_mask] = torch.sqrt(x[positive_mask])
+    else:
+        ret = torch.where(positive_mask, torch.sqrt(x), ret)
+    return ret
+def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert a unit quaternion to a standard form: one in which the real
+    part is non negative.
+    Args:
+        quaternions: Quaternions with real part last,
+            as tensor of shape (..., 4).
+    Returns:
+        Standardized quaternions as tensor of shape (..., 4).
+    """
+    return torch.where(quaternions[..., 3:4] < 0, -quaternions, quaternions)
+def cam_quat_xyzw_to_world_quat_wxyz(cam_quat_xyzw, c2w):
+    # cam_quat_xyzw: (b, n, 4) in xyzw
+    # c2w: (b, n, 4, 4)
+    b, n = cam_quat_xyzw.shape[:2]
+    # 1. xyzw -> wxyz
+    cam_quat_wxyz = torch.cat(
+        [
+            cam_quat_xyzw[..., 3:4],  # w
+            cam_quat_xyzw[..., 0:1],  # x
+            cam_quat_xyzw[..., 1:2],  # y
+            cam_quat_xyzw[..., 2:3],  # z
+        ],
+        dim=-1,
+    )
+    # 2. Quaternion to matrix
+    cam_quat_wxyz_flat = cam_quat_wxyz.reshape(-1, 4)
+    rotmat_cam = quat_to_mat(cam_quat_wxyz_flat).reshape(b, n, 3, 3)
+    # 3. Transform to world space
+    rotmat_c2w = c2w[..., :3, :3]
+    rotmat_world = torch.matmul(rotmat_c2w, rotmat_cam)
+    # 4. Matrix to quaternion (wxyz)
+    rotmat_world_flat = rotmat_world.reshape(-1, 3, 3)
+    world_quat_wxyz_flat = mat_to_quat(rotmat_world_flat)
+    world_quat_wxyz = world_quat_wxyz_flat.reshape(b, n, 4)
+    return world_quat_wxyz

depth_anything_3/registry.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import OrderedDict
+from pathlib import Path
+def get_all_models() -> OrderedDict:
+    """
+    Scans all YAML files in the configs directory and returns a sorted dictionary where:
+    - Keys are model names (YAML filenames without the .yaml extension)
+    - Values are absolute paths to the corresponding YAML files
+    """
+    # Get path to the configs directory within the da3 package
+    # Works both in development and after pip installation
+    # configs_dir = files("depth_anything_3").joinpath("configs")
+    configs_dir = Path(__file__).resolve().parent / "configs"
+    # Ensure path is a Path object for consistent cross-platform handling
+    configs_dir = Path(configs_dir)
+    model_entries = []
+    # Iterate through all items in the configs directory
+    for item in configs_dir.iterdir():
+        # Filter for YAML files (excluding directories)
+        if item.is_file() and item.suffix == ".yaml":
+            # Extract model name (filename without .yaml extension)
+            model_name = item.stem
+            # Get absolute path (resolve() handles symlinks)
+            file_abs_path = str(item.resolve())
+            model_entries.append((model_name, file_abs_path))
+    # Sort entries by model name and convert to OrderedDict
+    sorted_entries = sorted(model_entries, key=lambda x: x[0])
+    return OrderedDict(sorted_entries)
+# Global registry for external imports
+MODEL_REGISTRY = get_all_models()

depth_anything_3/services/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Services module for Depth Anything 3.
+"""
+from depth_anything_3.services.backend import create_app, start_server
+__all__ = [
+    start_server,
+    create_app,
+]

depth_anything_3/services/backend.py ADDED Viewed

	@@ -0,0 +1,538 @@

+# flake8: noqa: E501
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Model backend service for Depth Anything 3.
+Provides HTTP API for model inference with persistent model loading.
+"""
+import gc
+import os
+import posixpath
+import time
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Dict, List, Optional
+from urllib.parse import quote
+import numpy as np
+import torch
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import FileResponse, HTMLResponse
+from pydantic import BaseModel
+from ..api import DepthAnything3
+class InferenceRequest(BaseModel):
+    """Request model for inference API."""
+    image_paths: List[str]
+    export_dir: Optional[str] = None
+    export_format: str = "mini_npz-glb"
+    extrinsics: Optional[List[List[List[float]]]] = None
+    intrinsics: Optional[List[List[List[float]]]] = None
+    process_res: int = 504
+    process_res_method: str = "upper_bound_resize"
+    export_feat_layers: List[int] = []
+    align_to_input_ext_scale: bool = True
+    # GLB export parameters
+    conf_thresh_percentile: float = 40.0
+    num_max_points: int = 1_000_000
+    show_cameras: bool = True
+    # Feat_vis export parameters
+    feat_vis_fps: int = 15
+class InferenceResponse(BaseModel):
+    """Response model for inference API."""
+    success: bool
+    message: str
+    task_id: Optional[str] = None
+    export_dir: Optional[str] = None
+    export_format: str = "mini_npz-glb"
+    processing_time: Optional[float] = None
+class TaskStatus(BaseModel):
+    """Task status model."""
+    task_id: str
+    status: str  # "pending", "running", "completed", "failed"
+    message: str
+    progress: Optional[float] = None  # 0.0 to 1.0
+    created_at: float
+    started_at: Optional[float] = None
+    completed_at: Optional[float] = None
+    export_dir: Optional[str] = None
+    request: Optional[InferenceRequest] = None  # Store the original request
+    # Essential task parameters
+    num_images: Optional[int] = None  # Number of input images
+    export_format: Optional[str] = None  # Export format
+    process_res_method: Optional[str] = None  # Processing resolution method
+    video_path: Optional[str] = None  # Source video path
+class ModelBackend:
+    """Model backend service with persistent model loading."""
+    def __init__(self, model_dir: str, device: str = "cuda"):
+        self.model_dir = model_dir
+        self.device = device
+        self.model = None
+        self.model_loaded = False
+        self.load_time = None
+        self.load_start_time = None  # Time when model loading started
+        self.load_completed_time = None  # Time when model loading completed
+        self.last_used = None
+    def load_model(self):
+        """Load model if not already loaded."""
+        if self.model_loaded and self.model is not None:
+            self.last_used = time.time()
+            return self.model
+        try:
+            print(f"Loading model from {self.model_dir}...")
+            self.load_start_time = time.time()
+            start_time = time.time()
+            self.model = DepthAnything3.from_pretrained(self.model_dir).to(self.device)
+            self.model.eval()
+            self.model_loaded = True
+            self.load_time = time.time() - start_time
+            self.load_completed_time = time.time()
+            self.last_used = time.time()
+            print(f"Model loaded successfully in {self.load_time:.2f}s")
+            return self.model
+        except Exception as e:
+            print(f"Failed to load model: {e}")
+            raise e
+    def get_model(self):
+        """Get model, loading if necessary."""
+        if not self.model_loaded:
+            return self.load_model()
+        self.last_used = time.time()
+        return self.model
+    def get_status(self) -> Dict[str, Any]:
+        """Get backend status information."""
+        # Calculate uptime from when model loading completed
+        uptime = 0
+        if self.model_loaded and self.load_completed_time:
+            uptime = time.time() - self.load_completed_time
+        return {
+            "model_loaded": self.model_loaded,
+            "model_dir": self.model_dir,
+            "device": self.device,
+            "load_time": self.load_time,
+            "last_used": self.last_used,
+            "uptime": uptime,
+        }
+# Global backend instance
+_backend: Optional[ModelBackend] = None
+_app: Optional[FastAPI] = None
+_tasks: Dict[str, TaskStatus] = {}
+_executor = ThreadPoolExecutor(max_workers=1)  # Restrict to single-task execution
+_running_task_id: Optional[str] = None  # Currently running task ID
+_task_queue: List[str] = []  # Pending task queue
+# Task cleanup configuration
+MAX_TASK_HISTORY = 100  # Maximum number of tasks to keep in memory
+CLEANUP_INTERVAL = 300  # Cleanup interval in seconds (5 minutes)
+def _process_next_task():
+    """Process the next task in the queue."""
+    global _task_queue, _running_task_id
+    if not _task_queue or _running_task_id is not None:
+        return
+    # Get next task from queue
+    task_id = _task_queue.pop(0)
+    # Get task request from tasks dict (we need to store the request)
+    if task_id not in _tasks:
+        return
+    # Submit task to executor
+    _executor.submit(_run_inference_task, task_id)
+def _get_gpu_memory_info():
+    """Get current GPU memory usage information."""
+    if not torch.cuda.is_available():
+        return None
+    try:
+        device = torch.cuda.current_device()
+        total_memory = torch.cuda.get_device_properties(device).total_memory
+        allocated_memory = torch.cuda.memory_allocated(device)
+        reserved_memory = torch.cuda.memory_reserved(device)
+        free_memory = total_memory - reserved_memory
+        return {
+            "total_gb": total_memory / 1024**3,
+            "allocated_gb": allocated_memory / 1024**3,
+            "reserved_gb": reserved_memory / 1024**3,
+            "free_gb": free_memory / 1024**3,
+            "utilization": (reserved_memory / total_memory) * 100,
+        }
+    except Exception as e:
+        print(f"Warning: Failed to get GPU memory info: {e}")
+        return None
+def _cleanup_cuda_memory():
+    """Helper function to perform comprehensive CUDA memory cleanup."""
+    try:
+        if torch.cuda.is_available():
+            # Log memory before cleanup
+            mem_before = _get_gpu_memory_info()
+            torch.cuda.synchronize()
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+            gc.collect()
+            # Log memory after cleanup
+            mem_after = _get_gpu_memory_info()
+            if mem_before and mem_after:
+                freed = mem_before["reserved_gb"] - mem_after["reserved_gb"]
+                print(
+                    f"CUDA cleanup: freed {freed:.2f}GB, "
+                    f"available: {mem_after['free_gb']:.2f}GB/{mem_after['total_gb']:.2f}GB"
+                )
+            else:
+                print("CUDA memory cleanup completed")
+    except Exception as e:
+        print(f"Warning: CUDA cleanup failed: {e}")
+def _check_memory_availability(required_gb: float = 2.0) -> tuple[bool, str]:
+    """
+    Check if there's enough GPU memory available.
+    Args:
+        required_gb: Minimum required memory in GB
+    Returns:
+        Tuple of (is_available, message)
+    """
+    if not torch.cuda.is_available():
+        return False, "CUDA is not available"
+    try:
+        mem_info = _get_gpu_memory_info()
+        if mem_info is None:
+            return True, "Cannot check memory, proceeding anyway"
+        if mem_info["free_gb"] < required_gb:
+            return False, (
+                f"Insufficient GPU memory: {mem_info['free_gb']:.2f}GB available, "
+                f"{required_gb:.2f}GB required. "
+                f"Total: {mem_info['total_gb']:.2f}GB, "
+                f"Used: {mem_info['reserved_gb']:.2f}GB ({mem_info['utilization']:.1f}%)"
+            )
+        return True, (
+            f"Memory check passed: {mem_info['free_gb']:.2f}GB available, "
+            f"{required_gb:.2f}GB required"
+        )
+    except Exception as e:
+        return True, f"Memory check failed: {e}, proceeding anyway"
+def _estimate_memory_requirement(num_images: int, process_res: int) -> float:
+    """
+    Estimate GPU memory requirement in GB.
+    Args:
+        num_images: Number of images to process
+        process_res: Processing resolution
+    Returns:
+        Estimated memory requirement in GB
+    """
+    # Rough estimation: base model (2GB) + per-image overhead
+    base_memory = 2.0
+    per_image_memory = (process_res / 504) ** 2 * 0.5  # Scale with resolution
+    total_memory = base_memory + (
+        num_images * per_image_memory * 0.1
+    )  # Batch processing reduces per-image cost
+    return total_memory
+def _run_inference_task(task_id: str):
+    """Run inference task in background thread with OOM protection."""
+    global _tasks, _backend, _running_task_id, _task_queue
+    model = None
+    inference_started = False
+    start_time = time.time()
+    try:
+        # Get task request
+        if task_id not in _tasks or _tasks[task_id].request is None:
+            print(f"[{task_id}] Task not found or request missing")
+            return
+        request = _tasks[task_id].request
+        num_images = len(request.image_paths)
+        # Set current running task
+        _running_task_id = task_id
+        # Update task status to running
+        _tasks[task_id].status = "running"
+        _tasks[task_id].started_at = start_time
+        _tasks[task_id].message = f"[{task_id}] Starting inference on {num_images} frames..."
+        print(f"[{task_id}] Starting inference on {num_images} frames")
+        # Pre-inference cleanup to ensure maximum available memory
+        print(f"[{task_id}] Pre-inference cleanup...")
+        _cleanup_cuda_memory()
+        # Check memory availability
+        estimated_memory = _estimate_memory_requirement(num_images, request.process_res)
+        mem_available, mem_msg = _check_memory_availability(estimated_memory)
+        print(f"[{task_id}] {mem_msg}")
+        if not mem_available:
+            # Try aggressive cleanup
+            print(f"[{task_id}] Insufficient memory, attempting aggressive cleanup...")
+            _cleanup_cuda_memory()
+            time.sleep(0.5)  # Give system time to reclaim memory
+            # Check again
+            mem_available, mem_msg = _check_memory_availability(estimated_memory)
+            if not mem_available:
+                raise RuntimeError(
+                    f"Insufficient GPU memory after cleanup. {mem_msg}\n"
+                    f"Suggestions:\n"
+                    f"  1. Reduce process_res (current: {request.process_res})\n"
+                    f"  2. Process fewer images at once (current: {num_images})\n"
+                    f"  3. Clear other GPU processes"
+                )
+        # Get model (with error handling)
+        print(f"[{task_id}] Loading model...")
+        _tasks[task_id].message = f"[{task_id}] Loading model..."
+        _tasks[task_id].progress = 0.1
+        try:
+            model = _backend.get_model()
+        except RuntimeError as e:
+            if "out of memory" in str(e).lower():
+                _cleanup_cuda_memory()
+                raise RuntimeError(
+                    f"OOM during model loading: {str(e)}\n"
+                    f"Try reducing the batch size or resolution."
+                )
+            raise
+        print(f"[{task_id}] Model loaded successfully")
+        _tasks[task_id].progress = 0.2
+        # Prepare inference parameters
+        inference_kwargs = {
+            "image": request.image_paths,
+            "export_format": request.export_format,
+            "process_res": request.process_res,
+            "process_res_method": request.process_res_method,
+            "export_feat_layers": request.export_feat_layers,
+            "align_to_input_ext_scale": request.align_to_input_ext_scale,
+            "conf_thresh_percentile": request.conf_thresh_percentile,
+            "num_max_points": request.num_max_points,
+            "show_cameras": request.show_cameras,
+            "feat_vis_fps": request.feat_vis_fps,
+        }
+        if request.export_dir:
+            inference_kwargs["export_dir"] = request.export_dir
+        if request.extrinsics:
+            inference_kwargs["extrinsics"] = np.array(request.extrinsics, dtype=np.float32)
+        if request.intrinsics:
+            inference_kwargs["intrinsics"] = np.array(request.intrinsics, dtype=np.float32)
+        # Run inference with timing
+        inference_start_time = time.time()
+        print(f"[{task_id}] Running model inference...")
+        _tasks[task_id].message = f"[{task_id}] Running model inference on {num_images} images..."
+        _tasks[task_id].progress = 0.3
+        inference_started = True
+        try:
+            model.inference(**inference_kwargs)
+            inference_time = time.time() - inference_start_time
+            avg_time_per_image = inference_time / num_images if num_images > 0 else 0
+            print(
+                f"[{task_id}] Inference completed in {inference_time:.2f}s "
+                f"({avg_time_per_image:.2f}s per image)"
+            )
+        except RuntimeError as e:
+            if "out of memory" in str(e).lower():
+                _cleanup_cuda_memory()
+                raise RuntimeError(
+                    f"OOM during inference: {str(e)}\n"
+                    f"Settings: {num_images} images, resolution={request.process_res}\n"
+                    f"Suggestions:\n"
+                    f"  1. Reduce process_res to {int(request.process_res * 0.75)}\n"
+                    f"  2. Process images in smaller batches\n"
+                    f"  3. Use process_res_method='resize' instead of 'upper_bound_resize'"
+                )
+            raise
+        _tasks[task_id].progress = 0.9
+        # Post-inference cleanup
+        print(f"[{task_id}] Post-inference cleanup...")
+        _cleanup_cuda_memory()
+        # Calculate total processing time
+        total_time = time.time() - start_time
+        # Update task status to completed
+        _tasks[task_id].status = "completed"
+        _tasks[task_id].completed_at = time.time()
+        _tasks[task_id].message = (
+            f"[{task_id}] Completed in {total_time:.2f}s " f"({avg_time_per_image:.2f}s per image)"
+        )
+        _tasks[task_id].progress = 1.0
+        _tasks[task_id].export_dir = request.export_dir
+        # Clear running state
+        _running_task_id = None
+        # Process next task in queue
+        _process_next_task()
+        print(f"[{task_id}] Task completed successfully")
+        print(
+            f"[{task_id}] Total time: {total_time:.2f}s, "
+            f"Inference time: {inference_time:.2f}s, "
+            f"Avg per image: {avg_time_per_image:.2f}s"
+        )
+    except Exception as e:
+        # Update task status to failed
+        error_msg = str(e)
+        total_time = time.time() - start_time
+        print(f"[{task_id}] Task failed after {total_time:.2f}s: {error_msg}")
+        # Always attempt cleanup on failure
+        _cleanup_cuda_memory()
+        _tasks[task_id].status = "failed"
+        _tasks[task_id].completed_at = time.time()
+        _tasks[task_id].message = f"[{task_id}] Failed after {total_time:.2f}s: {error_msg}"
+        # Clear running state
+        _running_task_id = None
+        # Process next task in queue
+        _process_next_task()
+    finally:
+        # Final cleanup in finally block to ensure it always runs
+        # This is critical for releasing resources even if unexpected errors occur
+        try:
+            if inference_started:
+                print(f"[{task_id}] Final cleanup in finally block...")
+                _cleanup_cuda_memory()
+        except Exception as e:
+            print(f"[{task_id}] Warning: Finally block cleanup failed: {e}")
+        # Schedule cleanup after task completion
+        _schedule_task_cleanup()
+def _cleanup_old_tasks():
+    """Clean up old completed/failed tasks to prevent memory buildup."""
+    global _tasks
+    current_time = time.time()
+    tasks_to_remove = []
+    # Find tasks to remove - more aggressive cleanup
+    for task_id, task in _tasks.items():
+        # Remove completed/failed tasks older than 10 minutes (instead of 1 hour)
+        if (
+            task.status in ["completed", "failed"]
+            and task.completed_at
+            and current_time - task.completed_at > 600
+        ):  # 10 minutes
+            tasks_to_remove.append(task_id)
+    # Remove old tasks
+    for task_id in tasks_to_remove:
+        del _tasks[task_id]
+        print(f"[CLEANUP] Removed old task: {task_id}")
+    # If still too many tasks, remove oldest completed/failed tasks
+    if len(_tasks) > MAX_TASK_HISTORY:
+        completed_tasks = [
+            (task_id, task)
+            for task_id, task in _tasks.items()
+            if task.status in ["completed", "failed"]
+        ]
+        completed_tasks.sort(key=lambda x: x[1].completed_at or 0)
+        excess_count = len(_tasks) - MAX_TASK_HISTORY
+        for i in range(min(excess_count, len(completed_tasks))):
+            task_id = completed_tasks[i][0]
+            del _tasks[task_id]
+            print(f"[CLEANUP] Removed excess task: {task_id}")
+    # Count active tasks (only pending and running)
+    active_count = sum(1 for task in _tasks.values() if task.status in ["pending", "running"])
+    print(
+        "[CLEANUP] Task cleanup completed. "
+        f"Total tasks: {len(_tasks)}, Active tasks: {active_count}"
+    )
+def _schedule_task_cleanup():
+    """Schedule task cleanup in background."""
+    def cleanup_worker():
+        try:
+            time.sleep(2)  # Small delay to ensure task status is updated
+            _cleanup_old_tasks()
+        except Exception as e:
+            print(f"[CLEANUP] Cleanup worker failed: {e}")
+    # Run cleanup in background thread
+    _executor.submit(cleanup_worker)
+#

depth_anything_3/services/gallery.py ADDED Viewed

	@@ -0,0 +1,562 @@

+#!/usr/bin/env python3
+# flake8: noqa: E501
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Depth Anything 3 Gallery Server (two-level, single-file)
+Now supports paginated depth preview (4 per page).
+"""
+import argparse
+import json
+import mimetypes
+import os
+import posixpath
+import sys
+from functools import partial
+from http import HTTPStatus
+from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
+from urllib.parse import quote, unquote
+# ------------------------------ Embedded HTML ------------------------------ #
+HTML_PAGE = r"""<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8" />
+  <title>Depth Anything 3 Gallery</title>
+  <meta name="viewport" content="width=device-width, initial-scale=1" />
+  <link rel="icon" href="https://i.postimg.cc/rFSzGJ7J/light-icon.jpg" media="(prefers-color-scheme: light)">
+  <link rel="icon" href="https://i.postimg.cc/P5gZfJsf/dark-icon.jpg" media="(prefers-color-scheme: dark)">
+  <script type="module" src="https://unpkg.com/@google/model-viewer/dist/model-viewer.min.js"></script>
+  <style>
+    :root {
+      --gap:16px; --card-radius:16px; --shadow:0 8px 24px rgba(0,0,0,.12);
+      --maxW:1036px; --maxH:518px;
+      --tech-blue: #00d4ff;
+      --tech-cyan: #00ffcc;
+      --tech-purple: #7877c6;
+    }
+    *{ box-sizing:border-box }
+    /* Dark mode tech theme */
+    @media (prefers-color-scheme: dark) {
+      body{
+        margin:0; font:16px/1.5 system-ui,-apple-system,Segoe UI,Roboto,sans-serif;
+        background: linear-gradient(135deg, #0a0a0a 0%, #1a1a2e 50%, #16213e 100%);
+        color:#e8eaed;
+        position: relative;
+        overflow-x: hidden;
+      }
+      body::before {
+        content: '';
+        position: fixed;
+        top: 0;
+        left: 0;
+        right: 0;
+        bottom: 0;
+        background:
+          radial-gradient(circle at 20% 80%, rgba(120, 119, 198, 0.3) 0%, transparent 50%),
+          radial-gradient(circle at 80% 20%, rgba(255, 119, 198, 0.3) 0%, transparent 50%),
+          radial-gradient(circle at 40% 40%, rgba(120, 219, 255, 0.2) 0%, transparent 50%);
+        animation: techPulse 8s ease-in-out infinite;
+        z-index: -1;
+      }
+    }
+    /* Light mode tech theme */
+    @media (prefers-color-scheme: light) {
+      body{
+        margin:0; font:16px/1.5 system-ui,-apple-system,Segoe UI,Roboto,sans-serif;
+        background: linear-gradient(135deg, #f8fafc 0%, #e2e8f0 50%, #cbd5e1 100%);
+        color:#1e293b;
+        position: relative;
+        overflow-x: hidden;
+      }
+      body::before {
+        content: '';
+        position: fixed;
+        top: 0;
+        left: 0;
+        right: 0;
+        bottom: 0;
+        background:
+          radial-gradient(circle at 20% 80%, rgba(0, 212, 255, 0.1) 0%, transparent 50%),
+          radial-gradient(circle at 80% 20%, rgba(0, 102, 255, 0.1) 0%, transparent 50%),
+          radial-gradient(circle at 40% 40%, rgba(0, 255, 204, 0.08) 0%, transparent 50%);
+        animation: techPulse 8s ease-in-out infinite;
+        z-index: -1;
+      }
+    }
+    @keyframes techPulse {
+      0%, 100% { opacity: 0.5; }
+      50% { opacity: 0.8; }
+    }
+    @keyframes techGradient {
+      0% { background-position: 0% 50%; }
+      50% { background-position: 100% 50%; }
+      100% { background-position: 0% 50%; }
+    }
+    /* Dark mode header */
+    @media (prefers-color-scheme: dark) {
+      header{
+        padding:20px 24px; position:sticky; top:0;
+        background:linear-gradient(180deg,rgba(10,10,10,0.9) 60%,rgba(10,10,10,0));
+        z-index:2; border-bottom:1px solid rgba(0, 212, 255, 0.2);
+        backdrop-filter: blur(10px);
+      }
+      h1{
+        margin:0; font-size:22px;
+        background: linear-gradient(45deg, var(--tech-blue), var(--tech-cyan), var(--tech-purple));
+        background-size: 400% 400%;
+        -webkit-background-clip: text;
+        background-clip: text;
+        color: transparent;
+        animation: techGradient 3s ease infinite;
+        text-shadow: 0 0 30px rgba(0, 212, 255, 0.5);
+      }
+      .muted{ opacity:.7; font-size:13px; color: #a0a0a0; }
+      #backBtn{
+        display:none; padding:6px 10px; border-radius:10px;
+        border:1px solid rgba(0, 212, 255, 0.3);
+        background:rgba(0, 0, 0, 0.3);
+        color:#e8eaed; cursor:pointer;
+        transition: all 0.3s ease;
+      }
+      #backBtn:hover {
+        border-color: var(--tech-blue);
+        box-shadow: 0 0 10px rgba(0, 212, 255, 0.3);
+      }
+      #search{
+        flex:1 1 260px; min-width:240px; max-width:520px;
+        padding:10px 14px; border-radius:12px;
+        border:1px solid rgba(0, 212, 255, 0.3);
+        background:rgba(0, 0, 0, 0.3);
+        color:#e8eaed; outline:none;
+        transition: all 0.3s ease;
+      }
+      #search:focus {
+        border-color: var(--tech-blue);
+        box-shadow: 0 0 10px rgba(0, 212, 255, 0.3);
+      }
+    }
+    /* Light mode header */
+    @media (prefers-color-scheme: light) {
+      header{
+        padding:20px 24px; position:sticky; top:0;
+        background:linear-gradient(180deg,rgba(248,250,252,0.9) 60%,rgba(248,250,252,0));
+        z-index:2; border-bottom:1px solid rgba(0, 212, 255, 0.3);
+        backdrop-filter: blur(10px);
+      }
+      h1{
+        margin:0; font-size:22px;
+        background: linear-gradient(45deg, #0066ff, #00d4ff, #00ffcc);
+        background-size: 400% 400%;
+        -webkit-background-clip: text;
+        background-clip: text;
+        color: transparent;
+        animation: techGradient 3s ease infinite;
+        text-shadow: 0 0 20px rgba(0, 102, 255, 0.3);
+      }
+      .muted{ opacity:.7; font-size:13px; color: #64748b; }
+      #backBtn{
+        display:none; padding:6px 10px; border-radius:10px;
+        border:1px solid rgba(0, 212, 255, 0.4);
+        background:rgba(255, 255, 255, 0.8);
+        color:#1e293b; cursor:pointer;
+        transition: all 0.3s ease;
+      }
+      #backBtn:hover {
+        border-color: #0066ff;
+        box-shadow: 0 0 10px rgba(0, 102, 255, 0.3);
+      }
+      #search{
+        flex:1 1 260px; min-width:240px; max-width:520px;
+        padding:10px 14px; border-radius:12px;
+        border:1px solid rgba(0, 212, 255, 0.4);
+        background:rgba(255, 255, 255, 0.8);
+        color:#1e293b; outline:none;
+        transition: all 0.3s ease;
+      }
+      #search:focus {
+        border-color: #0066ff;
+        box-shadow: 0 0 10px rgba(0, 102, 255, 0.3);
+      }
+    }
+    .row{ display:flex; gap:12px; align-items:center; flex-wrap:wrap; justify-content:center; }
+    main{ padding:16px 24px 24px; display:grid; place-items:center; }
+    .group-wrap{ width:min(900px,100%); }
+    .group-list{ list-style:none; margin:0; padding:0; display:grid; gap:10px; }
+    /* Dark mode cards */
+    @media (prefers-color-scheme: dark) {
+      .group-item{
+        display:flex; align-items:center; gap:12px; padding:12px 14px;
+        background:rgba(0, 0, 0, 0.3); border:1px solid rgba(0, 212, 255, 0.2); border-radius:14px; cursor:pointer;
+        transition: all 0.3s ease;
+        backdrop-filter: blur(10px);
+      }
+      .group-item:hover{
+        transform: translateY(-1px);
+        border-color:var(--tech-blue);
+        box-shadow: 0 4px 15px rgba(0, 212, 255, 0.2);
+      }
+      .card{
+        background:rgba(0, 0, 0, 0.3); border:1px solid rgba(0, 212, 255, 0.2); border-radius:var(--card-radius);
+        overflow:hidden; box-shadow:var(--shadow);
+        transition:all 0.3s ease; cursor:pointer; display:flex; flex-direction:column; max-width:var(--maxW);
+        backdrop-filter: blur(10px);
+      }
+      .card:hover{
+        transform:translateY(-2px);
+        border-color:var(--tech-blue);
+        box-shadow: 0 8px 25px rgba(0, 212, 255, 0.2);
+      }
+      .thumb-box{
+        position:relative; width:100%; aspect-ratio:2/1;
+        background:linear-gradient(135deg, #0e121b 0%, #1a1a2e 100%);
+        display:grid; place-items:center; overflow:hidden;
+        border-bottom: 1px solid rgba(0, 212, 255, 0.1);
+      }
+      .open{
+        font-size:12px; opacity:.7; padding:6px 8px;
+        border:1px solid rgba(0, 212, 255, 0.3);
+        border-radius:10px;
+        background:rgba(0, 212, 255, 0.1);
+        transition: all 0.3s ease;
+      }
+      .open:hover {
+        background:rgba(0, 212, 255, 0.2);
+        border-color: var(--tech-blue);
+      }
+    }
+    /* Light mode cards */
+    @media (prefers-color-scheme: light) {
+      .group-item{
+        display:flex; align-items:center; gap:12px; padding:12px 14px;
+        background:rgba(255, 255, 255, 0.8); border:1px solid rgba(0, 212, 255, 0.3); border-radius:14px; cursor:pointer;
+        transition: all 0.3s ease;
+        backdrop-filter: blur(10px);
+        box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
+      }
+      .group-item:hover{
+        transform: translateY(-1px);
+        border-color:#0066ff;
+        box-shadow: 0 4px 15px rgba(0, 102, 255, 0.2);
+      }
+      .card{
+        background:rgba(255, 255, 255, 0.8); border:1px solid rgba(0, 212, 255, 0.3); border-radius:var(--card-radius);
+        overflow:hidden; box-shadow:0 4px 6px rgba(0, 0, 0, 0.1);
+        transition:all 0.3s ease; cursor:pointer; display:flex; flex-direction:column; max-width:var(--maxW);
+        backdrop-filter: blur(10px);
+      }
+      .card:hover{
+        transform:translateY(-2px);
+        border-color:#0066ff;
+        box-shadow: 0 8px 25px rgba(0, 102, 255, 0.2);
+      }
+      .thumb-box{
+        position:relative; width:100%; aspect-ratio:2/1;
+        background:linear-gradient(135deg, #f8fafc 0%, #e2e8f0 100%);
+        display:grid; place-items:center; overflow:hidden;
+        border-bottom: 1px solid rgba(0, 212, 255, 0.2);
+      }
+      .open{
+        font-size:12px; opacity:.7; padding:6px 8px;
+        border:1px solid rgba(0, 212, 255, 0.4);
+        border-radius:10px;
+        background:rgba(0, 212, 255, 0.1);
+        transition: all 0.3s ease;
+      }
+      .open:hover {
+        background:rgba(0, 212, 255, 0.2);
+        border-color: #0066ff;
+      }
+    }
+    .gname{ font-weight:600; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; width:100%; }
+    .grid{
+      width:min(1200px,100%);
+      display:grid;
+      grid-template-columns:repeat(auto-fill,minmax(260px,1fr));
+      gap:var(--gap);
+      align-items:start;
+      justify-items:stretch;
+      margin: 0 auto;
+      padding: 0 20px;
+    }
+    .thumb{ max-width:100%; max-height:100%; object-fit:contain; display:block; }
+    .meta{ padding:12px 14px; display:flex; justify-content:space-between; align-items:center; gap:8px; }
+    .title{ font-weight:600; font-size:14px; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; }
+    .empty{ opacity:.6; padding:40px 0; text-align:center; }
+    .crumb{ font-size:13px; opacity:.8; }
+    .overlay{ position:fixed; inset:0; background:rgba(0,0,0,.6); display:none; place-items:center; padding:20px; z-index:10; }
+    .overlay.show{ display:grid; }
+    /* Dark mode viewer */
+    @media (prefers-color-scheme: dark) {
+      .viewer{
+        inline-size:min(92vw,var(--maxW));
+        block-size:min(82vh,var(--maxH));
+        background:#0e121b; border:1px solid rgba(0, 212, 255, 0.3); border-radius:18px; overflow:hidden; position:relative; box-shadow:0 12px 36px rgba(0,0,0,.35);
+        display:grid;
+      }
+      .chip{ background:rgba(0,0,0,.45); border:1px solid rgba(0, 212, 255, 0.3); color:#e8eaed; padding:6px 10px; border-radius:12px; font-size:12px; max-width:60%; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; }
+      .btn{ margin-left:auto; background:rgba(0, 0, 0, 0.3); color:#e8eaed; border:1px solid rgba(0, 212, 255, 0.3); border-radius:10px; padding:6px 10px; cursor:pointer; transition: all 0.3s ease; }
+      .btn:hover { border-color: var(--tech-blue); box-shadow: 0 0 10px rgba(0, 212, 255, 0.3); }
+      .mv-box{ width:100%; aspect-ratio:1036/518; background:#0b0d12; border:1px solid rgba(0, 212, 255, 0.2); border-radius:12px; overflow:hidden; }
+      .mv-box model-viewer{ width:100%; height:100%; background:#0b0d12; }
+      .res-cell{ position:relative; width:100%; aspect-ratio:2/1; background:#0e121b; border:1px solid rgba(0, 212, 255, 0.2); border-radius:12px; overflow:hidden; display:grid; place-items:center; }
+      .res-empty{ position:absolute; inset:0; display:grid; place-items:center; opacity:.55; font-size:12px; color:#9aa0a6; }
+      .download-icon{ background:rgba(0, 0, 0, 0.6); border:1px solid rgba(0, 212, 255, 0.3); color:#e8eaed; box-shadow:0 4px 12px rgba(0,0,0,0.3); }
+      .download-icon:hover{ background:rgba(0, 212, 255, 0.2); border-color:var(--tech-blue); box-shadow:0 0 20px rgba(0, 212, 255, 0.4); transform:scale(1.05); }
+    }
+    /* Light mode viewer */
+    @media (prefers-color-scheme: light) {
+      .viewer{
+        inline-size:min(92vw,var(--maxW));
+        block-size:min(82vh,var(--maxH));
+        background:#f8fafc; border:1px solid rgba(0, 212, 255, 0.4); border-radius:18px; overflow:hidden; position:relative; box-shadow:0 12px 36px rgba(0,0,0,.15);
+        display:grid;
+      }
+      .chip{ background:rgba(255,255,255,0.8); border:1px solid rgba(0, 212, 255, 0.4); color:#1e293b; padding:6px 10px; border-radius:12px; font-size:12px; max-width:60%; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; }
+      .btn{ margin-left:auto; background:rgba(255, 255, 255, 0.8); color:#1e293b; border:1px solid rgba(0, 212, 255, 0.4); border-radius:10px; padding:6px 10px; cursor:pointer; transition: all 0.3s ease; }
+      .btn:hover { border-color: #0066ff; box-shadow: 0 0 10px rgba(0, 102, 255, 0.3); }
+      .mv-box{ width:100%; aspect-ratio:1036/518; background:#f8fafc; border:1px solid rgba(0, 212, 255, 0.3); border-radius:12px; overflow:hidden; }
+      .mv-box model-viewer{ width:100%; height:100%; background:#f8fafc; }
+      .res-cell{ position:relative; width:100%; aspect-ratio:2/1; background:#f8fafc; border:1px solid rgba(0, 212, 255, 0.3); border-radius:12px; overflow:hidden; display:grid; place-items:center; }
+      .res-empty{ position:absolute; inset:0; display:grid; place-items:center; opacity:.55; font-size:12px; color:#64748b; }
+      .download-icon{ background:rgba(255, 255, 255, 0.9); border:1px solid rgba(0, 212, 255, 0.4); color:#1e293b; box-shadow:0 4px 12px rgba(0,0,0,0.15); }
+      .download-icon:hover{ background:rgba(0, 212, 255, 0.2); border-color:#0066ff; box-shadow:0 0 20px rgba(0, 102, 255, 0.4); transform:scale(1.05); }
+    }
+    .viewer-header{ position:absolute; top:8px; left:8px; right:8px; display:flex; gap:8px; align-items:center; z-index:2; }
+    .viewer-body{ height:100%; display:grid; grid-template-rows:auto auto; gap:12px; padding:36px 8px 8px 8px; overflow:auto; }
+    .res-grid{ display:grid; grid-template-columns:1fr 1fr; gap:8px; }
+    .res-img{ max-width:100%; max-height:100%; object-fit:contain; display:block; }
+    .download-icon{ position:absolute; bottom:16px; right:16px; width:44px; height:44px; border-radius:50%; display:grid; place-items:center; font-size:20px; cursor:pointer; z-index:3; transition:all 0.3s ease; }
+    /* Pagination controls */
+    .pager {
+      grid-column: 1 / -1;
+      justify-content: center;
+      align-items: center;
+      display: flex;
+      gap: 16px;
+      margin-top: 8px;
+      font-size: 13px;
+      text-align: center;
+    }
+    /* Dark mode pagination */
+    @media (prefers-color-scheme: dark) {
+      .pager {
+        color: #ccc;
+      }
+      .pager button {
+        padding: 4px 10px;
+        border-radius: 8px;
+        border: 1px solid rgba(0, 212, 255, 0.3);
+        background: rgba(0, 0, 0, 0.3);
+        color: #e8eaed;
+        cursor: pointer;
+        transition: all 0.3s ease;
+      }
+      .pager button:hover:not(:disabled) {
+        border-color: var(--tech-blue);
+        box-shadow: 0 0 8px rgba(0, 212, 255, 0.2);
+      }
+      .pager button:disabled {
+        opacity: 0.4;
+        cursor: not-allowed;
+      }
+    }
+    /* Light mode pagination */
+    @media (prefers-color-scheme: light) {
+      .pager {
+        color: #64748b;
+      }
+      .pager button {
+        padding: 4px 10px;
+        border-radius: 8px;
+        border: 1px solid rgba(0, 212, 255, 0.4);
+        background: rgba(255, 255, 255, 0.8);
+        color: #1e293b;
+        cursor: pointer;
+        transition: all 0.3s ease;
+      }
+      .pager button:hover:not(:disabled) {
+        border-color: #0066ff;
+        box-shadow: 0 0 8px rgba(0, 102, 255, 0.2);
+      }
+      .pager button:disabled {
+        opacity: 0.4;
+        cursor: not-allowed;
+      }
+    }
+    /* Intro card styles */
+    @media (prefers-color-scheme: dark) {
+      .intro-card {
+        background: linear-gradient(135deg, rgba(0, 212, 255, 0.1) 0%, rgba(0, 102, 255, 0.1) 100%);
+        border: 1px solid rgba(0, 212, 255, 0.2);
+        backdrop-filter: blur(10px);
+      }
+      .intro-title {
+        background: linear-gradient(45deg, var(--tech-blue), var(--tech-cyan), var(--tech-purple));
+        background-size: 400% 400%;
+        -webkit-background-clip: text;
+        background-clip: text;
+        color: transparent;
+        animation: techGradient 3s ease infinite;
+        text-shadow: 0 0 20px rgba(0, 212, 255, 0.3);
+      }
+      .intro-description {
+        color: #e0e0e0;
+      }
+    }
+    @media (prefers-color-scheme: light) {
+      .intro-card {
+        background: linear-gradient(135deg, rgba(0, 212, 255, 0.05) 0%, rgba(0, 102, 255, 0.05) 100%);
+        border: 1px solid rgba(0, 212, 255, 0.3);
+        backdrop-filter: blur(10px);
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+      }
+      .intro-title {
+        background: linear-gradient(45deg, #0066ff, #00d4ff, #00ffcc);
+        background-size: 400% 400%;
+        -webkit-background-clip: text;
+        background-clip: text;
+        color: transparent;
+        animation: techGradient 3s ease infinite;
+        text-shadow: 0 0 15px rgba(0, 102, 255, 0.2);
+      }
+      .intro-description {
+        color: #334155;
+      }
+    }
+    footer{
+      opacity:.55;
+      font-size:12px;
+      padding:12px 24px 24px;
+      text-align:center;
+      display:flex;
+      justify-content:center;
+      align-items:center;
+      width:100%;
+    }
+  </style>
+</head>
+<body>
+  <header>
+    <div class="row">
+      <button id="backBtn">← Back</button>
+      <h1 id="pageTitle">Depth Anything 3 Gallery</h1>
+      <span id="crumb" class="crumb"></span>
+      <input id="search" placeholder="Search…" />
+    </div>
+    <div class="muted" id="hint" style="text-align: center;">Level 1 shows groups only; click a group to browse scenes and previews.</div>
+  </header>
+  <main>
+    <!-- Tech intro card -->
+    <div class="intro-card" style="margin-bottom: 30px; padding: 25px; border-radius: 15px; text-align: center; max-width: 800px;">
+      <h2 class="intro-title" style="margin: 0 0 15px 0; font-size: 1.8em; font-weight: 700;">
+        🎯 Depth Anything 3 Gallery
+      </h2>
+      <p class="intro-description" style="margin: 0; font-size: 1.1em; line-height: 1.6;">
+        Explore 3D reconstructions and depth visualizations from Depth Anything 3.
+        Browse through groups of scenes, preview 3D models, and examine depth maps interactively.
+      </p>
+    </div>
+    <div id="level1" class="group-wrap" aria-live="polite">
+      <ul id="groupList" class="group-list"></ul>
+      <div id="groupEmpty" class="empty" style="display:none;">No available groups</div>
+    </div>
+    <div id="level2" style="display:none; width:100%;" aria-live="polite">
+      <div id="topPager" class="pager" style="margin-bottom: 16px;"></div>
+      <div id="grid" class="grid"></div>
+      <div id="sceneEmpty" class="empty" style="display:none;">No available scenes in this group</div>
+    </div>
+  </main>
+  <div id="overlay" class="overlay" role="dialog" aria-modal="true" aria-label="3D Preview">
+    <div class="viewer" id="viewer">
+      <div class="viewer-header">
+        <div id="viewerTitle" class="chip">Loading…</div>
+        <button id="toggleView" class="btn" title="Toggle between 3D-only and resource view">Resource View</button>
+        <button id="closeBtn" class="btn">Close</button>
+      </div>
+      <div id="downloadBtn" class="download-icon" title="Download GLB model">⬇</div>
+      <div class="viewer-body">
+        <div class="mv-box"><model-viewer id="mv"
+          src=""
+          ar
+          camera-controls
+          auto-rotate
+          interaction-prompt="auto"
+          shadow-intensity="0.7"
+          exposure="1.0"
+          alt="GLB Preview"></model-viewer></div>
+        <div class="res-grid" id="resGrid" hidden></div>
+      </div>
+    </div>
+  </div>
+  <footer>Depth Anything 3 Gallery. Copyright 2025 Depth Anything 3 authors.</footer>
+<script>
+const level1=document.getElementById('level1'),level2=document.getElementById('level2'),pageTitle=document.getElementById('pageTitle'),crumb=document.getElementById('crumb'),backBtn=document.getElementById('backBtn'),hint=document.getElementById('hint'),searchInput=document.getElementById('search'),groupList=document.getElementById('groupList'),groupEmpty=document.getElementById('groupEmpty'),topPager=document.getElementById('topPager'),grid=document.getElementById('grid'),sceneEmpty=document.getElementById('sceneEmpty'),overlay=document.getElementById('overlay'),viewer=document.getElementById('viewer'),mv=document.getElementById('mv'),viewerTitle=document.getElementById('viewerTitle'),downloadBtn=document.getElementById('downloadBtn'),toggleViewBtn=document.getElementById('toggleView'),closeBtn=document.getElementById('closeBtn'),resGrid=document.getElementById('resGrid');
+let GROUPS=[],SCENES=[],currentGroup=null,currentScene=null,currentPage=1,currentScenePage=1;
+const qs=()=>new URLSearchParams(location.search);
+async function loadGroups(){const r=await fetch('/manifest.json',{cache:'no-store'});if(!r.ok)throw new Error(r.status+' '+r.statusText);const j=await r.json();GROUPS=j.groups||[];renderGroups(GROUPS);}
+async function loadScenes(g){const r=await fetch('/manifest/'+encodeURIComponent(g)+'.json',{cache:'no-store'});if(!r.ok)throw new Error(r.status+' '+r.statusText);const j=await r.json();SCENES=j.items||[];const p=parseInt(qs().get('page'))||1;renderScenes(SCENES,p);}
+function renderGroups(list){groupList.innerHTML='';const q=searchInput.value.trim().toLowerCase();const f=list.filter(g=>(g.title||g.id||'').toLowerCase().includes(q));if(!f.length){groupEmpty.style.display='';return;}groupEmpty.style.display='none';for(const g of f){const li=document.createElement('li');li.className='group-item';li.title=g.title||g.id;li.onclick=()=>enterLevel2(g.id,{push:true});const name=document.createElement('div');name.className='gname';name.textContent=g.title||g.id;li.appendChild(name);groupList.appendChild(li);}}
+function renderScenes(list,page=1){topPager.innerHTML='';grid.innerHTML='';const q=searchInput.value.trim().toLowerCase();const f=list.filter(x=>(x.title||'').toLowerCase().includes(q)||(x.id||'').toLowerCase().includes(q));if(!f.length){sceneEmpty.style.display='';topPager.style.display='none';return;}sceneEmpty.style.display='none';topPager.style.display='flex';const perPage=16;const total=f.length;const totalPages=Math.max(1,Math.ceil(total/perPage));currentScenePage=page;const u=new URL(location.href);u.searchParams.set('page',page);history.replaceState(null,'',u);const subset=f.slice((page-1)*perPage,page*perPage);for(const i of subset){const c=document.createElement('div');c.className='card';c.title=i.title;const b=document.createElement('div');b.className='thumb-box';const img=document.createElement('img');img.className='thumb';img.loading='lazy';img.alt=i.title;img.src=i.thumbnail;b.appendChild(img);const m=document.createElement('div');m.className='meta';const t=document.createElement('div');t.className='title';t.textContent=i.title;const o=document.createElement('div');o.className='open';o.textContent='Preview';m.appendChild(t);m.appendChild(o);c.appendChild(b);c.appendChild(m);c.onclick=()=>openViewer(i,{push:true});grid.appendChild(c);}function buildPager(){const pg=document.createElement('div');pg.className='pager';const prev=document.createElement('button');prev.textContent='← Prev';prev.disabled=page<=1;prev.onclick=()=>renderScenes(list,page-1);const info=document.createElement('span');info.textContent=`${page} / ${totalPages}`;const next=document.createElement('button');next.textContent='Next →';next.disabled=page>=totalPages;next.onclick=()=>renderScenes(list,page+1);pg.appendChild(prev);pg.appendChild(info);pg.appendChild(next);return pg;}topPager.innerHTML='';topPager.appendChild(buildPager());grid.appendChild(buildPager());}
+function enterLevel1({push=false}={}){currentGroup=null;pageTitle.textContent='Depth Anything 3 Gallery';crumb.textContent='';backBtn.style.display='none';hint.style.display='';level1.style.display='';level2.style.display='none';overlay.classList.remove('show');mv.src='';const u=new URL(location.href);u.searchParams.delete('group');u.searchParams.delete('id');u.searchParams.delete('page');push?history.pushState(null,'',u):history.replaceState(null,'',u);searchInput.value='';loadGroups().catch(e=>{groupList.innerHTML='';groupEmpty.style.display='';groupEmpty.textContent='Failed to load groups: '+e;});}
+async function enterLevel2(g,{push=false}={}){currentGroup=g;pageTitle.textContent=g;crumb.textContent='(group)';backBtn.style.display='';hint.style.display='none';level1.style.display='none';level2.style.display='';overlay.classList.remove('show');mv.src='';const u=new URL(location.href);u.searchParams.set('group',g);u.searchParams.delete('id');push?history.pushState(null,'',u):history.replaceState(null,'',u);searchInput.value='';try{await loadScenes(g);const id=qs().get('id');if(id){const hit=SCENES.find(x=>x.id

depth_anything_3/services/inference_service.py ADDED Viewed

	@@ -0,0 +1,225 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Unified Inference Service
+Provides unified interface for local and remote inference
+"""
+from typing import Any, Dict, List, Optional, Union
+import numpy as np
+import requests
+import typer
+from ..api import DepthAnything3
+class InferenceService:
+    """Unified inference service class"""
+    def __init__(self, model_dir: str, device: str = "cuda"):
+        self.model_dir = model_dir
+        self.device = device
+        self.model = None
+    def load_model(self):
+        """Load model"""
+        if self.model is None:
+            typer.echo(f"Loading model from {self.model_dir}...")
+            self.model = DepthAnything3.from_pretrained(self.model_dir).to(self.device)
+        return self.model
+    def run_local_inference(
+        self,
+        image_paths: List[str],
+        export_dir: str,
+        export_format: str = "mini_npz-glb",
+        process_res: int = 504,
+        process_res_method: str = "upper_bound_resize",
+        export_feat_layers: List[int] = None,
+        extrinsics: Optional[np.ndarray] = None,
+        intrinsics: Optional[np.ndarray] = None,
+        align_to_input_ext_scale: bool = True,
+        conf_thresh_percentile: float = 40.0,
+        num_max_points: int = 1_000_000,
+        show_cameras: bool = True,
+        feat_vis_fps: int = 15,
+    ) -> Any:
+        """Run local inference"""
+        if export_feat_layers is None:
+            export_feat_layers = []
+        model = self.load_model()
+        # Prepare inference parameters
+        inference_kwargs = {
+            "image": image_paths,
+            "export_dir": export_dir,
+            "export_format": export_format,
+            "process_res": process_res,
+            "process_res_method": process_res_method,
+            "export_feat_layers": export_feat_layers,
+            "align_to_input_ext_scale": align_to_input_ext_scale,
+            "conf_thresh_percentile": conf_thresh_percentile,
+            "num_max_points": num_max_points,
+            "show_cameras": show_cameras,
+            "feat_vis_fps": feat_vis_fps,
+        }
+        # Add pose data (if exists)
+        if extrinsics is not None:
+            inference_kwargs["extrinsics"] = extrinsics
+        if intrinsics is not None:
+            inference_kwargs["intrinsics"] = intrinsics
+        # Run inference
+        typer.echo(f"Running inference on {len(image_paths)} images...")
+        prediction = model.inference(**inference_kwargs)
+        typer.echo(f"Results saved to {export_dir}")
+        typer.echo(f"Export format: {export_format}")
+        return prediction
+    def run_backend_inference(
+        self,
+        image_paths: List[str],
+        export_dir: str,
+        backend_url: str,
+        export_format: str = "mini_npz-glb",
+        process_res: int = 504,
+        process_res_method: str = "upper_bound_resize",
+        export_feat_layers: List[int] = None,
+        extrinsics: Optional[np.ndarray] = None,
+        intrinsics: Optional[np.ndarray] = None,
+        align_to_input_ext_scale: bool = True,
+        conf_thresh_percentile: float = 40.0,
+        num_max_points: int = 1_000_000,
+        show_cameras: bool = True,
+        feat_vis_fps: int = 15,
+    ) -> Dict[str, Any]:
+        """Run backend inference"""
+        if export_feat_layers is None:
+            export_feat_layers = []
+        # Check backend status
+        if not self._check_backend_status(backend_url):
+            raise typer.BadParameter(f"Backend service is not running at {backend_url}")
+        # Prepare payload
+        payload = {
+            "image_paths": image_paths,
+            "export_dir": export_dir,
+            "export_format": export_format,
+            "process_res": process_res,
+            "process_res_method": process_res_method,
+            "export_feat_layers": export_feat_layers,
+            "align_to_input_ext_scale": align_to_input_ext_scale,
+            "conf_thresh_percentile": conf_thresh_percentile,
+            "num_max_points": num_max_points,
+            "show_cameras": show_cameras,
+            "feat_vis_fps": feat_vis_fps,
+        }
+        # Add pose data (if exists)
+        if extrinsics is not None:
+            payload["extrinsics"] = [ext.astype(np.float64).tolist() for ext in extrinsics]
+        if intrinsics is not None:
+            payload["intrinsics"] = [intr.astype(np.float64).tolist() for intr in intrinsics]
+        # Submit task
+        typer.echo("Submitting inference task to backend...")
+        try:
+            response = requests.post(f"{backend_url}/inference", json=payload, timeout=30)
+            response.raise_for_status()
+            result = response.json()
+            if result["success"]:
+                task_id = result["task_id"]
+                typer.echo("Task submitted successfully!")
+                typer.echo(f"Task ID: {task_id}")
+                typer.echo(f"Results will be saved to: {export_dir}")
+                typer.echo(f"Check backend logs for progress updates with task ID: {task_id}")
+                return result
+            else:
+                raise typer.BadParameter(
+                    f"Backend inference submission failed: {result['message']}"
+                )
+        except requests.exceptions.RequestException as e:
+            raise typer.BadParameter(f"Backend inference submission failed: {e}")
+    def _check_backend_status(self, backend_url: str) -> bool:
+        """Check backend status"""
+        try:
+            response = requests.get(f"{backend_url}/status", timeout=5)
+            return response.status_code == 200
+        except Exception:
+            return False
+def run_inference(
+    image_paths: List[str],
+    export_dir: str,
+    model_dir: str,
+    device: str = "cuda",
+    backend_url: Optional[str] = None,
+    export_format: str = "mini_npz-glb",
+    process_res: int = 504,
+    process_res_method: str = "upper_bound_resize",
+    export_feat_layers: List[int] = None,
+    extrinsics: Optional[np.ndarray] = None,
+    intrinsics: Optional[np.ndarray] = None,
+    align_to_input_ext_scale: bool = True,
+    conf_thresh_percentile: float = 40.0,
+    num_max_points: int = 1_000_000,
+    show_cameras: bool = True,
+    feat_vis_fps: int = 15,
+) -> Union[Any, Dict[str, Any]]:
+    """Unified inference interface"""
+    service = InferenceService(model_dir, device)
+    if backend_url:
+        return service.run_backend_inference(
+            image_paths=image_paths,
+            export_dir=export_dir,
+            backend_url=backend_url,
+            export_format=export_format,
+            process_res=process_res,
+            process_res_method=process_res_method,
+            export_feat_layers=export_feat_layers,
+            extrinsics=extrinsics,
+            intrinsics=intrinsics,
+            align_to_input_ext_scale=align_to_input_ext_scale,
+            conf_thresh_percentile=conf_thresh_percentile,
+            num_max_points=num_max_points,
+            show_cameras=show_cameras,
+            feat_vis_fps=feat_vis_fps,
+        )
+    else:
+        return service.run_local_inference(
+            image_paths=image_paths,
+            export_dir=export_dir,
+            export_format=export_format,
+            process_res=process_res,
+            process_res_method=process_res_method,
+            export_feat_layers=export_feat_layers,
+            extrinsics=extrinsics,
+            intrinsics=intrinsics,
+            align_to_input_ext_scale=align_to_input_ext_scale,
+            conf_thresh_percentile=conf_thresh_percentile,
+            num_max_points=num_max_points,
+            show_cameras=show_cameras,
+            feat_vis_fps=feat_vis_fps,
+        )

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ gradio>=4.0.0