{ "sam": { "params": 95569152, "architecture": "SAM ViT-B", "image_size": 1024, "patch_size": 16, "embed_dim": 768, "depth": 12, "num_heads": 12 }, "clip": { "params": 303177728, "architecture": "CLIP-Large", "image_size": 224, "patch_size": 14, "width": 1024, "layers": 24, "heads": 16 }, "projector": { "params": 2622720, "type": "linear", "input_dim": 2048, "output_dim": 1280 }, "total_params": 401369600, "output_tokens": 256, "output_dim": 1280 }