Depth Estimation & Point Cloud Reconstruction

The tutorial demonstrates how to use ZenSVI to estimatie depth information from street view imagery, and further integrate the depth and color information to reconstruct point cloud.
Contributer: Zicheng Fan

Import module

#pip install --upgrade zensvi
#pip install img2vec_pytorch
#pip install faiss-cpu
import sys
import os

# Get the current notebook's directory (docs/examples) dynamically
notebook_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()

# Construct the path to the src folder relative to the notebook location
src_path = os.path.normpath(os.path.join(notebook_dir, '../../src/'))

# Add the src folder to sys.path
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Now import your package
from zensvi.transform import PointCloudProcessor
from zensvi.cv import DepthEstimator
Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.
Weights file already exists.
/data/zicheng/.conda/envs/torch_cuda_118/lib/python3.11/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers
  warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning)

Download the test dataset

from huggingface_hub import HfApi, hf_hub_download


def download_folder(repo_id, repo_type, folder_path, local_dir):
    """
    Download an entire folder from a huggingface dataset repository.
    repo_id : string
        The ID of the repository (e.g., 'username/repo_name').
    repo_type : string
        Type of the repo, dataset or model.
    folder_path : string
        The path to the folder within the repository.
    local_dir : string
        Local folder to download the data. This mimics git behaviour
    """
    api = HfApi()
    # list all files in the repo, keep the ones within folder_path
    all_files = api.list_repo_files(repo_id, repo_type=repo_type)
    files_list = [f for f in all_files if f.startswith(folder_path)]

    # download each of those files
    for file_path in files_list:
        hf_hub_download(repo_id=repo_id, repo_type=repo_type,
                        filename=file_path, local_dir=local_dir)


# Download test dataset for the example
repo_id = "NUS-UAL/zensvi_test_data" # the test dataset repo
repo_type = "dataset" # required by the API when the repo is a dataset
folder_path = "input/depth_point_cloud/" # the specific data
local_dir = "zensvi_example_data/" # the local folder in your computer where it will be downloaded

# By default, huggingface download them to the .cache/huggingface folder
download_folder(repo_id, repo_type, folder_path, local_dir)

Depth Estimation

Depth Anything V2 is applied in ZenSVI to infer both absolute and relative depth based on Street View Imagery (SVI). Here we attempt the absolute depth inference.

from zensvi.cv import DepthEstimator

depth_estimator = DepthEstimator(
    device="cpu",  # device to use (either "cpu" or "gpu")
    task="absolute" # task to perform (either "relative" or "absolute")
)

dir_input = "zensvi_example_data/input/depth_point_cloud/images/color"
dir_image_output = "zensvi_example_data/input/depth_point_cloud/images/depth" # estimated depth map
depth_estimator.estimate_depth(
    dir_input,
    dir_image_output
)
xFormers not available
xFormers not available
Using cpu
Estimating depth: 100%|██████████| 2/2 [00:11<00:00,  5.51s/it]

Point Cloud Reconstruction

Part 1: Define the PointCloudProcessor

# Assuming the class PointCloudProcessor is defined as in the previous block or imported successfully
# Initialize the processor with paths to your image (color and depth) folders
processor = PointCloudProcessor(
    image_folder='zensvi_example_data/input/depth_point_cloud/images/color',
    depth_folder='zensvi_example_data/input/depth_point_cloud/images/depth'
)

We can visualize a color image and corresponding depth image from the two folders.

import cv2
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

color_path = Path('zensvi_example_data/input/depth_point_cloud/images/color/VSsVjWlr4orKerabFRy-dQ.jpg')
depth_path = Path('zensvi_example_data/input/depth_point_cloud/images/depth/VSsVjWlr4orKerabFRy-dQ.tiff')

# Read color image
raw_img = cv2.imread(str(color_path))
# Read depth image
depth_img = cv2.imread(str(depth_path), cv2.IMREAD_UNCHANGED)
depth = depth_img.astype(np.float32)
# Normalize depth map to 0-1 range
depth_normalized = (depth - depth.min()) / (depth.max() - depth.min())

# Create vertical layout subplots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))

# Display original image
ax1.imshow(cv2.cvtColor(raw_img, cv2.COLOR_BGR2RGB))
ax1.set_title('Original Image')
ax1.axis('off')

# Display depth map
ax2.imshow(depth_normalized, cmap='jet')
ax2.set_title('Depth Map')
ax2.axis('off')

plt.tight_layout()
plt.show()

# Print depth map information
print(f"Depth map dimensions: {depth.shape}")
print(f"Depth value range: [{depth.min()}, {depth.max()}]")

Part 2: Reconstruct point clouds from single panorama and absolute depth

import cv2
import torch
import sys
import pandas as pd


# Convert single group of images to point cloud
pcd = processor.convert_to_point_cloud(depth, raw_img, depth_max=None, use_absolute_depth=True)

# Visualize point cloud using plotly
processor.visualize_point_cloud(
    pcd,
    marker_size=1,
    opacity=0.8,
    camera_eye=dict(x=0, y=0, z=-1),
    camera_up=dict(x=0, y=-1, z=0)
)

Part 3: Reconstruct from multiple panorama with metadata

ZenSVI also support convert multiple SVI inputs to point clouds, within the help of indexing file.
An example dataframe is shown as below.
The image id is the only necessary attributes for indexing color and depth image when generating single point cloud. Besides the image id, possible metadata includes: image angle (‘heading’), and the real-world coordinates of image (‘x_proj’,’y_proj’), depending on the availability. They are useful in processing multiple images, and aligning point clouds generated.

# input the images
import pandas as pd
data = pd.read_csv('zensvi_example_data/input/depth_point_cloud/meta_data.csv')
data
Unnamed: 0 year month lat lon id heading geometry y_proj x_proj
0 0 2018 8 40.773640 -73.954823 Y2y7An1aRCeA5Y4nW7ITrg 3.627108 POINT (-8232613.214232705 4979010.676803163) -8232613.214 4979010.677
1 1 2019 5 40.775753 -73.956686 VSsVjWlr4orKerabFRy-dQ 5.209303 POINT (-8232820.629621736 4979321.30902424) -8232820.630 4979321.309

We can load images as dictionary of array according to the indexing dataframe.

# load all the images based on the datafrome
images = processor._load_images(data)
images
{'Y2y7An1aRCeA5Y4nW7ITrg': {'depth': array([[35, 35, 35, ..., 32, 32, 31],
         [36, 35, 35, ..., 32, 33, 32],
         [36, 35, 34, ..., 32, 33, 33],
         ...,
         [ 6,  6,  6, ...,  6,  6,  6],
         [ 6,  6,  6, ...,  6,  6,  6],
         [ 6,  6,  6, ...,  6,  6,  6]], dtype=uint8),
  'color': array([[[132, 171, 230],
          [132, 171, 230],
          [132, 171, 230],
          ...,
          [130, 173, 226],
          [130, 173, 226],
          [130, 173, 226]],
  
         [[132, 171, 230],
          [132, 171, 230],
          [132, 171, 230],
          ...,
          [130, 173, 226],
          [130, 173, 226],
          [130, 173, 226]],
  
         [[131, 171, 230],
          [131, 171, 230],
          [131, 171, 230],
          ...,
          [130, 173, 228],
          [130, 173, 228],
          [130, 173, 228]],
  
         ...,
  
         [[103, 107, 118],
          [106, 110, 121],
          [111, 115, 126],
          ...,
          [ 98, 107, 116],
          [ 98, 107, 116],
          [ 97, 106, 115]],
  
         [[111, 113, 125],
          [112, 114, 126],
          [114, 116, 128],
          ...,
          [106, 113, 121],
          [105, 112, 120],
          [104, 111, 119]],
  
         [[133, 135, 147],
          [133, 135, 147],
          [133, 135, 147],
          ...,
          [136, 143, 151],
          [136, 143, 151],
          [136, 143, 151]]], dtype=uint8)},
 'VSsVjWlr4orKerabFRy-dQ': {'depth': array([[30, 30, 30, ..., 30, 30, 29],
         [29, 29, 29, ..., 29, 29, 28],
         [29, 29, 29, ..., 29, 29, 29],
         ...,
         [ 6,  6,  6, ...,  6,  6,  6],
         [ 6,  6,  6, ...,  6,  6,  6],
         [ 6,  6,  6, ...,  6,  6,  6]], dtype=uint8),
  'color': array([[[249, 255, 251],
          [249, 255, 251],
          [249, 255, 251],
          ...,
          [254, 255, 255],
          [254, 255, 255],
          [254, 255, 255]],
  
         [[245, 255, 255],
          [243, 255, 255],
          [245, 255, 255],
          ...,
          [248, 255, 255],
          [247, 255, 255],
          [247, 255, 255]],
  
         [[223, 247, 255],
          [221, 248, 255],
          [223, 247, 255],
          ...,
          [229, 249, 255],
          [228, 248, 255],
          [228, 248, 255]],
  
         ...,
  
         [[ 75,  60,  57],
          [ 78,  63,  60],
          [ 79,  64,  61],
          ...,
          [ 65,  49,  49],
          [ 66,  50,  50],
          [ 66,  50,  50]],
  
         [[ 80,  65,  62],
          [ 83,  68,  65],
          [ 83,  68,  65],
          ...,
          [ 66,  50,  50],
          [ 67,  51,  51],
          [ 68,  52,  52]],
  
         [[ 87,  72,  69],
          [ 89,  74,  71],
          [ 90,  75,  72],
          ...,
          [ 67,  51,  51],
          [ 69,  53,  53],
          [ 69,  53,  53]]], dtype=uint8)}}

Generate point clouds from specific image in the dataframe.

image_id = 'Y2y7An1aRCeA5Y4nW7ITrg'

depth_img = images[image_id]["depth"]
depth = depth_img.astype(np.float32)
color_img = images[image_id]["color"]

pcd = processor.convert_to_point_cloud(depth_img, color_img, depth_max=None, use_absolute_depth=True)
pcd
PointCloud with 131072 points.

We can also process multiple images with loop and apply some point cloud processing steps.

# Generate point clouds from all the images in the dataframe
point_clouds = processor.process_multiple_images(data,depth_max=None, use_absolute_depth=True)
point_clouds
[PointCloud with 131072 points., PointCloud with 131072 points.]

With the point clouds generated, we can further modify and clean them. The processing steps include:

  • relocate the point clouds to their real-world coordinates;

  • align the point clouds according to the ‘heading’ information stored with SVI;

  • crop the point clouds based on a self-defined 3D bounding box (to remove unnecessary part)

The part will be improved with more functions and more explict control.

# Optionally, transform the point clouds
transformed_clouds = []
for i, pcd in enumerate(point_clouds):
    origin_x = data.at[i, 'x_proj'] 
    origin_y = data.at[i, 'y_proj'] 
    angle = data.at[i, 'heading']
    box_extent = [100, 100, 100]  # Example box dimensions
    box_center = [origin_x, origin_y, 1]  # Example box center
    transformed_pcd = processor.transform_point_cloud(pcd, origin_x, origin_y, angle, box_extent, box_center) # crop and transform the point clouds with the parameters
    transformed_clouds.append(transformed_pcd)
transformed_clouds
[PointCloud with 108589 points., PointCloud with 119811 points.]

Similarly, we can visualize the transformed point clouds in 3d with plotly library.

# Visualize the second transformed point cloud (for demonstration)
# Visualize point cloud using plotly
processor.visualize_point_cloud(
    transformed_clouds[1],
    marker_size=1,
    opacity=0.8,
    camera_eye=dict(x=0, y=0, z=-1),
    camera_up=dict(x=0, y=-1, z=0)
)

Part 5: Save point cloud in different formats

The generated point clouds can be saved to local in different formats.

point_clouds = processor.process_multiple_images(data,depth_max=None,use_absolute_depth=True, output_dir='zensvi_example_data/output/pointclouds', save_format="pcd")
point_clouds = processor.process_multiple_images(data,depth_max=None,use_absolute_depth=True, output_dir='zensvi_example_data/output/pointclouds', save_format="ply")
point_clouds = processor.process_multiple_images(data,depth_max=None,use_absolute_depth=True, output_dir='zensvi_example_data/output/pointclouds', save_format='npz')
point_clouds = processor.process_multiple_images(data,depth_max=None,use_absolute_depth=True, output_dir='zensvi_example_data/output/pointclouds', save_format='csv')

Part 6: Point Cloud Reconstruction via VGGT

As an intial attempt, we have integrate Visual Geometry Grounded Transformer (VGGT) into ZenSVI and are exploring a different way to reconstruct street scenes with SVI.

import torch
import sys
import os
import numpy as np
from pathlib import Path
import cv2

# Add necessary paths
# Construct the path to the src folder relative to the notebook location
src_path = os.path.normpath(os.path.join(notebook_dir, '../../src/zensvi/transform'))

# Add the src folder to sys.path
if src_path not in sys.path:
    sys.path.insert(0, src_path)


from image_to_pointcloud_vggt import VGGTProcessor

# Initialize VGGT processor
vggt_processor = VGGTProcessor(
)

# Set input image path
# Automatically get all images from the image folder
image_folder = "zensvi_example_data/input/depth_point_cloud/images/perspective"
image_extensions = [".png", ".jpg", ".jpeg"]
image_names = []

for ext in image_extensions:
    image_names.extend(list(Path(image_folder).glob(f"*{ext}")))
    image_names.extend(list(Path(image_folder).glob(f"*{ext.upper()}")))


dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16

# Process images and generate point cloud
with torch.no_grad():
    with torch.cuda.amp.autocast(dtype=dtype):
        # Use VGGT processor to generate depth map and point cloud
        predictions = vggt_processor.process_images(image_names)
        points_centered, colors_flat, conf_flat, cam_to_world = vggt_processor.generate_point_cloud(predictions)
=== VGGT Processor Initialization Started ===
VGGT Path: /data/zicheng/zensvi_update/20250616_updates/ZenSVI/src/zensvi/transform/vggt
Added VGGT path to sys.path
Utils Path: /data/zicheng/zensvi_update/20250616_updates/ZenSVI/src/zensvi/transform/vggt/vggt/utils
Added Utils path to sys.path
Using Device: cuda
Data Type: torch.bfloat16
Loading VGGT model with local cache: /data/zicheng/zensvi_update/20250616_updates/ZenSVI/models
=== VGGT Processor Initialization Completed ===
Warning: Found images with different shapes: {(476, 518), (504, 518), (462, 518), (490, 518), (448, 518), (518, 518)}
vggt_processor.visualize_point_cloud(
    points_centered, colors_flat,
    marker_size=1,
    opacity=0.8,
    camera_eye=dict(x=0, y=0, z=-1),
    camera_up=dict(x=0, y=-1, z=0)
)