probcomp · horizon-blue · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025
diff --git a/genmetaballs/src/cuda/bindings.cu b/genmetaballs/src/cuda/bindings.cu
@@ -253,10 +253,22 @@ template <MemoryLocation location>
 void bind_fmb_scene(nb::module_& m, const char* name) {
     nb::class_<FMBScene<location>>(m, name)
         .def(nb::init<size_t>(), nb::arg("size"))
+        .def(nb::init<const std::vector<FMB>&, const std::vector<float>&>(), nb::arg("fmbs"),
+             nb::arg("log_weights"),
+             "Construct FMBScene from a list of FMBs and corresponding log weights")
         .def_prop_ro("size", &FMBScene<location>::size)
         .def("__len__", &FMBScene<location>::size)
-        .def("__getitem__", &FMBScene<location>::get_fmb, nb::arg("idx"),
-             "Get the (FMB, log_weight) tuple at index i")
+        .def(
+            "__getitem__",
+            // Convert cuda::std::tuple to std::tuple for nanobind
+            [](const FMBScene<location>& scene, size_t idx) {
+                const auto& [fmb, log_weight] = scene[idx];
+                // for device data, the types would be thrust::device_reference, which cannot be
+                // returned directly to Python. The static cast forces a copy (to host) to be made.
+                return std::make_tuple(static_cast<const FMB&>(fmb),
+                                       static_cast<const float&>(log_weight));
+            },
+            "Get the (FMB, log_weight) tuple at index i")
         .def("__repr__", [=](const FMBScene<location>& scene) {
             return nb::str("{}(size={})").format(name, scene.size());
         });

diff --git a/genmetaballs/src/cuda/core/fmb.cu b/genmetaballs/src/cuda/core/fmb.cu
@@ -15,25 +15,3 @@ CUDA_CALLABLE float FMB::quadratic_form(const Vec3D vec) const {
     const auto shifted_vec = vec - get_mean();
     return dot(shifted_vec, cov_inv_apply(shifted_vec));
 }
-
-template <>
-__host__ FMBScene<MemoryLocation::HOST>::FMBScene(size_t size)
-    : fmbs_{new FMB[size]}, log_weights_{new float[size]}, size_{size} {}
-
-template <>
-__host__ FMBScene<MemoryLocation::DEVICE>::FMBScene(size_t size) : size_{size} {
-    CUDA_CHECK(cudaMalloc(&fmbs_, size * sizeof(FMB)));
-    CUDA_CHECK(cudaMalloc(&log_weights_, size * sizeof(float)));
-}
-
-template <>
-__host__ FMBScene<MemoryLocation::HOST>::~FMBScene() {
-    delete[] fmbs_;
-    delete[] log_weights_;
-}
-
-template <>
-__host__ FMBScene<MemoryLocation::DEVICE>::~FMBScene() {
-    CUDA_CHECK(cudaFree(fmbs_));
-    CUDA_CHECK(cudaFree(log_weights_));
-}
diff --git a/genmetaballs/src/cuda/core/fmb.cuh b/genmetaballs/src/cuda/core/fmb.cuh
@@ -2,7 +2,12 @@
 
 #include <cuda/std/span>
 #include <cuda/std/tuple>
+#include <cuda_runtime.h>
 #include <stdexcept>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <vector>
 
 #include "geometry.cuh"
 #include "utils.cuh"
@@ -46,74 +51,49 @@ public:
 template <MemoryLocation location>
 class FMBScene {
 private:
-    FMB* fmbs_;
-    float* log_weights_;
+    // Host memory -> thrust::host_vector
+    // Device memory -> thrust::device_vector
+    template <typename T>
+    using vector_t = std::conditional_t<location == MemoryLocation::HOST, thrust::host_vector<T>,
+                                        thrust::device_vector<T>>;
+
+    vector_t<FMB> fmbs_;
+    vector_t<float> log_weights_;
     size_t size_;
 
 public:
-    __host__ FMBScene(size_t size);
-
-    __host__ ~FMBScene();
-
-    CUDA_CALLABLE cuda::std::tuple<FMB&, float&> operator[](const uint32_t i) {
-        return cuda::std::tie(fmbs_[i], log_weights_[i]);
+    __host__ FMBScene(size_t size) : size_{size}, fmbs_(size), log_weights_(size) {};
+
+    // Copy constructor from std::vector
+    // This enables easy construction from Python side
+    __host__ FMBScene<location>(const std::vector<FMB>& fmbs, const std::vector<float>& log_weights)
+        : size_{fmbs.size()}, fmbs_(fmbs.begin(), fmbs.end()),
+          log_weights_(log_weights.begin(), log_weights.end()) {
+        if (fmbs.size() != log_weights.size()) {
+            throw std::invalid_argument(
+                "FMBScene constructor: fmbs and log_weights must have the same size");
+        }
     }
 
-    CUDA_CALLABLE cuda::std::tuple<const FMB&, const float&> operator[](const uint32_t i) const {
-        return cuda::std::tie(fmbs_[i], log_weights_[i]);
+    CUDA_CALLABLE auto operator[](const uint32_t i) {
+        return cuda::std::make_tuple(fmbs_[i], log_weights_[i]);
     }
 
-    class Iterator {
-    private:
-        FMB* fmb_ptr_;
-        float* log_weight_ptr_;
-
-    public:
-        CUDA_CALLABLE Iterator(FMB* const fmb_ptr, float* const log_weight_ptr)
-            : fmb_ptr_{fmb_ptr}, log_weight_ptr_{log_weight_ptr} {}
-        CUDA_CALLABLE cuda::std::tuple<FMB&, float&> operator*() {
-            return cuda::std::tie(*fmb_ptr_, *log_weight_ptr_);
-        }
-        CUDA_CALLABLE bool operator!=(const Iterator& other) const {
-            return fmb_ptr_ != other.fmb_ptr_ || log_weight_ptr_ != other.log_weight_ptr_;
-        }
-        CUDA_CALLABLE Iterator& operator++() {
-            fmb_ptr_++, log_weight_ptr_++;
-            return *this;
-        }
-    };
-
-    class ConstIterator {
-    private:
-        const FMB* fmb_ptr_;
-        const float* log_weight_ptr_;
-
-    public:
-        CUDA_CALLABLE ConstIterator(const FMB* const fmb_ptr, const float* const log_weight_ptr)
-            : fmb_ptr_{fmb_ptr}, log_weight_ptr_{log_weight_ptr} {}
-        CUDA_CALLABLE cuda::std::tuple<const FMB&, const float&> operator*() const {
-            return cuda::std::tie(*fmb_ptr_, *log_weight_ptr_);
-        }
-        CUDA_CALLABLE bool operator!=(const ConstIterator& other) const {
-            return fmb_ptr_ != other.fmb_ptr_ || log_weight_ptr_ != other.log_weight_ptr_;
-        }
-        CUDA_CALLABLE ConstIterator& operator++() {
-            fmb_ptr_++, log_weight_ptr_++;
-            return *this;
-        }
-    };
+    CUDA_CALLABLE auto operator[](const uint32_t i) const {
+        return cuda::std::make_tuple(fmbs_[i], log_weights_[i]);
+    }
 
-    CUDA_CALLABLE Iterator begin() {
-        return Iterator(fmbs_, log_weights_);
+    CUDA_CALLABLE auto begin() {
+        return thrust::make_zip_iterator(fmbs_.begin(), log_weights_.begin());
     }
-    CUDA_CALLABLE Iterator end() {
-        return Iterator(fmbs_ + size_, log_weights_ + size_);
+    CUDA_CALLABLE auto end() {
+        return thrust::make_zip_iterator(fmbs_.end(), log_weights_.end());
     }
-    CUDA_CALLABLE ConstIterator begin() const {
-        return ConstIterator(fmbs_, log_weights_);
+    CUDA_CALLABLE auto begin() const {
+        return thrust::make_zip_iterator(fmbs_.begin(), log_weights_.begin());
     }
-    CUDA_CALLABLE ConstIterator end() const {
-        return ConstIterator(fmbs_ + size_, log_weights_ + size_);
+    CUDA_CALLABLE auto end() const {
+        return thrust::make_zip_iterator(fmbs_.end(), log_weights_.end());
     }
     CUDA_CALLABLE const FMB& get_fmb(uint32_t idx) const {
         return fmbs_[idx];

diff --git a/genmetaballs/src/genmetaballs/core/__init__.py b/genmetaballs/src/genmetaballs/core/__init__.py
@@ -10,7 +10,7 @@
     TwoParameterConfidence,
     ZeroParameterConfidence,
 )
-from genmetaballs._genmetaballs_bindings.fmb import CPUFMBScene, GPUFMBScene
+from genmetaballs._genmetaballs_bindings.fmb import FMB, CPUFMBScene, GPUFMBScene
 from genmetaballs._genmetaballs_bindings.image import CPUImage, GPUImage
 from genmetaballs._genmetaballs_bindings.utils import CPUFloatArray2D, GPUFloatArray2D, sigmoid
 
@@ -63,6 +63,19 @@ def make_fmb_scene(size: int, device: DeviceType) -> CPUFMBScene | GPUFMBScene:
         raise ValueError(f"Unsupported device type: {device}")
 
 
+# TODO: create a wrapper class for FMBScene and turn the factory functions into
+# class methods
+def make_fmb_scene_from_values(
+    fmbs: list[fmb.FMB], log_weights: list[float], device: DeviceType
+) -> CPUFMBScene | GPUFMBScene:
+    if device == "cpu":
+        return CPUFMBScene(fmbs, log_weights)
+    elif device == "gpu":
+        return GPUFMBScene(fmbs, log_weights)
+    else:
+        raise ValueError(f"Unsupported device type: {device}")
+
+
 __all__ = [
     "array2d_float",
     "ZeroParameterConfidence",
@@ -74,7 +87,10 @@ def make_fmb_scene(size: int, device: DeviceType) -> CPUFMBScene | GPUFMBScene:
     "intersector",
     "sigmoid",
     "FourParameterBlender",
+    "FMB",
+    "Intrinsics",
     "ThreeParameterBlender",
     "make_image",
     "make_fmb_scene",
+    "make_fmb_scene_from_values",
 ]
diff --git a/tests/python_tests/test_fmb.py b/tests/python_tests/test_fmb.py
@@ -3,7 +3,7 @@
 from scipy.spatial.distance import mahalanobis
 from scipy.spatial.transform import Rotation as Rot
 
-from genmetaballs.core import fmb, geometry, make_fmb_scene
+from genmetaballs.core import fmb, geometry, make_fmb_scene, make_fmb_scene_from_values
 
 FMB = fmb.FMB
 Pose, Vec3D, Rotation = geometry.Pose, geometry.Vec3D, geometry.Rotation
@@ -48,3 +48,33 @@ def test_fmb_scene_creation():
     gpu_scene = make_fmb_scene(20, device="gpu")
     assert isinstance(gpu_scene, fmb.GPUFMBScene)
     assert len(gpu_scene) == 20
+
+
+@pytest.mark.parametrize("device", ["cpu", "gpu"])
+def test_fmb_scene_creation_from_lists(rng, device):
+    fmbs = []
+    log_weights = []
+    gt_translations = []
+    gt_extents = []
+    num_balls = 15
+    for _ in range(num_balls):
+        quat = rng.uniform(size=4).astype(np.float32)
+        tran, extent = rng.uniform(size=(2, 3)).astype(np.float32)
+        pose = Pose.from_components(Rotation.from_quat(*quat), Vec3D(*tran))
+        fmbs.append(FMB(pose, *extent))
+        log_weights.append(rng.uniform())
+        gt_translations.append(tran)
+        gt_extents.append(extent)
+
+    scene = make_fmb_scene_from_values(fmbs, log_weights, device=device)
+
+    assert len(scene) == num_balls
+    # Verify that we can retrieve each FMB and log weight correctly
+    for i in range(num_balls):
+        fmb_i, log_weight = scene[i]
+        translation = fmb_i.pose.tran
+        assert np.allclose([translation.x, translation.y, translation.z], gt_translations[i])
+
+        fmb_extent = fmb_i.extent
+        assert np.allclose(fmb_extent, gt_extents[i])
+        assert np.isclose(log_weight, log_weights[i])