Source code for simvx.graphics.renderer.particle_compute

"""GPU compute shader particle simulation.

Dispatches a compute shader to update particle positions, velocities, lifetimes,
and visual properties entirely on the GPU — avoiding per-frame CPU-to-GPU uploads
for particle data.
"""

import logging
from typing import Any

import numpy as np
import vulkan as vk

from ..gpu.descriptors import (
    DescriptorWriteBatch,
    allocate_descriptor_set,
    create_descriptor_set_layout,
    create_pool_for_types,
)
from ..gpu.memory import create_buffer, upload_numpy
from ..gpu.pipeline_compute import create_compute_pipeline

__all__ = ["ParticleCompute"]

log = logging.getLogger(__name__)

# Must match PARTICLE_DTYPE from core/particles.py (16 floats x 4 bytes = 64 bytes)
_PARTICLE_GPU_STRIDE = 16 * 4
_WORKGROUP_SIZE = 256
# VK_WHOLE_SIZE as unsigned uint64 — the vulkan Python package exposes it as -1 (signed),
# which triggers OverflowError when assigned to a cffi unsigned field.
_VK_WHOLE_SIZE_U64 = 0xFFFFFFFFFFFFFFFF

# Push constant layout (must match particle_sim.comp):
#   vec3  emitter_pos      (12) + float dt          (4)  = 16
#   vec3  gravity          (12) + float damping      (4)  = 16
#   vec3  initial_velocity (12) + float vel_spread   (4)  = 16
#   vec4  start_colour      (16)                           = 16
#   vec4  end_colour        (16)                           = 16
#   float start_scale      (4)  + float end_scale    (4)
#     + float emission_radius(4) + uint max_particles (4) = 16
#   uint  frame_seed       (4)  + 3x uint pad        (12) = 16
# Total = 112 bytes
_PUSH_CONSTANT_SIZE = 112


[docs]
class ParticleCompute:
    """GPU-based particle simulation via Vulkan compute shader.

    Creates a compute pipeline that updates particle state (position, velocity,
    colour, scale, lifetime) in an SSBO. The same SSBO can be bound by the
    graphics particle pass for zero-copy rendering.
    """

    def __init__(self, engine: Any):
        self._engine = engine
        self._max_particles: int = 0
        self._frame_counter: int = 0

        # GPU resources
        self._particle_buf: Any = None
        self._particle_mem: Any = None
        self._compute_pipeline: Any = None
        self._compute_layout: Any = None
        self._compute_module: Any = None
        self._desc_layout: Any = None
        self._desc_pool: Any = None
        self._desc_set: Any = None
        # Graphics-stage descriptor pool + set used by ``render()``. Separate
        # from the compute pool so the graphics pipeline layout can bind the
        # same SSBO with its own ``ParticlePass._ssbo_layout`` handle.
        self._graphics_desc_pool: Any = None
        self._graphics_desc_set: Any = None
        self._ready = False


[docs]
    def setup(self, max_particles: int = 65536) -> None:
        """Create compute pipeline, SSBO, and descriptor set.

        Args:
            max_particles: Maximum number of particles in the simulation buffer.
        """
        self._max_particles = max_particles
        e = self._engine
        device = e.ctx.device
        phys = e.ctx.physical_device

        # Particle SSBO — device-local with host-visible for initial upload
        buf_size = max_particles * _PARTICLE_GPU_STRIDE
        self._particle_buf, self._particle_mem = create_buffer(
            device,
            phys,
            buf_size,
            vk.VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
            vk.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | vk.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
        )

        # Descriptor set layout — single SSBO binding for compute
        self._desc_layout = create_descriptor_set_layout(device, [
            (0, vk.VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, vk.VK_SHADER_STAGE_COMPUTE_BIT, 1),
        ])
        self._desc_pool = create_pool_for_types(
            device, {vk.VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: 1},
        )
        self._desc_set = allocate_descriptor_set(device, self._desc_pool, self._desc_layout)
        with DescriptorWriteBatch(device) as batch:
            batch.ssbo(self._desc_set, 0, self._particle_buf, buf_size)

        self._compute_pipeline, self._compute_layout, self._compute_module = create_compute_pipeline(
            device, e.shader_dir / "particle_sim.comp", [self._desc_layout], _PUSH_CONSTANT_SIZE,
        )

        # Initialize buffer to zero (all particles dead)
        zeros = np.zeros(max_particles * _PARTICLE_GPU_STRIDE // 4, dtype=np.float32)
        # Set lifetime=1.0 and age=999.0 so all particles are "dead" and will respawn
        particle_floats = zeros.reshape(max_particles, 16)
        particle_floats[:, 13] = 1.0  # lifetime
        particle_floats[:, 14] = 999.0  # age > lifetime → dead
        upload_numpy(device, self._particle_mem, zeros)

        self._ready = True
        log.debug("Particle compute initialized (max %d particles)", max_particles)



[docs]
    def dispatch(self, cmd: Any, dt: float, emitter_config: dict) -> None:
        """Dispatch the compute shader to simulate one step.

        Args:
            cmd: Active command buffer (must be outside a render pass).
            dt: Delta time in seconds.
            emitter_config: Dict with emitter parameters:
                - emitter_pos: (x, y, z)
                - gravity: (x, y, z)
                - damping: float
                - initial_velocity: (x, y, z)
                - velocity_spread: float
                - start_colour: (r, g, b, a)
                - end_colour: (r, g, b, a)
                - start_scale: float
                - end_scale: float
                - emission_radius: float
        """
        if not self._ready:
            return

        self._frame_counter += 1

        # Build push constants (112 bytes = 28 floats/uints)
        pc = np.zeros(28, dtype=np.float32)

        pos = emitter_config.get("emitter_pos", (0.0, 0.0, 0.0))
        pc[0:3] = pos
        pc[3] = dt

        grav = emitter_config.get("gravity", (0.0, -9.8, 0.0))
        pc[4:7] = grav
        pc[7] = float(emitter_config.get("damping", 0.0))

        vel = emitter_config.get("initial_velocity", (0.0, 5.0, 0.0))
        pc[8:11] = vel
        pc[11] = float(emitter_config.get("velocity_spread", 0.3))

        sc = emitter_config.get("start_colour", (1.0, 1.0, 1.0, 1.0))
        pc[12:16] = sc

        ec = emitter_config.get("end_colour", (1.0, 1.0, 1.0, 0.0))
        pc[16:20] = ec

        pc[20] = float(emitter_config.get("start_scale", 1.0))
        pc[21] = float(emitter_config.get("end_scale", 0.0))
        pc[22] = float(emitter_config.get("emission_radius", 1.0))

        # max_particles and frame_seed as uint32 — reinterpret float bits
        uint_view = pc.view(np.uint32)
        uint_view[23] = self._max_particles
        uint_view[24] = self._frame_counter
        # pad slots 25-27 are already zero

        pc_bytes = pc.tobytes()
        ffi = vk.ffi
        cbuf = ffi.new("char[]", pc_bytes)

        # Memory barrier: ensure previous frame's compute writes are visible
        barrier = ffi.new("VkBufferMemoryBarrier*")
        barrier.sType = vk.VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER
        barrier.srcAccessMask = vk.VK_ACCESS_SHADER_WRITE_BIT
        barrier.dstAccessMask = vk.VK_ACCESS_SHADER_READ_BIT | vk.VK_ACCESS_SHADER_WRITE_BIT
        barrier.srcQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED
        barrier.dstQueueFamilyIndex = vk.VK_QUEUE_FAMILY_IGNORED
        barrier.buffer = self._particle_buf
        barrier.offset = 0
        barrier.size = _VK_WHOLE_SIZE_U64

        vk.vkCmdPipelineBarrier(
            cmd,
            vk.VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
            vk.VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
            0,
            0,
            None,
            1,
            [barrier[0]],
            0,
            None,
        )

        # Bind compute pipeline
        vk.vkCmdBindPipeline(cmd, vk.VK_PIPELINE_BIND_POINT_COMPUTE, self._compute_pipeline)
        vk.vkCmdBindDescriptorSets(
            cmd,
            vk.VK_PIPELINE_BIND_POINT_COMPUTE,
            self._compute_layout,
            0,
            1,
            [self._desc_set],
            0,
            None,
        )
        vk._vulkan.lib.vkCmdPushConstants(
            cmd,
            self._compute_layout,
            vk.VK_SHADER_STAGE_COMPUTE_BIT,
            0,
            len(pc_bytes),
            cbuf,
        )

        # Dispatch enough workgroups to cover all particles
        group_count = (self._max_particles + _WORKGROUP_SIZE - 1) // _WORKGROUP_SIZE
        vk.vkCmdDispatch(cmd, group_count, 1, 1)

        # Barrier: compute writes → vertex shader reads
        barrier.srcAccessMask = vk.VK_ACCESS_SHADER_WRITE_BIT
        barrier.dstAccessMask = vk.VK_ACCESS_SHADER_READ_BIT
        vk.vkCmdPipelineBarrier(
            cmd,
            vk.VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
            vk.VK_PIPELINE_STAGE_VERTEX_SHADER_BIT,
            0,
            0,
            None,
            1,
            [barrier[0]],
            0,
            None,
        )



[docs]
    def get_particle_ssbo(self) -> Any:
        """Return the particle SSBO buffer handle for use by the rendering pass."""
        return self._particle_buf



[docs]
    def render(
        self,
        cmd: Any,
        particle_pass: Any,
        view_proj: np.ndarray,
        camera_right: np.ndarray,
        camera_up: np.ndarray,
        extent: tuple[int, int],
    ) -> None:
        """Draw the GPU-simulated particles using the shared billboard pipeline.

        Reuses the same graphics pipeline as ``ParticlePass`` (identical
        particle.vert / particle.frag shaders) but binds our own compute-
        owned SSBO in place of the pass's CPU-uploaded one. A lazy-allocated
        descriptor set matching ``ParticlePass._ssbo_layout`` points at the
        compute buffer so no per-frame upload is needed.
        """
        if not self._ready or particle_pass is None or not particle_pass._ready:
            return

        if self._graphics_desc_set is None:
            self._graphics_desc_set = _allocate_graphics_ssbo_set(
                self._engine.ctx.device,
                particle_pass._ssbo_layout,
                self._particle_buf,
                self._max_particles * _PARTICLE_GPU_STRIDE,
            )

        # Push constants: mat4 view_proj (64) + vec3 camera_right + pad (16)
        # + vec3 camera_up + pad (16) = 96 bytes. Same layout as ParticlePass.
        pc = np.zeros(24, dtype=np.float32)
        pc[:16] = view_proj.T.ravel()
        pc[16:19] = camera_right
        pc[20:23] = camera_up
        pc_bytes = pc.tobytes()
        ffi = vk.ffi
        cbuf = ffi.new("char[]", pc_bytes)

        vk_vp = vk.VkViewport(
            x=0.0, y=0.0,
            width=float(extent[0]), height=float(extent[1]),
            minDepth=0.0, maxDepth=1.0,
        )
        vk.vkCmdSetViewport(cmd, 0, 1, [vk_vp])
        scissor = vk.VkRect2D(
            offset=vk.VkOffset2D(x=0, y=0),
            extent=vk.VkExtent2D(width=extent[0], height=extent[1]),
        )
        vk.vkCmdSetScissor(cmd, 0, 1, [scissor])

        vk.vkCmdBindPipeline(cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, particle_pass._pipeline)
        vk.vkCmdBindDescriptorSets(
            cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, particle_pass._pipeline_layout,
            0, 1, [self._graphics_desc_set], 0, None,
        )
        vk._vulkan.lib.vkCmdPushConstants(
            cmd, particle_pass._pipeline_layout,
            vk.VK_SHADER_STAGE_VERTEX_BIT,
            0, len(pc_bytes), cbuf,
        )
        vk.vkCmdDraw(cmd, self._max_particles * 6, 1, 0, 0)



[docs]
    def get_particle_memory(self) -> Any:
        """Return the particle SSBO memory handle."""
        return self._particle_mem



[docs]
    @property
    def max_particles(self) -> int:
        return self._max_particles



[docs]
    @property
    def ready(self) -> bool:
        return self._ready



[docs]
    def upload_initial_particles(self, particles: np.ndarray) -> None:
        """Seed the GPU buffer with CPU-generated particle data.

        Args:
            particles: Numpy array with dtype matching PARTICLE_DTYPE.
                       Length must not exceed max_particles.
        """
        if not self._ready:
            return
        count = min(len(particles), self._max_particles)
        if count == 0:
            return
        upload_numpy(self._engine.ctx.device, self._particle_mem, particles[:count])
        log.debug("Uploaded %d initial particles to GPU", count)



[docs]
    def cleanup(self) -> None:
        """Destroy all GPU resources."""
        if not self._ready:
            return
        device = self._engine.ctx.device
        for obj, fn in [
            (self._compute_pipeline, vk.vkDestroyPipeline),
            (self._compute_layout, vk.vkDestroyPipelineLayout),
            (self._compute_module, vk.vkDestroyShaderModule),
            (self._desc_layout, vk.vkDestroyDescriptorSetLayout),
            (self._desc_pool, vk.vkDestroyDescriptorPool),
        ]:
            if obj:
                fn(device, obj, None)
        if self._particle_buf:
            vk.vkDestroyBuffer(device, self._particle_buf, None)
        if self._particle_mem:
            vk.vkFreeMemory(device, self._particle_mem, None)
        self._ready = False
        log.debug("Particle compute resources cleaned up")



def _allocate_graphics_ssbo_set(device: Any, layout: Any, buf: Any, size: int) -> Any:
    """Allocate a descriptor set pointing at ``buf`` with the given layout.

    Used by ``ParticleCompute.render`` to bind the compute-owned particle
    buffer into the graphics particle pipeline without touching ParticlePass's
    own descriptor set (which references the CPU-uploaded buffer).
    """
    pool = create_pool_for_types(device, {vk.VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: 1})
    desc_set = allocate_descriptor_set(device, pool, layout)
    with DescriptorWriteBatch(device) as batch:
        batch.ssbo(desc_set, 0, buf, size)
    return desc_set