Graphics Programming Virtual Meetup

Discord

https://discord.gg/6TTRA5h

Twitter

https://twitter.com/GraphicMeetup

Youtube Channel

https://www.youtube.com/channel/UCbX05PBAE-582PYaRXdjRnw/

CUDA Flocking Simulation

Credits

University of Pennsylvania CIS 565 2020 Project 1
"Efficient Neighbor Searching for Agent-based
Simulation on GPU"

Boid Simulation

Rule 1: Separation

Separation Pseudocode

def seperation(boid: Boid, boids: Boid[]) {
  c: Vec3 = 0

  for (b : boids) {
    if b != boid and distance(b, boid) < seperation_distance
      c -= (neighbor.position - boid.position)
  }

  return c * seperation_scale
}

Rule 2: Alignment

Alignment Pseudocode

def alignment(boid: Boid, boids: Boid[]) {
  perceived_velocity: Vec3 = 0
  neighbors_count = 0

  for (b : boids) {
    if b != boid and distance(b, boid) < alignment_distance
      perceived_velocity += b.velocity - boid.velocity
      ++neighbors_count
  }
  
  if (neighbors_count >= 0) perceived_velocity /= neighbors_count

  return perceived_velocity * alignment_scale
}

Rule 3: Cohesion

Cohesion Pseudocode

def cohesion(boid: Boid, boids: Boid[]) {
  center_of_mass: Vec3 = 0
  neighbors_count = 0

  for (b : boids) {
    if b != boid and distance(b, boid) < cohesion_distance
      center_of_mass += b.position
      ++neighbors_count
  }
  
  if (neighbors_count > 0) {
    center_of_mass /= neighbors_count
    return (center_of_mass - boid.position) * cohesion_scale
  }

  return 0
}

Naive Implementation: Pseudocode

def step_naive(pos, vel1, vel2) {
  for parallel (boid : boids) {
    new_vel = vel1[boid];
    for (others : boids) {
      // Accumulate for new_vel
    }
    vel2[boid] = new_vel;
  }

  update_pos(pos, vel2);
  
  swap(vel1, vel2);
}

Naive Implementation: Pseudocode

def step_naive(pos, vel1, vel2) {
  for parallel (boid : boids) {
    new_vel = vel1[boid];
    for (others : boids) {
      // Accumulate for new_vel
    }
    vel2[boid] = new_vel;
  }

  update_pos(pos, vel2);
  
  swap(vel1, vel2);
}

Explicit Euler

Naive Implementation: Pseudocode

def step_naive(pos, vel1, vel2) {
  for parallel (boid : boids) {
    new_vel = vel1[boid];
    for (others : boids) {
      // Accumulate for new_vel
    }
    vel2[boid] = new_vel;
  }

  update_pos(pos, vel2);
  
  swap(vel1, vel2);
}

Buffer Ping-Ponging

Uniform Grid

Grid Search

How to store the grid?

Sort boids according to grid

Sort:

Thrust Library to Rescue

thrust::sort_by_key(grid_indices,
                    grid_indices + objects_count,
                    boid_indices);

Sort:

Note: Data not sorted

Sort:

Indirection:

The first and last boid of each grid

Uniform Grid: Pseudocode

def step_uniform_grid(pos, vel1, vel2) {
  indices, grid_indices = compute_indices();
  
  sort_by_key(grid_indices, indices);
  
  grid_first, grid_last = identify_first_last(grid_indices);

  
  for parallel (boid : boids) {
	neighbor_grid_cells = calculate_neighbor_grid_cells(boid);

    new_vel = vel1[boid];
    for (cell : neighbor_grid_cells) {
      for (neighbor from grid_first[cell] to grid_last[cell]) {
        neighbor_pos = pos[indices[neighbor]];
        neighbor_vel = vel[indices[neighbor]];
        // Accumulate for new_vel
      }
    }
    vel2[boid] = new_vel;
  }

  update_pos(pos, vel2);
  swap(vel1, vel2);
}

Cutting the middleman

thrust::copy(pos, pos + objects_count, pos_sorted);
thrust::copy(vel, vel + objects_count, vel_sorted);
thrust::sort_by_key(grid_indices,
                    grid_indices + objects_count,
                    pos_sorted,
                    vel_sorted);

Cutting the middleman

thrust::copy(pos, pos + objects_count, pos_sorted);
thrust::copy(vel, vel + objects_count, vel_sorted);
thrust::sort_by_key(grid_indices,
                    grid_indices + objects_count,
                    pos_sorted,
                    vel_sorted);

Cutting the middleman

thrust::sort_by_key(grid_indices,
                    grid_indices + objects_count,
                    indices);
thrust::gather(indices, indices + objects_count, pos, pos_sorted);
thrust::gather(indices, indices + objects_count, vel, vel_sorted);

Coherent Grid: Pseudocode

def step_coherent_grid(pos, vel1, vel2) {
  indices, grid_indices = compute_indices();
  
  pos_sorted, vel_sorted = pos, vel1
  sort_by_key(grid_indices, pos_sorted, vel_sorted);
  
  grid_first, grid_last = identify_first_last(grid_indices);

  
  for parallel (boid : boids) {
	neighbor_grid_cells = calculate_neighbor_grid_cells(boid);

    new_vel = vel1[boid];
    for (cell : neighbor_grid_cells) {
      for (neighbor from grid_first[cell] to grid_last[cell]) {
        neighbor_pos = pos_sorted[neighbor];
        neighbor_vel = vel_sorted[neighbor];
        // Accumulate for new_vel
      }
    }
    vel2[boid] = new_vel;
  }

  update_pos(pos, vel2);
  swap(vel1, vel2);
  swap(pos, pos_sorted);
}

Shared-memory Optimization

CUDA Thread Hierarchy

Credit: docs.nvidia.com/cuda/cuda-c-programming-guide/graphics/grid-of-thread-blocks.png

CUDA Memory

Credit: https://www.3dgep.com/cuda-memory-model/

CUDA Memory

Slow

Credit: https://www.3dgep.com/cuda-memory-model/

CUDA Memory

Slow

Credit: https://www.3dgep.com/cuda-memory-model/

Faster (Read-Only on Device)

CUDA Memory

Slow

Credit: https://www.3dgep.com/cuda-memory-model/

Faster (Read-Only on Device)

Fast

shovel all neighbor grids of a warp to shared memory

Shared Grid Pseudocode

def step_shared_grid(pos, vel1, vel2) {
  // same

  for parallel (warp: warps) {
    while (not process all neighbors) {
    
      // calculate all neighbor grid cells in a wrap
      
      // copy some boids in neighbor cells to shared memory based on capacity

      for parallel boids in wrap {
        for (neighbor from shared_first to shared_last) {
          // Accumulate for new_vel
        }
        vel2[boid] = new_vel;
      }
    }
 }
  
  // same
}

Live Demo

Graphics Programming Virtual Meetup