MAPS: GPU Memory Abstraction and Optimization Framework
MAPS Framework


MAPS Showcase



Showcase
  • 2-Dimensional Convolution
    __constant__ float kConvKernel[(2 * RADIUS + 1) * (2 * RADIUS + 1)];
    
    __global__ void Conv2(maps::Window2DSingleGPU<float, BW, BH, RADIUS> in,
                          maps::StructuredInjective2DSingleGPU<float, BW, BH> out) {
        MAPS_INIT(in, out);
        
        
        MAPS_FOREACH(oiter, out) {
            int i = 0;
            *oiter = 0.0f;
            
            
            MAPS_FOREACH_ALIGNED(iter, in, oiter) {
                *oiter += *iter * kConvKernel[i++];
            }
        }
        
        out.commit();
    }
    __constant__ float kConvKernel[(2 * RADIUS + 1) * (2 * RADIUS + 1)];
    
    __global__ void Conv2 MAPS_MULTIDEF(maps::Window2D<float, BW, BH, RADIUS> in,
                                        maps::StructuredInjective2D<float, BW, BH> out) {
        MAPS_MULTI_INITVARS(in, out);
        
        
        MAPS_FOREACH(oiter, out) {
            int i = 0;
            *oiter = 0.0f;
            
            
            MAPS_FOREACH_ALIGNED(iter, in, oiter) {
                *oiter += *iter * kConvKernel[i++];
            }
        }
        
        out.commit();
    }
  • Jacobi (with multiple elements per thread)
    template <typename T, int IPX, int IPY>
    __global__ void JacobiMAPSKernel(maps::WindowSingleGPU<T, 2, BW, BH, 1, 1, IPX, IPY> inFrame, 
                                     maps::StructuredInjectiveOutputSingleGPU<T, 2, BW, BH, 1, IPX, IPY> outFrame) {
        MAPS_INIT(inFrame, outFrame);
    
        // If there are no items to write, return
        if (outFrame.Items() == 0)
            return;
    
        
        MAPS_FOREACH(oiter, outFrame) {
            *oiter = (inFrame.aligned_at(oiter, 0, -1) +
                      inFrame.aligned_at(oiter, -1, 0) +
                      inFrame.aligned_at(oiter,  1, 0) +
                      inFrame.aligned_at(oiter, 0,  1)) / T(4);
        }
    
        outFrame.commit();
    }
    template <typename T, int IPX, int IPY>
    __global__ void JacobiMAPSKernel MAPS_MULTIDEF(maps::Window<T, 2, BW, BH, 1, 1, IPX, IPY> inFrame, 
                                                   maps::StructuredInjectiveOutput<T, 2, BW, BH, 1, IPX, IPY> outFrame) {
        MAPS_MULTI_INITVARS(inFrame, outFrame);
    
        // If there are no items to write, return
        if (outFrame.Items() == 0)
            return;
    
        
        MAPS_FOREACH(oiter, outFrame) {
            *oiter = (inFrame.aligned_at(oiter, 0, -1) +
                      inFrame.aligned_at(oiter, -1, 0) +
                      inFrame.aligned_at(oiter,  1, 0) +
                      inFrame.aligned_at(oiter, 0,  1)) / T(4);
        }
    
        outFrame.commit();
    }
  • Histogram
    template <typename T, typename BinT, int BINS>
    __global__ void Histogram(maps::Window2DSingleGPU<T, BW, BH, 0> in,
                              maps::ReductiveStaticSingleGPU<BinT, BW*BH, BINS> out) {
        MAPS_INIT(in, out);
        
        
        MAPS_FOREACH(oiter, out) {
            int bin = *iter.align(oiter);
            ++oiter[bin];
        }
        
        out.commit();
    }
    template <typename T, typename BinT, int BINS>
    __global__ void Histogram MAPS_MULTIDEF(maps::Window2D<T, BW, BH, 0> in,
                                            maps::ReductiveStatic<BinT, BW*BH, BINS> out) {
        MAPS_MULTI_INITVARS(in, out);
        
        
        MAPS_FOREACH(oiter, out) {
            int bin = *iter.align(oiter);
            ++oiter[bin];
        }
        
        out.commit();
    }
  • Game of Life (Advanced API)
    template <typename T, int IPX, int IPY>
    __global__ void GoLMAPSKernel(maps::WindowSingleGPU<T, 2, BW, BH, 1, 1, IPX, IPY> current_gen, 
                                  maps::StructuredInjectiveOutputSingleGPU<T, 2, BW, BH, 1, IPX, IPY> next_gen) {
        
        
        
        // Allocate shared memory for containers
        __shared__ typename decltype(current_gen)::SharedData sdata;
        __shared__ typename decltype(next_gen)::SharedData osdata;
        
        // Initialize containers (pre-synchronization)
        current_gen.init_async(sdata);
        next_gen.init_async(osdata);
        
        // Synchronize thread-blocks
        __syncthreads();
        
        // Initialize containers (post-sync)
        current_gen.init_async_postsync();
        next_gen.init_async_postsync();
    
        // If there are no items to write, return
        if (next_gen.Items() == 0)
            return;
    
        
        MAPS_FOREACH(oiter, next_gen) {
            int numLiveNeighbors = 0;
            int isLive;
    
            // Determine number of live neighbors
            
            MAPS_FOREACH_ALIGNED(iter, current_gen, oiter) {
                if (iter.index() == 4)
                    isLive = *iter;
                else
                    numLiveNeighbors += *iter;
            }
    
            // Game of Life conditions
            if (isLive) {
                if (numLiveNeighbors < 2 || numLiveNeighbors > 3)
                    isLive = 0;
            } else {
                if (numLiveNeighbors == 3)
                    isLive = 1;
            }
    
            // Fill output cell
            *oiter = isLive;
        }
    
        next_gen.commit();
    }
    template <typename T, int IPX, int IPY>
    __global__ void GoLMAPSKernel MAPS_MULTIDEF(maps::Window<T, 2, BW, BH, 1, 1, IPX, IPY> current_gen, 
                                                maps::StructuredInjectiveOutput<T, 2, BW, BH, 1, IPX, IPY> next_gen) {
        // Initialize multi-GPU device abstraction
        MAPS_MULTI_INIT();
        
        // Allocate shared memory for containers
        __shared__ typename decltype(current_gen)::SharedData sdata;
        __shared__ typename decltype(next_gen)::SharedData osdata;
        
        // Initialize containers (pre-synchronization)
        current_gen.init_async(sdata);
        next_gen.init_async(osdata);
        
        // Synchronize thread-blocks
        __syncthreads();
        
        // Initialize containers (post-sync)
        current_gen.init_async_postsync();
        next_gen.init_async_postsync();
    
        // If there are no items to write, return
        if (next_gen.Items() == 0)
            return;
    
        
        MAPS_FOREACH(oiter, next_gen) {
            int numLiveNeighbors = 0;
            int isLive;
    
            // Determine number of live neighbors
            
            MAPS_FOREACH_ALIGNED(iter, current_gen, oiter) {
                if (iter.index() == 4)
                    isLive = *iter;
                else
                    numLiveNeighbors += *iter;
            }
    
            // Game of Life conditions
            if (isLive) {
                if (numLiveNeighbors < 2 || numLiveNeighbors > 3)
                    isLive = 0;
            } else {
                if (numLiveNeighbors == 3)
                    isLive = 1;
            }
    
            // Fill output cell
            *oiter = isLive;
        }
    
        next_gen.commit();
    }
  • N-Body Simulation
    template <int IPX>
    __global__ void NBodyTimeStep MAPS_MULTIDEF(maps::Window1D<float4, BLOCK_WIDTH, 0, IPX> bodies_in,
                                                maps::Block1D<float4, 0, BLOCK_WIDTH, IPX> other_bodies,
                                                maps::StructuredInjective1D<float4, BLOCK_WIDTH, IPX> outBodies,
                                                maps::Window1D<float4, BLOCK_WIDTH, 0, IPX> velocity_in,
                                                maps::StructuredInjective1D<float4, BLOCK_WIDTH, IPX> velocity_out,
                                                float deltaTime) {
        MAPS_MULTI_INITVARS(bodies_in, other_bodies, outBodies, velocity_in, velocity_out);
        
        
        MAPS_FOREACH(oiter, outBodies)          // Initialize outputs
            *oiter = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
        
        do {                                    // Compute forces of all bodies
            
            MAPS_FOREACH(oiter, outBodies) {
                
                MAPS_FOREACH_ALIGNED(iter, other_bodies, oiter)
                    bodyBodyInteraction<float>(*oiter, *bodies_in.align(oiter), *iter);
            }
    
            other_bodies.nextChunk();
        } while (!other_bodies.isDone());
    
        auto viter = velocity_out.begin();
        
        MAPS_FOREACH(oiter, outBodies) {        // Integrate: compute outputs from acceleration
            float4 vel = *velocity_in.align(oiter);
    
            vel.x += (*oiter).x * deltaTime;    // Compute velocity
            vel.y += (*oiter).y * deltaTime;
            vel.z += (*oiter).z * deltaTime;
            vel *= DAMPING_FACTOR;
            
            *oiter = *bodies_in.align(oiter);   // Get current position
            (*oiter).x += vel.x * deltaTime;    // Update position
            (*oiter).y += vel.y * deltaTime;
            (*oiter).z += vel.z * deltaTime;
            
            *viter = vel;                       // Write velocity and increment iterator
            ++viter;
        }
        
        velocity_out.commit();                  // Commit velocity and positions
        outBodies.commit();
    }
  • Matrix Multiplication
    template <typename T>
    __global__ void GEMMKernel(MAPS_MULTIDEF2,
                               maps::Block2D<T, 0, BLOCK_WIDTH, BLOCK_HEIGHT> A,
                               maps::Block2D<T, 1, BLOCK_WIDTH, BLOCK_HEIGHT> B,
                               maps::StructuredInjective2D<T, BLOCK_WIDTH, BLOCK_HEIGHT> C) {
        MAPS_MULTI_INITVARS(A, B, C);
        *C.begin() = T(0);
    
        do {
            
            MAPS_FOREACH(oiter, C) {            
                
                auto B_iter = B.align(oiter);      // Initialize B's iterator in addition to A
                
                MAPS_FOREACH_ALIGNED(A_iter, A, oiter) {
                    *oiter += (*A_iter) * (*B_iter);
                    ++B_iter;
                }
            }
            maps::NextChunkAll(A, B);              // Efficiently advance chunk on both containers
        } while (!A.isDone());
    
        if (C.Items() > 0)                         // Output results
            C.commit();
    }
  • Matrix Multiplication (CUBLAS)
    
    bool SGEMMRoutine(void *context, int deviceIdx, cudaStream_t stream,
                      const maps::multi::GridSegment& task_segment,
                      const std::vector<void *>& parameters,
                      const std::vector<maps::multi::DatumSegment>& container_segments,
                      const std::vector<maps::multi::DatumSegment>& container_allocation) {
        CUBLASContext *c = (CUBLASContext *)context;    
        
        float alpha, beta;
        maps::GetConstantParameter(parameters[3], alpha); // Obtain constants
        maps::GetConstantParameter(parameters[4], beta);
    
        int m, n, k;
        m = container_segments[0].m_dimensions[0]; // Compute matrix segment dimensions
        n = container_segments[1].m_dimensions[0];
        k = container_segments[2].m_dimensions[1];
    
        // Set GPU stream and call actual kernel
        CUBLAS_CHECK(cublasSetStream(c->handles[deviceIdx], stream));
        CUBLAS_CHECK(cublasSgemm(c->handles[deviceIdx], CUBLAS_OP_N, CUBLAS_OP_N, m, k, n, &alpha, 
                                 (float *)parameters[0], 
                                 container_segments[0].m_stride_bytes / sizeof(float), 
                                 (float *)parameters[1], 
                                 container_segments[1].m_stride_bytes / sizeof(float), &beta, 
                                 (float *)parameters[2],
                                 container_segments[2].m_stride_bytes / sizeof(float)));
        return true;
    }
    
    sched.InvokeUnmodified(SGEMMRoutine, &context, dim3(),
                           maps::multi::Block2DUnmodified<0, float>(A),
                           maps::multi::Block2DUnmodified<1, float>(B),
                           maps::multi::StructuredInjective2D<float>(C), 
                           alpha, beta);
  • Sparse Matrix-Vector Multiplication (SpMV)
    template <typename T>
    __global__ void SpMVMapsMultiKernel MAPS_MULTIDEF(maps::Adjacency<T, T> in_graph,
                                                      maps::StructuredInjective1D<T, BLOCK_WIDTH> out) {
        MAPS_MULTI_INITVARS(in_graph, out);
    
        if (out.Items() == 0)
            return;
        T result = T(0);
        
        const auto eit = in_graph.end();
        for (auto it = in_graph.begin(); it != eit; ++it) {
            result +=
                (*it).edge_weight *           // Matrix value 
                (*it).adjacent_node_value;    // Vector value
        }
        
        *out.begin() = result;                // Write and commit output
        out.commit();
    }