ICS-E4020: OpenMP vs. CUDA

Four examples

Here are 4 code snippets that are roughly equivalent. All of them will run the following functions, in some order, possibly in parallel:

    foo(0)
    foo(1)
    ...
    foo(55554)

OpenMP: 55555 iterations of a parallel for loop

int main() {
    #pragma omp parallel for
    for (int i = 0; i < 55555; ++i) {
        foo(i);
    }
}

OpenMP: 55555 tasks

int main() {
    #pragma omp parallel
    #pragma omp single
    {
        for (int i = 0; i < 55555; ++i) {
            #pragma omp task
            foo(i);
        }
    }
}

CUDA: 1 thread per block, 55555 blocks

__global__ void kernel() {
    int i = blockIdx.x;
    foo(i);
}

int main() {
    kernel<<<55555, 1>>>();
    cudaDeviceSynchronize();
}

CUDA: 100 threads per block, 556 blocks

__global__ void kernel() {
    int i = 100 * blockIdx.x + threadIdx.x;
    if (i >= 55555) {
        return;
    }
    foo(i);
}

int main() {
    kernel<<<556, 100>>>();
    cudaDeviceSynchronize();
}