代码之家 › 专栏 › 技术社区 › user366312

如何在不使用循环的情况下添加两个数组?

cuda

user366312 · 技术社区 · 3 年前

以下程序( found here )使用循环添加两个数组:

#include <iostream>
#include <math.h>

// Kernel function to add the elements of two arrays
__global__ void add(int n, float *x, float *y)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  for (int i = index; i < n; i += stride)
    y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20;
  float *x, *y;

  // Allocate Unified Memory â accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) 
  {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  int blockSize = 256;
  int numBlocks = (N + blockSize - 1) / blockSize;
  add<<<numBlocks, blockSize>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);
  
  return 0;
}

另一方面,以下程序没有:

#include <iostream>
#include <cuda_runtime.h>

#define ARRAY_SIZE 512

__global__ void add(int n, float *x, float *y) 
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        y[i] = x[i] + y[i];
    }
}

int main() 
{
    // Allocate memory on the GPU for the arrays
    float *a, *b;
    cudaMalloc(&a, ARRAY_SIZE * sizeof(float));
    cudaMalloc(&b, ARRAY_SIZE * sizeof(float));

    // Initialize the arrays on the host (CPU)
    float host_a[ARRAY_SIZE], host_b[ARRAY_SIZE];
    for (int i = 0; i < ARRAY_SIZE; i++) 
    {
        host_a[i] = i;
        host_b[i] = i * 2;
    }

    // Copy the arrays from the host (CPU) to the device (GPU)
    cudaMemcpy(a, host_a, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(b, host_b, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice);

    // Launch a kernel to add the arrays
    int threadsPerBlock = 256;
    int blocksPerGrid = (ARRAY_SIZE + threadsPerBlock - 1) / threadsPerBlock;
    add<<<blocksPerGrid, threadsPerBlock>>>(ARRAY_SIZE, a, b);

    // Copy the result from the device (GPU) to the host (CPU)
    float host_result[ARRAY_SIZE];
    cudaMemcpy(host_result, b, ARRAY_SIZE * sizeof(float), cudaMemcpyDeviceToHost);

    // Print the result
    for (int i = 0; i < ARRAY_SIZE; i++) {
        std::cout << host_a[i] << " + " << host_b[i] << " = " << host_result[i] << std::endl;
    }

    // Free the memory on the GPU
    cudaFree(a);
    cudaFree(b);

    return 0;
}

然而,它们实现了相同的结果。

请解释原因和方式。

1 回复 | 直到 3 年前

Robert Crovella 3 年前

CUDA打算通过旋转许多线程来并行完成工作。“每个线程将做什么”的定义由内核代码给出。

没有循环的内核代码旨在让每个线程更新一个输出点,因此,为了覆盖所有输出点,您需要(至少)旋转与输出点一样多的线程。

具有循环的内核代码可以由单个线程为多个输出点执行工作。因此,它可以使用更少的线程完成相同的“工作”。

这个 grid stride loop 讨论了这两种网格大小调整方法之间的规范关系。CUDA网格是为特定内核启动而启动的所有线程的总和或集合。

您所展示的循环示例实际上是使用网格步长循环,并且该特定设计可以使用或多或少任意数量的线程来执行固定的工作定义。

为了有序地介绍CUDA,您可能感兴趣 this 培训课程。