以下程序(
found here
)使用循环添加两个数组:
#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__ void add(int n, float *x, float *y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
int main(void)
{
int N = 1<<20;
float *x, *y;
// Allocate Unified Memory â accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
// Run kernel on 1M elements on the GPU
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
return 0;
}
.
另一方面,以下程序没有:
#include <iostream>
#include <cuda_runtime.h>
#define ARRAY_SIZE 512
__global__ void add(int n, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
y[i] = x[i] + y[i];
}
}
int main()
{
// Allocate memory on the GPU for the arrays
float *a, *b;
cudaMalloc(&a, ARRAY_SIZE * sizeof(float));
cudaMalloc(&b, ARRAY_SIZE * sizeof(float));
// Initialize the arrays on the host (CPU)
float host_a[ARRAY_SIZE], host_b[ARRAY_SIZE];
for (int i = 0; i < ARRAY_SIZE; i++)
{
host_a[i] = i;
host_b[i] = i * 2;
}
// Copy the arrays from the host (CPU) to the device (GPU)
cudaMemcpy(a, host_a, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b, host_b, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice);
// Launch a kernel to add the arrays
int threadsPerBlock = 256;
int blocksPerGrid = (ARRAY_SIZE + threadsPerBlock - 1) / threadsPerBlock;
add<<<blocksPerGrid, threadsPerBlock>>>(ARRAY_SIZE, a, b);
// Copy the result from the device (GPU) to the host (CPU)
float host_result[ARRAY_SIZE];
cudaMemcpy(host_result, b, ARRAY_SIZE * sizeof(float), cudaMemcpyDeviceToHost);
// Print the result
for (int i = 0; i < ARRAY_SIZE; i++) {
std::cout << host_a[i] << " + " << host_b[i] << " = " << host_result[i] << std::endl;
}
// Free the memory on the GPU
cudaFree(a);
cudaFree(b);
return 0;
}
然而,它们实现了相同的结果。
请解释原因和方式。