## ¶ 簡介

CPU 與 GPU 在計算浮點數 (floating-point number) 時會有差異。

int diverge_cpu(float c_re, float c_im, int max)
{
float z_re = c_re, z_im = c_im;
int i;
for (i = 0; i < max; ++i)
{

if (z_re * z_re + z_im * z_im > 4.f)
break;

float new_re = z_re * z_re - z_im * z_im;
float new_im = 2.f * z_re * z_im;
z_re = c_re + new_re;
z_im = c_im + new_im;
}

return i;
}


## ¶ GPU 與 CPU 浮點數運算差異示例

test.cu

#include <cuda_runtime.h>
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>

#define INPUT_X -0.0612500f
#define INPUT_Y -0.9916667f

int diverge_cpu(float c_re, float c_im, int max)
{
float z_re = c_re, z_im = c_im;
int i;
for (i = 0; i < max; ++i)
{

if (z_re * z_re + z_im * z_im > 4.f)
break;

float new_re = z_re * z_re - z_im * z_im;
float new_im = 2.f * z_re * z_im;
z_re = c_re + new_re;
z_im = c_im + new_im;
}

return i;
}

__device__ int diverge_gpu(float c_re, float c_im, int max)
{
float z_re = c_re, z_im = c_im;
int i;
for (i = 0; i < max; ++i)
{

if (z_re * z_re + z_im * z_im > 4.f)
break;

float new_re = z_re * z_re - z_im * z_im;
float new_im = 2.f * z_re * z_im;
z_re = c_re + new_re;
z_im = c_im + new_im;
}

return i;
}

__global__ void kernel(int *c, int n)
{
// 取得 global ID
int id = blockIdx.x * blockDim.x + threadIdx.x;

// 通通設一樣的值
c[id] = diverge_gpu(INPUT_X, INPUT_Y, 256);
}

int main(int argc, char *argv[])
{
int n = 100;
int *h_c;
int *d_c;
h_c = (int *)malloc(n * sizeof(int));
cudaMalloc(&d_c, n * sizeof(int));

int blockSize = 1024;
int gridSize = 1;

// 這邊是算 GPU 的部分
kernel<<<gridSize, blockSize>>>(d_c, n);
cudaMemcpy(h_c, d_c, n * sizeof(int), cudaMemcpyDeviceToHost);

// 這邊是算 CPU 的部分
int cpu_result = diverge_cpu(INPUT_X, INPUT_Y, 256);

printf("GPU vs CPU: %d, %d\n", h_c[0], cpu_result);

cudaFree(d_c);
free(h_c);

return 0;
}


\$ nvcc test.cu; ./a.out
GPU vs CPU: 39, 40


## ¶ GPU 與 CPU 浮點數運算的差別

Even in the strict world of IEEE 754 operations, minor details such as organization of parentheses or thread counts can affect the final result. Take this into account when doing comparisons between implementations.

x = (x * y) * z; // 不等於  x *= y * z;
z = (x - y) + y ; // 不等於 z = x;
z = x + x * y; // 不等於 z = x * (1.0 + y);
y = x / 5.0; // 不等於 y = x * 0.2;


The consequence is that different math libraries cannot be expected to compute exactly the same result for a given input. This applies to GPU programming as well. Functions compiled for the GPU will use the NVIDIA CUDA math library implementation while functions compiled for the CPU will use the host compiler math library implementation (e.g., glibc on Linux). Because these implementations are independent and neither is guaranteed to be correctly rounded, the results will often differ slightly.

## ¶ 結論

CPU 與 GPU 在計算浮點數時會有差異。這代表我們在處理 CPU 與 GPU 相互資料的時候要特別小心，像是異質運算 kernel 同時會在 CPU 跑和在 GPU 跑時，數據跑出來的結果就有機會不一樣。