cuda加速nbody问题

大三下学期分布式并行计算第二次实验。

串行版本：

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#include "check.h"
#include <cuda_runtime.h>
 
#define SOFTENING 1e-9f
 
/*
 * Each body contains x, y, and z coordinate positions,
 * as well as velocities in the x, y, and z directions.
 */
 
typedef struct { float x, y, z, vx, vy, vz; } Body;
 
/*
 * Do not modify this function. A constraint of this exercise is
 * that it remain a host function.
 */
 
void randomizeBodies(float *data, int n) {
  for (int i = 0; i < n; i++) {
    data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
  }
}
 
/*
 * This function calculates the gravitational impact of all bodies in the system
 * on all others, but does not update their positions.
 */
 
void bodyForce(Body *p, float dt, int n) {
  for (int i = 0; i < n; ++i) {
    float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
 
    for (int j = 0; j < n; j++) {
      float dx = p[j].x - p[i].x;
      float dy = p[j].y - p[i].y;
      float dz = p[j].z - p[i].z;
      float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
      float invDist = rsqrtf(distSqr);
      float invDist3 = invDist * invDist * invDist;
 
      Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
    }
 
    p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
  }
}
 
int main(const int argc, const char** argv) {
 
  /*
   * Do not change the value for `nBodies` here. If you would like to modify it,
   * pass values into the command line.
   */
  
  int nBodies = 2<<11;
  int salt = 0;
  if (argc > 1) nBodies = 2<<atoi(argv[1]);
 
  /*
   * This salt is for assessment reasons. Tampering with it will result in automatic failure.
   */
 
  if (argc > 2) salt = atoi(argv[2]);
 
  const float dt = 0.01f; // time step
  const int nIters = 10;  // simulation iterations
 
  int bytes = nBodies * sizeof(Body);
  float *buf;
 
  buf = (float *)malloc(bytes);
 
  Body *p = (Body*)buf;
 
  /*
   * As a constraint of this exercise, `randomizeBodies` must remain a host function.
   */
 
  randomizeBodies(buf, 6 * nBodies); // Init pos / vel data
 
  double totalTime = 0.0;
 
  /*
   * This simulation will run for 10 cycles of time, calculating gravitational
   * interaction amongst bodies, and adjusting their positions to reflect.
   */
 
  /*******************************************************************/
  // Do not modify these 2 lines of code.
  for (int iter = 0; iter < nIters; iter++) {
    StartTimer();
  /*******************************************************************/
 
  /*
   * You will likely wish to refactor the work being done in `bodyForce`,
   * as well as the work to integrate the positions.
   */
 
    bodyForce(p, dt, nBodies); // compute interbody forces
 
  /*
   * This position integration cannot occur until this round of `bodyForce` has completed.
   * Also, the next round of `bodyForce` cannot begin until the integration is complete.
   */
 
    for (int i = 0 ; i < nBodies; i++) { // integrate position
      p[i].x += p[i].vx*dt;
      p[i].y += p[i].vy*dt;
      p[i].z += p[i].vz*dt;
    }
 
  /*******************************************************************/
  // Do not modify the code in this section.
    const double tElapsed = GetTimer() / 1000.0;
    totalTime += tElapsed;
  }
 
  double avgTime = totalTime / (double)(nIters);
  float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;
 
#ifdef ASSESS
  checkPerformance(buf, billionsOfOpsPerSecond, salt);
#else
  checkAccuracy(buf, nBodies);
  printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, billionsOfOpsPerSecond);
  salt += 1;
#endif
  /*******************************************************************/
 
  /*
   *  Feel free to modify code below.
   */
 
  free(buf);
}

并行版本：


#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#include "check.h"
#include <cuda_runtime.h>

#define SOFTENING 1e-9f

/*
 * Each body contains x, y, and z coordinate positions,
 * as well as velocities in the x, y, and z directions.
 */

typedef struct { float x, y, z, vx, vy, vz; } Body;

/*
 * Do not modify this function. A constraint of this exercise is
 * that it remain a host function.
 */

void randomizeBodies(float *data, int n) {
  for (int i = 0; i < n; i++) {
    data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
  }
}

/*
 * This function calculates the gravitational impact of all bodies in the system
 * on all others, but does not update their positions.
 */

__global__ void bodyForce(Body *p, float dt, int n) {
        int i=threadIdx.x+blockIdx.x*blockDim.x;
        if(i<n){
                float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
                for (int j = 0; j < n; j++) {
                        float dx = p[j].x - p[i].x;
                        float dy = p[j].y - p[i].y;
                        float dz = p[j].z - p[i].z;
                        float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
                        float invDist = rsqrtf(distSqr);
                        float invDist3 = invDist * invDist * invDist;
                        Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
                }
                p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
        }
}

__global__ void integrate_position(Body *p,float dt,int n){
        int i=threadIdx.x+blockIdx.x*blockDim.x;
        if(i<n){
                // integrate position
                p[i].x += p[i].vx*dt;
                p[i].y += p[i].vy*dt;
                p[i].z += p[i].vz*dt;
        }
}

int main(const int argc, const char** argv) {

  /*
   * Do not change the value for `nBodies` here. If you would like to modify it,
   * pass values into the command line.
   */

  int nBodies = 2<<11;
  int salt = 0;
  if (argc > 1) nBodies = 2<<atoi(argv[1]);

  /*
   * This salt is for assessment reasons. Tampering with it will result in automatic failure.
   */

  if (argc > 2) salt = atoi(argv[2]);

  const float dt = 0.01f; // time step
  const int nIters = 10;  // simulation iterations

  int bytes = nBodies * sizeof(Body);
  float *buf;
  cudaMallocHost((void **)&buf,bytes);

  /*
   * As a constraint of this exercise, `randomizeBodies` must remain a host function.
   */
  randomizeBodies(buf, 6 * nBodies); // Init pos / vel data
  float *d_buf;
  cudaMalloc((void **)&d_buf,bytes);
  Body *d_p=(Body *)d_buf;

  cudaMemcpy(d_buf,buf,bytes,cudaMemcpyHostToDevice);
  double totalTime = 0.0;

  size_t block_size=32;
  size_t block_num=nBodies/block_size;

  /*
   * This simulation will run for 10 cycles of time, calculating gravitational
   * interaction amongst bodies, and adjusting their positions to reflect.
   */

  /*******************************************************************/
  // Do not modify these 2 lines of code.
  for (int iter = 0; iter < nIters; iter++) {
    StartTimer();
  /*******************************************************************/

  /*
   * You will likely wish to refactor the work being done in `bodyForce`,
   * as well as the work to integrate the positions.
   */

    bodyForce<<<block_num,block_size>>>(d_p, dt, nBodies); // compute interbody forces

  /*
   * This position integration cannot occur until this round of `bodyForce` has completed.
   * Also, the next round of `bodyForce` cannot begin until the integration is complete.
   */
   integrate_position<<<block_num,block_size>>>(d_p,dt,nBodies);

   if(iter==nIters-1)
       cudaMemcpy(buf,d_buf,bytes,cudaMemcpyDeviceToHost);

  /*******************************************************************/
  // Do not modify the code in this section.
    const double tElapsed = GetTimer() / 1000.0;
    totalTime += tElapsed;
  }



  double avgTime = totalTime / (double)(nIters);
  float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;
  


#ifdef ASSESS
  checkPerformance(buf, billionsOfOpsPerSecond, salt);
#else
  checkAccuracy(buf, nBodies);
  printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, billionsOfOpsPerSecond);
  salt += 1;
#endif
  /*******************************************************************/

  /*
   * Feel free to modify code below.
   */
  cudaFree(buf);
  cudaFree(d_buf);
}

最终优化：

我们观察发现，bodyForce函数耗时最久，我们考虑优化这个函数。这里我们还是用最常见的循环展开。

if(i<n){
                float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
                for (int j = 0; j < n; j++) {
                        float dx = p[j].x - p[i].x;
                        float dy = p[j].y - p[i].y;
                        float dz = p[j].z - p[i].z;
                        float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
                        float invDist = rsqrtf(distSqr);
                        float invDist3 = invDist * invDist * invDist;
                        Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
                }
                p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
        }

我们考虑将这个循环拆开，拆成以BLOCK_STEP 组，这样子就可以加速计算。[ i , i+BLOCK_SIZE ,i+2*BLOCK_SIZE …] 为一组

如上图所示，我们配置函数 <<< BLOCK_STEP * (n / BLOCK_SIZE), BLOCK_SIZE>>>

让第一列的全部来计算第 [ 0-BLOCK_SIZE）号，第二列计算第 [ BLOCK_SIZE+1，2*BLOCK_SIZE ) 号。每一列是一个BLOCK 分别有 BLOCK_SIZE 个线程。

然后展开上面的循环，让第一列的第一行那个 BLOCK 计算该BLOCK和\{BLOCK，BLOCK+BLOCK_STEP …. ,BLOCK + t* BLOCK_STEP\} 之间的数值。

第 $j$ 列的block计算第 [ BLOCK_SIZE j , BLOCK_SIZE (j+1) ) 星星的数值

第 $i$ 行的block计算对应那个星星分别和该组引力大小。


#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#include "check.h"
#include <cuda_runtime.h>

#define SOFTENING 1e-9f
#define BLOCK_SIZE 32
#define BLOCK_STEP 32
/*
 * Each body contains x, y, and z coordinate positions,
 * as well as velocities in the x, y, and z directions.
 */

typedef struct { float x, y, z, vx, vy, vz; } Body;

/*
 * Do not modify this function. A constraint of this exercise is
 * that it remain a host function.
 */

void randomizeBodies(float *data, int n) {
  for (int i = 0; i < n; i++) {
    data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
  }
}

/*
 * This function calculates the gravitational impact of all bodies in the system
 * on all others, but does not update their positions.
 */

__global__ void bodyForce(Body *p, float dt, int n,int* flag) {
        int i=threadIdx.x+(blockIdx.x % (n/BLOCK_SIZE))*blockDim.x;
        //int i = threadIdx.x + (int)(blockIdx.x / BLOCK_STEP) * blockDim.x;
        
        Body ptemp = p[i];
        __shared__ float3 spos[BLOCK_SIZE];
        float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
        Body temp;
        
        for(int start_block=int(blockIdx.x / (n/BLOCK_SIZE)) ; start_block<n/BLOCK_SIZE;start_block+=BLOCK_STEP){
            temp=p[threadIdx.x+blockDim.x*start_block ];
            spos[threadIdx.x]=make_float3(temp.x, temp.y, temp.z);
            __syncthreads();

            for(int j=0;j<BLOCK_SIZE;j++){
                float dx = spos[j].x - ptemp.x;
                float dy = spos[j].y - ptemp.y;
                float dz = spos[j].z - ptemp.z;
                float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
                float invDist = rsqrtf(distSqr);
                float invDist3 = invDist * invDist * invDist;
                Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
            }
        
            __syncthreads();
        }
        atomicAdd(&p[i].vx, dt * Fx);
        atomicAdd(&p[i].vy, dt * Fy);
        atomicAdd(&p[i].vz, dt * Fz);
        // atomicSub(&flag[i], 1);
        // if(!atomicMax(&flag[i], 0)){
        //         atomicAdd(&p[i].x,p[i].vx*dt);
        //         atomicAdd(&p[i].y,p[i].vy*dt);
        //         atomicAdd(&p[i].z,p[i].vz*dt);
        //         atomicExch(&flag[i],BLOCK_STEP);
        // }
        // 不太对，我在2^11次方正常运行，在2^15次方结果不对。
}

__global__ void integrate_position(Body *p,float dt,int n){
        int i=threadIdx.x+blockIdx.x*blockDim.x;
        if(i<n){

                // integrate position
                p[i].x += p[i].vx*dt;
                p[i].y += p[i].vy*dt;
                p[i].z += p[i].vz*dt;
        }
}

int main(const int argc, const char** argv) {

  /*
   * Do not change the value for `nBodies` here. If you would like to modify it,
   * pass values into the command line.
   */

  int nBodies = 2<<11;
  int salt = 0;
  if (argc > 1) nBodies = 2<<atoi(argv[1]);

  /*
   * This salt is for assessment reasons. Tampering with it will result in automatic failure.
   */

  if (argc > 2) salt = atoi(argv[2]);

  const float dt = 0.01f; // time step
  const int nIters = 10;  // simulation iterations

  int bytes = nBodies * sizeof(Body);
  float *buf;
  cudaMallocHost((void **)&buf,bytes);

  /*
   * As a constraint of this exercise, `randomizeBodies` must remain a host function.
   */
  randomizeBodies(buf, 6 * nBodies); // Init pos / vel data
  float *d_buf;
  cudaMalloc((void **)&d_buf,bytes);
  Body *d_p=(Body *)d_buf;

  cudaMemcpy(d_buf,buf,bytes,cudaMemcpyHostToDevice);
  double totalTime = 0.0;

  size_t block_num=nBodies/BLOCK_SIZE;


  /*
   * This simulation will run for 10 cycles of time, calculating gravitational
   * interaction amongst bodies, and adjusting their positions to reflect.
   */

  /*******************************************************************/
  // Do not modify these 2 lines of code.
  for (int iter = 0; iter < nIters; iter++) {
    StartTimer();
  /*******************************************************************/

  /*
   * You will likely wish to refactor the work being done in `bodyForce`,
   * as well as the work to integrate the positions.
   */

    bodyForce<<<block_num*BLOCK_STEP,BLOCK_SIZE>>>(d_p, dt, nBodies,d_flag); // compute interbody forces

  /*
   * This position integration cannot occur until this round of `bodyForce` has completed.
   * Also, the next round of `bodyForce` cannot begin until the integration is complete.
   */
   integrate_position<<<block_num,BLOCK_SIZE>>>(d_p,dt,nBodies);

   if(iter==nIters-1)
       cudaMemcpy(buf,d_buf,bytes,cudaMemcpyDeviceToHost);

  /*******************************************************************/
  // Do not modify the code in this section.
    const double tElapsed = GetTimer() / 1000.0;
    totalTime += tElapsed;
  }



  double avgTime = totalTime / (double)(nIters);
  float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;
  


#ifdef ASSESS
  checkPerformance(buf, billionsOfOpsPerSecond, salt);
#else
  checkAccuracy(buf, nBodies);
  printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, billionsOfOpsPerSecond);
  salt += 1;
#endif
  /*******************************************************************/

  /*
   * Feel free to modify code below.
   */
  cudaFree(buf);
  cudaFree(d_buf);
}

经过优化之后，我们成功加速了 bodyForce 函数