cuda加速nbody问题

大三下学期分布式并行计算第二次实验。

串行版本:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#include "check.h"
#include <cuda_runtime.h>

#define SOFTENING 1e-9f

/*
* Each body contains x, y, and z coordinate positions,
* as well as velocities in the x, y, and z directions.
*/

typedef struct { float x, y, z, vx, vy, vz; } Body;

/*
* Do not modify this function. A constraint of this exercise is
* that it remain a host function.
*/

void randomizeBodies(float *data, int n) {
for (int i = 0; i < n; i++) {
data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
}
}

/*
* This function calculates the gravitational impact of all bodies in the system
* on all others, but does not update their positions.
*/

void bodyForce(Body *p, float dt, int n) {
for (int i = 0; i < n; ++i) {
float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;

for (int j = 0; j < n; j++) {
float dx = p[j].x - p[i].x;
float dy = p[j].y - p[i].y;
float dz = p[j].z - p[i].z;
float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
float invDist = rsqrtf(distSqr);
float invDist3 = invDist * invDist * invDist;

Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
}

p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
}
}

int main(const int argc, const char** argv) {

/*
* Do not change the value for `nBodies` here. If you would like to modify it,
* pass values into the command line.
*/

int nBodies = 2<<11;
int salt = 0;
if (argc > 1) nBodies = 2<<atoi(argv[1]);

/*
* This salt is for assessment reasons. Tampering with it will result in automatic failure.
*/

if (argc > 2) salt = atoi(argv[2]);

const float dt = 0.01f; // time step
const int nIters = 10; // simulation iterations

int bytes = nBodies * sizeof(Body);
float *buf;

buf = (float *)malloc(bytes);

Body *p = (Body*)buf;

/*
* As a constraint of this exercise, `randomizeBodies` must remain a host function.
*/

randomizeBodies(buf, 6 * nBodies); // Init pos / vel data

double totalTime = 0.0;

/*
* This simulation will run for 10 cycles of time, calculating gravitational
* interaction amongst bodies, and adjusting their positions to reflect.
*/

/*******************************************************************/
// Do not modify these 2 lines of code.
for (int iter = 0; iter < nIters; iter++) {
StartTimer();
/*******************************************************************/

/*
* You will likely wish to refactor the work being done in `bodyForce`,
* as well as the work to integrate the positions.
*/

bodyForce(p, dt, nBodies); // compute interbody forces

/*
* This position integration cannot occur until this round of `bodyForce` has completed.
* Also, the next round of `bodyForce` cannot begin until the integration is complete.
*/

for (int i = 0 ; i < nBodies; i++) { // integrate position
p[i].x += p[i].vx*dt;
p[i].y += p[i].vy*dt;
p[i].z += p[i].vz*dt;
}

/*******************************************************************/
// Do not modify the code in this section.
const double tElapsed = GetTimer() / 1000.0;
totalTime += tElapsed;
}

double avgTime = totalTime / (double)(nIters);
float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;

#ifdef ASSESS
checkPerformance(buf, billionsOfOpsPerSecond, salt);
#else
checkAccuracy(buf, nBodies);
printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, billionsOfOpsPerSecond);
salt += 1;
#endif
/*******************************************************************/

/*
* Feel free to modify code below.
*/

free(buf);
}

并行版本:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#include "check.h"
#include <cuda_runtime.h>

#define SOFTENING 1e-9f

/*
* Each body contains x, y, and z coordinate positions,
* as well as velocities in the x, y, and z directions.
*/

typedef struct { float x, y, z, vx, vy, vz; } Body;

/*
* Do not modify this function. A constraint of this exercise is
* that it remain a host function.
*/

void randomizeBodies(float *data, int n) {
for (int i = 0; i < n; i++) {
data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
}
}

/*
* This function calculates the gravitational impact of all bodies in the system
* on all others, but does not update their positions.
*/

__global__ void bodyForce(Body *p, float dt, int n) {
int i=threadIdx.x+blockIdx.x*blockDim.x;
if(i<n){
float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
for (int j = 0; j < n; j++) {
float dx = p[j].x - p[i].x;
float dy = p[j].y - p[i].y;
float dz = p[j].z - p[i].z;
float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
float invDist = rsqrtf(distSqr);
float invDist3 = invDist * invDist * invDist;
Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
}
p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
}
}

__global__ void integrate_position(Body *p,float dt,int n){
int i=threadIdx.x+blockIdx.x*blockDim.x;
if(i<n){
// integrate position
p[i].x += p[i].vx*dt;
p[i].y += p[i].vy*dt;
p[i].z += p[i].vz*dt;
}
}

int main(const int argc, const char** argv) {

/*
* Do not change the value for `nBodies` here. If you would like to modify it,
* pass values into the command line.
*/

int nBodies = 2<<11;
int salt = 0;
if (argc > 1) nBodies = 2<<atoi(argv[1]);

/*
* This salt is for assessment reasons. Tampering with it will result in automatic failure.
*/

if (argc > 2) salt = atoi(argv[2]);

const float dt = 0.01f; // time step
const int nIters = 10; // simulation iterations

int bytes = nBodies * sizeof(Body);
float *buf;
cudaMallocHost((void **)&buf,bytes);

/*
* As a constraint of this exercise, `randomizeBodies` must remain a host function.
*/
randomizeBodies(buf, 6 * nBodies); // Init pos / vel data
float *d_buf;
cudaMalloc((void **)&d_buf,bytes);
Body *d_p=(Body *)d_buf;

cudaMemcpy(d_buf,buf,bytes,cudaMemcpyHostToDevice);
double totalTime = 0.0;

size_t block_size=32;
size_t block_num=nBodies/block_size;

/*
* This simulation will run for 10 cycles of time, calculating gravitational
* interaction amongst bodies, and adjusting their positions to reflect.
*/

/*******************************************************************/
// Do not modify these 2 lines of code.
for (int iter = 0; iter < nIters; iter++) {
StartTimer();
/*******************************************************************/

/*
* You will likely wish to refactor the work being done in `bodyForce`,
* as well as the work to integrate the positions.
*/

bodyForce<<<block_num,block_size>>>(d_p, dt, nBodies); // compute interbody forces

/*
* This position integration cannot occur until this round of `bodyForce` has completed.
* Also, the next round of `bodyForce` cannot begin until the integration is complete.
*/
integrate_position<<<block_num,block_size>>>(d_p,dt,nBodies);

if(iter==nIters-1)
cudaMemcpy(buf,d_buf,bytes,cudaMemcpyDeviceToHost);

/*******************************************************************/
// Do not modify the code in this section.
const double tElapsed = GetTimer() / 1000.0;
totalTime += tElapsed;
}



double avgTime = totalTime / (double)(nIters);
float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;



#ifdef ASSESS
checkPerformance(buf, billionsOfOpsPerSecond, salt);
#else
checkAccuracy(buf, nBodies);
printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, billionsOfOpsPerSecond);
salt += 1;
#endif
/*******************************************************************/

/*
* Feel free to modify code below.
*/
cudaFree(buf);
cudaFree(d_buf);
}

最终优化:

我们观察发现,bodyForce函数耗时最久,我们考虑优化这个函数。这里我们还是用最常见的循环展开。

1
2
3
4
5
6
7
8
9
10
11
12
13
if(i<n){
float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
for (int j = 0; j < n; j++) {
float dx = p[j].x - p[i].x;
float dy = p[j].y - p[i].y;
float dz = p[j].z - p[i].z;
float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
float invDist = rsqrtf(distSqr);
float invDist3 = invDist * invDist * invDist;
Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
}
p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
}

我们考虑将这个循环拆开,拆成以BLOCK_STEP 组,这样子就可以加速计算。[ i , i+BLOCK_SIZE ,i+2*BLOCK_SIZE …]​ 为一组

如上图所示,我们配置函数 <<< BLOCK_STEP * (n / BLOCK_SIZE), BLOCK_SIZE>>>

让第一列的全部来计算第 [ 0-BLOCK_SIZE)号,第二列计算第 [ BLOCK_SIZE+1,2*BLOCK_SIZE ) 号。每一列是一个BLOCK 分别有 BLOCK_SIZE 个线程。

然后展开上面的循环,让第一列的第一行那个 BLOCK 计算该BLOCK和\{BLOCK,BLOCK+BLOCK_STEP …. ,BLOCK + t* BLOCK_STEP\} ​ 之间的数值。

第 $j$ 列的block计算第 [ BLOCK_SIZE j , BLOCK_SIZE (j+1) )​ 星星的数值

第 $i$ 行的block计算对应那个星星分别和该组引力大小。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#include "check.h"
#include <cuda_runtime.h>

#define SOFTENING 1e-9f
#define BLOCK_SIZE 32
#define BLOCK_STEP 32
/*
* Each body contains x, y, and z coordinate positions,
* as well as velocities in the x, y, and z directions.
*/

typedef struct { float x, y, z, vx, vy, vz; } Body;

/*
* Do not modify this function. A constraint of this exercise is
* that it remain a host function.
*/

void randomizeBodies(float *data, int n) {
for (int i = 0; i < n; i++) {
data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
}
}

/*
* This function calculates the gravitational impact of all bodies in the system
* on all others, but does not update their positions.
*/

__global__ void bodyForce(Body *p, float dt, int n,int* flag) {
int i=threadIdx.x+(blockIdx.x % (n/BLOCK_SIZE))*blockDim.x;
//int i = threadIdx.x + (int)(blockIdx.x / BLOCK_STEP) * blockDim.x;

Body ptemp = p[i];
__shared__ float3 spos[BLOCK_SIZE];
float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
Body temp;

for(int start_block=int(blockIdx.x / (n/BLOCK_SIZE)) ; start_block<n/BLOCK_SIZE;start_block+=BLOCK_STEP){
temp=p[threadIdx.x+blockDim.x*start_block ];
spos[threadIdx.x]=make_float3(temp.x, temp.y, temp.z);
__syncthreads();

for(int j=0;j<BLOCK_SIZE;j++){
float dx = spos[j].x - ptemp.x;
float dy = spos[j].y - ptemp.y;
float dz = spos[j].z - ptemp.z;
float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
float invDist = rsqrtf(distSqr);
float invDist3 = invDist * invDist * invDist;
Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
}

__syncthreads();
}
atomicAdd(&p[i].vx, dt * Fx);
atomicAdd(&p[i].vy, dt * Fy);
atomicAdd(&p[i].vz, dt * Fz);
// atomicSub(&flag[i], 1);
// if(!atomicMax(&flag[i], 0)){
// atomicAdd(&p[i].x,p[i].vx*dt);
// atomicAdd(&p[i].y,p[i].vy*dt);
// atomicAdd(&p[i].z,p[i].vz*dt);
// atomicExch(&flag[i],BLOCK_STEP);
// }
// 不太对,我在2^11次方正常运行,在2^15次方结果不对。
}

__global__ void integrate_position(Body *p,float dt,int n){
int i=threadIdx.x+blockIdx.x*blockDim.x;
if(i<n){

// integrate position
p[i].x += p[i].vx*dt;
p[i].y += p[i].vy*dt;
p[i].z += p[i].vz*dt;
}
}

int main(const int argc, const char** argv) {

/*
* Do not change the value for `nBodies` here. If you would like to modify it,
* pass values into the command line.
*/

int nBodies = 2<<11;
int salt = 0;
if (argc > 1) nBodies = 2<<atoi(argv[1]);

/*
* This salt is for assessment reasons. Tampering with it will result in automatic failure.
*/

if (argc > 2) salt = atoi(argv[2]);

const float dt = 0.01f; // time step
const int nIters = 10; // simulation iterations

int bytes = nBodies * sizeof(Body);
float *buf;
cudaMallocHost((void **)&buf,bytes);

/*
* As a constraint of this exercise, `randomizeBodies` must remain a host function.
*/
randomizeBodies(buf, 6 * nBodies); // Init pos / vel data
float *d_buf;
cudaMalloc((void **)&d_buf,bytes);
Body *d_p=(Body *)d_buf;

cudaMemcpy(d_buf,buf,bytes,cudaMemcpyHostToDevice);
double totalTime = 0.0;

size_t block_num=nBodies/BLOCK_SIZE;


/*
* This simulation will run for 10 cycles of time, calculating gravitational
* interaction amongst bodies, and adjusting their positions to reflect.
*/

/*******************************************************************/
// Do not modify these 2 lines of code.
for (int iter = 0; iter < nIters; iter++) {
StartTimer();
/*******************************************************************/

/*
* You will likely wish to refactor the work being done in `bodyForce`,
* as well as the work to integrate the positions.
*/

bodyForce<<<block_num*BLOCK_STEP,BLOCK_SIZE>>>(d_p, dt, nBodies,d_flag); // compute interbody forces

/*
* This position integration cannot occur until this round of `bodyForce` has completed.
* Also, the next round of `bodyForce` cannot begin until the integration is complete.
*/
integrate_position<<<block_num,BLOCK_SIZE>>>(d_p,dt,nBodies);

if(iter==nIters-1)
cudaMemcpy(buf,d_buf,bytes,cudaMemcpyDeviceToHost);

/*******************************************************************/
// Do not modify the code in this section.
const double tElapsed = GetTimer() / 1000.0;
totalTime += tElapsed;
}



double avgTime = totalTime / (double)(nIters);
float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;



#ifdef ASSESS
checkPerformance(buf, billionsOfOpsPerSecond, salt);
#else
checkAccuracy(buf, nBodies);
printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, billionsOfOpsPerSecond);
salt += 1;
#endif
/*******************************************************************/

/*
* Feel free to modify code below.
*/
cudaFree(buf);
cudaFree(d_buf);
}

经过优化之后,我们成功加速了 bodyForce 函数