tintegration.cuh - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
HTML git clone git://src.adamsgaard.dk/sphere
DIR Log
DIR Files
DIR Refs
DIR LICENSE
---
tintegration.cuh (17137B)
---
1 #ifndef INTEGRATION_CUH_
2 #define INTEGRATION_CUH_
3
4 // integration.cuh
5 // Functions responsible for temporal integration
6
7 //// Choose temporal integration scheme. Uncomment only one!
8 //#define EULER
9 //#define TY2
10 #define TY3
11
12 // Second order integration scheme based on Taylor expansion of particle kinematics.
13 // Kernel executed on device, and callable from host only.
14 __global__ void integrate(
15 const Float4* __restrict__ dev_x_sorted,
16 const Float4* __restrict__ dev_vel_sorted,
17 const Float4* __restrict__ dev_angvel_sorted,
18 Float4* __restrict__ dev_x,
19 Float4* __restrict__ dev_vel,
20 Float4* __restrict__ dev_angvel,
21 const Float4* __restrict__ dev_force,
22 const Float4* __restrict__ dev_torque,
23 Float4* __restrict__ dev_angpos,
24 Float4* __restrict__ dev_acc,
25 Float4* __restrict__ dev_angacc,
26 Float4* __restrict__ dev_vel0,
27 Float4* __restrict__ dev_angvel0,
28 Float4* __restrict__ dev_xyzsum,
29 const unsigned int* __restrict__ dev_gridParticleIndex,
30 const unsigned int iter,
31 const int* __restrict__ dev_walls_wmode,
32 const Float4* __restrict__ dev_walls_mvfd,
33 const Float* __restrict__ dev_walls_tau_eff_x_partial,
34 const Float* __restrict__ dev_walls_tau_x,
35 const Float tau_x,
36 const int change_velocity_state, // 1: v *= vel_fac, -1: v /= vel_fac
37 const Float velocity_factor,
38 const unsigned int blocksPerGrid)
39 {
40 unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
41
42 if (idx < devC_np) { // Condition prevents block size error
43
44 // Copy data to temporary arrays to avoid any potential
45 // read-after-write, write-after-read, or write-after-write hazards.
46 __syncthreads();
47 unsigned int orig_idx = dev_gridParticleIndex[idx];
48 const Float4 force = dev_force[orig_idx];
49 const Float4 torque = dev_torque[orig_idx];
50 const Float4 angpos = dev_angpos[orig_idx];
51 const Float4 x = dev_x_sorted[idx];
52 const Float4 vel = dev_vel_sorted[idx];
53 const Float4 angvel = dev_angvel_sorted[idx];
54 Float4 xyzsum = dev_xyzsum[orig_idx];
55
56 // Read the mode of the top wall
57 int wall0mode;
58 Float wall0mass;
59 if (devC_nw > 0) {
60 wall0mode = dev_walls_wmode[0];
61 wall0mass = dev_walls_mvfd[0].x;
62 }
63
64 // Find the final sum of shear stresses on the top particles
65 Float tau_eff_x = 0.0;
66 if (devC_nw > 0 && wall0mode == 3)
67 for (int i=0; i<blocksPerGrid; ++i)
68 tau_eff_x += dev_walls_tau_eff_x_partial[i];
69
70 // Get old accelerations for three-term Taylor expansion. These values
71 // don't exist in the first time step
72 #ifdef TY3
73 Float4 acc0, angacc0;
74 if (iter == 0) {
75 acc0 = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
76 angacc0 = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
77 } else {
78 __syncthreads();
79 acc0 = dev_acc[orig_idx];
80 angacc0 = dev_angacc[orig_idx];
81 }
82 #endif
83
84 const Float radius = x.w;
85
86 // New values
87 Float4 x_new, vel_new, angpos_new, angvel_new;
88
89 // Coherent read from constant memory to registers
90 const Float dt = devC_dt;
91 const Float3 origo = MAKE_FLOAT3(
92 devC_grid.origo[0],
93 devC_grid.origo[1],
94 devC_grid.origo[2]);
95 const Float3 L = MAKE_FLOAT3(
96 devC_grid.L[0],
97 devC_grid.L[1],
98 devC_grid.L[2]);
99
100 // Particle mass
101 Float m = 4.0/3.0 * PI * radius*radius*radius * devC_params.rho;
102
103 // Find the acceleration by Newton's second law
104 Float4 acc = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
105 acc.x = force.x/m + devC_params.g[0];
106 acc.y = force.y/m + devC_params.g[1];
107 acc.z = force.z/m + devC_params.g[2];
108
109 // Find the angular acceleration by Newton's second law
110 // (angacc = (total moment)/Intertia, intertia = 2/5*m*r^2)
111 Float4 angacc = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
112 angacc.x = torque.x * 1.0 / (2.0/5.0 * m * radius*radius);
113 angacc.y = torque.y * 1.0 / (2.0/5.0 * m * radius*radius);
114 angacc.z = torque.z * 1.0 / (2.0/5.0 * m * radius*radius);
115
116 // Fixed shear stress BC
117 if (wall0mode == 3 && vel.w > 0.0001 && x.z > devC_grid.L[2]*0.5) {
118
119 // the force should be positive when abs(tau_x) > abs(tau_eff_x)
120 const Float f_tau_x =
121 (tau_x + tau_eff_x)*devC_grid.L[0]*devC_grid.L[1];
122
123 acc.x = f_tau_x/wall0mass; // acceleration = force/mass
124 acc.y = 0.0;
125 acc.z -= devC_params.g[2];
126
127 // disable rotation
128 angacc = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
129 }
130
131 // Modify the acceleration if the particle is marked as having a fixed
132 // velocity. In that case, zero the horizontal acceleration and disable
133 // gravity to counteract segregation. Particles may move in the
134 // z-dimension, to allow for dilation.
135 else if (vel.w > 0.0001 && vel.w < 10.0) {
136
137 acc.x = 0.0;
138 acc.y = 0.0;
139 acc.z -= devC_params.g[2];
140
141 // Zero the angular acceleration
142 angacc = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
143 }
144
145 // 2d kinematic update by disabling linear acceleration along y and
146 // disabling angular acceleration by everything but y.
147 if (vel.w >= 10.0) {
148 acc.y = 0.0;
149 angacc.x = 0.0;
150 angacc.z = 0.0;
151 }
152
153 if (vel.w < -0.0001 && vel.w > -10.0) {
154 acc.x = 0.0;
155 acc.y = 0.0;
156 acc.z = 0.0;
157
158 // Zero the angular acceleration
159 angacc = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
160 }
161
162 if (vel.w <= -10.0) {
163 angacc = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
164 }
165
166 #ifdef EULER
167 // Forward Euler
168 // Truncation error O(dt^2) for positions and velocities
169 x_new.x = x.x + vel.x*dt;
170 x_new.y = x.y + vel.y*dt;
171 x_new.z = x.z + vel.z*dt;
172 x_new.w = x.w; // transfer radius
173
174 vel_new.x = vel.x + acc.x*dt;
175 vel_new.y = vel.y + acc.y*dt;
176 vel_new.z = vel.z + acc.z*dt;
177 vel_new.w = vel.w; // transfer fixvel
178
179 angpos_new.x = angpos.x + angvel.x*dt;
180 angpos_new.y = angpos.y + angvel.y*dt;
181 angpos_new.z = angpos.z + angvel.z*dt;
182 angpos_new.w = angpos.w;
183
184 angvel_new.x = angvel.x + angacc.x*dt;
185 angvel_new.y = angvel.y + angacc.y*dt;
186 angvel_new.z = angvel.z + angacc.z*dt;
187 angvel_new.w = angvel.w;
188
189 // Add horizontal-displacement for this time step to the sum of
190 // horizontal displacements
191 xyzsum.x += vel.x*dt;
192 xyzsum.y += vel.y*dt;
193 xyzsum.z += vel.z*dt;
194 #endif
195
196 #ifdef TY2
197 // Two-term Taylor expansion (TY2)
198 // Truncation error O(dt^3) for positions, O(dt^2) for velocities
199 x_new.x = x.x + vel.x*dt + 0.5*acc.x*dt*dt;
200 x_new.y = x.y + vel.y*dt + 0.5*acc.y*dt*dt;
201 x_new.z = x.z + vel.z*dt + 0.5*acc.z*dt*dt;
202 x_new.w = x.w; // transfer radius
203
204 vel_new.x = vel.x + acc.x*dt;
205 vel_new.y = vel.y + acc.y*dt;
206 vel_new.z = vel.z + acc.z*dt;
207 vel_new.w = vel.w; // transfer fixvel
208
209 angpos_new.x = angpos.x + angvel.x*dt + 0.5*angacc.x*dt*dt;
210 angpos_new.y = angpos.y + angvel.y*dt + 0.5*angacc.y*dt*dt;
211 angpos_new.z = angpos.z + angvel.z*dt + 0.5*angacc.z*dt*dt;
212 angpos_new.w = angpos.w;
213
214 angvel_new.x = angvel.x + angacc.x*dt;
215 angvel_new.y = angvel.y + angacc.y*dt;
216 angvel_new.z = angvel.z + angacc.z*dt;
217 angvel_new.w = angvel.w;
218
219 // Add horizontal-displacement for this time step to the sum of
220 // horizontal displacements
221 xyzsum.x += vel.x*dt + 0.5*acc.x*dt*dt;
222 xyzsum.y += vel.y*dt + 0.5*acc.y*dt*dt;
223 xyzsum.z += vel.z*dt + 0.5*acc.z*dt*dt;
224 #endif
225
226 #ifdef TY3
227 // Three-term Taylor expansion (TY3)
228 // Truncation error O(dt^4) for positions, O(dt^3) for velocities
229 // Approximate acceleration change by backwards difference:
230 const Float3 dacc_dt = MAKE_FLOAT3(
231 (acc.x - acc0.x)/dt,
232 (acc.y - acc0.y)/dt,
233 (acc.z - acc0.z)/dt);
234
235 const Float3 dangacc_dt = MAKE_FLOAT3(
236 (angacc.x - angacc0.x)/dt,
237 (angacc.y - angacc0.y)/dt,
238 (angacc.z - angacc0.z)/dt);
239
240 x_new.x = x.x + vel.x*dt + 0.5*acc.x*dt*dt + 1.0/6.0*dacc_dt.x*dt*dt*dt;
241 x_new.y = x.y + vel.y*dt + 0.5*acc.y*dt*dt + 1.0/6.0*dacc_dt.y*dt*dt*dt;
242 x_new.z = x.z + vel.z*dt + 0.5*acc.z*dt*dt + 1.0/6.0*dacc_dt.z*dt*dt*dt;
243 x_new.w = x.w; // transfer radius
244
245 vel_new.x = vel.x + acc.x*dt + 0.5*dacc_dt.x*dt*dt;
246 vel_new.y = vel.y + acc.y*dt + 0.5*dacc_dt.y*dt*dt;
247 vel_new.z = vel.z + acc.z*dt + 0.5*dacc_dt.z*dt*dt;
248 vel_new.w = vel.w; // transfer fixvel
249
250 angpos_new.x = angpos.x + angvel.x*dt + 0.5*angacc.x*dt*dt
251 + 1.0/6.0*dangacc_dt.x*dt*dt*dt;
252 angpos_new.y = angpos.y + angvel.y*dt + 0.5*angacc.y*dt*dt
253 + 1.0/6.0*dangacc_dt.y*dt*dt*dt;
254 angpos_new.z = angpos.z + angvel.z*dt + 0.5*angacc.z*dt*dt
255 + 1.0/6.0*dangacc_dt.z*dt*dt*dt;
256 angpos_new.w = angpos.w;
257
258 angvel_new.x = angvel.x + angacc.x*dt + 0.5*dangacc_dt.x*dt*dt;
259 angvel_new.y = angvel.y + angacc.y*dt + 0.5*dangacc_dt.y*dt*dt;
260 angvel_new.z = angvel.z + angacc.z*dt + 0.5*dangacc_dt.z*dt*dt;
261 angvel_new.w = angvel.w;
262
263 // Add horizontal-displacement for this time step to the sum of
264 // horizontal displacements
265 xyzsum.x += vel.x*dt + 0.5*acc.x*dt*dt + 1.0/6.0*dacc_dt.x*dt*dt*dt;
266 xyzsum.y += vel.y*dt + 0.5*acc.y*dt*dt + 1.0/6.0*dacc_dt.y*dt*dt*dt;
267 xyzsum.z += vel.z*dt + 0.5*acc.z*dt*dt + 1.0/6.0*dacc_dt.z*dt*dt*dt;
268 #endif
269
270 // Move particles outside the domain across periodic boundaries
271 if (devC_grid.periodic == 1) {
272 if (x_new.x < origo.x)
273 x_new.x += L.x;
274 if (x_new.x > L.x)
275 x_new.x -= L.x;
276 if (ND == 3) {
277 if (x_new.y < origo.y)
278 x_new.y += L.y;
279 if (x_new.y > L.y)
280 x_new.y -= L.y;
281 }
282 } else if (devC_grid.periodic == 2) {
283 if (x_new.x < origo.x)
284 x_new.x += L.x;
285 if (x_new.x > L.x)
286 x_new.x -= L.x;
287 }
288
289 // step-wise velocity change for rate-and-state experiments
290 if (vel.w > 5.0 && vel.w < 10.0) {
291 if (change_velocity_state == 1)
292 vel_new.x *= velocity_factor;
293 else if (change_velocity_state == -1)
294 vel_new.x /= velocity_factor;
295 }
296
297 // Hold threads for coalesced write
298 __syncthreads();
299
300 // Store data in global memory at original, pre-sort positions
301 dev_xyzsum[orig_idx] = xyzsum;
302 dev_acc[orig_idx] = acc;
303 dev_angacc[orig_idx] = angacc;
304 dev_angvel[orig_idx] = angvel_new;
305 dev_vel[orig_idx] = vel_new;
306 dev_angpos[orig_idx] = angpos_new;
307 dev_x[orig_idx] = x_new;
308 }
309 } // End of integrate(...)
310
311
312 // Reduce wall force contributions from particles to a single value per wall
313 __global__ void summation(
314 const Float* __restrict__ in,
315 Float* __restrict__ out)
316 {
317 __shared__ Float cache[256];
318 unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
319 unsigned int cacheIdx = threadIdx.x;
320
321 Float temp = 0.0f;
322 while (idx < devC_np) {
323 temp += in[idx];
324 idx += blockDim.x * gridDim.x;
325 }
326
327 // Set the cache values
328 cache[cacheIdx] = temp;
329
330 __syncthreads();
331
332 // For reductions, threadsPerBlock must be a power of two
333 // because of the following code
334 unsigned int i = blockDim.x/2;
335 while (i != 0) {
336 if (cacheIdx < i)
337 cache[cacheIdx] += cache[cacheIdx + i];
338 __syncthreads();
339 i /= 2;
340 }
341
342 // Write sum for block to global memory
343 if (cacheIdx == 0)
344 out[blockIdx.x] = cache[0];
345 }
346
347 // Update wall positions
348 __global__ void integrateWalls(
349 Float4* __restrict__ dev_walls_nx,
350 Float4* __restrict__ dev_walls_mvfd,
351 const int* __restrict__ dev_walls_wmode,
352 const Float* __restrict__ dev_walls_force_partial,
353 Float* __restrict__ dev_walls_acc,
354 const unsigned int blocksPerGrid,
355 const Float t_current,
356 const unsigned int iter)
357 {
358 unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
359
360 if (idx < devC_nw) { // Condition prevents block size error
361
362 // Copy data to temporary arrays to avoid any potential
363 // read-after-write, write-after-read, or write-after-write hazards.
364 Float4 w_nx = dev_walls_nx[idx];
365 Float4 w_mvfd = dev_walls_mvfd[idx];
366 int wmode = dev_walls_wmode[idx]; // Wall BC, 0: fixed, 1: sigma0, 2: vel
367
368 if (wmode != 0) { // wmode == 0: Wall fixed: do nothing
369
370 #ifdef TY3
371 Float acc0;
372 if (iter == 0)
373 acc0 = 0.0;
374 else
375 acc0 = dev_walls_acc[idx];
376 #endif
377
378 // Find the final sum of forces on wall
379 w_mvfd.z = 0.0;
380 for (int i=0; i<blocksPerGrid; ++i) {
381 w_mvfd.z += dev_walls_force_partial[i];
382 }
383
384 const Float dt = devC_dt;
385
386 // Apply sinusoidal variation if specified
387 const Float sigma_0 = w_mvfd.w
388 + devC_params.sigma0_A*sin(2.0*PI*devC_params.sigma0_f * t_current);
389
390 // Normal load = Normal stress times wall surface area,
391 // directed downwards.
392 const Float N = -sigma_0*devC_grid.L[0]*devC_grid.L[1];
393
394 // Calculate resulting acceleration of wall
395 // (Wall mass is stored in x component of position Float4)
396 Float acc = (w_mvfd.z + N)/w_mvfd.x;
397
398 // Apply gravity
399 /*if (idx == 0)
400 acc += devC_params.g[2];
401 else if (idx == 1)
402 acc += devC_params.g[0];
403 else if (idx == 2)
404 acc += devC_params.g[1];*/
405
406 // If Wall BC is controlled by velocity, it should not change
407 if (wmode == 2) {
408 acc = 0.0;
409 }
410
411 #ifdef EULER
412 // Forward Euler tempmoral integration.
413
414 // Update position
415 w_nx.w += w_mvfd.y*dt;
416
417 // Update velocity
418 w_mvfd.y += acc*dt;
419 #endif
420
421 #ifdef TY2
422 // Two-term Taylor expansion for tempmoral integration.
423 // The truncation error is O(dt^3) for positions and O(dt^2) for
424 // velocities.
425
426 // Update position
427 w_nx.w += w_mvfd.y*dt + 0.5*acc*dt*dt;
428
429 // Update velocity
430 w_mvfd.y += acc*dt;
431 #endif
432
433 #ifdef TY3
434 // Three-term Taylor expansion for tempmoral integration.
435 // The truncation error is O(dt^4) for positions and O(dt^3) for
436 // velocities. The acceleration change approximated by backwards
437 // central difference:
438 const Float dacc_dt = (acc - acc0)/dt;
439
440 // Update position
441 w_nx.w += w_mvfd.y*dt + 0.5*acc*dt*dt + 1.0/6.0*dacc_dt*dt*dt*dt;
442
443 // Update velocity
444 w_mvfd.y += acc*dt + 0.5*dacc_dt*dt*dt;
445 #endif
446
447 // Store data in global memory
448 __syncthreads();
449 dev_walls_nx[idx] = w_nx;
450 dev_walls_mvfd[idx] = w_mvfd;
451 dev_walls_acc[idx] = acc;
452 }
453 }
454 } // End of integrateWalls(...)
455
456
457 // Finds shear stress per particle adjacent to top wall (idx=0).
458 // The fixvel value is saved in vel.w.
459 // Particles who do not fulfill the criteria will have a value of 0.0 written to
460 // dev_walls_tau_eff_x_pp.
461 __global__ void findShearStressOnFixedMovingParticles(
462 const Float4* __restrict__ dev_x,
463 const Float4* __restrict__ dev_vel,
464 const Float4* __restrict__ dev_force,
465 Float* __restrict__ dev_walls_tau_eff_x_pp)
466 {
467 unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
468
469 if (idx < devC_np) { // Condition prevents block size error
470
471 // Copy data to temporary arrays to avoid any potential
472 // read-after-write, write-after-read, or write-after-write hazards.
473 __syncthreads();
474 const Float z = dev_x[idx].z;
475 const Float fixvel = dev_vel[idx].w;
476 const Float force_x = dev_force[idx].x;
477
478 // Only select fixed velocity (fixvel > 0.0, fixvel = vel.w) particles
479 // at the top boundary (z > L[0]/2)
480 Float f_x = 0.0;
481 if (fixvel > 0.0 && z > devC_grid.L[2]*0.5)
482 f_x = force_x;
483
484 __syncthreads();
485 // Convert force to shear stress and save
486 dev_walls_tau_eff_x_pp[idx] = f_x/(devC_grid.L[0]*devC_grid.L[1]);
487 }
488 }
489 #endif
490 // vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4