URI: 
       tintegration.cuh - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
  HTML git clone git://src.adamsgaard.dk/sphere
   DIR Log
   DIR Files
   DIR Refs
   DIR LICENSE
       ---
       tintegration.cuh (17137B)
       ---
            1 #ifndef INTEGRATION_CUH_
            2 #define INTEGRATION_CUH_
            3 
            4 // integration.cuh
            5 // Functions responsible for temporal integration
            6 
            7 //// Choose temporal integration scheme. Uncomment only one!
            8 //#define EULER
            9 //#define TY2
           10 #define TY3
           11 
           12 // Second order integration scheme based on Taylor expansion of particle kinematics. 
           13 // Kernel executed on device, and callable from host only.
           14 __global__ void integrate(
           15     const Float4* __restrict__ dev_x_sorted,
           16     const Float4* __restrict__ dev_vel_sorted,
           17     const Float4* __restrict__ dev_angvel_sorted,
           18     Float4* __restrict__ dev_x,
           19     Float4* __restrict__ dev_vel,
           20     Float4* __restrict__ dev_angvel,
           21     const Float4* __restrict__ dev_force,
           22     const Float4* __restrict__ dev_torque,
           23     Float4* __restrict__ dev_angpos,
           24     Float4* __restrict__ dev_acc,
           25     Float4* __restrict__ dev_angacc,
           26     Float4* __restrict__ dev_vel0,
           27     Float4* __restrict__ dev_angvel0,
           28     Float4* __restrict__ dev_xyzsum,
           29     const unsigned int* __restrict__ dev_gridParticleIndex,
           30     const unsigned int iter,
           31     const int* __restrict__ dev_walls_wmode,
           32     const Float4* __restrict__ dev_walls_mvfd,
           33     const Float* __restrict__ dev_walls_tau_eff_x_partial,
           34     const Float* __restrict__ dev_walls_tau_x,
           35     const Float tau_x,
           36     const int change_velocity_state, // 1: v *= vel_fac, -1: v /= vel_fac
           37     const Float velocity_factor,
           38     const unsigned int blocksPerGrid)
           39 {
           40     unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
           41 
           42     if (idx < devC_np) { // Condition prevents block size error
           43 
           44         // Copy data to temporary arrays to avoid any potential
           45         // read-after-write, write-after-read, or write-after-write hazards. 
           46         __syncthreads();
           47         unsigned int orig_idx = dev_gridParticleIndex[idx];
           48         const Float4 force    = dev_force[orig_idx];
           49         const Float4 torque   = dev_torque[orig_idx];
           50         const Float4 angpos   = dev_angpos[orig_idx];
           51         const Float4 x        = dev_x_sorted[idx];
           52         const Float4 vel      = dev_vel_sorted[idx];
           53         const Float4 angvel   = dev_angvel_sorted[idx];
           54         Float4 xyzsum = dev_xyzsum[orig_idx];
           55 
           56         // Read the mode of the top wall
           57         int wall0mode;
           58         Float wall0mass;
           59         if (devC_nw > 0) {
           60             wall0mode = dev_walls_wmode[0];
           61             wall0mass = dev_walls_mvfd[0].x;
           62         }
           63 
           64         // Find the final sum of shear stresses on the top particles
           65         Float tau_eff_x = 0.0;
           66         if (devC_nw > 0 && wall0mode == 3)
           67             for (int i=0; i<blocksPerGrid; ++i)
           68                 tau_eff_x += dev_walls_tau_eff_x_partial[i];
           69 
           70         // Get old accelerations for three-term Taylor expansion. These values
           71         // don't exist in the first time step
           72 #ifdef TY3
           73         Float4 acc0, angacc0;
           74         if (iter == 0) {
           75             acc0 = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
           76             angacc0 = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
           77         } else {
           78             __syncthreads();
           79             acc0    = dev_acc[orig_idx];
           80             angacc0 = dev_angacc[orig_idx];
           81         }
           82 #endif
           83 
           84         const Float radius = x.w;
           85 
           86         // New values
           87         Float4 x_new, vel_new, angpos_new, angvel_new;
           88 
           89         // Coherent read from constant memory to registers
           90         const Float dt = devC_dt;
           91         const Float3 origo = MAKE_FLOAT3(
           92             devC_grid.origo[0],
           93             devC_grid.origo[1],
           94             devC_grid.origo[2]); 
           95         const Float3 L = MAKE_FLOAT3(
           96             devC_grid.L[0],
           97             devC_grid.L[1],
           98             devC_grid.L[2]);
           99 
          100         // Particle mass
          101         Float m = 4.0/3.0 * PI * radius*radius*radius * devC_params.rho;
          102 
          103         // Find the acceleration by Newton's second law
          104         Float4 acc = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
          105         acc.x = force.x/m + devC_params.g[0];
          106         acc.y = force.y/m + devC_params.g[1];
          107         acc.z = force.z/m + devC_params.g[2];
          108 
          109         // Find the angular acceleration by Newton's second law
          110         // (angacc = (total moment)/Intertia, intertia = 2/5*m*r^2)
          111         Float4 angacc = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
          112         angacc.x = torque.x * 1.0 / (2.0/5.0 * m * radius*radius);
          113         angacc.y = torque.y * 1.0 / (2.0/5.0 * m * radius*radius);
          114         angacc.z = torque.z * 1.0 / (2.0/5.0 * m * radius*radius);
          115 
          116         // Fixed shear stress BC
          117         if (wall0mode == 3 && vel.w > 0.0001 && x.z > devC_grid.L[2]*0.5) {
          118 
          119             // the force should be positive when abs(tau_x) > abs(tau_eff_x)
          120             const Float f_tau_x =
          121                 (tau_x + tau_eff_x)*devC_grid.L[0]*devC_grid.L[1];
          122 
          123             acc.x = f_tau_x/wall0mass;  // acceleration = force/mass
          124             acc.y = 0.0;
          125             acc.z -= devC_params.g[2];
          126 
          127             // disable rotation
          128             angacc = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
          129         }
          130 
          131         // Modify the acceleration if the particle is marked as having a fixed
          132         // velocity. In that case, zero the horizontal acceleration and disable
          133         // gravity to counteract segregation. Particles may move in the
          134         // z-dimension, to allow for dilation.
          135         else if (vel.w > 0.0001 && vel.w < 10.0) {
          136 
          137             acc.x = 0.0;
          138             acc.y = 0.0;
          139             acc.z -= devC_params.g[2];
          140 
          141             // Zero the angular acceleration
          142             angacc = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
          143         }
          144 
          145         // 2d kinematic update by disabling linear acceleration along y and
          146         // disabling angular acceleration by everything but y.
          147         if (vel.w >= 10.0) {
          148             acc.y = 0.0;
          149             angacc.x = 0.0;
          150             angacc.z = 0.0;
          151         }
          152 
          153         if (vel.w < -0.0001 && vel.w > -10.0) {
          154             acc.x = 0.0;
          155             acc.y = 0.0;
          156             acc.z = 0.0;
          157 
          158             // Zero the angular acceleration
          159             angacc = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
          160         }
          161 
          162         if (vel.w <= -10.0) {
          163             angacc = MAKE_FLOAT4(0.0, 0.0, 0.0, 0.0);
          164         }
          165 
          166 #ifdef EULER
          167         // Forward Euler
          168         // Truncation error O(dt^2) for positions and velocities
          169         x_new.x = x.x + vel.x*dt;
          170         x_new.y = x.y + vel.y*dt;
          171         x_new.z = x.z + vel.z*dt;
          172         x_new.w = x.w; // transfer radius
          173 
          174         vel_new.x = vel.x + acc.x*dt;
          175         vel_new.y = vel.y + acc.y*dt;
          176         vel_new.z = vel.z + acc.z*dt;
          177         vel_new.w = vel.w; // transfer fixvel
          178 
          179         angpos_new.x = angpos.x + angvel.x*dt;
          180         angpos_new.y = angpos.y + angvel.y*dt;
          181         angpos_new.z = angpos.z + angvel.z*dt;
          182         angpos_new.w = angpos.w;
          183 
          184         angvel_new.x = angvel.x + angacc.x*dt;
          185         angvel_new.y = angvel.y + angacc.y*dt;
          186         angvel_new.z = angvel.z + angacc.z*dt;
          187         angvel_new.w = angvel.w;
          188 
          189         // Add horizontal-displacement for this time step to the sum of
          190         // horizontal displacements
          191         xyzsum.x += vel.x*dt;
          192         xyzsum.y += vel.y*dt;
          193         xyzsum.z += vel.z*dt;
          194 #endif
          195 
          196 #ifdef TY2
          197         // Two-term Taylor expansion (TY2)
          198         // Truncation error O(dt^3) for positions, O(dt^2) for velocities
          199         x_new.x = x.x + vel.x*dt + 0.5*acc.x*dt*dt;
          200         x_new.y = x.y + vel.y*dt + 0.5*acc.y*dt*dt;
          201         x_new.z = x.z + vel.z*dt + 0.5*acc.z*dt*dt;
          202         x_new.w = x.w; // transfer radius
          203 
          204         vel_new.x = vel.x + acc.x*dt;
          205         vel_new.y = vel.y + acc.y*dt;
          206         vel_new.z = vel.z + acc.z*dt;
          207         vel_new.w = vel.w; // transfer fixvel
          208 
          209         angpos_new.x = angpos.x + angvel.x*dt + 0.5*angacc.x*dt*dt;
          210         angpos_new.y = angpos.y + angvel.y*dt + 0.5*angacc.y*dt*dt;
          211         angpos_new.z = angpos.z + angvel.z*dt + 0.5*angacc.z*dt*dt;
          212         angpos_new.w = angpos.w;
          213 
          214         angvel_new.x = angvel.x + angacc.x*dt;
          215         angvel_new.y = angvel.y + angacc.y*dt;
          216         angvel_new.z = angvel.z + angacc.z*dt;
          217         angvel_new.w = angvel.w;
          218 
          219         // Add horizontal-displacement for this time step to the sum of
          220         // horizontal displacements
          221         xyzsum.x += vel.x*dt + 0.5*acc.x*dt*dt;
          222         xyzsum.y += vel.y*dt + 0.5*acc.y*dt*dt;
          223         xyzsum.z += vel.z*dt + 0.5*acc.z*dt*dt;
          224 #endif
          225 
          226 #ifdef TY3
          227         // Three-term Taylor expansion (TY3)
          228         // Truncation error O(dt^4) for positions, O(dt^3) for velocities
          229         // Approximate acceleration change by backwards difference:
          230         const Float3 dacc_dt = MAKE_FLOAT3(
          231             (acc.x - acc0.x)/dt,
          232             (acc.y - acc0.y)/dt,
          233             (acc.z - acc0.z)/dt);
          234 
          235         const Float3 dangacc_dt = MAKE_FLOAT3(
          236             (angacc.x - angacc0.x)/dt,
          237             (angacc.y - angacc0.y)/dt,
          238             (angacc.z - angacc0.z)/dt);
          239 
          240         x_new.x = x.x + vel.x*dt + 0.5*acc.x*dt*dt + 1.0/6.0*dacc_dt.x*dt*dt*dt;
          241         x_new.y = x.y + vel.y*dt + 0.5*acc.y*dt*dt + 1.0/6.0*dacc_dt.y*dt*dt*dt;
          242         x_new.z = x.z + vel.z*dt + 0.5*acc.z*dt*dt + 1.0/6.0*dacc_dt.z*dt*dt*dt;
          243         x_new.w = x.w; // transfer radius
          244 
          245         vel_new.x = vel.x + acc.x*dt + 0.5*dacc_dt.x*dt*dt;
          246         vel_new.y = vel.y + acc.y*dt + 0.5*dacc_dt.y*dt*dt;
          247         vel_new.z = vel.z + acc.z*dt + 0.5*dacc_dt.z*dt*dt;
          248         vel_new.w = vel.w; // transfer fixvel
          249 
          250         angpos_new.x = angpos.x + angvel.x*dt + 0.5*angacc.x*dt*dt
          251             + 1.0/6.0*dangacc_dt.x*dt*dt*dt;
          252         angpos_new.y = angpos.y + angvel.y*dt + 0.5*angacc.y*dt*dt
          253             + 1.0/6.0*dangacc_dt.y*dt*dt*dt;
          254         angpos_new.z = angpos.z + angvel.z*dt + 0.5*angacc.z*dt*dt
          255             + 1.0/6.0*dangacc_dt.z*dt*dt*dt;
          256         angpos_new.w = angpos.w;
          257 
          258         angvel_new.x = angvel.x + angacc.x*dt + 0.5*dangacc_dt.x*dt*dt;
          259         angvel_new.y = angvel.y + angacc.y*dt + 0.5*dangacc_dt.y*dt*dt;
          260         angvel_new.z = angvel.z + angacc.z*dt + 0.5*dangacc_dt.z*dt*dt;
          261         angvel_new.w = angvel.w;
          262 
          263         // Add horizontal-displacement for this time step to the sum of
          264         // horizontal displacements
          265         xyzsum.x += vel.x*dt + 0.5*acc.x*dt*dt + 1.0/6.0*dacc_dt.x*dt*dt*dt;
          266         xyzsum.y += vel.y*dt + 0.5*acc.y*dt*dt + 1.0/6.0*dacc_dt.y*dt*dt*dt;
          267         xyzsum.z += vel.z*dt + 0.5*acc.z*dt*dt + 1.0/6.0*dacc_dt.z*dt*dt*dt;
          268 #endif
          269 
          270         // Move particles outside the domain across periodic boundaries
          271         if (devC_grid.periodic == 1) {
          272             if (x_new.x < origo.x)
          273                 x_new.x += L.x;
          274             if (x_new.x > L.x)
          275                 x_new.x -= L.x;
          276             if (ND == 3) {
          277                 if (x_new.y < origo.y)
          278                     x_new.y += L.y;
          279                 if (x_new.y > L.y)
          280                     x_new.y -= L.y;
          281             }
          282         } else if (devC_grid.periodic == 2) {
          283             if (x_new.x < origo.x)
          284                 x_new.x += L.x;
          285             if (x_new.x > L.x)
          286                 x_new.x -= L.x;
          287         }
          288 
          289         // step-wise velocity change for rate-and-state experiments
          290         if (vel.w > 5.0 && vel.w < 10.0) {
          291             if (change_velocity_state == 1)
          292                 vel_new.x *= velocity_factor;
          293             else if (change_velocity_state == -1)
          294                 vel_new.x /= velocity_factor;
          295         }
          296 
          297         // Hold threads for coalesced write
          298         __syncthreads();
          299 
          300         // Store data in global memory at original, pre-sort positions
          301         dev_xyzsum[orig_idx]  = xyzsum;
          302         dev_acc[orig_idx]     = acc;
          303         dev_angacc[orig_idx]  = angacc;
          304         dev_angvel[orig_idx]  = angvel_new;
          305         dev_vel[orig_idx]     = vel_new;
          306         dev_angpos[orig_idx]  = angpos_new;
          307         dev_x[orig_idx]       = x_new;
          308     } 
          309 } // End of integrate(...)
          310 
          311 
          312 // Reduce wall force contributions from particles to a single value per wall
          313 __global__ void summation(
          314     const Float* __restrict__ in,
          315     Float* __restrict__ out)
          316 {
          317     __shared__ Float cache[256];
          318     unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
          319     unsigned int cacheIdx = threadIdx.x;
          320 
          321     Float temp = 0.0f;
          322     while (idx < devC_np) {
          323         temp += in[idx];
          324         idx += blockDim.x * gridDim.x;
          325     }
          326 
          327     // Set the cache values
          328     cache[cacheIdx] = temp;
          329 
          330     __syncthreads();
          331 
          332     // For reductions, threadsPerBlock must be a power of two
          333     // because of the following code
          334     unsigned int i = blockDim.x/2;
          335     while (i != 0) {
          336         if (cacheIdx < i)
          337             cache[cacheIdx] += cache[cacheIdx + i];
          338         __syncthreads();
          339         i /= 2;
          340     }
          341 
          342     // Write sum for block to global memory
          343     if (cacheIdx == 0)
          344         out[blockIdx.x] = cache[0];
          345 }
          346 
          347 // Update wall positions
          348 __global__ void integrateWalls(
          349     Float4* __restrict__ dev_walls_nx,
          350     Float4* __restrict__ dev_walls_mvfd,
          351     const int* __restrict__ dev_walls_wmode,
          352     const Float* __restrict__ dev_walls_force_partial,
          353     Float* __restrict__ dev_walls_acc,
          354     const unsigned int blocksPerGrid,
          355     const Float t_current,
          356     const unsigned int iter)
          357 {
          358     unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
          359 
          360     if (idx < devC_nw) { // Condition prevents block size error
          361 
          362         // Copy data to temporary arrays to avoid any potential
          363         // read-after-write, write-after-read, or write-after-write hazards. 
          364         Float4 w_nx   = dev_walls_nx[idx];
          365         Float4 w_mvfd = dev_walls_mvfd[idx];
          366         int wmode = dev_walls_wmode[idx];  // Wall BC, 0: fixed, 1: sigma0, 2: vel
          367 
          368         if (wmode != 0) { // wmode == 0: Wall fixed: do nothing
          369 
          370 #ifdef TY3
          371             Float acc0;
          372             if (iter == 0)
          373                 acc0 = 0.0;
          374             else
          375                 acc0 = dev_walls_acc[idx];
          376 #endif
          377 
          378             // Find the final sum of forces on wall
          379             w_mvfd.z = 0.0;
          380             for (int i=0; i<blocksPerGrid; ++i) {
          381                 w_mvfd.z += dev_walls_force_partial[i];
          382             }
          383 
          384             const Float dt = devC_dt;
          385 
          386             // Apply sinusoidal variation if specified
          387             const Float sigma_0 = w_mvfd.w
          388                 + devC_params.sigma0_A*sin(2.0*PI*devC_params.sigma0_f * t_current);
          389 
          390             // Normal load = Normal stress times wall surface area,
          391             // directed downwards.
          392             const Float N = -sigma_0*devC_grid.L[0]*devC_grid.L[1];
          393 
          394             // Calculate resulting acceleration of wall
          395             // (Wall mass is stored in x component of position Float4)
          396             Float acc = (w_mvfd.z + N)/w_mvfd.x;
          397 
          398             // Apply gravity
          399             /*if (idx == 0)
          400                 acc += devC_params.g[2];
          401             else if (idx == 1)
          402                 acc += devC_params.g[0];
          403             else if (idx == 2)
          404                 acc += devC_params.g[1];*/
          405 
          406             // If Wall BC is controlled by velocity, it should not change
          407             if (wmode == 2) { 
          408                 acc = 0.0;
          409             }
          410 
          411 #ifdef EULER
          412             // Forward Euler tempmoral integration.
          413 
          414             // Update position
          415             w_nx.w += w_mvfd.y*dt;
          416 
          417             // Update velocity
          418             w_mvfd.y += acc*dt;
          419 #endif
          420 
          421 #ifdef TY2
          422             // Two-term Taylor expansion for tempmoral integration.
          423             // The truncation error is O(dt^3) for positions and O(dt^2) for
          424             // velocities.
          425 
          426             // Update position
          427             w_nx.w += w_mvfd.y*dt + 0.5*acc*dt*dt;
          428 
          429             // Update velocity
          430             w_mvfd.y += acc*dt;
          431 #endif
          432 
          433 #ifdef TY3
          434             // Three-term Taylor expansion for tempmoral integration.
          435             // The truncation error is O(dt^4) for positions and O(dt^3) for
          436             // velocities. The acceleration change approximated by backwards
          437             // central difference:
          438             const Float dacc_dt = (acc - acc0)/dt;
          439 
          440             // Update position
          441             w_nx.w += w_mvfd.y*dt + 0.5*acc*dt*dt + 1.0/6.0*dacc_dt*dt*dt*dt;
          442 
          443             // Update velocity
          444             w_mvfd.y += acc*dt + 0.5*dacc_dt*dt*dt;
          445 #endif
          446 
          447             // Store data in global memory
          448             __syncthreads();
          449             dev_walls_nx[idx]   = w_nx;
          450             dev_walls_mvfd[idx] = w_mvfd;
          451             dev_walls_acc[idx]  = acc;
          452         }
          453     }
          454 } // End of integrateWalls(...)
          455 
          456 
          457 // Finds shear stress per particle adjacent to top wall (idx=0).
          458 // The fixvel value is saved in vel.w.
          459 // Particles who do not fulfill the criteria will have a value of 0.0 written to
          460 // dev_walls_tau_eff_x_pp.
          461 __global__ void findShearStressOnFixedMovingParticles(
          462     const Float4* __restrict__ dev_x,
          463     const Float4* __restrict__ dev_vel,
          464     const Float4* __restrict__ dev_force,
          465     Float* __restrict__ dev_walls_tau_eff_x_pp)
          466 {
          467     unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
          468 
          469     if (idx < devC_np) { // Condition prevents block size error
          470 
          471         // Copy data to temporary arrays to avoid any potential
          472         // read-after-write, write-after-read, or write-after-write hazards. 
          473         __syncthreads();
          474         const Float z       = dev_x[idx].z;
          475         const Float fixvel  = dev_vel[idx].w;
          476         const Float force_x = dev_force[idx].x;
          477 
          478         // Only select fixed velocity (fixvel > 0.0, fixvel = vel.w) particles
          479         // at the top boundary (z > L[0]/2)
          480         Float f_x = 0.0;
          481         if (fixvel > 0.0 && z > devC_grid.L[2]*0.5)
          482             f_x = force_x;
          483 
          484         __syncthreads();
          485         // Convert force to shear stress and save
          486         dev_walls_tau_eff_x_pp[idx] = f_x/(devC_grid.L[0]*devC_grid.L[1]);
          487     }
          488 }
          489 #endif
          490 // vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4