cuda_mersennetwisterkernel.h
00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035 #ifndef CUDA_MERSENNETWISTERKERNEL_H_DEFINED
00036 #define CUDA_MERSENNETWISTERKERNEL_H_DEFINED
00037
00038 #include "cuda_mersennetwister.h"
00039 #include "cudadefs.h"
00040 #include "CUDA/cutil.h"
00041 #include <cuda_runtime_api.h>
00042
00043
00044 __device__ static mt_struct_stripped ds_MT[MT_RNG_COUNT];
00045 static mt_struct_stripped h_MT[MT_RNG_COUNT];
00046
00047
00048
00049
00050 void cuda_c_seedMT(unsigned int seed){
00051 int i;
00052
00053 mt_struct_stripped *MT = (mt_struct_stripped *)malloc(MT_RNG_COUNT * sizeof(mt_struct_stripped));
00054
00055 for(i = 0; i < MT_RNG_COUNT; i++){
00056 MT[i] = h_MT[i];
00057 MT[i].seed = seed;
00058 }
00059 CUDA_SAFE_CALL( cudaMemcpyToSymbol(ds_MT, MT, sizeof(h_MT)) );
00060
00061 free(MT);
00062 }
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074 __global__ void cuda_global_randomMT(
00075 float *d_Random,
00076 int NPerRng
00077 ){
00078 const int tid = blockDim.x * blockIdx.x + threadIdx.x;
00079 const int THREAD_N = blockDim.x * gridDim.x;
00080
00081 int iState, iState1, iStateM, iOut;
00082 unsigned int mti, mti1, mtiM, x;
00083 unsigned int mt[MT_NN];
00084
00085 for(int iRng = tid; iRng < MT_RNG_COUNT; iRng += THREAD_N){
00086
00087 mt_struct_stripped config = ds_MT[iRng];
00088
00089
00090 mt[0] = config.seed;
00091 for(iState = 1; iState < MT_NN; iState++)
00092 mt[iState] = (1812433253U * (mt[iState - 1] ^ (mt[iState - 1] >> 30)) + iState) & MT_WMASK;
00093
00094 iState = 0;
00095 mti1 = mt[0];
00096 for(iOut = 0; iOut < NPerRng; iOut++){
00097
00098
00099 iState1 = iState + 1;
00100 iStateM = iState + MT_MM;
00101 if(iState1 >= MT_NN) iState1 -= MT_NN;
00102 if(iStateM >= MT_NN) iStateM -= MT_NN;
00103 mti = mti1;
00104 mti1 = mt[iState1];
00105 mtiM = mt[iStateM];
00106
00107 x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
00108 x = mtiM ^ (x >> 1) ^ ((x & 1) ? config.matrix_a : 0);
00109 mt[iState] = x;
00110 iState = iState1;
00111
00112
00113 x ^= (x >> MT_SHIFT0);
00114 x ^= (x << MT_SHIFTB) & config.mask_b;
00115 x ^= (x << MT_SHIFTC) & config.mask_c;
00116 x ^= (x >> MT_SHIFT1);
00117
00118
00119 d_Random[iRng + iOut * MT_RNG_COUNT] = ((float)x + 1.0f) / 4294967296.0f;
00120 }
00121 }
00122 }
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133 __device__ void BoxMuller(float& u1, float& u2){
00134 float r = sqrtf(-2.0f * logf(u1));
00135 float phi = 2 * PI * u2;
00136 u1 = r * __cosf(phi);
00137 u2 = r * __sinf(phi);
00138 }
00139
00140 __global__ void BoxMullerGPU(float *d_Random, int NPerRng){
00141 const int tid = blockDim.x * blockIdx.x + threadIdx.x;
00142 const int THREAD_N = blockDim.x * gridDim.x;
00143
00144 for(int iRng = tid; iRng < MT_RNG_COUNT; iRng += THREAD_N)
00145 for(int iOut = 0; iOut < NPerRng; iOut += 2)
00146 BoxMuller(
00147 d_Random[iRng + (iOut + 0) * MT_RNG_COUNT],
00148 d_Random[iRng + (iOut + 1) * MT_RNG_COUNT]
00149 );
00150 }
00151
00152 #endif