00001 /* 00002 * Copyright 1993-2007 NVIDIA Corporation. All rights reserved. 00003 * 00004 * NOTICE TO USER: 00005 * 00006 * This source code is subject to NVIDIA ownership rights under U.S. and 00007 * international Copyright laws. Users and possessors of this source code 00008 * are hereby granted a nonexclusive, royalty-free license to use this code 00009 * in individual and commercial software. 00010 * 00011 * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 00012 * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 00013 * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH 00014 * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 00015 * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 00016 * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 00017 * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 00018 * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 00019 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 00020 * OR PERFORMANCE OF THIS SOURCE CODE. 00021 * 00022 * U.S. Government End Users. This source code is a "commercial item" as 00023 * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of 00024 * "commercial computer software" and "commercial computer software 00025 * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 00026 * and is provided to the U.S. Government only as a commercial end item. 00027 * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 00028 * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 00029 * source code with only those rights set forth herein. 00030 * 00031 * Any use of this source code in individual and commercial software must 00032 * include, in the user documentation and internal comments to the code, 00033 * the above Disclaimer and U.S. Government End Users Notice. 00034 */ 00035 #ifndef CUDA_MERSENNETWISTERKERNEL_H_DEFINED 00036 #define CUDA_MERSENNETWISTERKERNEL_H_DEFINED 00037 00038 #include "cuda_mersennetwister.h" 00039 #include "cudadefs.h" 00040 #include "CUDA/cutil.h" 00041 #include <cuda_runtime_api.h> 00042 00043 00044 __device__ static mt_struct_stripped ds_MT[MT_RNG_COUNT]; 00045 static mt_struct_stripped h_MT[MT_RNG_COUNT]; 00046 00047 00048 00049 //Initialize/seed twister for current GPU context 00050 void cuda_c_seedMT(unsigned int seed){ 00051 int i; 00052 //Need to be thread-safe 00053 mt_struct_stripped *MT = (mt_struct_stripped *)malloc(MT_RNG_COUNT * sizeof(mt_struct_stripped)); 00054 00055 for(i = 0; i < MT_RNG_COUNT; i++){ 00056 MT[i] = h_MT[i]; 00057 MT[i].seed = seed; 00058 } 00059 CUDA_SAFE_CALL( cudaMemcpyToSymbol(ds_MT, MT, sizeof(h_MT)) ); 00060 00061 free(MT); 00062 } 00063 00064 00065 //////////////////////////////////////////////////////////////////////////////// 00066 // Write MT_RNG_COUNT vertical lanes of NPerRng random numbers to *d_Random. 00067 // For coalesced global writes MT_RNG_COUNT should be a multiple of warp size. 00068 // Initial states for each generator are the same, since the states are 00069 // initialized from the global seed. In order to improve distribution properties 00070 // on small NPerRng supply dedicated (local) seed to each twister. 00071 // The local seeds, in their turn, can be extracted from global seed 00072 // by means of any simple random number generator, like LCG. 00073 //////////////////////////////////////////////////////////////////////////////// 00074 __global__ void cuda_global_randomMT( 00075 float *d_Random, 00076 int NPerRng 00077 ){ 00078 const int tid = blockDim.x * blockIdx.x + threadIdx.x; 00079 const int THREAD_N = blockDim.x * gridDim.x; 00080 00081 int iState, iState1, iStateM, iOut; 00082 unsigned int mti, mti1, mtiM, x; 00083 unsigned int mt[MT_NN]; 00084 00085 for(int iRng = tid; iRng < MT_RNG_COUNT; iRng += THREAD_N){ 00086 //Load bit-vector Mersenne Twister parameters 00087 mt_struct_stripped config = ds_MT[iRng]; 00088 00089 //Initialize current state 00090 mt[0] = config.seed; 00091 for(iState = 1; iState < MT_NN; iState++) 00092 mt[iState] = (1812433253U * (mt[iState - 1] ^ (mt[iState - 1] >> 30)) + iState) & MT_WMASK; 00093 00094 iState = 0; 00095 mti1 = mt[0]; 00096 for(iOut = 0; iOut < NPerRng; iOut++){ 00097 //iState1 = (iState + 1) % MT_NN 00098 //iStateM = (iState + MT_MM) % MT_NN 00099 iState1 = iState + 1; 00100 iStateM = iState + MT_MM; 00101 if(iState1 >= MT_NN) iState1 -= MT_NN; 00102 if(iStateM >= MT_NN) iStateM -= MT_NN; 00103 mti = mti1; 00104 mti1 = mt[iState1]; 00105 mtiM = mt[iStateM]; 00106 00107 x = (mti & MT_UMASK) | (mti1 & MT_LMASK); 00108 x = mtiM ^ (x >> 1) ^ ((x & 1) ? config.matrix_a : 0); 00109 mt[iState] = x; 00110 iState = iState1; 00111 00112 //Tempering transformation 00113 x ^= (x >> MT_SHIFT0); 00114 x ^= (x << MT_SHIFTB) & config.mask_b; 00115 x ^= (x << MT_SHIFTC) & config.mask_c; 00116 x ^= (x >> MT_SHIFT1); 00117 00118 //Convert to (0, 1] float and write to global memory 00119 d_Random[iRng + iOut * MT_RNG_COUNT] = ((float)x + 1.0f) / 4294967296.0f; 00120 } 00121 } 00122 } 00123 00124 00125 00126 //////////////////////////////////////////////////////////////////////////////// 00127 // Transform each of MT_RNG_COUNT lanes of NPerRng uniformly distributed 00128 // random samples, produced by RandomGPU(), to normally distributed lanes 00129 // using Cartesian form of Box-Muller transformation. 00130 // NPerRng must be even. 00131 //////////////////////////////////////////////////////////////////////////////// 00132 00133 __device__ void BoxMuller(float& u1, float& u2){ 00134 float r = sqrtf(-2.0f * logf(u1)); 00135 float phi = 2 * PI * u2; 00136 u1 = r * __cosf(phi); 00137 u2 = r * __sinf(phi); 00138 } 00139 00140 __global__ void BoxMullerGPU(float *d_Random, int NPerRng){ 00141 const int tid = blockDim.x * blockIdx.x + threadIdx.x; 00142 const int THREAD_N = blockDim.x * gridDim.x; 00143 00144 for(int iRng = tid; iRng < MT_RNG_COUNT; iRng += THREAD_N) 00145 for(int iOut = 0; iOut < NPerRng; iOut += 2) 00146 BoxMuller( 00147 d_Random[iRng + (iOut + 0) * MT_RNG_COUNT], 00148 d_Random[iRng + (iOut + 1) * MT_RNG_COUNT] 00149 ); 00150 } 00151 00152 #endif