cuda_mersennetwisterkernel.h

00001 /*
00002  * Copyright 1993-2007 NVIDIA Corporation.  All rights reserved.
00003  *
00004  * NOTICE TO USER:
00005  *
00006  * This source code is subject to NVIDIA ownership rights under U.S. and
00007  * international Copyright laws.  Users and possessors of this source code
00008  * are hereby granted a nonexclusive, royalty-free license to use this code
00009  * in individual and commercial software.
00010  *
00011  * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
00012  * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
00013  * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
00014  * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
00015  * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
00016  * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
00017  * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
00018  * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
00019  * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
00020  * OR PERFORMANCE OF THIS SOURCE CODE.
00021  *
00022  * U.S. Government End Users.   This source code is a "commercial item" as
00023  * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
00024  * "commercial computer  software"  and "commercial computer software
00025  * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
00026  * and is provided to the U.S. Government only as a commercial end item.
00027  * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
00028  * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
00029  * source code with only those rights set forth herein.
00030  *
00031  * Any use of this source code in individual and commercial software must
00032  * include, in the user documentation and internal comments to the code,
00033  * the above Disclaimer and U.S. Government End Users Notice.
00034  */
00035 #ifndef CUDA_MERSENNETWISTERKERNEL_H_DEFINED
00036 #define CUDA_MERSENNETWISTERKERNEL_H_DEFINED
00037 
00038 #include "cuda_mersennetwister.h"
00039 #include "cudadefs.h"
00040 #include "CUDA/cutil.h"
00041 #include <cuda_runtime_api.h>
00042 
00043 
00044 __device__ static mt_struct_stripped ds_MT[MT_RNG_COUNT];
00045 static mt_struct_stripped h_MT[MT_RNG_COUNT];
00046 
00047 
00048 
00049 //Initialize/seed twister for current GPU context
00050 void cuda_c_seedMT(unsigned int seed){
00051     int i;
00052     //Need to be thread-safe
00053     mt_struct_stripped *MT = (mt_struct_stripped *)malloc(MT_RNG_COUNT * sizeof(mt_struct_stripped));
00054 
00055     for(i = 0; i < MT_RNG_COUNT; i++){
00056         MT[i]      = h_MT[i];
00057         MT[i].seed = seed;
00058     }
00059     CUDA_SAFE_CALL( cudaMemcpyToSymbol(ds_MT, MT, sizeof(h_MT)) );
00060 
00061     free(MT);
00062 }
00063 
00064 
00065 ////////////////////////////////////////////////////////////////////////////////
00066 // Write MT_RNG_COUNT vertical lanes of NPerRng random numbers to *d_Random.
00067 // For coalesced global writes MT_RNG_COUNT should be a multiple of warp size.
00068 // Initial states for each generator are the same, since the states are
00069 // initialized from the global seed. In order to improve distribution properties
00070 // on small NPerRng supply dedicated (local) seed to each twister.
00071 // The local seeds, in their turn, can be extracted from global seed
00072 // by means of any simple random number generator, like LCG.
00073 ////////////////////////////////////////////////////////////////////////////////
00074 __global__ void cuda_global_randomMT(
00075     float *d_Random,
00076     int NPerRng
00077 ){
00078     const int      tid = blockDim.x * blockIdx.x + threadIdx.x;
00079     const int THREAD_N = blockDim.x * gridDim.x;
00080 
00081     int iState, iState1, iStateM, iOut;
00082     unsigned int mti, mti1, mtiM, x;
00083     unsigned int mt[MT_NN];
00084 
00085     for(int iRng = tid; iRng < MT_RNG_COUNT; iRng += THREAD_N){
00086         //Load bit-vector Mersenne Twister parameters
00087         mt_struct_stripped config = ds_MT[iRng];
00088 
00089         //Initialize current state
00090         mt[0] = config.seed;
00091         for(iState = 1; iState < MT_NN; iState++)
00092             mt[iState] = (1812433253U * (mt[iState - 1] ^ (mt[iState - 1] >> 30)) + iState) & MT_WMASK;
00093 
00094         iState = 0;
00095         mti1 = mt[0];
00096         for(iOut = 0; iOut < NPerRng; iOut++){
00097             //iState1 = (iState +     1) % MT_NN
00098             //iStateM = (iState + MT_MM) % MT_NN
00099             iState1 = iState + 1;
00100             iStateM = iState + MT_MM;
00101             if(iState1 >= MT_NN) iState1 -= MT_NN;
00102             if(iStateM >= MT_NN) iStateM -= MT_NN;
00103             mti  = mti1;
00104             mti1 = mt[iState1];
00105             mtiM = mt[iStateM];
00106 
00107             x    = (mti & MT_UMASK) | (mti1 & MT_LMASK);
00108             x    =  mtiM ^ (x >> 1) ^ ((x & 1) ? config.matrix_a : 0);
00109             mt[iState] = x;
00110             iState = iState1;
00111 
00112             //Tempering transformation
00113             x ^= (x >> MT_SHIFT0);
00114             x ^= (x << MT_SHIFTB) & config.mask_b;
00115             x ^= (x << MT_SHIFTC) & config.mask_c;
00116             x ^= (x >> MT_SHIFT1);
00117 
00118             //Convert to (0, 1] float and write to global memory
00119             d_Random[iRng + iOut * MT_RNG_COUNT] = ((float)x + 1.0f) / 4294967296.0f;
00120         }
00121     }
00122 }
00123 
00124 
00125 
00126 ////////////////////////////////////////////////////////////////////////////////
00127 // Transform each of MT_RNG_COUNT lanes of NPerRng uniformly distributed
00128 // random samples, produced by RandomGPU(), to normally distributed lanes
00129 // using Cartesian form of Box-Muller transformation.
00130 // NPerRng must be even.
00131 ////////////////////////////////////////////////////////////////////////////////
00132 
00133 __device__ void BoxMuller(float& u1, float& u2){
00134     float   r = sqrtf(-2.0f * logf(u1));
00135     float phi = 2 * PI * u2;
00136     u1 = r * __cosf(phi);
00137     u2 = r * __sinf(phi);
00138 }
00139 
00140 __global__ void BoxMullerGPU(float *d_Random, int NPerRng){
00141     const int      tid = blockDim.x * blockIdx.x + threadIdx.x;
00142     const int THREAD_N = blockDim.x * gridDim.x;
00143 
00144     for(int iRng = tid; iRng < MT_RNG_COUNT; iRng += THREAD_N)
00145         for(int iOut = 0; iOut < NPerRng; iOut += 2)
00146             BoxMuller(
00147                 d_Random[iRng + (iOut + 0) * MT_RNG_COUNT],
00148                 d_Random[iRng + (iOut + 1) * MT_RNG_COUNT]
00149             );
00150 }
00151 
00152 #endif