cuRAND Usage: CUDA Feature Testing

Author: Greg Gutmann
Affiliation: Tokyo Institute of Technology, Nvidia University Ambassador, Nvidia DLI

Typical cuRAND Usage

The code on this page demonstrates one common approach to generating random numbers on GPU with CUDA using cuRAND.

This approach consists of two steps:

  • First an initialization step: launching a kernel that calls curand_init on a curandState for each thread.
  • Second generating random numbers: reading the curandState from global memory and then calling curand_uniform or one of the other distributions

cuRAND 3.1.4. Distributions: https://docs.nvidia.com/cuda/curand/device-api-overview.html

// seed - determines the starting state. Ex. Input: time(NULL)
// sequence - each sequence will produce different values. Ex. Input: global thread ID
// offset - amount to skip ahead in the random sequence. Ex. Input: 0
// state - a curandState to be initialized
__device__ void curand_init (
    unsigned long long seed, unsigned long long sequence,
    unsigned long long offset, curandState_t *state)

Output of Code Below

Timing is in milliseconds

TwoStepRandom
0.570, 0.226, 0.357, 0.557, 0.174, 0.174, 0.727, 0.154,
Elapsed time setup     2393.354
Elapsed time generate     0.140
Shannon Entropy <13.285>

As seen above, the setup {calling curand_init with a seed, sequence, and state} takes a considerable amount of time. This is likely not an issue though, as this only needs to be done once on initialization. However, the time to generate random values has been slowed down by the fact that we need to read the curandState from global memory. Page two of this post will show an approach that does not require reading the curandState from global memory, as well as not needing the initialization kernel.

Code

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <curand.h>
#include <curand_kernel.h>
#include <stdint.h>
#include <map>

#include <stdio.h>

void ShannonEntropy(int* data, int N, int& min, int& max, float& entropy);

__global__ void setup_kernel(curandState* state, uint64_t seed)
{
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	curand_init(seed, tid, 0, &state[tid]);
}

__global__ void generate_randoms(curandState* globalState, float* randoms)
{
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	curandState localState = globalState[tid];
	randoms[tid * 2 + 0] = curand_uniform(&localState);
	randoms[tid * 2 + 1] = curand_uniform(&localState);
}

int main()
{
	printf("\nTwoStepRandom\n");
	int threads = 256;
	int blocks = 5120;
	int threadCount = blocks * threads;
	int N = blocks * threads * 2;

	curandState* dev_curand_states;
	float* randomValues;
	float* host_randomValues;
	int* host_int;
	
	float time_elapsed_setup;
	float time_elapsed;
	cudaEvent_t startTime;
	cudaEvent_t stopTime;
	cudaStream_t computeStream;

	// Init host memory
	host_randomValues = (float*)malloc(N * sizeof(float));
	host_int = (int*)malloc(N * sizeof(float));

	// Init device memory
	cudaMalloc(&dev_curand_states, threadCount * sizeof(curandState));
	cudaMalloc(&randomValues, N * sizeof(float));

	cudaEventCreate(&startTime);
	cudaEventCreate(&stopTime);
	cudaStreamCreateWithFlags(&computeStream, cudaStreamNonBlocking);

	//  ----- Setup seeds -----
	cudaEventRecord(startTime, computeStream);
	
	setup_kernel << < blocks, threads, 0, computeStream >> > (dev_curand_states, time(NULL));
	
	cudaEventRecord(stopTime, computeStream);
	cudaEventSynchronize(stopTime);
	cudaEventElapsedTime(&time_elapsed_setup, startTime, stopTime);

	// ----- Generate random numbers -----
	cudaEventRecord(startTime, computeStream);
	
	// Needs both read and write from global memory 
	generate_randoms << < blocks, threads, 0, computeStream >> > (dev_curand_states, randomValues);

	cudaEventRecord(stopTime, computeStream);
	cudaEventSynchronize(stopTime);

	cudaEventElapsedTime(&time_elapsed, startTime, stopTime);

	// ----- Concluding Steps -----

	cudaMemcpy(host_randomValues, randomValues, N * sizeof(float), cudaMemcpyDeviceToHost);

	// Convert floats to ints for the shannnon entropy function
	for (int i = 0; i < N; ++i) {
		// Print a few values out
		if (i < 8) { printf("%.3f, ", host_randomValues[i]); }
		
		host_int[i] = (int)(host_randomValues[i] * 10000.0f);
	}

	printf("\n");
	printf("Elapsed time setup    %9.3f\n", time_elapsed_setup);
	printf("Elapsed time generate %9.3f\n", time_elapsed);

	int min, max;
	float entropy;
	ShannonEntropy(host_int, N, min, max, entropy);
	printf("Shannon Entropy <%6.3f>\n", entropy);

	cudaFree(dev_curand_states);
	cudaFree(randomValues);
	free(host_randomValues);
	free(host_int);

	return 0;
}

void ShannonEntropy(int* data, int N, int& min, int& max, float& entropy) {
	entropy = 0; // Init
	min = UINT_MAX;
	max = 0;

	std::map<int, long> counts;
	typename std::map<int, long>::iterator it;

	for (int dataIndex = 0; dataIndex < N; dataIndex++) {
		int dValue = data[dataIndex];
		if (dValue < min) {
			min = dValue;
		}
		if (dValue > max) {
			max = dValue;
		}
		counts[dValue]++;
	}

	it = counts.begin();
	while (it != counts.end()) {
		float p_x = (float)it->second / N;
		if (p_x > 0) entropy -= (float)(p_x * log(p_x) / log(2));
		it++;
	}
}

Contact me if you would like to use the contents of this post. Thanks.
Copyright © 2020 by Gregory Gutmann

Continued to Page 2

Leave a Reply

Close Menu