Peer-to-peer Memory Copy with NVLink: CUDA Feature Testing

Author: Greg Gutmann
Affiliation: Tokyo Institute of Technology, Nvidia University Ambassador, Nvidia DLI

Introduction

The following code demonstrates peer-to-peer memory copy in CUDA. This is intended as a simple example. For a more in depth sample see the official CUDA samples that are included with the SDK [Sample Documentation Link].

Code

Caution code plugin error: replace & with &

// P2P Test by Greg Gutmann

#include "stdio.h"
#include "stdint.h"

int main()
{
	// GPUs
	int gpuid_0 = 0;
	int gpuid_1 = 1;

	// Memory Copy Size
	uint32_t size = pow(2, 26); // 2^26 = 67MB

	// Allocate Memory
	uint32_t* dev_0;
	cudaSetDevice(gpuid_0);
	cudaMalloc((void**)&dev_0, size);

	uint32_t* dev_1;
	cudaSetDevice(gpuid_1);
	cudaMalloc((void**)&dev_1, size);

	//Check for peer access between participating GPUs: 
	int can_access_peer_0_1;
	int can_access_peer_1_0;
	cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid_0, gpuid_1);
	cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid_1, gpuid_0);
	printf("cudaDeviceCanAccessPeer(%d->%d): %d\n", gpuid_0, gpuid_1, can_access_peer_0_1);
	printf("cudaDeviceCanAccessPeer(%d->%d): %d\n", gpuid_1, gpuid_0, can_access_peer_1_0);

	if (can_access_peer_0_1 && can_access_peer_1_0) {
		// Enable P2P Access
		cudaSetDevice(gpuid_0);
		cudaDeviceEnablePeerAccess(gpuid_1, 0);
		cudaSetDevice(gpuid_1);
		cudaDeviceEnablePeerAccess(gpuid_0, 0);
	}

	// Init Timing Data
	uint32_t repeat = 10;
	cudaEvent_t start;
	cudaEvent_t stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	// Init Stream
	cudaStream_t stream;
	cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

	// ~~ Start Test ~~
	cudaEventRecord(start, stream);

	//Do a P2P memcpy
	for (int i = 0; i < repeat; ++i) {
		cudaMemcpyAsync(dev_0, dev_1, size, cudaMemcpyDeviceToDevice, stream);
	}

	cudaEventRecord(stop, stream);
	cudaStreamSynchronize(stream);
	// ~~ End of Test ~~

	// Check Timing & Performance
	float time_ms;
	cudaEventElapsedTime(&time_ms, start, stop);
	double time_s = time_ms / 1e3;

	double gb = size * repeat / (double)1e9;
	double bandwidth = gb / time_s;

	printf("Seconds: %f\n", time_s);
	printf("Unidirectional Bandwidth: %f (GB/s)\n", bandwidth);

	if (can_access_peer_0_1 && can_access_peer_1_0) {
		// Shutdown P2P Settings
		cudaSetDevice(gpuid_0);
		cudaDeviceDisablePeerAccess(gpuid_1);
		cudaSetDevice(gpuid_1);
		cudaDeviceDisablePeerAccess(gpuid_0);
	}

	// Clean Up
	cudaFree(dev_0);
	cudaFree(dev_1);

	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	cudaStreamDestroy(stream);
}
Output across NVLink

cudaDeviceCanAccessPeer(0->1) : 1
cudaDeviceCanAccessPeer(1->0) : 1
Seconds : 0.014270
Unidirectional Bandwidth : 47.028515 (GB / s)

Output across PCI-E

cudaDeviceCanAccessPeer(0->2): 0
cudaDeviceCanAccessPeer(2->0): 0
Seconds: 0.061266
Unidirectional Bandwidth: 10.953679 (GB/s)

Results From Running the CUDA p2pBandwidthLatencyTest Sample

Linux machine with four RTX 2080 Ti, with pairs of GPU using NVLink.

Contact me if you would like to use the contents of this post. Thanks.
Copyright © 2020 by Gregory Gutmann

Leave a Reply

Close Menu