Author: Greg Gutmann
Affiliation: Tokyo Institute of Technology, Nvidia University Ambassador, Nvidia DLI
Introduction
The following code demonstrates peer-to-peer memory copy in CUDA. This is intended as a simple example. For a more in depth sample see the official CUDA samples that are included with the SDK [Sample Documentation Link].
Code
Caution code plugin error: replace & with &
// P2P Test by Greg Gutmann
#include "stdio.h"
#include "stdint.h"
int main()
{
// GPUs
int gpuid_0 = 0;
int gpuid_1 = 1;
// Memory Copy Size
uint32_t size = pow(2, 26); // 2^26 = 67MB
// Allocate Memory
uint32_t* dev_0;
cudaSetDevice(gpuid_0);
cudaMalloc((void**)&dev_0, size);
uint32_t* dev_1;
cudaSetDevice(gpuid_1);
cudaMalloc((void**)&dev_1, size);
//Check for peer access between participating GPUs:
int can_access_peer_0_1;
int can_access_peer_1_0;
cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid_0, gpuid_1);
cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid_1, gpuid_0);
printf("cudaDeviceCanAccessPeer(%d->%d): %d\n", gpuid_0, gpuid_1, can_access_peer_0_1);
printf("cudaDeviceCanAccessPeer(%d->%d): %d\n", gpuid_1, gpuid_0, can_access_peer_1_0);
if (can_access_peer_0_1 && can_access_peer_1_0) {
// Enable P2P Access
cudaSetDevice(gpuid_0);
cudaDeviceEnablePeerAccess(gpuid_1, 0);
cudaSetDevice(gpuid_1);
cudaDeviceEnablePeerAccess(gpuid_0, 0);
}
// Init Timing Data
uint32_t repeat = 10;
cudaEvent_t start;
cudaEvent_t stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Init Stream
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
// ~~ Start Test ~~
cudaEventRecord(start, stream);
//Do a P2P memcpy
for (int i = 0; i < repeat; ++i) {
cudaMemcpyAsync(dev_0, dev_1, size, cudaMemcpyDeviceToDevice, stream);
}
cudaEventRecord(stop, stream);
cudaStreamSynchronize(stream);
// ~~ End of Test ~~
// Check Timing & Performance
float time_ms;
cudaEventElapsedTime(&time_ms, start, stop);
double time_s = time_ms / 1e3;
double gb = size * repeat / (double)1e9;
double bandwidth = gb / time_s;
printf("Seconds: %f\n", time_s);
printf("Unidirectional Bandwidth: %f (GB/s)\n", bandwidth);
if (can_access_peer_0_1 && can_access_peer_1_0) {
// Shutdown P2P Settings
cudaSetDevice(gpuid_0);
cudaDeviceDisablePeerAccess(gpuid_1);
cudaSetDevice(gpuid_1);
cudaDeviceDisablePeerAccess(gpuid_0);
}
// Clean Up
cudaFree(dev_0);
cudaFree(dev_1);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaStreamDestroy(stream);
}
Output across NVLink
cudaDeviceCanAccessPeer(0->1) : 1
cudaDeviceCanAccessPeer(1->0) : 1
Seconds : 0.014270
Unidirectional Bandwidth : 47.028515 (GB / s)
Output across PCI-E
cudaDeviceCanAccessPeer(0->2): 0
cudaDeviceCanAccessPeer(2->0): 0
Seconds: 0.061266
Unidirectional Bandwidth: 10.953679 (GB/s)
Results From Running the CUDA p2pBandwidthLatencyTest Sample
Linux machine with four RTX 2080 Ti, with pairs of GPU using NVLink.

Contact me if you would like to use the contents of this post. Thanks.
Copyright © 2020 by Gregory Gutmann