Untitled

# cat t1.cu
#include <cuda_runtime.h>
#include <iostream>
#include <omp.h>

// Kernel to print the device ID from the GPU
__global__ void printGPUDeviceID() {
    int deviceID;
    cudaGetDevice(&deviceID);  // Get the current device ID
    printf("Device ID from the kernel: %d\n", deviceID);
}

int main() {
    // Get the number of available devices
    int num_devices;
    cudaGetDeviceCount(&num_devices);
    if (num_devices < 2) {
        std::cout << "This example requires at least two GPUs." << std::endl;
        return 1;
    }

    // Use OpenMP to create threads for each GPU
    #pragma omp parallel num_threads(num_devices)
    {
        int thread_id = omp_get_thread_num();  // Get the OpenMP thread ID
        int device_id = thread_id;             // Assign one device per thread

        // Set the current device for this thread
        cudaSetDevice(device_id);

        // Get and print the device ID from the host
        int deviceIDFromHost;
        cudaGetDevice(&deviceIDFromHost);
        printf("Device ID from the host (thread %d): %d\n", thread_id, deviceIDFromHost);

        // Launch a kernel to print the device ID from the GPU
        printGPUDeviceID<<<1, 1>>>();

        // Wait for the GPU to finish
        cudaDeviceSynchronize();

        // Check for any errors during kernel execution
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) {
            printf("CUDA error on device %d: %s\n", device_id, cudaGetErrorString(err));
        }
    }

    return 0;
}
# nvcc -Xcompiler -fopenmp t1.cu -o t1 -lgomp -rdc=true 
# OMP_NUM_THREADS=2 ./t1
Device ID from the host (thread 0): 0
Device ID from the host (thread 1): 1
Device ID from the kernel: 1
Device ID from the kernel: 0
#
Editor is loading...