Image conv cuda
unknown
c_cpp
2 years ago
3.9 kB
4
Indexable
#include <iostream> #include <vector> #include <memory> #include <cuda_runtime.h> class Image { public: Image(int width, int height) : width(width), height(height), data(width * height) {} const std::vector<float>& get_data() const { return data; } void set_data_from_flattened(const std::vector<float>& flat_data) { data = flat_data; } private: int width, height; std::vector<float> data; }; class ImageVector : public std::vector<Image> { public: ImageVector(int numImages, const Image& img) : std::vector<Image>(numImages, img) {} std::vector<float> to_flat() const { std::vector<float> flattened; for (const auto& img : *this) { const auto& img_data = img.get_data(); flattened.insert(flattened.end(), img_data.begin(), img_data.end()); } return flattened; } }; template<typename T> struct DeviceDeleter { void operator()(T* ptr) const { cudaFree(ptr); } }; template<typename T> using DeviceUniquePtr = std::unique_ptr<T, DeviceDeleter<T>>; __global__ void conv2d(const float* input, const float* kernel, float* output, int width, int height, int kernelSize, int numImages) { int col = blockIdx.x * blockDim.x + threadIdx.x; int row = blockIdx.y * blockDim.y + threadIdx.y; int img = blockIdx.z; if (col < width - kernelSize + 1 && row < height - kernelSize + 1) { float sum = 0; for (int i = 0; i < kernelSize; ++i) { for (int j = 0; j < kernelSize; ++j) { sum += input[img * width * height + (row + i) * width + (col + j)] * kernel[i * kernelSize + j]; } } output[img * (width - kernelSize + 1) * (height - kernelSize + 1) + row * (width - kernelSize + 1) + col] = sum; } } int main() { int width = 8; int height = 8; int kernelSize = 3; int numImages = 3; ImageVector input_images(numImages, Image(width, height)); // Fill input_images with data // ... std::vector<float> h_input_flat = input_images.to_flat(); std::vector<float> h_kernel(kernelSize * kernelSize); // Fill h_kernel with data // ... std::vector<float> h_output_flat(numImages * (width - kernelSize + 1) * (height - kernelSize + 1)); DeviceUniquePtr<float> d_input; DeviceUniquePtr<float> d_kernel; DeviceUniquePtr<float> d_output; cudaMalloc((void**)&d_input, numImages * width * height * sizeof(float)); cudaMalloc((void**)&d_kernel, kernelSize * kernelSize * sizeof(float)); cudaMalloc((void**)&d_output, numImages * (width - kernelSize + 1) * (height - kernelSize + 1) * sizeof(float)); cudaMemcpy(d_input.get(), h_input_flat.data(), numImages * width * height * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_kernel.get(), h_kernel.data(), kernelSize * kernelSize * sizeof(float), cudaMemcpyHostToDevice); dim3 blockDim(16, 16); dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y, numImages); conv2d<<<gridDim, blockDim>>>(d_input.get(), d_kernel.get(), d_output.get(), width, height, kernelSize, numImages); cudaMemcpy(h_output_flat.data(), d_output.get(), numImages * (width - kernelSize + 1) * (height - kernelSize + 1) * sizeof(float), cudaMemcpyDeviceToHost); // Transfer the output data back into ImageVector ImageVector output_images(numImages, Image(width - kernelSize + 1, height - kernelSize + 1)); int output_image_size = (width - kernelSize + 1) * (height - kernelSize + 1); for (int img = 0; img < numImages; ++img) { std::vector<float> img_data(h_output_flat.begin() + img * output_image_size, h_output_flat.begin() + (img + 1) * output_image_size); output_images[img].set_data_from_flattened(img_data); } // Now output_images contains the result of the convolution // ... return 0; }
Editor is loading...