Image conv cuda
unknown
c_cpp
3 years ago
3.9 kB
9
Indexable
#include <iostream>
#include <vector>
#include <memory>
#include <cuda_runtime.h>
class Image {
public:
Image(int width, int height) : width(width), height(height), data(width * height) {}
const std::vector<float>& get_data() const {
return data;
}
void set_data_from_flattened(const std::vector<float>& flat_data) {
data = flat_data;
}
private:
int width, height;
std::vector<float> data;
};
class ImageVector : public std::vector<Image> {
public:
ImageVector(int numImages, const Image& img) : std::vector<Image>(numImages, img) {}
std::vector<float> to_flat() const {
std::vector<float> flattened;
for (const auto& img : *this) {
const auto& img_data = img.get_data();
flattened.insert(flattened.end(), img_data.begin(), img_data.end());
}
return flattened;
}
};
template<typename T>
struct DeviceDeleter {
void operator()(T* ptr) const {
cudaFree(ptr);
}
};
template<typename T>
using DeviceUniquePtr = std::unique_ptr<T, DeviceDeleter<T>>;
__global__ void conv2d(const float* input, const float* kernel, float* output, int width, int height, int kernelSize, int numImages) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int img = blockIdx.z;
if (col < width - kernelSize + 1 && row < height - kernelSize + 1) {
float sum = 0;
for (int i = 0; i < kernelSize; ++i) {
for (int j = 0; j < kernelSize; ++j) {
sum += input[img * width * height + (row + i) * width + (col + j)] * kernel[i * kernelSize + j];
}
}
output[img * (width - kernelSize + 1) * (height - kernelSize + 1) + row * (width - kernelSize + 1) + col] = sum;
}
}
int main() {
int width = 8;
int height = 8;
int kernelSize = 3;
int numImages = 3;
ImageVector input_images(numImages, Image(width, height));
// Fill input_images with data
// ...
std::vector<float> h_input_flat = input_images.to_flat();
std::vector<float> h_kernel(kernelSize * kernelSize);
// Fill h_kernel with data
// ...
std::vector<float> h_output_flat(numImages * (width - kernelSize + 1) * (height - kernelSize + 1));
DeviceUniquePtr<float> d_input;
DeviceUniquePtr<float> d_kernel;
DeviceUniquePtr<float> d_output;
cudaMalloc((void**)&d_input, numImages * width * height * sizeof(float));
cudaMalloc((void**)&d_kernel, kernelSize * kernelSize * sizeof(float));
cudaMalloc((void**)&d_output, numImages * (width - kernelSize + 1) * (height - kernelSize + 1) * sizeof(float));
cudaMemcpy(d_input.get(), h_input_flat.data(), numImages * width * height * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel.get(), h_kernel.data(), kernelSize * kernelSize * sizeof(float), cudaMemcpyHostToDevice);
dim3 blockDim(16, 16);
dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y, numImages);
conv2d<<<gridDim, blockDim>>>(d_input.get(), d_kernel.get(), d_output.get(), width, height, kernelSize, numImages);
cudaMemcpy(h_output_flat.data(), d_output.get(), numImages * (width - kernelSize + 1) * (height - kernelSize + 1) * sizeof(float), cudaMemcpyDeviceToHost);
// Transfer the output data back into ImageVector
ImageVector output_images(numImages, Image(width - kernelSize + 1, height - kernelSize + 1));
int output_image_size = (width - kernelSize + 1) * (height - kernelSize + 1);
for (int img = 0; img < numImages; ++img) {
std::vector<float> img_data(h_output_flat.begin() + img * output_image_size, h_output_flat.begin() + (img + 1) * output_image_size);
output_images[img].set_data_from_flattened(img_data);
}
// Now output_images contains the result of the convolution
// ...
return 0;
}
Editor is loading...