cpp#include <iostream>
#include <opencv2/opencv.hpp>
#include <cuda_runtime.h>
 
// CUDA カーネル関数
__global__ void rgbToGrayKernel(unsigned char* d_input, unsigned char* d_output, int width, int height, int channels) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
 
    if (x < width && y < height) {
        int idx = (y * width + x) * channels;
        unsigned char r = d_input[idx];
        unsigned char g = d_input[idx + 1];
        unsigned char b = d_input[idx + 2];
        d_output[y * width + x] = static_cast<unsigned char>(0.299f * r + 0.587f * g + 0.114f * b);
    }
}
 
int main() {
    // 画像の読み込み
    cv::Mat inputImage = cv::imread("input.jpg");
    if (inputImage.empty()) {
        std::cerr << "画像が読み込めませんでした!" << std::endl;
        return -1;
    }
 
    int width = inputImage.cols;
    int height = inputImage.rows;
    int channels = inputImage.channels();
 
    // 出力画像の作成
    cv::Mat outputImage(height, width, CV_8UC1);
 
    // GPUメモリの確保
    unsigned char* d_input;
    unsigned char* d_output;
    cudaMalloc((void**)&d_input, width * height * channels * sizeof(unsigned char));
    cudaMalloc((void**)&d_output, width * height * sizeof(unsigned char));
 
    // データをGPUに転送
    cudaMemcpy(d_input, inputImage.data, width * height * channels * sizeof(unsigned char), cudaMemcpyHostToDevice);
 
    // カーネル起動の設定
    dim3 blockSize(16, 16);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);
 
    // カーネルを起動
    rgbToGrayKernel<<<gridSize, blockSize>>>(d_input, d_output, width, height, channels);
 
    // 結果をホストに転送
    cudaMemcpy(outputImage.data, d_output, width * height * sizeof(unsigned char), cudaMemcpyDeviceToHost);
 
    // 結果を表示および保存
    cv::imshow("Original Image", inputImage);
    cv::imshow("Grayscale Image", outputImage);
    cv::imwrite("output.jpg", outputImage);
    cv::waitKey(0);
 
    // GPUメモリの解放
    cudaFree(d_input);
    cudaFree(d_output);
 
    return 0;
}