opencv只有cpu接口的Inpaint函数,对于需要使用CUDA进行图片处理时,反复内存显存迁移数据会影响计算速度。在不考虑填充效果十分好的情况下,可以使用如下CUDA算法,简单的进行填充。
算法来源
算法主要参考了https://github.com/Po-Ting-lin/HairRemoval.git 中的填充代码。该项目主要是去除皮肤上的毛发。对于其如何寻找需要填充的区域就不讨论了,直接使用其分析出的mask图进行填充。
该填充算法对细线和小面积的填充效果还可以 大面积的 效果就很一般了
1.需要填充的图片 和 mask图
2.inpaint类
使用inpaint类实现cuda inpaintu算法,包括inpaintcuda.h inpaintcuda.c 和 inpaintcuda.cu3个文件。
说明:
类初始化时 填入图片的大小 和 通道。支持单通道或3通道。
函数 inpaint 进行图片填充
src- 需要填充的原图 支持8BIT图 或 16BIT图
mask – 要填充的区域 8bit图
dst – 填充好的图片 是float类型
iters – 迭代次数 算法里每次执行就会有500次 一般填1就够了。填2就是1000次 以此类推
inpaintcuda.h
//inpaintcuda.h
#ifndef INPAINTCUDA_H
#define INPAINTCUDA_H
#include <opencv2/core.hpp>
#define INPAINT_TILE_X 32
#define INPAINT_TILE_Y 16
#define INPAINT_UNROLL_Y 1
class InpaintCUDA
{
public:
InpaintCUDA(int cols,int rows, int channels);
void inpaint(cv::cuda::GpuMat& src, cv::cuda::GpuMat& mask, cv::cuda::GpuMat& dst ,int iters);
void _pdeHeatDiffusion(uint8_t* d_normalized_mask, float* d_normalized_src, float* d_dst);
~InpaintCUDA();
private:
int cols;
int rows;
int channels;
int iters;
uint8_t* d_normalized_mask;
float* d_normalized_src;
float* d_normalized_masked_src_temp;
int iDivUp(int a, int b);
};
#endif // INPAINTCUDA_H
inpaintcuda.cpp
//inpaintcuda.cpp
#include "inpaintcuda.h"
#include "cuda_runtime.h"
#include <opencv2/cudaarithm.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/imgcodecs.hpp>
#include <iostream>
using namespace cv;
InpaintCUDA::InpaintCUDA(int cols, int rows, int channels)
{ //初始化 分配存储空间
this->cols = cols;
this->rows = rows;
this->channels = channels;
cudaMalloc((uint8_t**)&d_normalized_mask, cols* rows * channels * sizeof(uint8_t));
cudaMalloc((float**)&d_normalized_src, cols* rows * channels * sizeof(float));
cudaMalloc((float**)&d_normalized_masked_src_temp, cols* rows * channels * sizeof(float));
}
InpaintCUDA::~InpaintCUDA()
{
cudaFree(d_normalized_mask);
cudaFree(d_normalized_src);
cudaFree(d_normalized_masked_src_temp);
}
int InpaintCUDA::iDivUp(int a, int b)
{
return (a % b != 0) ? (a / b + 1) : (a / b);
}
void InpaintCUDA::inpaint(cv::cuda::GpuMat& src, cv::cuda::GpuMat& mask, cv::cuda::GpuMat& dst, int iters)
{
this->iters = iters;
std::vector<double> minvalues;
std::vector<double> maxvalues;
//将mask归一化到0 和 1的数值
cuda::GpuMat normalized_mask_mat(rows, cols, CV_8UC1, d_normalized_mask);
cuda::threshold(mask, normalized_mask_mat, 254, 1, THRESH_BINARY_INV);
//将原图各通道归一化到0~1
std::vector<cuda::GpuMat> m_list;
cuda::split(src, m_list);
for (int n = 0; n < m_list.size(); n++) {
cuda::GpuMat t(rows, cols, CV_32FC1, d_normalized_src + n * cols * rows);
m_list[n].convertTo(t, CV_32F);
cuda::normalize(t, t, 1.0, 0.0, cv::NORM_INF,-1);
double minval, maxval;
cuda::minMax(m_list[n],&minval,&maxval);
minvalues.push_back(minval);
maxvalues.push_back(maxval);
}
cudaMemcpy(d_normalized_masked_src_temp, d_normalized_src, cols* rows * channels * sizeof(float), cudaMemcpyDeviceToDevice);
//进行迭代填充
for (int n = 0; n < iters; n++) {
_pdeHeatDiffusion(d_normalized_mask, d_normalized_src, d_normalized_masked_src_temp);
}
//将归一化后的图 还原到原来的数值范围
if (channels == 1) {
cuda::GpuMat temp_nor;
cuda::multiply(cuda::GpuMat(rows, cols, CV_32FC1, d_normalized_masked_src_temp), maxvalues[0], temp_nor);
temp_nor.copyTo(dst);
}
else {
std::vector<cuda::GpuMat> temp_list;
for (int n = 0; n < channels;n++) {
cuda::GpuMat temp_nor;
cuda::multiply(cuda::GpuMat(rows, cols, CV_32FC1, d_normalized_masked_src_temp + n * cols * rows), maxvalues[n], temp_nor);
temp_list.push_back(temp_nor);
}
cuda::GpuMat temp;
cuda::merge(temp_list, temp);
temp.copyTo(dst);
}
}
Inpaintkernel.cu
//Inpaintkernel.cu
#include "InpaintCUDA.h"
#include <opencv2/imgcodecs.hpp>
#include <opencv2/cudaarithm.hpp>
#include <iostream>
__global__ void _pdeHeatDiffusionKernel(uint8_t* mask, float* src, float* tempSrc, int width, int height);
void InpaintCUDA::_pdeHeatDiffusion(uint8_t* d_normalized_mask, float* d_normalized_src, float* d_dst) {
cudaStream_t* streams = new cudaStream_t[channels];
for (int i = 0; i < channels; i++)
cudaStreamCreate(&streams[i]);
dim3 block(INPAINT_TILE_X, INPAINT_TILE_Y / INPAINT_UNROLL_Y);
dim3 grid(iDivUp(cols, INPAINT_TILE_X), iDivUp(rows / INPAINT_UNROLL_Y, INPAINT_TILE_Y / INPAINT_UNROLL_Y));
for (int k = 0; k < channels; k++) {
int offset = k * cols * rows;
for (int i = 0; i < 500; i++) {
_pdeHeatDiffusionKernel << <grid, block, 0 , streams[k]>> > (d_normalized_mask, d_normalized_src + offset, d_dst + offset, cols, rows);
}
}
cudaDeviceSynchronize();
for (int i = 0; i < channels; i++)
cudaStreamDestroy(streams[i]);
}
__global__ void _pdeHeatDiffusionKernel(uint8_t* mask, float* src, float* tempSrc, int width, int height) {
const int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
if (x < 0 || y < 0 || x >= width || y >= height / INPAINT_UNROLL_Y) return;
#pragma unroll
for (; y < height; y += height / INPAINT_UNROLL_Y) {
float center = tempSrc[y * width + x];
int i = y * width + x;
if(mask[i]){ // 不在mask上的点 不改变数值
return;
}
tempSrc[i] = center
+ 0.2f
* (tempSrc[max(0, y - 1) * width + x]
+ tempSrc[min(height - 1, y + 1) * width + x]
+ tempSrc[y * width + max(0, x - 1)]
+ tempSrc[y * width + min(width - 1, x + 1)]
- 4.0f * center)
- 0.2f * mask[i] * (center - src[i]);
}
}
3.main函数调用方法 我只写核心代码 包含文件和函数入口就省略了
Mat src = imread("demo1280.png", cv::ImreadModes::IMREAD_COLOR);
Mat mask = imread("mask_demo.png", cv::ImreadModes::IMREAD_GRAYSCALE);
cuda::GpuMat src_g(src);
cuda::GpuMat mask_g(mask);
cuda::GpuMat dst_g(src.size(),CV_32FC1);
InpaintCUDA ic(src.cols, src.rows , src.channels());
ic.inpaint(src_g, mask_g, dst_g, 1);
Mat dst_cpu;
dst_g.download(dst_cpu);
dst_cpu.convertTo(dst_cpu,CV_8U);
imwrite("dst_cpu.png", dst_cpu);
4.效果