上一篇文章中提到 warpAffine 会分块处理,将坐标映射和插值系数分别存储下来,然后借助 remap 来实现最终的映射。而 remap 会根据映射关系取源像素并加权计算出目的像素值。其最核心的计算为 RemapVec_8u。
cv::remap
d s t ( x , y ) = s r c ( m a p x ( x , y ) , m a p y ( x , y ) ) \mathrm{dst}(x,y)=\mathrm{src}(\mathrm{map}_x(x,y), \mathrm{map}_y(x,y)) dst(x,y)=src(mapx(x,y),mapy(x,y))
定义不同类型的函数表。
remapBilinear
CV_INSTRUMENT_REGION(); static RemapNNFunc nn_tab[] = {
remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>, remapNearest<int>, remapNearest<float>, remapNearest<double>, 0 }; static RemapFunc linear_tab[] = {
remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0, remapBilinear<Cast<float, ushort>, RemapNoVec, float>, remapBilinear<Cast<float, short>, RemapNoVec, float>, 0, remapBilinear<Cast<float, float>, RemapNoVec, float>, remapBilinear<Cast<double, double>, RemapNoVec, float>, 0 }; static RemapFunc cubic_tab[] = {
remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0, remapBicubic<Cast<float, ushort>, float, 1>, remapBicubic<Cast<float, short>, float, 1>, 0, remapBicubic<Cast<float, float>, float, 1>, remapBicubic<Cast<double, double>, float, 1>, 0 }; static RemapFunc lanczos4_tab[] = {
remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0, remapLanczos4<Cast<float, ushort>, float, 1>, remapLanczos4<Cast<float, short>, float, 1>, 0, remapLanczos4<Cast<float, float>, float, 1>, remapLanczos4<Cast<double, double>, float, 1>, 0 };
_map1不能为空,_map2可以。
CV_Assert( !_map1.empty() ); CV_Assert( _map2.empty() || (_map2.size() == _map1.size())); CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue)) Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat(); _dst.create( map1.size(), src.type() ); Mat dst = _dst.getMat(); CV_OVX_RUN( src.type() == CV_8UC1 && dst.type() == CV_8UC1 && !ovx::skipSmallImages<VX_KERNEL_REMAP>(src.cols, src.rows) && (borderType& ~BORDER_ISOLATED) == BORDER_CONSTANT && ((map1.type() == CV_32FC2 && map2.empty() && map1.size == dst.size) || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1 && map1.size == dst.size && map2.size == dst.size) || (map1.empty() && map2.type() == CV_32FC2 && map2.size == dst.size)) && ((borderType & BORDER_ISOLATED) != 0 || !src.isSubmatrix()), openvx_remap(src, dst, map1, map2, interpolation, borderValue));
CV_Assert( dst.cols < SHRT_MAX && dst.rows < SHRT_MAX && src.cols < SHRT_MAX && src.rows < SHRT_MAX ); if( dst.data == src.data ) src = src.clone(); if( interpolation == INTER_AREA ) interpolation = INTER_LINEAR; int type = src.type(), depth = CV_MAT_DEPTH(type); #if defined HAVE_IPP && !IPP_DISABLE_REMAP CV_IPP_CHECK() {
if ((interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_NEAREST) && map1.type() == CV_32FC1 && map2.type() == CV_32FC1 && (borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT)) {
int ippInterpolation = interpolation == INTER_NEAREST ? IPPI_INTER_NN : interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : IPPI_INTER_CUBIC; ippiRemap ippFunc = type == CV_8UC1 ? (ippiRemap)ippiRemap_8u_C1R : type == CV_8UC3 ? (ippiRemap)ippiRemap_8u_C3R : type == CV_8UC4 ? (ippiRemap)ippiRemap_8u_C4R : type == CV_16UC1 ? (ippiRemap)ippiRemap_16u_C1R : type == CV_16UC3 ? (ippiRemap)ippiRemap_16u_C3R : type == CV_16UC4 ? (ippiRemap)ippiRemap_16u_C4R : type == CV_32FC1 ? (ippiRemap)ippiRemap_32f_C1R : type == CV_32FC3 ? (ippiRemap)ippiRemap_32f_C3R : type == CV_32FC4 ? (ippiRemap)ippiRemap_32f_C4R : 0; if (ippFunc) {
bool ok; IPPRemapInvoker invoker(src, dst, map1, map2, ippFunc, ippInterpolation, borderType, borderValue, &ok); Range range(0, dst.rows); parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); if (ok) {
CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); return; } setIppErrorStatus(); } } } #endif
如果是最近邻插值设置nnfunc,否则设置ifunc。
initInterTab2D 返回静态数组的指针给ctab。
RemapNNFunc nnfunc = 0; RemapFunc ifunc = 0; const void* ctab = 0; bool fixpt = depth == CV_8U; bool planar_input = false; if( interpolation == INTER_NEAREST ) {
nnfunc = nn_tab[depth]; CV_Assert( nnfunc != 0 ); } else {
if( interpolation == INTER_LINEAR ) ifunc = linear_tab[depth]; else if( interpolation == INTER_CUBIC ){
ifunc = cubic_tab[depth]; CV_Assert( _src.channels() <= 4 ); } else if( interpolation == INTER_LANCZOS4 ){
ifunc = lanczos4_tab[depth]; CV_Assert( _src.channels() <= 4 ); } else CV_Error( CV_StsBadArg, "Unknown interpolation method" ); CV_Assert( ifunc != 0 ); ctab = initInterTab2D( interpolation, fixpt ); } const Mat *m1 = &map1, *m2 = &map2; if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || map2.empty())) || (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || map1.empty())) ) {
if( map1.type() != CV_16SC2 ) std::swap(m1, m2); } else {
CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && map2.empty()) || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) ); planar_input = map1.channels() == 1; }
调用 RemapInvoker 的函数。
RemapInvoker invoker(src, dst, m1, m2, borderType, borderValue, planar_input, nnfunc, ifunc, ctab); parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
initInterTab2D
initInterTab2D 存储_tab中两个核参数相乘的结果。
static bool inittab[INTER_MAX+1] = {
false}; float* tab = 0; short* itab = 0; int ksize = 0; if( method == INTER_LINEAR ) tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2; else if( method == INTER_CUBIC ) tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4; else if( method == INTER_LANCZOS4 ) tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8; else CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" );
if( !inittab[method] ) {
AutoBuffer<float> _tab(8*INTER_TAB_SIZE); int i, j, k1, k2; initInterTab1D(method, _tab.data(), INTER_TAB_SIZE); for( i = 0; i < INTER_TAB_SIZE; i++ ) for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize ) {
int isum = 0; NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2; NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2; for( k1 = 0; k1 < ksize; k1++ ) {
float vy = _tab[i*ksize + k1]; for( k2 = 0; k2 < ksize; k2++ ) {
float v = vy*_tab[j*ksize + k2]; tab[k1*ksize + k2] = v; isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE); } }
计算完成后,tab和itab重新指向首地址。
if( isum != INTER_REMAP_COEF_SCALE ) {
int diff = isum - INTER_REMAP_COEF_SCALE; int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2; for( k1 = ksize2; k1 < ksize2+2; k1++ ) for( k2 = ksize2; k2 < ksize2+2; k2++ ) {
if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] ) mk1 = k1, mk2 = k2; else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] ) Mk1 = k1, Mk2 = k2; } if( diff < 0 ) itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff); else itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff); } } tab -= INTER_TAB_SIZE2*ksize*ksize; itab -= INTER_TAB_SIZE2*ksize*ksize;
#if CV_SIMD128 if( method == INTER_LINEAR ) {
for( i = 0; i < INTER_TAB_SIZE2; i++ ) for( j = 0; j < 4; j++ ) {
BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0]; BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1]; BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0]; BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1]; } } #endif inittab[method] = true; } return fixpt ? (const void*)itab : (const void*)tab;
initInterTab1D
float scale = 1.f/tabsz; if( method == INTER_LINEAR ) { for( int i = 0; i < tabsz; i++, tab += 2 ) interpolateLinear( i*scale, tab ); } else if( method == INTER_CUBIC ) { for( int i = 0; i < tabsz; i++, tab += 4 ) interpolateCubic( i*scale, tab ); } else if( method == INTER_LANCZOS4 ) { for( int i = 0; i < tabsz; i++, tab += 8 ) interpolateLanczos4( i*scale, tab ); } else CV_Error( CV_StsBadArg, "Unknown interpolation method" ); interpolateLinear
计算插值系数。
coeffs[0] = 1.f - x; coeffs[1] = x;
RemapInvoker
public: RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1, const Mat *_m2, int _borderType, const Scalar &_borderValue, int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) : ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2), borderType(_borderType), borderValue(_borderValue), planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab) {
}
brows0为缓冲区行数,bcols0为列数。
dpart为目的矩阵的当前处理区域。
_bufxy为x和y的缓冲区。bufxy为当前 RoI 区域。
virtual void operator() (const Range& range) const CV_OVERRIDE {
int x, y, x1, y1; const int buf_size = 1 << 14; int brows0 = std::min(128, dst->rows), map_depth = m1->depth(); int bcols0 = std::min(buf_size/brows0, dst->cols); brows0 = std::min(buf_size/bcols0, dst->rows); Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa; if( !nnfunc ) _bufa.create(brows0, bcols0, CV_16UC1); for( y = range.start; y < range.end; y += brows0 ) {
for( x = 0; x < dst->cols; x += bcols0 ) {
int brows = std::min(brows0, range.end - y); int bcols = std::min(bcols0, dst->cols - x); Mat dpart(*dst, Rect(x, y, bcols, brows)); Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
如果为最近邻映射。
if( nnfunc ) {
if( m1->type() == CV_16SC2 && m2->empty() ) // the data is already in the right format bufxy = (*m1)(Rect(x, y, bcols, brows)); else if( map_depth != CV_32F ) {
for( y1 = 0; y1 < brows; y1++ ) {
short* XY = bufxy.ptr<short>(y1); const short* sXY = m1->ptr<short>(y+y1) + x*2; const ushort* sA = m2->ptr<ushort>(y+y1) + x; for( x1 = 0; x1 < bcols; x1++ ) {
int a = sA[x1] & (INTER_TAB_SIZE2-1); XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0]; XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1]; } } } else if( !planar_input ) (*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth()); else {
for( y1 = 0; y1 < brows; y1++ ) {
short* XY = bufxy.ptr<short>(y1); const float* sX = m1->ptr<float>(y+y1) + x; const float* sY = m2->ptr<float>(y+y1) + x; x1 = 0; #if CV_SIMD128 {
int span = v_float32x4::nlanes; for( ; x1 <= bcols - span * 2; x1 += span * 2 ) {
v_int32x4 ix0 = v_round(v_load(sX + x1)); v_int32x4 iy0 = v_round(v_load(sY + x1)); v_int32x4 ix1 = v_round(v_load(sX + x1 + span)); v_int32x4 iy1 = v_round(v_load(sY + x1 + span)); v_int16x8 dx, dy; dx = v_pack(ix0, ix1); dy = v_pack(iy0, iy1); v_store_interleave(XY + x1 * 2, dx, dy); } } #endif for( ; x1 < bcols; x1++ ) {
XY[x1*2] = saturate_cast<short>(sX[x1]); XY[x1*2+1] = saturate_cast<short>(sY[x1]); } } } nnfunc( *src, dpart, bufxy, borderType, borderValue ); continue; }
Mat bufa(_bufa, Rect(0, 0, bcols, brows)); for( y1 = 0; y1 < brows; y1++ ) {
short* XY = bufxy.ptr<short>(y1); ushort* A = bufa.ptr<ushort>(y1); if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) ) {
bufxy = (*m1)(Rect(x, y, bcols, brows)); const ushort* sA = m2->ptr<ushort>(y+y1) + x; x1 = 0; #if CV_SIMD128 {
v_uint16x8 v_scale = v_setall_u16(INTER_TAB_SIZE2 - 1); int span = v_uint16x8::nlanes; for( ; x1 <= bcols - span; x1 += span ) v_store((unsigned short*)(A + x1), v_load(sA + x1) & v_scale); } #endif for( ; x1 < bcols; x1++ ) A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1)); }
如果m1为单通道
else if( planar_input ) {
const float* sX = m1->ptr<float>(y+y1) + x; const float* sY = m2->ptr<float>(y+y1) + x; x1 = 0; #if CV_SIMD128 {
v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE); v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1); int span = v_float32x4::nlanes; for( ; x1 <= bcols - span * 2; x1 += span * 2 ) {
v_int32x4 v_sx0 = v_round(v_scale * v_load(sX + x1)); v_int32x4 v_sy0 = v_round(v_scale * v_load(sY + x1)); v_int32x4 v_sx1 = v_round(v_scale * v_load(sX + x1 + span)); v_int32x4 v_sy1 = v_round(v_scale * v_load(sY + x1 + span)); v_uint16x8 v_sx8 = v_reinterpret_as_u16(v_pack(v_sx0 & v_scale2, v_sx1 & v_scale2)); v_uint16x8 v_sy8 = v_reinterpret_as_u16(v_pack(v_sy0 & v_scale2, v_sy1 & v_scale2)); v_uint16x8 v_v = v_shl<INTER_BITS>(v_sy8) | (v_sx8); v_store(A + x1, v_v); v_int16x8 v_d0 = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1)); v_int16x8 v_d1 = v_pack(v_shr<INTER_BITS>(v_sy0), v_shr<INTER_BITS>(v_sy1)); v_store_interleave(XY + (x1 << 1), v_d0, v_d1); } } #endif for( ; x1 < bcols; x1++ ) {
int sx = cvRound(sX[x1]*INTER_TAB_SIZE); int sy = cvRound(sY[x1]*INTER_TAB_SIZE); int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1)); XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS); XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS); A[x1] = (ushort)v; } } else {
const float* sXY = m1->ptr<float>(y+y1) + x*2; x1 = 0; #if CV_SIMD128 {
v_float32x4 v_scale = v_setall_f32((float)INTER_TAB_SIZE); v_int32x4 v_scale2 = v_setall_s32(INTER_TAB_SIZE - 1), v_scale3 = v_setall_s32(INTER_TAB_SIZE); int span = v_float32x4::nlanes; for( ; x1 <= bcols - span * 2; x1 += span * 2 ) {
v_float32x4 v_fx, v_fy; v_load_deinterleave(sXY + (x1 << 1), v_fx, v_fy); v_int32x4 v_sx0 = v_round(v_fx * v_scale); v_int32x4 v_sy0 = v_round(v_fy * v_scale); v_load_deinterleave(sXY + ((x1 + span) << 1), v_fx, v_fy); v_int32x4 v_sx1 = v_round(v_fx * v_scale); v_int32x4 v_sy1 = v_round(v_fy * v_scale); v_int32x4 v_v0 = v_muladd(v_scale3, (v_sy0 & v_scale2), (v_sx0 & v_scale2)); v_int32x4 v_v1 = v_muladd(v_scale3, (v_sy1 & v_scale2), (v_sx1 & v_scale2)); v_uint16x8 v_v8 = v_reinterpret_as_u16(v_pack(v_v0, v_v1)); v_store(A + x1, v_v8); v_int16x8 v_dx = v_pack(v_shr<INTER_BITS>(v_sx0), v_shr<INTER_BITS>(v_sx1)); v_int16x8 v_dy = v_pack(v_shr<INTER_BITS>(v_sy0), v_shr<INTER_BITS>(v_sy1)); v_store_interleave(XY + (x1 << 1), v_dx, v_dy); } } #endif for( ; x1 < bcols; x1++ ) {
int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE); int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE); int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1)); XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS); XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS); A[x1] = (ushort)v; } } }
调用ifunc完成插值操作。
ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue); } } }
类成员。
private: const Mat* src; Mat* dst; const Mat *m1, *m2; int borderType; Scalar borderValue; int planar_input; RemapNNFunc nnfunc; RemapFunc ifunc; const void *ctab;
remapBilinear
typedef typename CastOp::rtype T; typedef typename CastOp::type1 WT; Size ssize = _src.size(), dsize = _dst.size(); const int cn = _src.channels(); const AT* wtab = (const AT*)_wtab; const T* S0 = _src.ptr<T>(); size_t sstep = _src.step/sizeof(S0[0]); T cval[CV_CN_MAX]; CastOp castOp; VecOp vecOp; for(int k = 0; k < cn; k++ ) cval[k] = saturate_cast<T>(_borderValue[k & 3]); unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0); CV_Assert( !ssize.empty() ); #if CV_SIMD128 if( _src.type() == CV_8UC3 ) width1 = std::max(ssize.width-2, 0); #endif D指向目的图上的当前行。
XY和FXY指向整数和小数部分的值。
curInlier判断是否超出边界。
如果dx在区域内,根据XY中的值是否超出边界来判断。
X0是上一次处理完后的行位置,也是本次起点,X1是本次处理的终点。
dy = 0时,由于X0=0,所以跳过不处理。如果prevInlier和curInlier相等则跳过不处理。
dx = dsize.width本是一个不存在的位置,这使得在行尾prevInlier和curInlier不相等,马上进行处理。
for(int dy = 0; dy < dsize.height; dy++ ) {
T* D = _dst.ptr<T>(dy); const short* XY = _xy.ptr<short>(dy); const ushort* FXY = _fxy.ptr<ushort>(dy); int X0 = 0; bool prevInlier = false; for(int dx = 0; dx <= dsize.width; dx++ ) {
bool curInlier = dx < dsize.width ? (unsigned)XY[dx*2] < width1 && (unsigned)XY[dx*2+1] < height1 : !prevInlier; if( curInlier == prevInlier ) continue; int X1 = dx; dx = X0; X0 = X1; prevInlier = curInlier;
f ( x , y ) = a 0 α 0 β 0 + a 1 α 1 β 0 + b 0 α 0 β 1 + b 1 α 1 β 1 \begin{aligned} f(x, y) &= a_0\alpha_0\beta_0+ a_1\alpha_1\beta_0 + b_0\alpha_0\beta_1+ b_1\alpha_1\beta_1 \end{aligned} f(x,y)=a0α0β0+a1α1β0+b0α0β1+b1α1β1
如果不是内露层,调用vecOp处理可向量化的数据;剩余部分循环处理。总共处理X1-X0个像素。
S为对应到源图的左上点地址,w为融合后的4个权重参数。以FXY数组的值作为索引,可以获得4个中心系数乘积。
if( !curInlier ) {
int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx ); D += len*cn; dx += len; if( cn == 1 ) {
for( ; dx < X1; dx++, D++ ) {
int sx = XY[dx*2], sy = XY[dx*2+1]; const AT* w = wtab + FXY[dx]*4; const T* S = S0 + sy*sstep + sx; *D = castOp(WT(S[0]*w[0] + S[1]*w[1] + S[sstep]*w[2] + S[sstep+1]*w[3])); } } else if( cn == 2 ) for( ; dx < X1; dx++, D += 2 ) {
int sx = XY[dx*2], sy = XY[dx*2+1]; const AT* w = wtab + FXY[dx]*4; const T* S = S0 + sy*sstep + sx*2; WT t0 = S[0]*w[0] + S[2]*w[1] + S[sstep]*w[2] + S[sstep+2]*w[3]; WT t1 = S[1]*w[0] + S[3]*w[1] + S[sstep+1]*w[2] + S[sstep+3]*w[3]; D[0] = castOp(t0); D[1] = castOp(t1); } else if( cn == 3 ) for( ; dx < X1; dx++, D += 3 ) {
int sx = XY[dx*2], sy = XY[dx*2+1]; const AT* w = wtab + FXY[dx]*4; const T* S = S0 + sy*sstep + sx*3; WT t0 = S[0]*w[0] + S[3]*w[1] + S[sstep]*w[2] + S[sstep+3]*w[3]; WT t1 = S[1]*w[0] + S[4]*w[1] + S[sstep+1]*w[2] + S[sstep+4]*w[3]; WT t2 = S[2]*w[0] + S[5]*w[1] + S[sstep+2]*w[2] + S[sstep+5]*w[3]; D[0] = castOp(t0); D[1] = castOp(t1); D[2] = castOp(t2); } else if( cn == 4 ) for( ; dx < X1; dx++, D += 4 ) {
int sx = XY[dx*2], sy = XY[dx*2+1]; const AT* w = wtab + FXY[dx]*4; const T* S = S0 + sy*sstep + sx*4; WT t0 = S[0]*w[0] + S[4]*w[1] + S[sstep]*w[2] + S[sstep+4]*w[3]; WT t1 = S[1]*w[0] + S[5]*w[1] + S[sstep+1]*w[2] + S[sstep+5]*w[3]; D[0] = castOp(t0); D[1] = castOp(t1); t0 = S[2]*w[0] + S[6]*w[1] + S[sstep+2]*w[2] + S[sstep+6]*w[3]; t1 = S[3]*w[0] + S[7]*w[1] + S[sstep+3]*w[2] + S[sstep+7]*w[3]; D[2] = castOp(t0); D[3] = castOp(t1); } else for( ; dx < X1; dx++, D += cn ) {
int sx = XY[dx*2], sy = XY[dx*2+1]; const AT* w = wtab + FXY[dx]*4; const T* S = S0 + sy*sstep + sx*cn; for(int k = 0; k < cn; k++ ) {
WT t0 = S[k]*w[0] + S[k+cn]*w[1] + S[sstep+k]*w[2] + S[sstep+k+cn]*w[3]; D[k] = castOp(t0); } } }
else {
if( borderType == BORDER_TRANSPARENT && cn != 3 ) {
D += (X1 - dx)*cn; dx = X1; continue; }
if( cn == 1 ) for( ; dx < X1; dx++, D++ ) {
int sx = XY[dx*2], sy = XY[dx*2+1]; if( borderType == BORDER_CONSTANT && (sx >= ssize.width || sx+1 < 0 || sy >= ssize.height || sy+1 < 0) ) {
D[0] = cval[0]; } else {
int sx0, sx1, sy0, sy1; T v0, v1, v2, v3; const AT* w = wtab + FXY[dx]*4; if( borderType == BORDER_REPLICATE ) {
sx0 = clip(sx, 0, ssize.width); sx1 = clip(sx+1, 0, ssize.width); sy0 = clip(sy, 0, ssize.height); sy1 = clip(sy+1, 0, ssize.height); v0 = S0[sy0*sstep + sx0]; v1 = S0[sy0*sstep + sx1]; v2 = S0[sy1*sstep + sx0]; v3 = S0[sy1*sstep + sx1]; } else {
sx0 = borderInterpolate(sx, ssize.width, borderType); sx1 = borderInterpolate(sx+1, ssize.width, borderType); sy0 = borderInterpolate(sy, ssize.height, borderType); sy1 = borderInterpolate(sy+1, ssize.height, borderType); v0 = sx0 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx0] : cval[0]; v1 = sx1 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx1] : cval[0]; v2 = sx0 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx0] : cval[0]; v3 = sx1 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx1] : cval[0]; } D[0] = castOp(WT(v0*w[0] + v1*w[1] + v2*w[2] + v3*w[3])); } }
多通道的边界处理。
else for( ; dx < X1; dx++, D += cn ) {
int sx = XY[dx*2], sy = XY[dx*2+1]; if( borderType == BORDER_CONSTANT && (sx >= ssize.width || sx+1 < 0 || sy >= ssize.height || sy+1 < 0) ) {
for(int k = 0; k < cn; k++ ) D[k] = cval[k]; } else {
int sx0, sx1, sy0, sy1; const T *v0, *v1, *v2, *v3; const AT* w = wtab + FXY[dx]*4; if( borderType == BORDER_REPLICATE ) {
sx0 = clip(sx, 0, ssize.width); sx1 = clip(sx+1, 0, ssize.width); sy0 = clip(sy, 0, ssize.height); sy1 = clip(sy+1, 0, ssize.height); v0 = S0 + sy0*sstep + sx0*cn; v1 = S0 + sy0*sstep + sx1*cn; v2 = S0 + sy1*sstep + sx0*cn; v3 = S0 + sy1*sstep + sx1*cn; } else if( borderType == BORDER_TRANSPARENT && ((unsigned)sx >= (unsigned)(ssize.width-1) || (unsigned)sy >= (unsigned)(ssize.height-1))) continue; else {
sx0 = borderInterpolate(sx, ssize.width, borderType); sx1 = borderInterpolate(sx+1, ssize.width, borderType); sy0 = borderInterpolate(sy, ssize.height, borderType); sy1 = borderInterpolate(sy+1, ssize.height, borderType); v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx0*cn : &cval[0]; v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx1*cn : &cval[0]; v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0]; v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0]; } for(int k = 0; k < cn; k++ ) D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3])); } } } } }
RemapVec_8u
int operator()( const Mat& _src, void* _dst, const short* XY, const ushort* FXY, const void* _wtab, int width ) const {
int cn = _src.channels(), x = 0, sstep = (int)_src.step; if( (cn != 1 && cn != 3 && cn != 4) || sstep >= 0x8000 ) return 0; const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1); const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0]; uchar* D = (uchar*)_dst; v_int32x4 delta = v_setall_s32(INTER_REMAP_COEF_SCALE / 2); v_int16x8 xy2ofs = v_reinterpret_as_s16(v_setall_s32(cn + (sstep << 16))); int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4]; const uchar* src_limit_8bytes = _src.datalimit - v_int16x8::nlanes; #define CV_PICK_AND_PACK_RGB(ptr, offset, result) \ { \ const uchar* const p = ((const uchar*)ptr) + (offset); \ if (p <= src_limit_8bytes) \ { \ v_uint8x16 rrggbb, dummy; \ v_uint16x8 rrggbb8, dummy8; \ v_uint8x16 rgb0 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p), 0, 0, 0)); \ v_uint8x16 rgb1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + 3), 0, 0, 0)); \ v_zip(rgb0, rgb1, rrggbb, dummy); \ v_expand(rrggbb, rrggbb8, dummy8); \ result = v_reinterpret_as_s16(rrggbb8); \ } \ else \ { \ result = v_int16x8((short)p[0], (short)p[3], /* r0r1 */ \ (short)p[1], (short)p[4], /* g0g1 */ \ (short)p[2], (short)p[5], /* b0b1 */ 0, 0); \ } \ } #define CV_PICK_AND_PACK_RGBA(ptr, offset, result) \ { \ const uchar* const p = ((const uchar*)ptr) + (offset); \ CV_DbgAssert(p <= src_limit_8bytes); \ v_uint8x16 rrggbbaa, dummy; \ v_uint16x8 rrggbbaa8, dummy8; \ v_uint8x16 rgba0 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p), 0, 0, 0)); \ v_uint8x16 rgba1 = v_reinterpret_as_u8(v_int32x4(*(unaligned_int*)(p + v_int32x4::nlanes), 0, 0, 0)); \ v_zip(rgba0, rgba1, rrggbbaa, dummy); \ v_expand(rrggbbaa, rrggbbaa8, dummy8); \ result = v_reinterpret_as_s16(rrggbbaa8); \ } #define CV_PICK_AND_PACK4(base,offset) \ v_uint16x8(*(unaligned_ushort*)(base + offset[0]), *(unaligned_ushort*)(base + offset[1]), \ *(unaligned_ushort*)(base + offset[2]), *(unaligned_ushort*)(base + offset[3]), \ 0, 0, 0, 0)
if( cn == 1 ) {
for( ; x <= width - 8; x += 8 ) {
v_int16x8 _xy0 = v_load(XY + x*2); v_int16x8 _xy1 = v_load(XY + x*2 + 8); v_int32x4 v0, v1, v2, v3, a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2;
v_int32x4 xy0 = v_dotprod( _xy0, xy2ofs ); v_int32x4 xy1 = v_dotprod( _xy1, xy2ofs ); v_store( iofs0, xy0 ); v_store( iofs1, xy1 );
v_uint16x8 stub, dummy; v_uint16x8 vec16; vec16 = CV_PICK_AND_PACK4(S0, iofs0); v_expand(v_reinterpret_as_u8(vec16), stub, dummy); v0 = v_reinterpret_as_s32(stub); vec16 = CV_PICK_AND_PACK4(S1, iofs0); v_expand(v_reinterpret_as_u8(vec16), stub, dummy); v1 = v_reinterpret_as_s32(stub);
f ( x , y ) = ( a 0 α 0 + a 1 α 1 ) β 0 + ( b 0 α 0 + b 1 α 1 ) β 1 = a 0 α 0 β 0 + a 1 α 1 β 0 + b 0 α 0 β 1 + b 1 α 1 β 1 = f ( Q 11 ) ( x 2 − x ) ( y 2 − y ) + f ( Q 12 ) ( x − x 1 ) ( y 2 − y ) + f ( Q 21 ) ( x 2 − x ) ( y 2 − y ) + f ( Q 22 ) ( x − x 1 ) ( y − y 1 ) \begin{aligned} f(x, y) &= (a_0\alpha_0+ a_1\alpha_1)\beta_0 + (b_0\alpha_0+ b_1\alpha_1)\beta_1\\ &= a_0\alpha_0\beta_0+ a_1\alpha_1\beta_0 + b_0\alpha_0\beta_1+ b_1\alpha_1\beta_1\\ &= f(Q_{11})(x_2 -x)(y_2 -y) + f(Q_{12})(x-x_1)(y_2 -y) + f(Q_{21})(x_2 -x)(y_2 -y) + f(Q_{22})(x-x_1)(y-y_1) \end{aligned} f(x,y)=(a0α0+a1α1)β0+(b0α0+b1α1)β1=a0α0β0+a1α1β0+b0α0β1+b1α1β1=f(Q11)(x2−x)(y2−y)+f(Q12)(x−x1)(y2−y)+f(Q21)(x2−x)(y2−y)+f(Q22)(x−x1)(y−y1)
v_zip(v_load_low((int*)(wtab + FXY[x] * 4)), v_load_low((int*)(wtab + FXY[x + 1] * 4)), a0, a1); v_zip(v_load_low((int*)(wtab + FXY[x + 2] * 4)), v_load_low((int*)(wtab + FXY[x + 3] * 4)), b0, b1); v_recombine(a0, b0, a2, b2); v1 = v_dotprod(v_reinterpret_as_s16(v1), v_reinterpret_as_s16(b2), delta); v0 = v_dotprod(v_reinterpret_as_s16(v0), v_reinterpret_as_s16(a2), v1);
对于iofs1重复以上操作得到v2。
vec16 = CV_PICK_AND_PACK4(S0, iofs1); v_expand(v_reinterpret_as_u8(vec16), stub, dummy); v2 = v_reinterpret_as_s32(stub); vec16 = CV_PICK_AND_PACK4(S1, iofs1); v_expand(v_reinterpret_as_u8(vec16), stub, dummy); v3 = v_reinterpret_as_s32(stub); v_zip(v_load_low((int*)(wtab + FXY[x + 4] * 4)), v_load_low((int*)(wtab + FXY[x + 5] * 4)), c0, c1); v_zip(v_load_low((int*)(wtab + FXY[x + 6] * 4)), v_load_low((int*)(wtab + FXY[x + 7] * 4)), d0, d1); v_recombine(c0, d0, c2, d2); v3 = v_dotprod(v_reinterpret_as_s16(v3), v_reinterpret_as_s16(d2), delta); v2 = v_dotprod(v_reinterpret_as_s16(v2), v_reinterpret_as_s16(c2), v3);
v0 = v0 >> INTER_REMAP_COEF_BITS; v2 = v2 >> INTER_REMAP_COEF_BITS; v_pack_u_store(D + x, v_pack(v0, v2)); } }
如果是三通道数据。
else if( cn == 3 ) {
for( ; x <= width - 5; x += 4, D += 12 ) {
v_int16x8 u0, v0, u1, v1; v_int16x8 _xy0 = v_load(XY + x * 2); v_int32x4 xy0 = v_dotprod(_xy0, xy2ofs); v_store(iofs0, xy0); int offset0 = FXY[x] * 16; int offset1 = FXY[x + 1] * 16; int offset2 = FXY[x + 2] * 16; int offset3 = FXY[x + 3] * 16; v_int16x8 w00 = v_load(wtab + offset0); v_int16x8 w01 = v_load(wtab + offset0 + 8); v_int16x8 w10 = v_load(wtab + offset1); v_int16x8 w11 = v_load(wtab + offset1 + 8); CV_PICK_AND_PACK_RGB(S0, iofs0[0], u0); CV_PICK_AND_PACK_RGB(S1, iofs0[0], v0); CV_PICK_AND_PACK_RGB(S0, iofs0[1], u1); CV_PICK_AND_PACK_RGB(S1, iofs0[1], v1); v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS; v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS; result0 = v_rotate_left<1>(result0); v_int16x8 result8 = v_pack(result0, result1); v_uint8x16 result16 = v_pack_u(result8, result8); v_store_low(D, v_rotate_right<1>(result16)); w00 = v_load(wtab + offset2); w01 = v_load(wtab + offset2 + 8); w10 = v_load(wtab + offset3); w11 = v_load(wtab + offset3 + 8); CV_PICK_AND_PACK_RGB(S0, iofs0[2], u0); CV_PICK_AND_PACK_RGB(S1, iofs0[2], v0); CV_PICK_AND_PACK_RGB(S0, iofs0[3], u1); CV_PICK_AND_PACK_RGB(S1, iofs0[3], v1); result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS; result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS; result0 = v_rotate_left<1>(result0); result8 = v_pack(result0, result1); result16 = v_pack_u(result8, result8); v_store_low(D + 6, v_rotate_right<1>(result16)); } } else if( cn == 4 ) {
for( ; x <= width - 4; x += 4, D += 16 ) {
v_int16x8 _xy0 = v_load(XY + x * 2); v_int16x8 u0, v0, u1, v1; v_int32x4 xy0 = v_dotprod( _xy0, xy2ofs ); v_store(iofs0, xy0); int offset0 = FXY[x] * 16; int offset1 = FXY[x + 1] * 16; int offset2 = FXY[x + 2] * 16; int offset3 = FXY[x + 3] * 16; v_int16x8 w00 = v_load(wtab + offset0); v_int16x8 w01 = v_load(wtab + offset0 + 8); v_int16x8 w10 = v_load(wtab + offset1); v_int16x8 w11 = v_load(wtab + offset1 + 8); CV_PICK_AND_PACK_RGBA(S0, iofs0[0], u0); CV_PICK_AND_PACK_RGBA(S1, iofs0[0], v0); CV_PICK_AND_PACK_RGBA(S0, iofs0[1], u1); CV_PICK_AND_PACK_RGBA(S1, iofs0[1], v1); v_int32x4 result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS; v_int32x4 result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS; v_int16x8 result8 = v_pack(result0, result1); v_pack_u_store(D, result8); w00 = v_load(wtab + offset2); w01 = v_load(wtab + offset2 + 8); w10 = v_load(wtab + offset3); w11 = v_load(wtab + offset3 + 8); CV_PICK_AND_PACK_RGBA(S0, iofs0[2], u0); CV_PICK_AND_PACK_RGBA(S1, iofs0[2], v0); CV_PICK_AND_PACK_RGBA(S0, iofs0[3], u1); CV_PICK_AND_PACK_RGBA(S1, iofs0[3], v1); result0 = v_dotprod(u0, w00, v_dotprod(v0, w01, delta)) >> INTER_REMAP_COEF_BITS; result1 = v_dotprod(u1, w10, v_dotprod(v1, w11, delta)) >> INTER_REMAP_COEF_BITS; result8 = v_pack(result0, result1); v_pack_u_store(D + 8, result8); } } return x; }
参考资料:
- Warp Affine
- How to work with OpenCV application based on OpenVX API
- OpenVX sample #7163
- opencv/cmake/FindOpenVX.cmake
- Hybrid CV/DL pipelines with OpenCV 4.4 G-API
- opencv和openvx
- Showing our vision with OpenVX™ 1.1 support for PowerVR GPUs
- Amdovx Core
- OpenVX 1.3 is Here!
- Color Copy OpenVX* Sample
- opencv ncnn warpaffine 性能测试
- OpenCV warpAffine的天坑
- Geometric Transformations of Images
- Decompose a 2D arbitrary transform into only scaling and rotation
- Decompose affine transformation (including shear in x and y)
- Decomposing an Affine transformation
- Decomposing an Affine transformation
- Given this transformation matrix, how do I decompose it into translation, rotation and scale matrices?
- Opencv 仿射变换原理代码解析
- 如何通俗地讲解「仿射变换」这个概念?
- OpenCV代码提取:warpAffine函数的实现
- NEON优化——OpenCV WarpAffine最近邻
- Subtracting 0x8000 from an int
- OpenCV2:Mat属性type,depth,step
- OpenCV中如何获取Mat类型中的步长stride及分析 C++实现
- NEON is the new black: fast JPEG optimization on ARM server
- 大端序与小端序
- Remapping
- C++ 命令模式讲解和代码示例
- Specify an origin to warpPerspective() function in OpenCV 2.x
- 计算机视觉2D几何基元及其变换介绍和OpenCV WarpPerspective源码分析
发布者:全栈程序员-站长,转载请注明出处:https://javaforall.net/225776.html原文链接:https://javaforall.net
