TNN 的仿射变换形态介于 OpenCV 和 ncnn 之间。其处理流程与 OpenCV 较为相似并做了一些优化,不同的地方在于数据处理宽度为4,比较小。在性能表现方面中规中矩,小图上不及 ncnn。
TNN/blob/master/source/tnn/utils/mat_utils.cc#L156">MatUtils::WarpAffine
CheckSrcAndDstMat 对输入输出的所在设备、数据类型及尺寸等进行检查。
ArmMatConverterAcc::WarpAffine 即 arm 设备上的实现。
auto ret = CheckSrcAndDstMat(src, dst, true, true, true);
if (ret != TNN_OK) {
return ret;
}
if (dst.GetData() == nullptr) {
// set dst size to src size
dst = Mat(dst.GetDeviceType(), dst.GetMatType(), src.GetDims());
}
MAT_CONVERTER_PREPARATION(src.GetDeviceType());
return converter->WarpAffine(src, dst, param, command_queue);
TNN/blob/master/source/tnn/device/arm/arm_mat_converter.cc#L174">ArmMatConverterAcc::WarpAffine
CheckMatConverterParams 检查输入输出是否为空,所属设备是否一致。
AFFINE_CHECK_RUN 检查参数,目前仅支持常量填充。
Status ret = TNN_OK;
ret = CheckMatConverterParams(src, dst, true);
if (ret != TNN_OK)
return ret;
int dst_width = dst.GetWidth();
int dst_height = dst.GetHeight();
if (dst_width == 0 || dst_height == 0) {
return Status(TNNERR_INVALID_INPUT, "dst size is zero");
}
if (src.GetMatType() == NGRAY) {
AFFINE_CHECK_RUN(WarpAffineBilinearC1, WarpAffineNearestC1);
} else if (src.GetMatType() == N8UC3) {
AFFINE_CHECK_RUN(WarpAffineBilinearC3, WarpAffineNearestC3);
} else if (src.GetMatType() == N8UC4) {
AFFINE_CHECK_RUN(WarpAffineBilinearC4, WarpAffineNearestC4);
} else if (src.GetMatType() == NNV21 || src.GetMatType() == NNV12) {
AFFINE_CHECK_RUN(WarpAffineBilinearYUV420sp, WarpAffineNearestYUV420sp);
} else {
return Status(TNNERR_PARAM_ERR, "ArmMatConverterAcc::WarpAffine, convert type not support yet");
}
return ret;
TNN/blob/master/source/tnn/device/arm/arm_mat_util.cc#L1404">WarpAffineBilinearC1
脱离了 TNN 定义的结构体,便于移植。
调用模板函数 WarpAffineBilinear。
WarpAffineBilinear<1>(src, batch, src_w, src_h, dst, dst_w, dst_h, transform, border_val);
TNN/blob/master/source/tnn/device/arm/arm_mat_util.cc#L1369">WarpAffineBilinear
WarpAffineInit 为buffer
开辟内存并计算好adelta
和bdelta
。
buf_loc
和tab_loc
为局部缓冲区和表。
src2
指向第2行。
int src_plane = src_h * src_w * schannel;
int* buffer = nullptr;
WarpAffineInit(dst, batch, dst_w, dst_h, schannel, border_val, transform, &buffer);
int* adelta = buffer;
int* bdelta = buffer + dst_w * 2;
int max_num_threads = OMP_MAX_THREADS_NUM_;
int* buf_loc = new int[dst_w * max_num_threads];
short* tab_loc = new short[dst_w * max_num_threads];
const unsigned char* src2 = src + src_w * schannel;
dst_loc_base
为当初行偏移。
buf_loc_t
和tab_loc_t
指向当前线程可用的内存。
WarpAffinePrepareOneRow 计算映射关系,即生成 map。
WarpAffineCalculateOneRow 根据源像素生成结果。
OMP_PARALLEL_FOR_
for (int y = 0; y < dst_h * batch; ++y) {
int thread_id = OMP_TID_;
int x_count = 0;
int end_x = 0;
int dst_loc_base = y * dst_w * schannel;
int* buf_loc_t = buf_loc + thread_id * dst_w;
short* tab_loc_t = tab_loc + thread_id * dst_w;
WarpAffinePrepareOneRow(buf_loc_t, tab_loc_t, adelta, bdelta, schannel, src, src_w, src_h,
dst + dst_loc_base, dst_w, y % dst_h, (y / dst_h) * src_plane, x_count, end_x, border_val);
WarpAffineCalculateOneRow(end_x - x_count + 1, end_x, schannel, dst_loc_base, buf_loc_t, tab_loc_t, src, src2, dst);
}
delete[] buf_loc;
delete[] tab_loc;
free(buffer);
TNN/blob/master/source/tnn/device/arm/arm_mat_util.cc#L877">WarpAffineInit
将目标填充边界值。比较暴力。
InitInterTab2D 生成插值表。
WarpAffineMatrixInverse 对参数矩阵求逆。
uint8_t border_ival = (uint8_t)border_val;
memset(dst, border_ival, batch * dst_h * dst_w * channel);
// Init LookUp Table
InitInterTab2D();
double m[6];
WarpAffineMatrixInverse(transform, m);
预先计算行列元素变换系数,adelta
为
M
11
x
M_{11}x
M11x 和
M
21
x
M_{21}x
M21x,bdelta
为
M
12
y
+
M
13
M_{12}y+ M_{13}
M12y+M13 和
M
22
y
+
M
23
M_{22}y+ M_{23}
M22y+M23。
*buffer = reinterpret_cast<int*>(armMalloc((dst_w + dst_h) * 2 * sizeof(int)));
int* adelta = *buffer;
int* bdelta = *buffer + dst_w * 2;
for (int x = 0; x < dst_w; x++) {
*adelta++ = SATURATE_CAST_INT(m[0] * x * 1024);
*adelta++ = SATURATE_CAST_INT(m[3] * x * 1024);
}
for (int y = 0; y < dst_h; y++) {
*bdelta++ = SATURATE_CAST_INT((m[1] * y + m[2]) * 1024);
*bdelta++ = SATURATE_CAST_INT((m[4] * y + m[5]) * 1024);
}
TNN/blob/master/source/tnn/device/arm/arm_mat_util.cc#L828">InitInterTab2D
static bool inited = false;
if (inited) {
return;
}
short* itab = BilinearTab_i[0][0];
int ksize = KSIZE;
float* _tab = new float[2 * INTER_TAB_SIZE];
int i, j, k1, k2;
InitInterTab1D(_tab, INTER_TAB_SIZE);
for (i = 0; i < INTER_TAB_SIZE; i++) {
for (j = 0; j < INTER_TAB_SIZE; j++, itab += ksize * ksize) {
int isum = 0;
for (k1 = 0; k1 < ksize; k1++) {
float vy = _tab[i * ksize + k1];
for (k2 = 0; k2 < ksize; k2++) {
float v = vy * _tab[j * ksize + k2];
isum += itab[k1 * ksize + k2] = SATURATE_CAST_SHORT(v * INTER_REMAP_COEF_SCALE);
}
}
if (isum != INTER_REMAP_COEF_SCALE) {
int diff = isum - INTER_REMAP_COEF_SCALE;
int ksize2 = ksize / 2, Mk1 = ksize2, Mk2 = ksize2, mk1 = ksize2, mk2 = ksize2;
for (k1 = ksize2; k1 < ksize2 + 2; k1++)
for (k2 = ksize2; k2 < ksize2 + 2; k2++) {
if (itab[k1 * ksize + k2] < itab[mk1 * ksize + mk2])
mk1 = k1, mk2 = k2;
else if (itab[k1 * ksize + k2] > itab[Mk1 * ksize + Mk2])
Mk1 = k1, Mk2 = k2;
}
if (diff < 0)
itab[Mk1 * ksize + Mk2] = (short)(itab[Mk1 * ksize + Mk2] - diff);
else
itab[mk1 * ksize + mk2] = (short)(itab[mk1 * ksize + mk2] - diff);
}
}
}
delete[] _tab;
TNN/blob/master/source/tnn/utils/mat_converter_utils.cc#L128">WarpAffineMatrixInverse
M
=
[
m
0
m
1
m
2
m
3
m
4
m
5
m
6
m
7
m
8
]
=
[
m
0
m
1
m
2
m
3
m
4
m
5
m
6
m
7
m
8
]
=
[
A
B
O
D
]
M = \begin{bmatrix} m_0 & m_1 & m_2 \\ m_3 & m_4 & m_5 \\ m_6 & m_7 & m_8 \end{bmatrix} =\begin{bmatrix} \begin{array}{c c|c} m_0 & m_1 & m_2 \\ m_3 & m_4 & m_5 \\\hline m_6 & m_7 & m_8 \end{array} \end{bmatrix} =\begin{bmatrix} A & B \\ O & D \end{bmatrix}
M=⎣⎡m0m3m6m1m4m7m2m5m8⎦⎤=⎣⎡m0m3m6m1m4m7m2m5m8⎦⎤=[AOBD]
对于块上三角矩阵,且
D
=
I
D=I
D=I
M
−
1
=
[
A
−
1
−
A
−
1
B
D
−
1
O
D
−
1
]
=
[
A
−
1
−
A
−
1
B
O
I
]
M^{-1}=\begin{bmatrix} A^{-1} & -A^{-1}BD^{-1} \\ O & D^{-1} \end{bmatrix} =\begin{bmatrix} A^{-1} & -A^{-1}B \\ O & I \end{bmatrix}
M−1=[A−1O−A−1BD−1D−1]=[A−1O−A−1BI]
其中
A
−
1
=
1
d
e
t
(
A
)
a
d
j
(
A
)
=
1
m
0
m
4
−
m
1
m
3
[
A
11
A
12
A
21
A
22
]
=
1
m
0
m
4
−
m
1
m
3
[
m
4
−
m
1
−
m
3
m
0
]
\begin{aligned} A^{−1}&=\frac{1}{\mathrm{det}(A)}\mathrm{adj}(A)\\ &=\frac{1}{m_0 m_4- m_1 m_3}\begin{bmatrix} A_{11} & A_{12} \\ A_{21} & A_{22} \end{bmatrix}\\ &=\frac{1}{m_0 m_4- m_1 m_3}\begin{bmatrix} m_4 & -m_1 \\ -m_3 & m_0 \end{bmatrix} \end{aligned}
A−1=det(A)1adj(A)=m0m4−m1m31[A11A21A12A22]=m0m4−m1m31[m4−m3−m1m0]
double M[6];
M[0] = transform[0][0];
M[1] = transform[0][1];
M[2] = transform[0][2];
M[3] = transform[1][0];
M[4] = transform[1][1];
M[5] = transform[1][2];
// Inverse transform matrix
double D = M[0] * M[4] - M[1] * M[3];
D = D != 0 ? 1. / D : 0;
double A11 = M[4] * D, A22 = M[0] * D;
inverse[0] = A11;
inverse[1] = M[1] * (-D);
inverse[3] = M[3] * (-D);
inverse[4] = A22;
double b1 = -A11 * M[2] - inverse[1] * M[5];
double b2 = -inverse[3] * M[2] - A22 * M[5];
inverse[2] = b1;
inverse[5] = b2;
TNN/blob/master/source/tnn/device/arm/arm_mat_util.cc#L914">WarpAffinePrepareOneRow
src2
没有传入,而是再次计算了一遍。
vld1q_s32 从内存加载4个元素到寄存器。
_bdelta0
加载当前行所对应的
M
12
y
M_{12}y
M12y 和
M
22
y
M_{22}y
M22y,_bdelta
则为两份的拼接。
_src_w
为不同维度上的步长。
const unsigned char* src2 = src + src_w * channel;
short xy_loc_buf[dst_w * 2];
short tb_loc_buf[dst_w];
int sc_loc_buf[dst_w];
int* adelta_p = adelta;
short* xy_loc_buf_p = xy_loc_buf;
short* tb_loc_buf_p = tb_loc_buf;
int* sc_loc_buf_p = sc_loc_buf;
int x = 0;
#ifdef TNN_USE_NEON
int32x2_t _bdelta0 = vld1_s32(bdelta + 2 * y);
int32x4_t _bdelta = vcombine_s32(_bdelta0, _bdelta0);
int32x4_t _offset = vdupq_n_s32(16);
int16x8_t _mask = vdupq_n_s16(31);
int16x8_t _coeff = {1,32,1,32,1,32,1,32};
int32x4_t _channel = vdupq_n_s32(channel);
int32x4_t _soffset = vdupq_n_s32(src_offset);
int16x4_t _src_w = {1, (short)src_w,1,(short)src_w};
每次处理目的区域的4个元素。
vaddq_s32 执行4个整数的加法。
_xyxy0
为前两个目的像素所对应的源图坐标
(
M
11
x
+
M
12
y
+
M
13
,
M
21
x
+
M
22
y
+
M
23
)
(M_{11}x + M_{12}y + M_{13}, M_{21}x+M_{22}y + M_{23})
(M11x+M12y+M13,M21x+M22y+M23),_xyxy1
为后两个。ncnn 中 x 和 y 是分开的。
右移10位还原后,_xyxy0s
和_xyxy1s
为真实坐标值。
vcombine_s16 将两个较小的向量合并为一个较大的向量。
_xyxy01s
为4对坐标值。
vst1q_s16 存储8个元素到内存。
for (; x < dst_w>>2<<2; x += 4) {
int32x4_t _xyxy0 = vaddq_s32(vld1q_s32(adelta_p), _offset);
int32x4_t _xyxy1 = vaddq_s32(vld1q_s32(adelta_p + 4), _offset);
_xyxy0 = vaddq_s32(_xyxy0, _bdelta);
_xyxy1 = vaddq_s32(_xyxy1, _bdelta);
int16x4_t _xyxy0s = vshrn_n_s32(_xyxy0, 10);
//int16x8_t _xyxy01s = vshrn_high_n_s32(_xyxy0s, _xyxy1, 10);
int16x4_t _xyxy1s = vshrn_n_s32(_xyxy1, 10);
int16x8_t _xyxy01s = vcombine_s16(_xyxy0s, _xyxy1s);
vst1q_s16(xy_loc_buf_p, _xyxy01s);
vmull_s16 为向量长乘。
_src_0
中得到两部分偏移,即x
和y*w
。
vmulq_s32
vst1_s16 存储到内存中。
sc_loc_buf_p
保存4个目的像素对应到源图上的偏移。
int32x4_t _src_0 = vmull_s16(_xyxy0s, _src_w);
int32x4_t _src_1 = vmull_s16(vget_high_s16(_xyxy01s), _src_w);
vst1q_s32(sc_loc_buf_p, vaddq_s32(vmulq_s32(_channel, VPADDQ_S32(_src_0, _src_1)), _soffset));
减少右移位数可以保留一定范围内的小数值。
_xyxy0s = vshrn_n_s32(_xyxy0, 5);
//_xyxy01s = vshrn_high_n_s32(_xyxy0s, _xyxy1, 5);
_xyxy1s = vshrn_n_s32(_xyxy1, 5);
_xyxy01s = vcombine_s16(_xyxy0s, _xyxy1s);
int16x8_t _tab_xys = vmulq_s16(vandq_s16(_xyxy01s, _mask), _coeff);
vst1_s16(tb_loc_buf_p, vpadd_s16(vget_low_s16(_tab_xys), vget_high_s16(_tab_xys)));
adelta_p += 8;
xy_loc_buf_p += 8;
tb_loc_buf_p += 4;
sc_loc_buf_p += 4;
}
if (dst_w % 4) {
x -= 4;
}
#endif
对于末尾未对齐的数据。
new_x
为
M
11
x
+
M
12
y
+
M
13
M_{11}x + M_{12}y + M_{13}
M11x+M12y+M13, new_y
为
M
21
x
+
M
22
y
+
M
23
M_{21}x+M_{22}y + M_{23}
M21x+M22y+M23。
由于adelta
和bdelta
乘以了1024,右移后new_x_loc
和new_y_loc
为对应源图坐标。
tb_loc_buf
将 x 和 y 的小数部分拼接起来存储。
sc_loc_buf
记录一维格式下的地址。
for (; x < dst_w; ++x) {
int new_x = adelta[2 * x] + bdelta[2 * y] + 16;
int new_y = adelta[2 * x + 1] + bdelta[2 * y + 1] + 16;
int new_x_loc = new_x >> 10;
int new_y_loc = new_y >> 10;
xy_loc_buf[2 * x] = new_x_loc;
xy_loc_buf[2 * x + 1] = new_y_loc;
tb_loc_buf[x] = ((new_x >> 5) & 31) + ((new_y >> 5) & 31) * 32;
sc_loc_buf[x] = (new_x_loc + new_y_loc * src_w) * channel + src_offset;
}
CheckDataIsOnBoundary 检查数据是否在边界上。
如果源图上的坐标在边界内,则保存到buf_loc
,插值系数保存到tab_loc
;
否则,如果在边界上则计算出变换后的结果。
wtab
为系数乘积表。
mask0
、mask1
、mask2
和mask3
用于判断插值的4个点是否超出。
for (x = 0; x < dst_w; ++x) {
short new_x_loc = xy_loc_buf[2 * x];
short new_y_loc = xy_loc_buf[2 * x + 1];
short new_xy_float = tb_loc_buf[x];
int src_loc = sc_loc_buf[x];
if ((unsigned)new_x_loc < (src_w - 1) && (unsigned)new_y_loc < (src_h - 1)) {
buf_loc[x] = src_loc;
tab_loc[x] = new_xy_float;
x_count++;
end_x = x;
} else if (CheckDataIsOnBoundary(new_x_loc, new_y_loc, src_w, src_h)) {
short* wtab = BilinearTab_i[new_xy_float][0];
int dsc_loc = x * channel;
int mask0 = new_x_loc >= 0 && new_y_loc >= 0;
int mask1 = new_x_loc <= (src_w - 2) && new_y_loc >= 0;
int mask2 = new_x_loc >= 0 && new_y_loc <= (src_h - 2);
int mask3 = new_x_loc <= (src_w - 2) && new_y_loc <= (src_h - 2);
for (int c = 0; c < channel; ++c) {
int val_xy = 0;
val_xy += wtab[0] * (mask0 ? src[src_loc + c] : border_val);
val_xy += wtab[1] * (mask1 ? src[src_loc + channel + c] : border_val);
val_xy += wtab[2] * (mask2 ? src2[src_loc + c] : border_val);
val_xy += wtab[3] * (mask3 ? src2[src_loc + channel + c] : border_val);
dst[dsc_loc + c] = SATURATE_CAST_UCHAR((val_xy + (1 << 14)) >> 15);
}
}
}
TNN/blob/master/source/tnn/device/arm/arm_mat_util.cc#L1010">WarpAffineCalculateOneRow
buf_loc_p
对应 OpenCV 中的_map1
,但这里是一维的。
const int* buf_loc_p = buf_loc + begin_x;
const short* tab_loc_p = tab_loc + begin_x;
const short* tab_p = BilinearTab_i[0][0];
int x = begin_x;
#ifdef TNN_USE_NEON
#define MAKE_CAL(n) \
_val0 = vmull_s16(_tab0, vget_low_s16(_src16_0##n)); \
_val1 = vmull_s16(_tab1, vget_high_s16(_src16_0##n)); \
_val2 = vmull_s16(_tab2, vget_low_s16(_src16_1##n)); \
_val3 = vmull_s16(_tab3, vget_high_s16(_src16_1##n)); \
_res0123 = VPADDQ_S32(VPADDQ_S32(_val0, _val1), VPADDQ_S32(_val2, _val3)); \
_res0123 = vaddq_s32(_res0123, _offset); \
_res16 = vshrn_n_s32(_res0123, 15); \
_resu8.val[n] = vqmovun_s16(vcombine_s16(_res16, _res16)); \
#define CAL_C0() MAKE_CAL(0)
#define CAL_C1() MAKE_CAL(1)
#define CAL_C2() MAKE_CAL(2)
#define CAL_C3() MAKE_CAL(3)
#endif
对于单通道图像,加载4个对应源像素周围的4点。
dst_p
为当前目的图上的起始地址。
_src_loc
为4个源像素的偏移,考虑到插值像素的局部性质,与src1
和src2
配合可以访问4个目的像素所需的元素。
_src01
存储了前两个目的像素所对应的元素,_src23
为后两个。
vmovl_u8 左移变成uint16
,_src16_0
和_src16_1
与系数乘积宽度相同。
if (channel == 1) {
#ifdef TNN_USE_NEON
uint8_t* dst_p = dst + dst_loc_base + begin_x * 1;
int32x4_t _offset = vdupq_n_s32(1 << 14);
int simd_loop = 0;
for (; x <= end_x - 3; x += 4) {
int32x4_t _src_loc = vld1q_s32(buf_loc_p);
uint8x8_t _src01 = uint8x8_t();
_src01 = vld1_lane_u8(src1 + _src_loc[0], _src01, 0);
_src01 = vld1_lane_u8(src1 + _src_loc[0] + 1, _src01, 1);
_src01 = vld1_lane_u8(src2 + _src_loc[0], _src01, 2);
_src01 = vld1_lane_u8(src2 + _src_loc[0] + 1, _src01, 3);
_src01 = vld1_lane_u8(src1 + _src_loc[1], _src01, 4);
_src01 = vld1_lane_u8(src1 + _src_loc[1] + 1, _src01, 5);
_src01 = vld1_lane_u8(src2 + _src_loc[1], _src01, 6);
_src01 = vld1_lane_u8(src2 + _src_loc[1] + 1, _src01, 7);
int16x8_t _src16_0 = vreinterpretq_s16_u16(vmovl_u8(_src01));
uint8x8_t _src23 = uint8x8_t();
_src23 = vld1_lane_u8(src1 + _src_loc[2], _src23, 0);
_src23 = vld1_lane_u8(src1 + _src_loc[2] + 1, _src23, 1);
_src23 = vld1_lane_u8(src2 + _src_loc[2], _src23, 2);
_src23 = vld1_lane_u8(src2 + _src_loc[2] + 1, _src23, 3);
_src23 = vld1_lane_u8(src1 + _src_loc[3], _src23, 4);
_src23 = vld1_lane_u8(src1 + _src_loc[3] + 1, _src23, 5);
_src23 = vld1_lane_u8(src2 + _src_loc[3], _src23, 6);
_src23 = vld1_lane_u8(src2 + _src_loc[3] + 1, _src23, 7);
int16x8_t _src16_1 = vreinterpretq_s16_u16(vmovl_u8(_src23));
_tab0
、_tab1
、_tab2
和_tab3
为4个目的像素对应的系数乘积。
_val0
为第一个目的像素的分立结果。
VPADDQ_S32 调用 vpaddq_s32,相邻元素相加。
_res0123
为计算得到的4个目的像素值。
_offset
为舍入值。右移后得到_res16
。
vqmovun_s16 每个有符号整数值,将其饱和为一个原始宽度的一半的无符号整数值。
由于 vqmovun_s16 每次处理8个数所以这里存在一半的浪费。
int16x4_t _tab_loc = vld1_s16(tab_loc_p);
int16x4_t _tab0 = vld1_s16(tab_p + _tab_loc[0] * 4);
int16x4_t _tab1 = vld1_s16(tab_p + _tab_loc[1] * 4);
int16x4_t _tab2 = vld1_s16(tab_p + _tab_loc[2] * 4);
int16x4_t _tab3 = vld1_s16(tab_p + _tab_loc[3] * 4);
int32x4_t _val0 = vmull_s16(_tab0, vget_low_s16(_src16_0));
int32x4_t _val1 = vmull_s16(_tab1, vget_high_s16(_src16_0));
int32x4_t _val2 = vmull_s16(_tab2, vget_low_s16(_src16_1));
int32x4_t _val3 = vmull_s16(_tab3, vget_high_s16(_src16_1));
int32x4_t _res0123 = VPADDQ_S32(VPADDQ_S32(_val0, _val1), VPADDQ_S32(_val2, _val3));
_res0123 = vaddq_s32(_res0123, _offset);
int16x4_t _res16 = vshrn_n_s32(_res0123, 15);
uint8x8_t _resu8 = vqmovun_s16(vcombine_s16(_res16, _res16));
保存4个结果。
vst1_lane_u8(dst_p, _resu8, 0);
vst1_lane_u8(dst_p + 1, _resu8, 1);
vst1_lane_u8(dst_p + 2, _resu8, 2);
vst1_lane_u8(dst_p + 3, _resu8, 3);
buf_loc_p += 4;
tab_loc_p += 4;
dst_p += 4;
++simd_loop;
}
x = begin_x + (simd_loop << 2);
#endif
dst_loc
为目的地址,src_loc
为对应到源图上的地址。
point0
、point1
、point2
和point3
为源图上周围4点,val_xy0
为双线性插值的结果。
for (; x <= end_x; x++) {
int dst_loc = dst_loc_base + x * 1;
int src_loc = buf_loc[x];
short* wtab = BilinearTab_i[tab_loc[x]][0];
int point0 = src1[src_loc];
int point1 = src1[src_loc + 1];
int point2 = src2[src_loc];
int point3 = src2[src_loc + 1];
int val_xy0 = wtab[0] * point0 + wtab[1] * point1 + wtab[2] * point2 + wtab[3] * point3;
dst[dst_loc] = SATURATE_CAST_UCHAR((val_xy0 + (1 << 14)) >> 15);
}
对于2通道的图像,
} else if (channel == 2) {
#ifdef TNN_USE_NEON
uint8_t* dst_p = dst + dst_loc_base + begin_x * 2;
int32x4_t _offset = vdupq_n_s32(1 << 14);
int simd_loop = 0;
for (; x <= end_x - 3; x += 4) {
int32x4_t _src_loc = vld1q_s32(buf_loc_p);
uint8x8x2_t _src01 = uint8x8x2_t();
_src01 = vld2_lane_u8(src1 + _src_loc[0], _src01, 0);
_src01 = vld2_lane_u8(src1 + _src_loc[0] + 2, _src01, 1);
_src01 = vld2_lane_u8(src2 + _src_loc[0], _src01, 2);
_src01 = vld2_lane_u8(src2 + _src_loc[0] + 2, _src01, 3);
_src01 = vld2_lane_u8(src1 + _src_loc[1], _src01, 4);
_src01 = vld2_lane_u8(src1 + _src_loc[1] + 2, _src01, 5);
_src01 = vld2_lane_u8(src2 + _src_loc[1], _src01, 6);
_src01 = vld2_lane_u8(src2 + _src_loc[1] + 2, _src01, 7);
int16x8_t _src16_00 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[0]));
int16x8_t _src16_01 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[1]));
uint8x8x2_t _src23 = uint8x8x2_t();
_src23 = vld2_lane_u8(src1 + _src_loc[2], _src23, 0);
_src23 = vld2_lane_u8(src1 + _src_loc[2] + 2, _src23, 1);
_src23 = vld2_lane_u8(src2 + _src_loc[2], _src23, 2);
_src23 = vld2_lane_u8(src2 + _src_loc[2] + 2, _src23, 3);
_src23 = vld2_lane_u8(src1 + _src_loc[3], _src23, 4);
_src23 = vld2_lane_u8(src1 + _src_loc[3] + 2, _src23, 5);
_src23 = vld2_lane_u8(src2 + _src_loc[3], _src23, 6);
_src23 = vld2_lane_u8(src2 + _src_loc[3] + 2, _src23, 7);
int16x8_t _src16_10 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[0]));
int16x8_t _src16_11 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[1]));
int16x4_t _tab_loc = vld1_s16(tab_loc_p);
int16x4_t _tab0 = vld1_s16(tab_p + _tab_loc[0] * 4);
int16x4_t _tab1 = vld1_s16(tab_p + _tab_loc[1] * 4);
int16x4_t _tab2 = vld1_s16(tab_p + _tab_loc[2] * 4);
int16x4_t _tab3 = vld1_s16(tab_p + _tab_loc[3] * 4);
int32x4_t _val0, _val1, _val2, _val3, _res0123;
int16x4_t _res16;
uint8x8x2_t _resu8;
CAL_C0();
CAL_C1();
vst2_lane_u8(dst_p, _resu8, 0);
vst2_lane_u8(dst_p + 2, _resu8, 1);
vst2_lane_u8(dst_p + 4, _resu8, 2);
vst2_lane_u8(dst_p + 6, _resu8, 3);
buf_loc_p += 4;
tab_loc_p += 4;
dst_p += 8;
++simd_loop;
}
x = begin_x + (simd_loop << 2);
#endif
for (; x <= end_x; x++) {
int dst_loc = dst_loc_base + x * 2;
int src_loc = buf_loc[x];
short* wtab = BilinearTab_i[tab_loc[x]][0];
int point00 = src1[src_loc];
int point01 = src1[src_loc + 1];
int point02 = src1[src_loc + 2];
int point03 = src1[src_loc + 3];
int point10 = src2[src_loc];
int point11 = src2[src_loc + 1];
int point12 = src2[src_loc + 2];
int point13 = src2[src_loc + 3];
int val_xy0 = wtab[0] * point00 + wtab[1] * point02 + wtab[2] * point10 + wtab[3] * point12;
int val_xy1 = wtab[0] * point01 + wtab[1] * point03 + wtab[2] * point11 + wtab[3] * point13;
dst[dst_loc] = SATURATE_CAST_UCHAR((val_xy0 + (1 << 14)) >> 15);
dst[dst_loc + 1] = SATURATE_CAST_UCHAR((val_xy1 + (1 << 14)) >> 15);
}
对于3通道数据,
} else if (channel == 3) {
#ifdef TNN_USE_NEON
uint8_t* dst_p = dst + dst_loc_base + begin_x * 3;
int32x4_t _offset = vdupq_n_s32(1 << 14);
int simd_loop = 0;
for (; x <= end_x - 3; x += 4) {
int32x4_t _src_loc = vld1q_s32(buf_loc_p);
uint8x8x3_t _src01 = uint8x8x3_t();
_src01 = vld3_lane_u8(src1 + _src_loc[0], _src01, 0);
_src01 = vld3_lane_u8(src1 + _src_loc[0] + 3, _src01, 1);
_src01 = vld3_lane_u8(src2 + _src_loc[0], _src01, 2);
_src01 = vld3_lane_u8(src2 + _src_loc[0] + 3, _src01, 3);
_src01 = vld3_lane_u8(src1 + _src_loc[1], _src01, 4);
_src01 = vld3_lane_u8(src1 + _src_loc[1] + 3, _src01, 5);
_src01 = vld3_lane_u8(src2 + _src_loc[1], _src01, 6);
_src01 = vld3_lane_u8(src2 + _src_loc[1] + 3, _src01, 7);
int16x8_t _src16_00 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[0]));
int16x8_t _src16_01 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[1]));
int16x8_t _src16_02 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[2]));
uint8x8x3_t _src23 = uint8x8x3_t();
_src23 = vld3_lane_u8(src1 + _src_loc[2], _src23, 0);
_src23 = vld3_lane_u8(src1 + _src_loc[2] + 3, _src23, 1);
_src23 = vld3_lane_u8(src2 + _src_loc[2], _src23, 2);
_src23 = vld3_lane_u8(src2 + _src_loc[2] + 3, _src23, 3);
_src23 = vld3_lane_u8(src1 + _src_loc[3], _src23, 4);
_src23 = vld3_lane_u8(src1 + _src_loc[3] + 3, _src23, 5);
_src23 = vld3_lane_u8(src2 + _src_loc[3], _src23, 6);
_src23 = vld3_lane_u8(src2 + _src_loc[3] + 3, _src23, 7);
int16x8_t _src16_10 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[0]));
int16x8_t _src16_11 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[1]));
int16x8_t _src16_12 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[2]));
int16x4_t _tab_loc = vld1_s16(tab_loc_p);
int16x4_t _tab0 = vld1_s16(tab_p + _tab_loc[0] * 4);
int16x4_t _tab1 = vld1_s16(tab_p + _tab_loc[1] * 4);
int16x4_t _tab2 = vld1_s16(tab_p + _tab_loc[2] * 4);
int16x4_t _tab3 = vld1_s16(tab_p + _tab_loc[3] * 4);
int32x4_t _val0, _val1, _val2, _val3, _res0123;
int16x4_t _res16;
uint8x8x3_t _resu8;
CAL_C0();
CAL_C1();
CAL_C2();
vst3_lane_u8(dst_p, _resu8, 0);
vst3_lane_u8(dst_p + 3, _resu8, 1);
vst3_lane_u8(dst_p + 6, _resu8, 2);
vst3_lane_u8(dst_p + 9, _resu8, 3);
buf_loc_p += 4;
tab_loc_p += 4;
dst_p += 12;
++simd_loop;
}
x = begin_x + (simd_loop << 2);
#endif
for (; x <= end_x; x++) {
int dst_loc = dst_loc_base + x * 3;
int src_loc = buf_loc[x];
short* wtab = BilinearTab_i[tab_loc[x]][0];
int point00 = src1[src_loc];
int point01 = src1[src_loc + 1];
int point02 = src1[src_loc + 2];
int point03 = src1[src_loc + 3];
int point04 = src1[src_loc + 4];
int point05 = src1[src_loc + 5];
int point10 = src2[src_loc];
int point11 = src2[src_loc + 1];
int point12 = src2[src_loc + 2];
int point13 = src2[src_loc + 3];
int point14 = src2[src_loc + 4];
int point15 = src2[src_loc + 5];
int val_xy0 = wtab[0] * point00 + wtab[1] * point03 + wtab[2] * point10 + wtab[3] * point13;
int val_xy1 = wtab[0] * point01 + wtab[1] * point04 + wtab[2] * point11 + wtab[3] * point14;
int val_xy2 = wtab[0] * point02 + wtab[1] * point05 + wtab[2] * point12 + wtab[3] * point15;
dst[dst_loc] = SATURATE_CAST_UCHAR((val_xy0 + (1 << 14)) >> 15);
dst[dst_loc + 1] = SATURATE_CAST_UCHAR((val_xy1 + (1 << 14)) >> 15);
dst[dst_loc + 2] = SATURATE_CAST_UCHAR((val_xy2 + (1 << 14)) >> 15);
}
} else if (channel == 4) {
#ifdef TNN_USE_NEON
uint8_t* dst_p = dst + dst_loc_base + begin_x * 4;
int32x4_t _offset = vdupq_n_s32(1 << 14);
int simd_loop = 0;
for (; x <= end_x - 3; x += 4) {
int32x4_t _src_loc = vld1q_s32(buf_loc_p);
uint8x8x4_t _src01 = uint8x8x4_t();
_src01 = vld4_lane_u8(src1 + _src_loc[0], _src01, 0);
_src01 = vld4_lane_u8(src1 + _src_loc[0] + 4, _src01, 1);
_src01 = vld4_lane_u8(src2 + _src_loc[0], _src01, 2);
_src01 = vld4_lane_u8(src2 + _src_loc[0] + 4, _src01, 3);
_src01 = vld4_lane_u8(src1 + _src_loc[1], _src01, 4);
_src01 = vld4_lane_u8(src1 + _src_loc[1] + 4, _src01, 5);
_src01 = vld4_lane_u8(src2 + _src_loc[1], _src01, 6);
_src01 = vld4_lane_u8(src2 + _src_loc[1] + 4, _src01, 7);
int16x8_t _src16_00 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[0]));
int16x8_t _src16_01 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[1]));
int16x8_t _src16_02 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[2]));
int16x8_t _src16_03 = vreinterpretq_s16_u16(vmovl_u8(_src01.val[3]));
uint8x8x4_t _src23 = uint8x8x4_t();
_src23 = vld4_lane_u8(src1 + _src_loc[2], _src23, 0);
_src23 = vld4_lane_u8(src1 + _src_loc[2] + 4, _src23, 1);
_src23 = vld4_lane_u8(src2 + _src_loc[2], _src23, 2);
_src23 = vld4_lane_u8(src2 + _src_loc[2] + 4, _src23, 3);
_src23 = vld4_lane_u8(src1 + _src_loc[3], _src23, 4);
_src23 = vld4_lane_u8(src1 + _src_loc[3] + 4, _src23, 5);
_src23 = vld4_lane_u8(src2 + _src_loc[3], _src23, 6);
_src23 = vld4_lane_u8(src2 + _src_loc[3] + 4, _src23, 7);
int16x8_t _src16_10 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[0]));
int16x8_t _src16_11 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[1]));
int16x8_t _src16_12 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[2]));
int16x8_t _src16_13 = vreinterpretq_s16_u16(vmovl_u8(_src23.val[3]));
int16x4_t _tab_loc = vld1_s16(tab_loc_p);
int16x4_t _tab0 = vld1_s16(tab_p + _tab_loc[0] * 4);
int16x4_t _tab1 = vld1_s16(tab_p + _tab_loc[1] * 4);
int16x4_t _tab2 = vld1_s16(tab_p + _tab_loc[2] * 4);
int16x4_t _tab3 = vld1_s16(tab_p + _tab_loc[3] * 4);
int32x4_t _val0, _val1, _val2, _val3, _res0123;
int16x4_t _res16;
uint8x8x4_t _resu8;
CAL_C0();
CAL_C1();
CAL_C2();
CAL_C3();
vst4_lane_u8(dst_p, _resu8, 0);
vst4_lane_u8(dst_p + 4, _resu8, 1);
vst4_lane_u8(dst_p + 8, _resu8, 2);
vst4_lane_u8(dst_p + 12, _resu8, 3);
buf_loc_p += 4;
tab_loc_p += 4;
dst_p += 16;
++simd_loop;
}
x = begin_x + (simd_loop << 2);
#endif
for (; x <= end_x; x++) {
int dst_loc = dst_loc_base + x * 4;
int src_loc = buf_loc[x];
short* wtab = BilinearTab_i[tab_loc[x]][0];
int point00 = src1[src_loc];
int point01 = src1[src_loc + 1];
int point02 = src1[src_loc + 2];
int point03 = src1[src_loc + 3];
int point04 = src1[src_loc + 4];
int point05 = src1[src_loc + 5];
int point06 = src1[src_loc + 6];
int point07 = src1[src_loc + 7];
int point10 = src2[src_loc];
int point11 = src2[src_loc + 1];
int point12 = src2[src_loc + 2];
int point13 = src2[src_loc + 3];
int point14 = src2[src_loc + 4];
int point15 = src2[src_loc + 5];
int point16 = src2[src_loc + 6];
int point17 = src2[src_loc + 7];
int val_xy0 = wtab[0] * point00 + wtab[1] * point04 + wtab[2] * point10 + wtab[3] * point14;
int val_xy1 = wtab[0] * point01 + wtab[1] * point05 + wtab[2] * point11 + wtab[3] * point15;
int val_xy2 = wtab[0] * point02 + wtab[1] * point06 + wtab[2] * point12 + wtab[3] * point16;
int val_xy3 = wtab[0] * point03 + wtab[1] * point07 + wtab[2] * point13 + wtab[3] * point17;
dst[dst_loc] = SATURATE_CAST_UCHAR((val_xy0 + (1 << 14)) >> 15);
dst[dst_loc + 1] = SATURATE_CAST_UCHAR((val_xy1 + (1 << 14)) >> 15);
dst[dst_loc + 2] = SATURATE_CAST_UCHAR((val_xy2 + (1 << 14)) >> 15);
dst[dst_loc + 3] = SATURATE_CAST_UCHAR((val_xy3 + (1 << 14)) >> 15);
}
}
#ifdef TNN_USE_NEON
#undef MAKE_CAL
#undef CAL_C0
#undef CAL_C1
#undef CAL_C2
#undef CAL_C3
#endif
参考资料:
- Simple 3x3 matrix inverse code (C++)
- Inverse of a Matrix
- How to Find the Inverse of a 3x3 Matrix
- Matrix inversion of a3×3matrix
- Efficient 4x4 matrix inverse (affine transform)
- Inverses of 2 × 2 Block Matrices
- General Formula: Matrix Inversion Lemma
- 常用NEON 内置函数记录备用
- Neon Intrinsics各函数介绍