网站开发后端语言,网站加栏目,建设网站学什么条件,小微企业建站目录
前言
知识直通车
NEON转置指令
右旋90
4x4矩阵右旋实例
灰度图#xff08;单通道#xff09;右旋90
彩图#xff08;RGB三通道#xff09;右旋90
左旋90
4x4矩阵左旋实例
灰度图#xff08;单通道#xff09;左旋90
彩图#xff08;RGB三通道#xff0…目录
前言
知识直通车
NEON转置指令
右旋90
4x4矩阵右旋实例
灰度图单通道右旋90
彩图RGB三通道右旋90
左旋90
4x4矩阵左旋实例
灰度图单通道左旋90
彩图RGB三通道左旋90
RK3288-linux(硬浮点) 耗时对比 前言
图像旋转就是简单的坐标变换1080p的图像右旋90度耗时时间在RK3288-linux(硬浮点)板子上opencv4.1也就17ms搞定本没啥好讲的但是凡事追求极致嘛能加速的就给他加速一把当然这态度的问题只是其中一个原因另外一个原因是最近在公司遇到一个对旋转90的加速需求在某些场景下需要做图像旋转然后再送入神经网络做人脸检测而由于整个人脸识别整个工程的运行导致板子的资源极大消耗使得简单的图像旋转的时间延长至三四十ms这是很可怕的人脸检测也就二三十ms左右吧所以这是写这篇文章的直接导火索。
当然类似的利用NEON对矩阵旋转加速的解决方案网上已经有人发表过了比如最早的文章https://www.cnblogs.com/hrlnw/p/3723072.html本篇博客会在此基础上进行扩展增加RGB彩图的加速旋转代码并补充opencv与本篇算法在RK3288上的旋转耗时数据对比。 知识直通车
矩阵右旋90的NEON加速https://www.cnblogs.com/hrlnw/p/3723072.html
矩阵右旋180度和270(-90)度https://www.cnblogs.com/hrlnw/p/3767853.html
opencv右旋90、180、270度https://www.cnblogs.com/alexYuin/p/9493242.html
opencv利用transposeflip实现图像旋转虽然比仿射变换函数warpaffine要快不少但是分步的做法总是没有一步到位的解决方案来得快的接下来就开始介绍一步到位法了。 NEON转置指令
可以用neon进行旋转加速的关键要感谢arm出品的vtrn_u8/u16/u32..系列的转置指令虽然是2x2的转置指令但是对于矩阵旋转完全够用取决于怎么用了。
转置的具体情况如下示意应该很清楚就不多少了
uint8x8_t mat0,mat1;
uint8x8x2_t resultvtrn_u8(mat0,mat1); 右旋90
4x4矩阵右旋实例
在介绍图形旋转代码之前先用个小矩阵实例说明一下算法的可行性。先说右旋90度 旋转前矩阵 旋转后矩阵 算法步骤
1相邻两行每2x2做转置(但是注意输入顺序是倒的比如前两行第二行作为vtrn的第一个参数输入第一行作为vtrn的第二个参数输入)结果如下 原矩阵 相邻行倒置 相邻行转置 2每行两个像素值看做一个整体隔行转置依然倒着输入即2、0行一组输入3、1行一组输入这样便可以得到最终的矩阵旋转结果了过程如下 上一步结果 隔行倒置 隔行转置 例子看明白了那下面图像旋转也就很简单了soeasy啊~
图像旋转与矩阵旋转的区别就是.....嗯.....图像比较大下就分块喽那从指令读取数据的角度以及代码量及速度方面的考虑我们将图像分成8x8的小块矩阵一个一个做旋转然后再放到最终图像相应的位置就可以了。
灰度图单通道右旋90 static void rotate_pos90_gray(uint8_t* src, size_t sstep, uint8_t* dst, size_t dstep, int w,int h)
{uint8x8x4_t mat[2];uint8x8x2_t temp8x8x2[4];uint16x4x4_t temp16x4x4[2];uint16x4x2_t temp16x4x2[4];uint32x2x4_t temp32x2x4[2];uint32x2x2_t temp32x2x2[4];int dww7; int sww-dw;int dhh7; int shh-dh;for(int y0;ysh;y8){uint8_t* pIndexY srcy*sstep;int tr_x h-y-8;for(int x0;xsw;x8){uint8_t* ptr_y dstx*dstep;//load 8x8 matrixmat[0].val[0] vld1_u8(pIndexY x);mat[0].val[1] vld1_u8(pIndexY sstepx);mat[0].val[2] vld1_u8(pIndexY2*sstepx);mat[0].val[3] vld1_u8(pIndexY3*sstepx);mat[1].val[0] vld1_u8(pIndexY4*sstepx);mat[1].val[1] vld1_u8(pIndexY5*sstepx);mat[1].val[2] vld1_u8(pIndexY6*sstepx);mat[1].val[3] vld1_u8(pIndexY7*sstepx);//transpose for two nearby rows temp8x8x2[0] vtrn_u8(mat[0].val[1],mat[0].val[0]); temp8x8x2[1] vtrn_u8(mat[0].val[3],mat[0].val[2]); temp8x8x2[2] vtrn_u8(mat[1].val[1],mat[1].val[0]); temp8x8x2[3] vtrn_u8(mat[1].val[3],mat[1].val[2]); //interpret two 8bit to one 16bit datatemp16x4x4[0].val[0] vreinterpret_u16_u8(temp8x8x2[0].val[0]);temp16x4x4[0].val[1] vreinterpret_u16_u8(temp8x8x2[0].val[1]);temp16x4x4[0].val[2] vreinterpret_u16_u8(temp8x8x2[1].val[0]);temp16x4x4[0].val[3] vreinterpret_u16_u8(temp8x8x2[1].val[1]);temp16x4x4[1].val[0] vreinterpret_u16_u8(temp8x8x2[2].val[0]);temp16x4x4[1].val[1] vreinterpret_u16_u8(temp8x8x2[2].val[1]);temp16x4x4[1].val[2] vreinterpret_u16_u8(temp8x8x2[3].val[0]);temp16x4x4[1].val[3] vreinterpret_u16_u8(temp8x8x2[3].val[1]);//transpose for two interleaved rows temp16x4x2[0] vtrn_u16(temp16x4x4[0].val[2],temp16x4x4[0].val[0]); temp16x4x2[1] vtrn_u16(temp16x4x4[0].val[3],temp16x4x4[0].val[1]); temp16x4x2[2] vtrn_u16(temp16x4x4[1].val[2],temp16x4x4[1].val[0]); temp16x4x2[3] vtrn_u16(temp16x4x4[1].val[3],temp16x4x4[1].val[1]); temp32x2x4[0].val[0] vreinterpret_u32_u16(temp16x4x2[0].val[0]);temp32x2x4[0].val[1] vreinterpret_u32_u16(temp16x4x2[0].val[1]);temp32x2x4[0].val[2] vreinterpret_u32_u16(temp16x4x2[1].val[0]);temp32x2x4[0].val[3] vreinterpret_u32_u16(temp16x4x2[1].val[1]);temp32x2x4[1].val[0] vreinterpret_u32_u16(temp16x4x2[2].val[0]);temp32x2x4[1].val[1] vreinterpret_u32_u16(temp16x4x2[2].val[1]);temp32x2x4[1].val[2] vreinterpret_u32_u16(temp16x4x2[3].val[0]);temp32x2x4[1].val[3] vreinterpret_u32_u16(temp16x4x2[3].val[1]);temp32x2x2[0] vtrn_u32(temp32x2x4[1].val[0],temp32x2x4[0].val[0]); temp32x2x2[1] vtrn_u32(temp32x2x4[1].val[1],temp32x2x4[0].val[1]); temp32x2x2[2] vtrn_u32(temp32x2x4[1].val[2],temp32x2x4[0].val[2]); temp32x2x2[3] vtrn_u32(temp32x2x4[1].val[3],temp32x2x4[0].val[3]); mat[0].val[0] vreinterpret_u8_u32(temp32x2x2[0].val[0]); mat[0].val[1] vreinterpret_u8_u32(temp32x2x2[2].val[0]); mat[0].val[2] vreinterpret_u8_u32(temp32x2x2[1].val[0]); mat[0].val[3] vreinterpret_u8_u32(temp32x2x2[3].val[0]); mat[1].val[0] vreinterpret_u8_u32(temp32x2x2[0].val[1]); mat[1].val[1] vreinterpret_u8_u32(temp32x2x2[2].val[1]); mat[1].val[2] vreinterpret_u8_u32(temp32x2x2[1].val[1]); mat[1].val[3] vreinterpret_u8_u32(temp32x2x2[3].val[1]); vst1_u8(ptr_y tr_x, mat[0].val[0]);vst1_u8(ptr_y dsteptr_x, mat[0].val[1]);vst1_u8(ptr_y2*dsteptr_x, mat[0].val[2]);vst1_u8(ptr_y3*dsteptr_x, mat[0].val[3]);vst1_u8(ptr_y4*dsteptr_x, mat[1].val[0]);vst1_u8(ptr_y5*dsteptr_x, mat[1].val[1]);vst1_u8(ptr_y6*dsteptr_x, mat[1].val[2]);vst1_u8(ptr_y7*dsteptr_x, mat[1].val[3]);}}for(int ysh;yh;y){int tr_x h-y-1;for(int x0;xw;x){uint8_t* ptr_y dstx*dstep;ptr_y[tr_x] src[y*sstepx];}}for(int xsw;xw;x){ uint8_t* ptr_y dstx*dstep;for(int y0;ysh;y){int tr_x h-y-1;ptr_y[tr_x] src[y*sstepx];}}return;
}
彩图RGB三通道右旋90
static void rotate_pos90_rgb(uint8_t* src, size_t sstep, uint8_t* dst, size_t dstep, int w,int h, int cn)
{uint8x8x4_t mat10,mat11,mat12;uint8x8x4_t mat20,mat21,mat22;uint8x8x2_t temp10,temp11,temp12;uint8x8x2_t temp20,temp21,temp22;uint8x8x2_t temp30,temp31,temp32;uint8x8x2_t temp40,temp41,temp42;uint16x4x4_t temp110,temp111,temp112;uint16x4x4_t temp120,temp121,temp122;uint16x4x2_t temp50,temp51,temp52;uint16x4x2_t temp60,temp61,temp62;uint16x4x2_t temp70,temp71,temp72;uint16x4x2_t temp80,temp81,temp82;uint32x2x4_t temp210,temp211,temp212;uint32x2x4_t temp220,temp221,temp222;uint32x2x2_t res10,res11,res12;uint32x2x2_t res20,res21,res22;uint32x2x2_t res30,res31,res32;uint32x2x2_t res40,res41,res42;int dww7; int sww-dw;int dhh7; int shh-dh;int x,y;for(y0;ysh;yy8){uint8_t* pIndexY srcy*sstep;int tr_x (h-y-8)*cn;for(x0;xsw;xx8){int xIndex x * cn;uint8_t* ptr_y dstx*dstep;uint8x8x3_t v vld3_u8(pIndexY xIndex);mat10.val[0] v.val[0]; mat11.val[0] v.val[1]; mat12.val[0] v.val[2];v vld3_u8((pIndexY sstep) xIndex);mat10.val[1] v.val[0]; mat11.val[1] v.val[1]; mat12.val[1] v.val[2];v vld3_u8((pIndexY 2*sstep) xIndex);mat10.val[2] v.val[0]; mat11.val[2] v.val[1]; mat12.val[2] v.val[2];v vld3_u8((pIndexY 3*sstep) xIndex);mat10.val[3] v.val[0]; mat11.val[3] v.val[1]; mat12.val[3] v.val[2];v vld3_u8((pIndexY 4*sstep) xIndex);mat20.val[0] v.val[0]; mat21.val[0] v.val[1]; mat22.val[0] v.val[2];v vld3_u8((pIndexY 5*sstep) xIndex);mat20.val[1] v.val[0]; mat21.val[1] v.val[1]; mat22.val[1] v.val[2];v vld3_u8((pIndexY 6*sstep) xIndex);mat20.val[2] v.val[0]; mat21.val[2] v.val[1]; mat22.val[2] v.val[2];v vld3_u8((pIndexY 7*sstep) xIndex);mat20.val[3] v.val[0]; mat21.val[3] v.val[1]; mat22.val[3] v.val[2];temp10vtrn_u8(mat10.val[1],mat10.val[0]); temp11vtrn_u8(mat11.val[1],mat11.val[0]); temp12vtrn_u8(mat12.val[1],mat12.val[0]); temp20vtrn_u8(mat10.val[3],mat10.val[2]); temp21vtrn_u8(mat11.val[3],mat11.val[2]); temp22vtrn_u8(mat12.val[3],mat12.val[2]);temp30vtrn_u8(mat20.val[1],mat20.val[0]); temp31vtrn_u8(mat21.val[1],mat21.val[0]); temp32vtrn_u8(mat22.val[1],mat22.val[0]);temp40vtrn_u8(mat20.val[3],mat20.val[2]); temp41vtrn_u8(mat21.val[3],mat21.val[2]); temp42vtrn_u8(mat22.val[3],mat22.val[2]);temp110.val[0]vreinterpret_u16_u8(temp10.val[0]); temp111.val[0]vreinterpret_u16_u8(temp11.val[0]); temp112.val[0]vreinterpret_u16_u8(temp12.val[0]);temp110.val[1]vreinterpret_u16_u8(temp10.val[1]); temp111.val[1]vreinterpret_u16_u8(temp11.val[1]); temp112.val[1]vreinterpret_u16_u8(temp12.val[1]);temp110.val[2]vreinterpret_u16_u8(temp20.val[0]); temp111.val[2]vreinterpret_u16_u8(temp21.val[0]); temp112.val[2]vreinterpret_u16_u8(temp22.val[0]);temp110.val[3]vreinterpret_u16_u8(temp20.val[1]); temp111.val[3]vreinterpret_u16_u8(temp21.val[1]); temp112.val[3]vreinterpret_u16_u8(temp22.val[1]);temp120.val[0]vreinterpret_u16_u8(temp30.val[0]); temp121.val[0]vreinterpret_u16_u8(temp31.val[0]); temp122.val[0]vreinterpret_u16_u8(temp32.val[0]);temp120.val[1]vreinterpret_u16_u8(temp30.val[1]); temp121.val[1]vreinterpret_u16_u8(temp31.val[1]); temp122.val[1]vreinterpret_u16_u8(temp32.val[1]);temp120.val[2]vreinterpret_u16_u8(temp40.val[0]); temp121.val[2]vreinterpret_u16_u8(temp41.val[0]); temp122.val[2]vreinterpret_u16_u8(temp42.val[0]);temp120.val[3]vreinterpret_u16_u8(temp40.val[1]); temp121.val[3]vreinterpret_u16_u8(temp41.val[1]); temp122.val[3]vreinterpret_u16_u8(temp42.val[1]);temp50vtrn_u16(temp110.val[2],temp110.val[0]); temp51vtrn_u16(temp111.val[2],temp111.val[0]); temp52vtrn_u16(temp112.val[2],temp112.val[0]);temp60vtrn_u16(temp110.val[3],temp110.val[1]); temp61vtrn_u16(temp111.val[3],temp111.val[1]); temp62vtrn_u16(temp112.val[3],temp112.val[1]);temp70vtrn_u16(temp120.val[2],temp120.val[0]); temp71vtrn_u16(temp121.val[2],temp121.val[0]); temp72vtrn_u16(temp122.val[2],temp122.val[0]);temp80vtrn_u16(temp120.val[3],temp120.val[1]); temp81vtrn_u16(temp121.val[3],temp121.val[1]); temp82vtrn_u16(temp122.val[3],temp122.val[1]);temp210.val[0]vreinterpret_u32_u16(temp50.val[0]); temp211.val[0]vreinterpret_u32_u16(temp51.val[0]); temp212.val[0]vreinterpret_u32_u16(temp52.val[0]);temp210.val[1]vreinterpret_u32_u16(temp50.val[1]); temp211.val[1]vreinterpret_u32_u16(temp51.val[1]); temp212.val[1]vreinterpret_u32_u16(temp52.val[1]);temp210.val[2]vreinterpret_u32_u16(temp60.val[0]); temp211.val[2]vreinterpret_u32_u16(temp61.val[0]); temp212.val[2]vreinterpret_u32_u16(temp62.val[0]);temp210.val[3]vreinterpret_u32_u16(temp60.val[1]); temp211.val[3]vreinterpret_u32_u16(temp61.val[1]); temp212.val[3]vreinterpret_u32_u16(temp62.val[1]);temp220.val[0]vreinterpret_u32_u16(temp70.val[0]); temp221.val[0]vreinterpret_u32_u16(temp71.val[0]); temp222.val[0]vreinterpret_u32_u16(temp72.val[0]);temp220.val[1]vreinterpret_u32_u16(temp70.val[1]); temp221.val[1]vreinterpret_u32_u16(temp71.val[1]); temp222.val[1]vreinterpret_u32_u16(temp72.val[1]);temp220.val[2]vreinterpret_u32_u16(temp80.val[0]); temp221.val[2]vreinterpret_u32_u16(temp81.val[0]); temp222.val[2]vreinterpret_u32_u16(temp82.val[0]);temp220.val[3]vreinterpret_u32_u16(temp80.val[1]); temp221.val[3]vreinterpret_u32_u16(temp81.val[1]); temp222.val[3]vreinterpret_u32_u16(temp82.val[1]);res10vtrn_u32(temp220.val[0],temp210.val[0]); res11vtrn_u32(temp221.val[0],temp211.val[0]); res12vtrn_u32(temp222.val[0],temp212.val[0]);res20vtrn_u32(temp220.val[1],temp210.val[1]); res21vtrn_u32(temp221.val[1],temp211.val[1]); res22vtrn_u32(temp222.val[1],temp212.val[1]);res30vtrn_u32(temp220.val[2],temp210.val[2]); res31vtrn_u32(temp221.val[2],temp211.val[2]); res32vtrn_u32(temp222.val[2],temp212.val[2]);res40vtrn_u32(temp220.val[3],temp210.val[3]); res41vtrn_u32(temp221.val[3],temp211.val[3]); res42vtrn_u32(temp222.val[3],temp212.val[3]);mat10.val[0]vreinterpret_u8_u32(res10.val[0]); mat11.val[0]vreinterpret_u8_u32(res11.val[0]); mat12.val[0]vreinterpret_u8_u32(res12.val[0]);mat10.val[1]vreinterpret_u8_u32(res30.val[0]); mat11.val[1]vreinterpret_u8_u32(res31.val[0]); mat12.val[1]vreinterpret_u8_u32(res32.val[0]);mat10.val[2]vreinterpret_u8_u32(res20.val[0]); mat11.val[2]vreinterpret_u8_u32(res21.val[0]); mat12.val[2]vreinterpret_u8_u32(res22.val[0]);mat10.val[3]vreinterpret_u8_u32(res40.val[0]); mat11.val[3]vreinterpret_u8_u32(res41.val[0]); mat12.val[3]vreinterpret_u8_u32(res42.val[0]);mat20.val[0]vreinterpret_u8_u32(res10.val[1]); mat21.val[0]vreinterpret_u8_u32(res11.val[1]); mat22.val[0]vreinterpret_u8_u32(res12.val[1]);mat20.val[1]vreinterpret_u8_u32(res30.val[1]); mat21.val[1]vreinterpret_u8_u32(res31.val[1]); mat22.val[1]vreinterpret_u8_u32(res32.val[1]);mat20.val[2]vreinterpret_u8_u32(res20.val[1]); mat21.val[2]vreinterpret_u8_u32(res21.val[1]); mat22.val[2]vreinterpret_u8_u32(res22.val[1]);mat20.val[3]vreinterpret_u8_u32(res40.val[1]); mat21.val[3]vreinterpret_u8_u32(res41.val[1]); mat22.val[3]vreinterpret_u8_u32(res42.val[1]);v_store_interleave_rotate(ptr_y tr_x,mat10.val[0],mat11.val[0],mat12.val[0]); v_store_interleave_rotate(ptr_y dsteptr_x,mat10.val[1],mat11.val[1],mat12.val[1]);v_store_interleave_rotate(ptr_y2*dsteptr_x,mat10.val[2],mat11.val[2],mat12.val[2]);v_store_interleave_rotate(ptr_y3*dsteptr_x,mat10.val[3],mat11.val[3],mat12.val[3]);v_store_interleave_rotate(ptr_y4*dsteptr_x,mat20.val[0],mat21.val[0],mat22.val[0]);v_store_interleave_rotate(ptr_y5*dsteptr_x,mat20.val[1],mat21.val[1],mat22.val[1]);v_store_interleave_rotate(ptr_y6*dsteptr_x,mat20.val[2],mat21.val[2],mat22.val[2]);v_store_interleave_rotate(ptr_y7*dsteptr_x,mat20.val[3],mat21.val[3],mat22.val[3]);}}for(ysh;yh;y){int tr_x (h-y-1)*cn;for(x0;xw;x){uint8_t* ptr_y dstx*dstep;for(int n0;ncn;n){ptr_y[tr_xn]src[y*sstepxn];}}}for(xsw;xw;x){ uint8_t* ptr_y dstx*dstep;for(y0;ysh;y){int tr_x (h-y-1)*cn;for(int n0;ncn;n){ptr_y[tr_xn]src[y*sstepxn];}}}return;
}左旋90
4x4矩阵左旋实例
左旋90度相对右旋而言比较简单了不需要倒置行数据其他的类似 旋转前矩阵 旋转后矩阵 算法步骤
1直接每两行做2x2转置(相邻行转置) 原矩阵 相邻行转置 2类似地每行两个像素值看做一个整体隔行转置但是注意这里需要垂直flip一下不要担心会因此消耗时间只需要在存储的时候上下倒着水平存储就可以了并不耗时 这样便可以得到最终的矩阵左旋结果了 上一步结果 隔行转置 垂直倒置存储 灰度图单通道左旋90
static void rotate_neg90_gray(uint8_t* src, size_t sstep, uint8_t* dst, size_t dstep, int w, int h)
{uint8x8x4_t mat[2]; //use 2 register array to load a 8x8 patchuint8x8x2_t temp8x8x2[4];uint16x4x2_t temp16x4x2[8];uint32x2x2_t temp32x2x2[8];int dww7; int sww-dw;int dhh7; int shh-dh;for(int i0;ish;i8){for(int j0;jsw;j8){//step0 load 8x8 bytes in 8 registersmat[0].val[0] vld1_u8(src i *sstepj);mat[0].val[1] vld1_u8(src(i1)*sstepj);mat[0].val[2] vld1_u8(src(i2)*sstepj);mat[0].val[3] vld1_u8(src(i3)*sstepj);mat[1].val[0] vld1_u8(src(i4)*sstepj);mat[1].val[1] vld1_u8(src(i5)*sstepj);mat[1].val[2] vld1_u8(src(i6)*sstepj);mat[1].val[3] vld1_u8(src(i7)*sstepj);//step1 trn nearby registerstemp8x8x2[0] vtrn_u8(mat[0].val[0], mat[0].val[1]);temp8x8x2[1] vtrn_u8(mat[0].val[2], mat[0].val[3]);temp8x8x2[2] vtrn_u8(mat[1].val[0], mat[1].val[1]);temp8x8x2[3] vtrn_u8(mat[1].val[2], mat[1].val[3]);//step2 trn 1,3 2,4 5,7 6,8temp16x4x2[0].val[0] vreinterpret_u16_u8(temp8x8x2[0].val[0]);temp16x4x2[0].val[1] vreinterpret_u16_u8(temp8x8x2[0].val[1]);temp16x4x2[1].val[0] vreinterpret_u16_u8(temp8x8x2[1].val[0]);temp16x4x2[1].val[1] vreinterpret_u16_u8(temp8x8x2[1].val[1]);temp16x4x2[2].val[0] vreinterpret_u16_u8(temp8x8x2[2].val[0]);temp16x4x2[2].val[1] vreinterpret_u16_u8(temp8x8x2[2].val[1]);temp16x4x2[3].val[0] vreinterpret_u16_u8(temp8x8x2[3].val[0]);temp16x4x2[3].val[1] vreinterpret_u16_u8(temp8x8x2[3].val[1]);temp16x4x2[4] vtrn_u16(temp16x4x2[0].val[0],temp16x4x2[1].val[0]);temp16x4x2[5] vtrn_u16(temp16x4x2[0].val[1],temp16x4x2[1].val[1]);temp16x4x2[6] vtrn_u16(temp16x4x2[2].val[0],temp16x4x2[3].val[0]);temp16x4x2[7] vtrn_u16(temp16x4x2[2].val[1],temp16x4x2[3].val[1]);//step3 trn 1,5 2,6 3,7 4,8temp32x2x2[0].val[0] vreinterpret_u32_u16(temp16x4x2[4].val[0]);temp32x2x2[0].val[1] vreinterpret_u32_u16(temp16x4x2[4].val[1]);temp32x2x2[1].val[0] vreinterpret_u32_u16(temp16x4x2[5].val[0]);temp32x2x2[1].val[1] vreinterpret_u32_u16(temp16x4x2[5].val[1]);temp32x2x2[2].val[0] vreinterpret_u32_u16(temp16x4x2[6].val[0]);temp32x2x2[2].val[1] vreinterpret_u32_u16(temp16x4x2[6].val[1]);temp32x2x2[3].val[0] vreinterpret_u32_u16(temp16x4x2[7].val[0]);temp32x2x2[3].val[1] vreinterpret_u32_u16(temp16x4x2[7].val[1]);temp32x2x2[4] vtrn_u32(temp32x2x2[0].val[0],temp32x2x2[2].val[0]);temp32x2x2[5] vtrn_u32(temp32x2x2[0].val[1],temp32x2x2[2].val[1]);temp32x2x2[6] vtrn_u32(temp32x2x2[1].val[0],temp32x2x2[3].val[0]);temp32x2x2[7] vtrn_u32(temp32x2x2[1].val[1],temp32x2x2[3].val[1]);//step4 store bytes in correct position,the order now is 1,2,3,4,5,6,7,8temp8x8x2[0].val[0] vreinterpret_u8_u32(temp32x2x2[7].val[1]);temp8x8x2[0].val[1] vreinterpret_u8_u32(temp32x2x2[5].val[1]);temp8x8x2[1].val[0] vreinterpret_u8_u32(temp32x2x2[6].val[1]);temp8x8x2[1].val[1] vreinterpret_u8_u32(temp32x2x2[4].val[1]);temp8x8x2[2].val[0] vreinterpret_u8_u32(temp32x2x2[7].val[0]);temp8x8x2[2].val[1] vreinterpret_u8_u32(temp32x2x2[5].val[0]);temp8x8x2[3].val[0] vreinterpret_u8_u32(temp32x2x2[6].val[0]);temp8x8x2[3].val[1] vreinterpret_u8_u32(temp32x2x2[4].val[0]);vst1_u8(dst(w-j-8)*dstepi, temp8x8x2[0].val[0]);vst1_u8(dst(w-j-7)*dstepi, temp8x8x2[0].val[1]);vst1_u8(dst(w-j-6)*dstepi, temp8x8x2[1].val[0]);vst1_u8(dst(w-j-5)*dstepi, temp8x8x2[1].val[1]);vst1_u8(dst(w-j-4)*dstepi, temp8x8x2[2].val[0]);vst1_u8(dst(w-j-3)*dstepi, temp8x8x2[2].val[1]);vst1_u8(dst(w-j-2)*dstepi, temp8x8x2[3].val[0]);vst1_u8(dst(w-j-1)*dstepi, temp8x8x2[3].val[1]);}}for(int ysh;yh;y){int yIndex y*sstep;for(int x0;xw;x){dst[(w-x-1)*hy]src[yIndexx];}}for(int xsw;xw;x){ int xIndex (w-x-1)*h;for(int y0;ysh;y){dst[xIndexy]src[y*sstepx];}}return;
}彩图RGB三通道左旋90
static void rotate_neg90_rgb(uint8_t* src, size_t sstep, uint8_t* dst, size_t dstep, int w,int h, int cn)
{uint8x8x4_t mat0[2],mat1[2],mat2[2]; //use 2 register array to load a 8x8 patchuint8x8x2_t temp8x8x2_0[4],temp8x8x2_1[4],temp8x8x2_2[4];uint16x4x2_t temp16x4x2_0[8],temp16x4x2_1[8],temp16x4x2_2[8];uint32x2x2_t temp32x2x2_0[8],temp32x2x2_1[8],temp32x2x2_2[8];int dww7; int sww-dw;int dhh7; int shh-dh;for(int i0;ish;i8){for(int j0;jsw;j8){//step0 load 8x8 bytes in 8 registersuint8x8x3_t v vld3_u8(src i *sstepj*cn);mat0[0].val[0] v.val[0]; mat1[0].val[0] v.val[1]; mat2[0].val[0] v.val[2];v vld3_u8(src(i1)*sstepj*cn);mat0[0].val[1] v.val[0]; mat1[0].val[1] v.val[1]; mat2[0].val[1] v.val[2];v vld3_u8(src(i2)*sstepj*cn);mat0[0].val[2] v.val[0]; mat1[0].val[2] v.val[1]; mat2[0].val[2] v.val[2];v vld3_u8(src(i3)*sstepj*cn);mat0[0].val[3] v.val[0]; mat1[0].val[3] v.val[1]; mat2[0].val[3] v.val[2];v vld3_u8(src(i4)*sstepj*cn);mat0[1].val[0] v.val[0]; mat1[1].val[0] v.val[1]; mat2[1].val[0] v.val[2];v vld3_u8(src(i5)*sstepj*cn);mat0[1].val[1] v.val[0]; mat1[1].val[1] v.val[1]; mat2[1].val[1] v.val[2];v vld3_u8(src(i6)*sstepj*cn);mat0[1].val[2] v.val[0]; mat1[1].val[2] v.val[1]; mat2[1].val[2] v.val[2];v vld3_u8(src(i7)*sstepj*cn);mat0[1].val[3] v.val[0]; mat1[1].val[3] v.val[1]; mat2[1].val[3] v.val[2];//step1 trn nearby registerstemp8x8x2_0[0] vtrn_u8(mat0[0].val[0], mat0[0].val[1]); temp8x8x2_1[0] vtrn_u8(mat1[0].val[0], mat1[0].val[1]); temp8x8x2_2[0] vtrn_u8(mat2[0].val[0], mat2[0].val[1]);temp8x8x2_0[1] vtrn_u8(mat0[0].val[2], mat0[0].val[3]); temp8x8x2_1[1] vtrn_u8(mat1[0].val[2], mat1[0].val[3]); temp8x8x2_2[1] vtrn_u8(mat2[0].val[2], mat2[0].val[3]);temp8x8x2_0[2] vtrn_u8(mat0[1].val[0], mat0[1].val[1]); temp8x8x2_1[2] vtrn_u8(mat1[1].val[0], mat1[1].val[1]); temp8x8x2_2[2] vtrn_u8(mat2[1].val[0], mat2[1].val[1]);temp8x8x2_0[3] vtrn_u8(mat0[1].val[2], mat0[1].val[3]); temp8x8x2_1[3] vtrn_u8(mat1[1].val[2], mat1[1].val[3]); temp8x8x2_2[3] vtrn_u8(mat2[1].val[2], mat2[1].val[3]);//step2 trn 1,3 2,4 5,7 6,8temp16x4x2_0[0].val[0] vreinterpret_u16_u8(temp8x8x2_0[0].val[0]); temp16x4x2_1[0].val[0] vreinterpret_u16_u8(temp8x8x2_1[0].val[0]); temp16x4x2_2[0].val[0] vreinterpret_u16_u8(temp8x8x2_2[0].val[0]);temp16x4x2_0[0].val[1] vreinterpret_u16_u8(temp8x8x2_0[0].val[1]); temp16x4x2_1[0].val[1] vreinterpret_u16_u8(temp8x8x2_1[0].val[1]); temp16x4x2_2[0].val[1] vreinterpret_u16_u8(temp8x8x2_2[0].val[1]);temp16x4x2_0[1].val[0] vreinterpret_u16_u8(temp8x8x2_0[1].val[0]); temp16x4x2_1[1].val[0] vreinterpret_u16_u8(temp8x8x2_1[1].val[0]); temp16x4x2_2[1].val[0] vreinterpret_u16_u8(temp8x8x2_2[1].val[0]);temp16x4x2_0[1].val[1] vreinterpret_u16_u8(temp8x8x2_0[1].val[1]); temp16x4x2_1[1].val[1] vreinterpret_u16_u8(temp8x8x2_1[1].val[1]); temp16x4x2_2[1].val[1] vreinterpret_u16_u8(temp8x8x2_2[1].val[1]);temp16x4x2_0[2].val[0] vreinterpret_u16_u8(temp8x8x2_0[2].val[0]); temp16x4x2_1[2].val[0] vreinterpret_u16_u8(temp8x8x2_1[2].val[0]); temp16x4x2_2[2].val[0] vreinterpret_u16_u8(temp8x8x2_2[2].val[0]);temp16x4x2_0[2].val[1] vreinterpret_u16_u8(temp8x8x2_0[2].val[1]); temp16x4x2_1[2].val[1] vreinterpret_u16_u8(temp8x8x2_1[2].val[1]); temp16x4x2_2[2].val[1] vreinterpret_u16_u8(temp8x8x2_2[2].val[1]);temp16x4x2_0[3].val[0] vreinterpret_u16_u8(temp8x8x2_0[3].val[0]); temp16x4x2_1[3].val[0] vreinterpret_u16_u8(temp8x8x2_1[3].val[0]); temp16x4x2_2[3].val[0] vreinterpret_u16_u8(temp8x8x2_2[3].val[0]);temp16x4x2_0[3].val[1] vreinterpret_u16_u8(temp8x8x2_0[3].val[1]); temp16x4x2_1[3].val[1] vreinterpret_u16_u8(temp8x8x2_1[3].val[1]); temp16x4x2_2[3].val[1] vreinterpret_u16_u8(temp8x8x2_2[3].val[1]);temp16x4x2_0[4] vtrn_u16(temp16x4x2_0[0].val[0],temp16x4x2_0[1].val[0]); temp16x4x2_1[4] vtrn_u16(temp16x4x2_1[0].val[0],temp16x4x2_1[1].val[0]); temp16x4x2_2[4] vtrn_u16(temp16x4x2_2[0].val[0],temp16x4x2_2[1].val[0]);temp16x4x2_0[5] vtrn_u16(temp16x4x2_0[0].val[1],temp16x4x2_0[1].val[1]); temp16x4x2_1[5] vtrn_u16(temp16x4x2_1[0].val[1],temp16x4x2_1[1].val[1]); temp16x4x2_2[5] vtrn_u16(temp16x4x2_2[0].val[1],temp16x4x2_2[1].val[1]);temp16x4x2_0[6] vtrn_u16(temp16x4x2_0[2].val[0],temp16x4x2_0[3].val[0]); temp16x4x2_1[6] vtrn_u16(temp16x4x2_1[2].val[0],temp16x4x2_1[3].val[0]); temp16x4x2_2[6] vtrn_u16(temp16x4x2_2[2].val[0],temp16x4x2_2[3].val[0]);temp16x4x2_0[7] vtrn_u16(temp16x4x2_0[2].val[1],temp16x4x2_0[3].val[1]); temp16x4x2_1[7] vtrn_u16(temp16x4x2_1[2].val[1],temp16x4x2_1[3].val[1]); temp16x4x2_2[7] vtrn_u16(temp16x4x2_2[2].val[1],temp16x4x2_2[3].val[1]);//step3 trn 1,5 2,6 3,7 4,8temp32x2x2_0[0].val[0] vreinterpret_u32_u16(temp16x4x2_0[4].val[0]); temp32x2x2_1[0].val[0] vreinterpret_u32_u16(temp16x4x2_1[4].val[0]); temp32x2x2_2[0].val[0] vreinterpret_u32_u16(temp16x4x2_2[4].val[0]);temp32x2x2_0[0].val[1] vreinterpret_u32_u16(temp16x4x2_0[4].val[1]); temp32x2x2_1[0].val[1] vreinterpret_u32_u16(temp16x4x2_1[4].val[1]); temp32x2x2_2[0].val[1] vreinterpret_u32_u16(temp16x4x2_2[4].val[1]);temp32x2x2_0[1].val[0] vreinterpret_u32_u16(temp16x4x2_0[5].val[0]); temp32x2x2_1[1].val[0] vreinterpret_u32_u16(temp16x4x2_1[5].val[0]); temp32x2x2_2[1].val[0] vreinterpret_u32_u16(temp16x4x2_2[5].val[0]);temp32x2x2_0[1].val[1] vreinterpret_u32_u16(temp16x4x2_0[5].val[1]); temp32x2x2_1[1].val[1] vreinterpret_u32_u16(temp16x4x2_1[5].val[1]); temp32x2x2_2[1].val[1] vreinterpret_u32_u16(temp16x4x2_2[5].val[1]);temp32x2x2_0[2].val[0] vreinterpret_u32_u16(temp16x4x2_0[6].val[0]); temp32x2x2_1[2].val[0] vreinterpret_u32_u16(temp16x4x2_1[6].val[0]); temp32x2x2_2[2].val[0] vreinterpret_u32_u16(temp16x4x2_2[6].val[0]);temp32x2x2_0[2].val[1] vreinterpret_u32_u16(temp16x4x2_0[6].val[1]); temp32x2x2_1[2].val[1] vreinterpret_u32_u16(temp16x4x2_1[6].val[1]); temp32x2x2_2[2].val[1] vreinterpret_u32_u16(temp16x4x2_2[6].val[1]);temp32x2x2_0[3].val[0] vreinterpret_u32_u16(temp16x4x2_0[7].val[0]); temp32x2x2_1[3].val[0] vreinterpret_u32_u16(temp16x4x2_1[7].val[0]); temp32x2x2_2[3].val[0] vreinterpret_u32_u16(temp16x4x2_2[7].val[0]);temp32x2x2_0[3].val[1] vreinterpret_u32_u16(temp16x4x2_0[7].val[1]); temp32x2x2_1[3].val[1] vreinterpret_u32_u16(temp16x4x2_1[7].val[1]); temp32x2x2_2[3].val[1] vreinterpret_u32_u16(temp16x4x2_2[7].val[1]);temp32x2x2_0[4] vtrn_u32(temp32x2x2_0[0].val[0],temp32x2x2_0[2].val[0]); temp32x2x2_1[4] vtrn_u32(temp32x2x2_1[0].val[0],temp32x2x2_1[2].val[0]); temp32x2x2_2[4] vtrn_u32(temp32x2x2_2[0].val[0],temp32x2x2_2[2].val[0]);temp32x2x2_0[5] vtrn_u32(temp32x2x2_0[0].val[1],temp32x2x2_0[2].val[1]); temp32x2x2_1[5] vtrn_u32(temp32x2x2_1[0].val[1],temp32x2x2_1[2].val[1]); temp32x2x2_2[5] vtrn_u32(temp32x2x2_2[0].val[1],temp32x2x2_2[2].val[1]);temp32x2x2_0[6] vtrn_u32(temp32x2x2_0[1].val[0],temp32x2x2_0[3].val[0]); temp32x2x2_1[6] vtrn_u32(temp32x2x2_1[1].val[0],temp32x2x2_1[3].val[0]); temp32x2x2_2[6] vtrn_u32(temp32x2x2_2[1].val[0],temp32x2x2_2[3].val[0]);temp32x2x2_0[7] vtrn_u32(temp32x2x2_0[1].val[1],temp32x2x2_0[3].val[1]); temp32x2x2_1[7] vtrn_u32(temp32x2x2_1[1].val[1],temp32x2x2_1[3].val[1]); temp32x2x2_2[7] vtrn_u32(temp32x2x2_2[1].val[1],temp32x2x2_2[3].val[1]);//step4 store bytes in correct position,the order now is 1,2,3,4,5,6,7,8temp8x8x2_0[0].val[0] vreinterpret_u8_u32(temp32x2x2_0[7].val[1]); temp8x8x2_1[0].val[0] vreinterpret_u8_u32(temp32x2x2_1[7].val[1]); temp8x8x2_2[0].val[0] vreinterpret_u8_u32(temp32x2x2_2[7].val[1]);temp8x8x2_0[0].val[1] vreinterpret_u8_u32(temp32x2x2_0[5].val[1]); temp8x8x2_1[0].val[1] vreinterpret_u8_u32(temp32x2x2_1[5].val[1]); temp8x8x2_2[0].val[1] vreinterpret_u8_u32(temp32x2x2_2[5].val[1]);temp8x8x2_0[1].val[0] vreinterpret_u8_u32(temp32x2x2_0[6].val[1]); temp8x8x2_1[1].val[0] vreinterpret_u8_u32(temp32x2x2_1[6].val[1]); temp8x8x2_2[1].val[0] vreinterpret_u8_u32(temp32x2x2_2[6].val[1]);temp8x8x2_0[1].val[1] vreinterpret_u8_u32(temp32x2x2_0[4].val[1]); temp8x8x2_1[1].val[1] vreinterpret_u8_u32(temp32x2x2_1[4].val[1]); temp8x8x2_2[1].val[1] vreinterpret_u8_u32(temp32x2x2_2[4].val[1]);temp8x8x2_0[2].val[0] vreinterpret_u8_u32(temp32x2x2_0[7].val[0]); temp8x8x2_1[2].val[0] vreinterpret_u8_u32(temp32x2x2_1[7].val[0]); temp8x8x2_2[2].val[0] vreinterpret_u8_u32(temp32x2x2_2[7].val[0]);temp8x8x2_0[2].val[1] vreinterpret_u8_u32(temp32x2x2_0[5].val[0]); temp8x8x2_1[2].val[1] vreinterpret_u8_u32(temp32x2x2_1[5].val[0]); temp8x8x2_2[2].val[1] vreinterpret_u8_u32(temp32x2x2_2[5].val[0]);temp8x8x2_0[3].val[0] vreinterpret_u8_u32(temp32x2x2_0[6].val[0]); temp8x8x2_1[3].val[0] vreinterpret_u8_u32(temp32x2x2_1[6].val[0]); temp8x8x2_2[3].val[0] vreinterpret_u8_u32(temp32x2x2_2[6].val[0]);temp8x8x2_0[3].val[1] vreinterpret_u8_u32(temp32x2x2_0[4].val[0]); temp8x8x2_1[3].val[1] vreinterpret_u8_u32(temp32x2x2_1[4].val[0]); temp8x8x2_2[3].val[1] vreinterpret_u8_u32(temp32x2x2_2[4].val[0]);v_store_interleave_rotate(dst(w-j-8)*dstepi*cn, temp8x8x2_0[0].val[0], temp8x8x2_1[0].val[0], temp8x8x2_2[0].val[0]);v_store_interleave_rotate(dst(w-j-7)*dstepi*cn, temp8x8x2_0[0].val[1], temp8x8x2_1[0].val[1], temp8x8x2_2[0].val[1]);v_store_interleave_rotate(dst(w-j-6)*dstepi*cn, temp8x8x2_0[1].val[0], temp8x8x2_1[1].val[0], temp8x8x2_2[1].val[0]);v_store_interleave_rotate(dst(w-j-5)*dstepi*cn, temp8x8x2_0[1].val[1], temp8x8x2_1[1].val[1], temp8x8x2_2[1].val[1]);v_store_interleave_rotate(dst(w-j-4)*dstepi*cn, temp8x8x2_0[2].val[0], temp8x8x2_1[2].val[0], temp8x8x2_2[2].val[0]);v_store_interleave_rotate(dst(w-j-3)*dstepi*cn, temp8x8x2_0[2].val[1], temp8x8x2_1[2].val[1], temp8x8x2_2[2].val[1]);v_store_interleave_rotate(dst(w-j-2)*dstepi*cn, temp8x8x2_0[3].val[0], temp8x8x2_1[3].val[0], temp8x8x2_2[3].val[0]);v_store_interleave_rotate(dst(w-j-1)*dstepi*cn, temp8x8x2_0[3].val[1], temp8x8x2_1[3].val[1], temp8x8x2_2[3].val[1]);}}for(int ysh;yh;y){int yIndex y*sstep;for(int x0;xw;x){for(int n0;ncn;n){dst[(w-x-1)*hcn*yn]src[yIndexcn*xn];}}}for(int xsw;xw;x){ int xIndex (w-x-1)*h;for(int y0;ysh;y){for(int n0;ncn;n){dst[xIndexcn*yn]src[y*sstepcn*xn];}}}return;
}
v_store_interleave_rotate函数代码
inline void v_store_interleave_rotate(unsigned char* ptr, const uint8x8_t a, const uint8x8_t b, const uint8x8_t c)
{uint8x8x3_t v; v.val[0] a; v.val[1] b; v.val[2] c; vst3_u8(ptr, v);
} RK3288-linux(硬浮点) 耗时对比opencv4.1与NEON加速