ARM NEON 8x8 SAD 操作
Posted
技术标签:
【中文标题】ARM NEON 8x8 SAD 操作【英文标题】:ARM NEON 8x8 SAD operation 【发布时间】:2016-09-19 09:22:48 【问题描述】:我对 ARM 和 NEON 编程相当陌生,我被赋予了优化 SAD(绝对差和)函数的任务。我不知道从哪里开始,我尝试了几种生成 NEON 代码的方法,但都没有成功。 顺序函数看起来像这样:
void sad_block_8x8(uint8_t *block1, uint8_t *block2, int stride, int *result)
int u, v;
*result = 0;
for (v = 0; v < 8; ++v)
for (u = 0; u < 8; ++u)
*result += abs(block2[v*stride+u] - block1[v*stride+u]);
所以我的问题是:
-
如何为每次迭代加载寄存器
如何进行计算并将其存储到变量结果中
任何帮助将不胜感激!
好的...所以我的第一次尝试是这样的(它有效,但我知道这是一个非常糟糕的霓虹灯代码)
void sad_block_8x8_2(uint8_t *block1, uint8_t *block2, int stride, int *result)
int u, v;
uint8x8_t m_1, m_2, m_o;
uint8_t* test;
test = (uint8_t*)malloc(v*u*sizeof(uint8_t));;
*result = 0;
for (v = 0; v < 8; ++v)
for(u = 0; u < 8; ++u)
m_1 = vld1_u8(&block1[v*stride]);
m_2 = vld1_u8(&block2[v*stride]);
m_o = vabd_u8(m_2, m_1);
vst1_u8(&test[v], m_o);
//printf("%d ", test[v]);
*result += (int)test[v];
请帮忙?
【问题讨论】:
输入数据块应该是const
。您是否尝试过支持 NEON 的编译器,它可能会为您自动矢量化?
要获得 8x8 到 8x1,您可以在 arm_neon.h 中使用 uint16x8_t vabal_u8 (uint16x8_t, uint8x8_t, uint8x8_t)
【参考方案1】:
这是您想要的 SAD 算法的更好更清晰的实现:
void neon_sad_block_8x8(uint8_t *__restrict block1, uint8_t * __restrict block2, int stride, int *__restrict result)
int i, j;
uint8x8_t neon_block1;
uint8x8_t neon_block2;
uint8x8_t res;
int sum = 0;
for (i = 0; i < 8; i++)
neon_block1 = vld1_u8(&block1[i * stride]);
neon_block2 = vld1_u8(&block2[i * stride]);
res = vabd_u8(neon_block2, neon_block1);
sum += res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7];
*result = sum;
这段代码有:
只有一个循环 循环中没有 break 语句 指针由 __restrict 保护【讨论】:
你也可以考虑使用像 uint8x8x4_t 这样的向量数据类型,它被定义为 uint8x8_t 类型的简单数组: typedef struct int8x8x4_t int8x8_t val[4]; int8x8x4_t; 如果您在 64 位架构上运行代码,您还可以使用 uint8_t vaddvq_u8() 函数将所有元素添加到一条指令中。【参考方案2】:我们可以减少两个循环并使它们并行执行。请注意,我一次完成所有负载以消除任何延迟或依赖性。
unsigned int sadCalculator_Neon_not_basic(void* sDPointer, int source_stride, void*
pDPointer, int pred_stride, int w_block, int h_block)
uint8_t* sdPointer = (uint8_t*)sDPointer;
uint8_t* pdPointer = (uint8_t*)pDPointer;
//if w_block is 8 handles 8x8
if (w_block == 8)
if (h_block == 8)
uint8x8_t sBlock_8_1, sBlock_8_2, sBlock_8_3, sBlock_8_4, sBlock_8_5, sBlock_8_6, sBlock_8_7, sBlock_8_8;
uint8x8_t pBlock_8_1, pBlock_8_2, pBlock_8_3, pBlock_8_4, pBlock_8_5, pBlock_8_6, pBlock_8_7, pBlock_8_8;
uint8x8_t res1, res2, res3, res4, res5, res6, res7, res8;
unsigned int sad=0,sad1 = 0, sad2 = 0, sad3 = 0, sad4 = 0, sad5 = 0, sad6 = 0, sad7 = 0, sad8 = 0;
sBlock_8_1 = vld1_u8(sdPointer);
sBlock_8_2 = vld1_u8(sdPointer+(1 * source_stride));
sBlock_8_3 = vld1_u8(sdPointer + (2 * source_stride));
sBlock_8_4 = vld1_u8(sdPointer + (3 * source_stride));
sBlock_8_5 = vld1_u8(sdPointer + (4 * source_stride));
sBlock_8_7= vld1_u8(sdPointer + (6 * source_stride));
sBlock_8_6= vld1_u8(sdPointer + (5 * source_stride));
sBlock_8_8= vld1_u8(sdPointer + (7 * source_stride));
pBlock_8_1 = vld1_u8(pdPointer);
pBlock_8_2 = vld1_u8(pdPointer+(1 * pred_stride));
pBlock_8_3 = vld1_u8(pdPointer + (2 * pred_stride));
pBlock_8_4 = vld1_u8(pdPointer + (3 * pred_stride));
pBlock_8_5 = vld1_u8(pdPointer + (4 * pred_stride));
pBlock_8_6 = vld1_u8(pdPointer + (5 * pred_stride));
pBlock_8_7 = vld1_u8(pdPointer + (6 * pred_stride));
pBlock_8_8 = vld1_u8(pdPointer + (7 * pred_stride));
res1 = vabd_u8(sBlock_8_1, pBlock_8_1);
uint16x4_t res16 = vpaddl_u8(res1);
uint32x2_t res32 = vpaddl_u16(res16);
uint64x1_t res64 = vpaddl_u32(res32);
sad += vget_lane_u64(res64, 0);
res2 = vabd_u8(sBlock_8_2, pBlock_8_2);
res16 = vpaddl_u8(res2);
res32 = vpaddl_u16(res16);
res64 = vpaddl_u32(res32);
sad += vget_lane_u64(res64, 0);
res3 = vabd_u8(sBlock_8_3, pBlock_8_3);
res16 = vpaddl_u8(res3);
res32 = vpaddl_u16(res16);
res64 = vpaddl_u32(res32);
sad += vget_lane_u64(res64, 0);
res4 = vabd_u8(sBlock_8_4, pBlock_8_4);
res16 = vpaddl_u8(res4);
res32 = vpaddl_u16(res16);
res64 = vpaddl_u32(res32);
sad += vget_lane_u64(res64, 0);
res5 = vabd_u8(sBlock_8_5, pBlock_8_5);
res16 = vpaddl_u8(res5);
res32 = vpaddl_u16(res16);
res64 = vpaddl_u32(res32);
sad += vget_lane_u64(res64, 0);
res6 = vabd_u8(sBlock_8_6, pBlock_8_6);
res16 = vpaddl_u8(res6);
res32 = vpaddl_u16(res16);
res64 = vpaddl_u32(res32);
sad += vget_lane_u64(res64, 0);
res7 = vabd_u8(sBlock_8_7, pBlock_8_7);
res16 = vpaddl_u8(res7);
res32 = vpaddl_u16(res16);
res64 = vpaddl_u32(res32);
sad += vget_lane_u64(res64, 0);
res8 = vabd_u8(sBlock_8_8, pBlock_8_8);
res16 = vpaddl_u8(res8);
res32 = vpaddl_u16(res16);
res64 = vpaddl_u32(res32);
sad += vget_lane_u64(res64, 0);
return sad;
【讨论】:
以上是关于ARM NEON 8x8 SAD 操作的主要内容,如果未能解决你的问题,请参考以下文章