是否有 ARM NEON 指令用于该轮向零的有符号右移?
Posted
技术标签:
【中文标题】是否有 ARM NEON 指令用于该轮向零的有符号右移?【英文标题】:Are there are ARM NEON instructions for signed right-shift that round toward zero? 【发布时间】:2018-02-07 14:50:36 【问题描述】:我正在尝试使用 ARM 内部函数实现算法。
算法的一步需要一个有符号整数的右移,但它需要向上舍入负值(即负数较小)。例如,如果右移 1,则 -8 应为 -4,但 -1 应为 0。
换句话说,我想要将负值舍入为零而不是向下舍入:
int rightshift(int number, unsigned int shift)
return ((number < 0) ? -1 : 1) * (abs(number) >> shift);
我找不到合适的函数来以 SIMD 方式执行此操作。有没有办法在一个函数调用中做到这一点,或者可以使用一些技巧?
【问题讨论】:
微处理器中通常没有这样的指令。 您正在寻找负数的向上取整和正数的向下取整?我想你不会在一条指令中得到它。但我认为,您可以在转移前将number >> (bits - 1) & number >> (shift - 1) & 1
添加到数字中(其中bits
是sizeof number * CHAR_BIT
)。
你描述的操作没有移动。这就是为什么没有这样的低级指令。
您是否使用了整数输入值的全部范围?
我正在尝试使用 int16x4_t 手臂类型。
【参考方案1】:
我认为不存在单指令转换以及向零舍入的行为。
但是,您可以通过几个移位和掩码指令相当简单地做到这一点。如果我们从一个负数开始并且有一个“执行”输出(即结果右侧的任何位都是 1),我们需要做的是在结果中加一。
我可以用下面的纯 C 代码来证明这一点:
#include <stddef.h>
#include <limits.h>
int16_t rightshift(int number, unsigned int shift)
static const size_t bits = sizeof number * CHAR_BIT;
number += ((1<<shift) - 1) & (number >> bits-1);
return number >> shift;
#include <stdio.h>
int main()
for (int i = -16; i <= 16; ++i)
printf(" %3d: ", i);
for (int j = 0; j < 4; ++j)
printf("%4d", rightshift(i, j));
puts("");
这会编译成一些不错的分支费用程序集,看起来适合内联(尤其是当 shift
是编译时常量时):
rightshift:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
movs r3, #1
lsls r3, r3, r1
subs r3, r3, #1
and r3, r3, r0, asr #31
add r0, r0, r3
asrs r0, r0, r1
bx lr
为了针对 Neon,我写了另一个函数,用多个数据来练习它:
void do_shift(int16_t *restrict dest, const int16_t *restrict src,
size_t count, unsigned int shift)
for (size_t j = 0; j < count; ++j)
dest[j] = rightshift(src[j], shift);
还有一个测试程序:
#include <stdio.h>
int main()
static const int16_t src[] =
-32768, -32767, -32766, -32765, -32764,
-16384, -16383, -16382, -16381, -16380,
-8193, -8192, -8191, -8190, -8189,
-16, -15, -14, -13, -12, -10, -9,
-8, -7, -6, -5, -4, -3, -2, -1, 0,
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
1023, 1024, 32767,
;
static const size_t count = sizeof src / sizeof *src;
int16_t dest[16][count];
for (unsigned int i = 0; i < 16; ++i)
do_shift(dest[i], src, count, i);
for (size_t i = 0; i < count; ++i)
printf("%7d: ", src[i]);
for (int j = 0; j < 16; ++j)
printf("%7d", dest[j][i]);
puts("");
我用gcc -O3 -march=armv7 -mfpu=neon
编译了这个。我承认我不熟悉 Neon 说明,但结果可能具有启发性:
do_shift:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
cmp r2, #0
beq .L21
push r4, r5, r6, r7, r8, lr
ubfx r4, r1, #1, #2
negs r4, r4
movs r5, #1
and r4, r4, #7
lsls r5, r5, r3
adds r7, r4, #7
subs r6, r2, #1
subs r5, r5, #1
cmp r6, r7
sxth r5, r5
bcc .L8
cmp r4, #0
beq .L9
ldrsh r7, [r1]
cmp r4, #1
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0] @ movhi
beq .L9
ldrsh r7, [r1, #2]
cmp r4, #2
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #2] @ movhi
beq .L9
ldrsh r7, [r1, #4]
cmp r4, #3
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #4] @ movhi
beq .L9
ldrsh r7, [r1, #6]
cmp r4, #4
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #6] @ movhi
beq .L9
ldrsh r7, [r1, #8]
cmp r4, #5
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #8] @ movhi
beq .L9
ldrsh r7, [r1, #10]
cmp r4, #7
ite eq
moveq r8, r4
movne r8, #6
and r6, r5, r7, asr #31
add r6, r6, r7
it eq
ldrsheq r7, [r1, #12]
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #10] @ movhi
itttt eq
andeq r6, r5, r7, asr #31
addeq r6, r6, r7
sxtheq r6, r6
asreq r6, r6, r3
it eq
strheq r6, [r0, #12] @ movhi
.L4:
vdup.32 q10, r3
sub lr, r2, r4
lsls r4, r4, #1
movs r7, #0
vneg.s32 q10, q10
adds r6, r1, r4
lsr ip, lr, #3
add r4, r4, r0
vdup.16 q12, r5
.L6:
adds r7, r7, #1
adds r6, r6, #16
vldr d18, [r6, #-16]
vldr d19, [r6, #-8]
cmp r7, ip
vshr.s16 q8, q9, #15
vand q8, q8, q12
vadd.i16 q8, q8, q9
vmovl.s16 q9, d16
vmovl.s16 q8, d17
vshl.s32 q9, q9, q10
vshl.s32 q8, q8, q10
vmovn.i32 d22, q9
vmovn.i32 d23, q8
vst1.16 q11, [r4]
add r4, r4, #16
bcc .L6
bic r6, lr, #7
cmp lr, r6
add r4, r8, r6
beq .L1
.L3:
ldrsh ip, [r1, r4, lsl #1]
adds r7, r4, #1
cmp r2, r7
and r6, r5, ip, asr #31
add r6, r6, ip
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r4, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #2
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
adds r7, r4, #3
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #4
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
adds r7, r4, #5
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #6
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
adds r7, r4, #7
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #8
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
add r7, r4, #9
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #10
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
add r7, r4, #11
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #12
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh r7, [r1, ip, lsl #1]
adds r4, r4, #13
cmp r2, r4
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh r1, [r1, r4, lsl #1]
and r2, r5, r1, asr #31
add r2, r2, r1
sxth r2, r2
asr r3, r2, r3
strh r3, [r0, r4, lsl #1] @ movhi
.L1:
pop r4, r5, r6, r7, r8, pc
.L9:
mov r8, r4
b .L4
.L21:
bx lr
.L8:
movs r4, #0
b .L3
有很多循环展开使代码变长,但模式应该清晰。
【讨论】:
以上是关于是否有 ARM NEON 指令用于该轮向零的有符号右移?的主要内容,如果未能解决你的问题,请参考以下文章