使用 ARM NEON 执行比 C 代码需要更长的时间
Posted
技术标签:
【中文标题】使用 ARM NEON 执行比 C 代码需要更长的时间【英文标题】:execution with ARM NEON takes longer than C code 【发布时间】:2013-09-09 15:10:32 【问题描述】:我将 Brisk 函数(用于调整图像大小)从 SSE 内部函数转换为 ARM NEON 内部函数,以便在 ARM 架构上执行它。如果支持,Brisk 使用 SSE 函数,否则使用 opencv 函数。 SSE 当然更快。 我逐步转换了 ARM neon 中的 SSE 函数,但是当我测量与 openCV resize 函数相比的执行时间时,结果是我的函数更慢(0.2ms vs 0.4ms)。代码如下:
上海证券交易所:
inline void BriskLayer::halfsample(const cv::Mat& srcimg, cv::Mat& dstimg)
const unsigned short leftoverCols = ((srcimg.cols%16)/2);// take care with border...
const bool noleftover = (srcimg.cols%16)==0; // note: leftoverCols can be zero butthis still false...
// make sure the destination image is of the right size:
assert(srcimg.cols/2==dstimg.cols);
assert(srcimg.rows/2==dstimg.rows);
// mask needed later:
register __m128i mask = _mm_set_epi32 (0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
// to be added in order to make successive averaging correct:
register __m128i ones = _mm_set_epi32 (0x11111111, 0x11111111, 0x11111111, 0x11111111);
// data pointers:
__m128i* p1=(__m128i*)srcimg.data;
__m128i* p2=(__m128i*)(srcimg.data+srcimg.cols);
__m128i* p_dest=(__m128i*)dstimg.data;
unsigned char* p_dest_char;//=(unsigned char*)p_dest;
// size:
const unsigned int size = (srcimg.cols*srcimg.rows)/16;
const unsigned int hsize = srcimg.cols/16;
__m128i* p_end=p1+size;
unsigned int row=0;
const unsigned int end=hsize/2;
bool half_end;
if(hsize%2==0)
half_end=false;
else
half_end=true;
while(p2<p_end)
for(unsigned int i=0; i<end;i++)
// load the two blocks of memory:
__m128i upper;
__m128i lower;
if(noleftover)
upper=_mm_load_si128(p1);
lower=_mm_load_si128(p2);
else
upper=_mm_loadu_si128(p1);
lower=_mm_loadu_si128(p2);
__m128i result1=_mm_adds_epu8 (upper, ones);
result1=_mm_avg_epu8 (upper, lower);
// increment the pointers:
p1++;
p2++;
// load the two blocks of memory:
upper=_mm_loadu_si128(p1);
lower=_mm_loadu_si128(p2);
__m128i result2=_mm_adds_epu8 (upper, ones);
result2=_mm_avg_epu8 (upper, lower);
// calculate the shifted versions:
__m128i result1_shifted = _mm_srli_si128 (result1, 1);
__m128i result2_shifted = _mm_srli_si128 (result2, 1);
// pack:
__m128i result=_mm_packus_epi16 (_mm_and_si128 (result1, mask),
_mm_and_si128 (result2, mask));
__m128i result_shifted = _mm_packus_epi16 (_mm_and_si128 (result1_shifted, mask),
_mm_and_si128 (result2_shifted, mask));
// average for the second time:
result=_mm_avg_epu8(result,result_shifted);
// store to memory
_mm_storeu_si128 (p_dest, result);
// increment the pointers:
p1++;
p2++;
p_dest++;
//p_dest_char=(unsigned char*)p_dest;
// if we are not at the end of the row, do the rest:
if(half_end)
// load the two blocks of memory:
__m128i upper;
__m128i lower;
if(noleftover)
upper=_mm_load_si128(p1);
lower=_mm_load_si128(p2);
else
upper=_mm_loadu_si128(p1);
lower=_mm_loadu_si128(p2);
__m128i result1=_mm_adds_epu8 (upper, ones);
result1=_mm_avg_epu8 (upper, lower);
// increment the pointers:
p1++;
p2++;
// compute horizontal pairwise average and store
p_dest_char=(unsigned char*)p_dest;
const UCHAR_ALIAS* result=(UCHAR_ALIAS*)&result1;
for(unsigned int j=0; j<8; j++)
*(p_dest_char++)=(*(result+2*j)+*(result+2*j+1))/2;
//p_dest_char=(unsigned char*)p_dest;
else
p_dest_char=(unsigned char*)p_dest;
if(noleftover)
row++;
p_dest=(__m128i*)(dstimg.data+row*dstimg.cols);
p1=(__m128i*)(srcimg.data+2*row*srcimg.cols);
//p2=(__m128i*)(srcimg.data+(2*row+1)*srcimg.cols);
//p1+=hsize;
p2=p1+hsize;
else
const unsigned char* p1_src_char=(unsigned char*)(p1);
const unsigned char* p2_src_char=(unsigned char*)(p2);
for(unsigned int k=0; k<leftoverCols; k++)
unsigned short tmp = p1_src_char[k]+p1_src_char[k+1]+
p2_src_char[k]+p2_src_char[k+1];
*(p_dest_char++)=(unsigned char)(tmp/4);
// done with the two rows:
row++;
p_dest=(__m128i*)(dstimg.data+row*dstimg.cols);
p1=(__m128i*)(srcimg.data+2*row*srcimg.cols);
p2=(__m128i*)(srcimg.data+(2*row+1)*srcimg.cols);
ARM 霓虹灯:
void halfsample(const cv::Mat& srcimg, cv::Mat& dstimg)
const unsigned short leftoverCols = ((srcimg.cols%16)/2);// take care with border...
const bool noleftover = (srcimg.cols%16)==0; // note: leftoverCols can be zero but this still false...
// make sure the destination image is of the right size:
//assert(srcimg.cols/2==dstimg.cols);
//assert(srcimg.rows/2==dstimg.rows);
//int32x4_t zero = vdupq_n_s8(0);
// mask needed later:
//register __m128i mask = _mm_set_epi32 (0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
int32x4_t mask = vdupq_n_s32(0x00FF00FF);
// to be added in order to make successive averaging correct:
int32x4_t ones = vdupq_n_s32(0x11111111);
print128_numhex(mask);
// data pointers:
int32_t* p1=(int32_t*)srcimg.data;
int32_t* p2=(int32_t*)(srcimg.data+srcimg.cols);
int32_t* p_dest=(int32_t*)dstimg.data;
unsigned char* p_dest_char;//=(unsigned char*)p_dest;
int k=0;
// size:
const unsigned int size = (srcimg.cols*srcimg.rows)/16;
const unsigned int hsize = srcimg.cols/16;
int32_t* p_end=p1+size*4;
unsigned int row=0;
const unsigned int end=hsize/2;
bool half_end;
if(hsize%2==0)
half_end=false;
else
half_end=true;
while(p2<p_end)
k++;
for(unsigned int i=0; i<end;i++)
// load the two blocks of memory:
int32x4_t upper;
int32x4_t lower;
if(noleftover)
upper=vld1q_s32(p1);
lower=vld1q_s32(p2);
else
upper=vld1q_s32(p1);
lower=vld1q_s32(p2);
int32x4_t result1=vaddq_s32(upper, ones);
result1=vrhaddq_u8(upper, lower);
// increment the pointers:
p1=p1+4;
p2=p2+4;
// load the two blocks of memory:
upper=vld1q_s32(p1);
lower=vld1q_s32(p2);
int32x4_t result2=vaddq_s32(upper, ones);
result2=vrhaddq_u8(upper, lower);
// calculate the shifted versions:
int32x4_t result1_shifted = vextq_u8(result1,vmovq_n_u8(0),1);
int32x4_t result2_shifted = vextq_u8(result2,vmovq_n_u8(0),1);
// pack:
int32x4_t result= vcombine_u8(vqmovn_u16(vandq_u32(result1, mask)),
vqmovn_u16(vandq_u32 (result2, mask)));
int32x4_t result_shifted = vcombine_u8(vqmovn_u16(vandq_u32 (result1_shifted, mask)),
vqmovn_u16(vandq_u32(result2_shifted, mask)));
// average for the second time:
result=vrhaddq_u8(result,result_shifted);
// store to memory
vst1q_s32(p_dest, result);
// increment the pointers:
p1=p1+4;
p2=p2+4;
p_dest=p_dest+4;
//p_dest_char=(unsigned char*)p_dest;
// if we are not at the end of the row, do the rest:
if(half_end)
std::cout<<"entra in half_end" << std::endl;
// load the two blocks of memory:
int32x4_t upper;
int32x4_t lower;
if(noleftover)
upper=vld1q_s32(p1);
lower=vld1q_s32(p2);
else
upper=vld1q_s32(p1);
lower=vld1q_s32(p2);
int32x4_t result1=vqaddq_s32(upper, ones);
result1=vrhaddq_u8(upper, lower);
// increment the pointers:
p1=p1+4;
p2=p2+4;
// compute horizontal pairwise average and store
p_dest_char=(unsigned char*)p_dest;
const UCHAR_ALIAS* result=(UCHAR_ALIAS*)&result1;
for(unsigned int j=0; j<8; j++)
*(p_dest_char++)=(*(result+2*j)+*(result+2*j+1))/2;
//p_dest_char=(unsigned char*)p_dest;
else
p_dest_char=(unsigned char*)p_dest;
if(noleftover)
row++;
p_dest=(int32_t*)(dstimg.data+row*dstimg.cols);
p1=(int32_t*)(srcimg.data+2*row*srcimg.cols);
//p2=(__m128i*)(srcimg.data+(2*row+1)*srcimg.cols);
//p1+=hsize;
p2=p1+hsize*4;
else
const unsigned char* p1_src_char=(unsigned char*)(p1);
const unsigned char* p2_src_char=(unsigned char*)(p2);
for(unsigned int k=0; k<leftoverCols; k++)
unsigned short tmp = p1_src_char[k]+p1_src_char[k+1]+
p2_src_char[k]+p2_src_char[k+1];
*(p_dest_char++)=(unsigned char)(tmp/4);
// done with the two rows:
row++;
p_dest=(int32_t*)(dstimg.data+row*dstimg.cols);
p1=(int32_t*)(srcimg.data+2*row*srcimg.cols);
p2=(int32_t*)(srcimg.data+(2*row+1)*srcimg.cols);
ARM 和 SSE 函数的输出完全相同。问题是执行时间。
【问题讨论】:
您应该从改进代码开始。第一个noleftover
部分与 if/else 相同。您分配result1
两次而不使用第一个。不要使用短裤,因为 ARM 有 32 位字。其中大部分可能由编译器优化,但无论如何你应该清除那些然后使用像DS-5 Streamline
这样的分析器 - 这应该会让你的任务更容易。提供社区/免费版本。
@auselen 是的,在剩余部分中,我复制了 2 次代码。我对其进行了编辑。关于结果1,我完全按照原始代码中的方式进行操作,但我仍然不清楚为什么要完成该操作,我只是按原样转换它。我会按照你说的那样尝试那个分析器。
您的代码无法编译。所以我假设你被一些 ifdefs 愚弄了。
@auselen 我确定它可以编译。我使用这些参数 -mfloat-abi=softfp -mfpu=neon -flax-vector-conversions。使用 DS-5 Streamline 进行 prifiling 的问题是我使用的是 beagleboard(没有图形界面),我不知道我是否可以在上面使用 DS-5。
您可以在beaglebone 上捕获数据并在PC 主机上进行检查,非常简单。
【参考方案1】:
您应该意识到,无论是内在函数还是内联汇编代码都不能像原生汇编中的手写代码那样“完美无缺”。
更糟糕的是,有时编译器(尤其是像 GCC 之类的开源编译器)会添加一些不必要的指令,这些指令会导致流水线停顿,这会花费超过十个周期。当这发生在最里面的循环中时,这对性能来说是直接致命的。
为什么不发布代码的反汇编?有内在问题的人应该总是先看看它。 (并尽快停止使用内在函数)
【讨论】:
这不是答案。 当 OP 发布他的反汇编时,我会给出答案 :) 我如何获得反汇编(这是我第一次使用NEON)?以上是关于使用 ARM NEON 执行比 C 代码需要更长的时间的主要内容,如果未能解决你的问题,请参考以下文章
为啥我使用 openMP atomic 的并行代码比串行代码花费更长的时间?