在 g++4.4.7 中为复杂算术生成快速汇编

Posted 2023-02-22

技术标签:

【中文标题】在 g++4.4.7 中为复杂算术生成快速汇编【英文标题】：Generating fast assembly for complex arithmetic in g++4.4.7 【发布时间】：2017-07-21 14:38:19 【问题描述】：

我有一个非常简单的函数：

__attribute__((noinline))
void benchmark(cfloat* __restrict__ aa, cfloat* __restrict__ bb, cfloat* __restrict__ cc, cfloat* __restrict__ dd, cfloat uu, cfloat vv, size_t nn) 
    for (ssize_t ii=0; ii < nn; ii++) 
        dd[ii] = (
            aa[ii]*uu +
            bb[ii]*vv +
            cc[ii]
        );

根据我定义我的 cfloat 对象的方式，使用 g++4.4.7 生成非常不同的程序集。

第一次迭代，如果我这样定义我的 cfloat：

struct cfloat 
    cfloat(float re, float im) : re(re), im(im) 
    float re,im;
;

cfloat operator +(cfloat a, cfloat b) 
    return cfloat(a.re+b.re, a.im+b.im);


cfloat operator *(cfloat a, cfloat b) 
    return cfloat(a.re*b.re-a.im*b.im, a.re*b.im+a.im*b.re);

为基准函数生成此程序集（使用 g++ testcx.cc -O3 -o testcx 编译：

   0x00000000004006a0 <+0>: push   %r15
   0x00000000004006a2 <+2>: test   %r8,%r8
   0x00000000004006a5 <+5>: push   %r14
   0x00000000004006a7 <+7>: push   %r13
   0x00000000004006a9 <+9>: push   %r12
   0x00000000004006ab <+11>:    push   %rbp
   0x00000000004006ac <+12>:    push   %rbx
   0x00000000004006ad <+13>:    movq   %xmm0,-0x28(%rsp)
   0x00000000004006b3 <+19>:    mov    %rdi,-0x38(%rsp)
   0x00000000004006b8 <+24>:    mov    -0x28(%rsp),%rax
   0x00000000004006bd <+29>:    movq   %xmm1,-0x28(%rsp)
   0x00000000004006c3 <+35>:    mov    -0x28(%rsp),%r9
   0x00000000004006c8 <+40>:    je     0x4008a0 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+512>
   0x00000000004006ce <+46>:    mov    %r9,%r15
   0x00000000004006d1 <+49>:    mov    %rax,%r14
   0x00000000004006d4 <+52>:    xor    %r11d,%r11d
   0x00000000004006d7 <+55>:    shr    $0x20,%r15
   0x00000000004006db <+59>:    shr    $0x20,%r14
   0x00000000004006df <+63>:    xor    %r10d,%r10d
   0x00000000004006e2 <+66>:    mov    %r15d,-0x2c(%rsp)
   0x00000000004006e7 <+71>:    xor    %ebp,%ebp
   0x00000000004006e9 <+73>:    xor    %ebx,%ebx
   0x00000000004006eb <+75>:    movss  -0x2c(%rsp),%xmm6
   0x00000000004006f1 <+81>:    mov    %r9d,-0x2c(%rsp)
   0x00000000004006f6 <+86>:    movss  -0x2c(%rsp),%xmm5
   0x00000000004006fc <+92>:    mov    %r14d,-0x2c(%rsp)
   0x0000000000400701 <+97>:    movss  -0x2c(%rsp),%xmm4
   0x0000000000400707 <+103>:   mov    %eax,-0x2c(%rsp)
   0x000000000040070b <+107>:   xor    %r13d,%r13d
   0x000000000040070e <+110>:   xor    %r12d,%r12d
   0x0000000000400711 <+113>:   movabs $0xffffffff00000000,%r9
   0x000000000040071b <+123>:   movss  -0x2c(%rsp),%xmm3
   0x0000000000400721 <+129>:   nopl   0x0(%rax)
   0x0000000000400728 <+136>:   lea    0x0(,%r13,8),%rax
   0x0000000000400730 <+144>:   movaps %xmm6,%xmm1
   0x0000000000400733 <+147>:   movaps %xmm5,%xmm7
   0x0000000000400736 <+150>:   and    $0xffffffff,%ebp
   0x0000000000400739 <+153>:   lea    (%rsi,%rax,1),%r15
   0x000000000040073d <+157>:   lea    (%rdx,%rax,1),%r14
   0x0000000000400741 <+161>:   add    -0x38(%rsp),%rax
   0x0000000000400746 <+166>:   and    $0xffffffff,%ebx
   0x0000000000400749 <+169>:   add    $0x1,%r12
   0x000000000040074d <+173>:   movss  (%r15),%xmm0
   0x0000000000400752 <+178>:   movss  0x4(%r15),%xmm2
   0x0000000000400758 <+184>:   mulss  %xmm0,%xmm1
   0x000000000040075c <+188>:   mulss  %xmm2,%xmm7
   0x0000000000400760 <+192>:   mulss  %xmm5,%xmm0
   0x0000000000400764 <+196>:   mulss  %xmm6,%xmm2
   0x0000000000400768 <+200>:   addss  %xmm7,%xmm1
   0x000000000040076c <+204>:   movaps %xmm3,%xmm7
   0x000000000040076f <+207>:   subss  %xmm2,%xmm0
   0x0000000000400773 <+211>:   movd   %xmm1,-0x30(%rsp)
   0x0000000000400779 <+217>:   mov    -0x30(%rsp),%edi
   0x000000000040077d <+221>:   movaps %xmm4,%xmm1
   0x0000000000400780 <+224>:   movd   %xmm0,-0x30(%rsp)
   0x0000000000400786 <+230>:   mov    %edi,%r15d
   0x0000000000400789 <+233>:   mov    -0x30(%rsp),%edi
   0x000000000040078d <+237>:   movss  (%rax),%xmm0
   0x0000000000400791 <+241>:   shl    $0x20,%r15
   0x0000000000400795 <+245>:   movss  0x4(%rax),%xmm2
   0x000000000040079a <+250>:   mulss  %xmm0,%xmm1
   0x000000000040079e <+254>:   or     %r15,%rbp
   0x00000000004007a1 <+257>:   mulss  %xmm2,%xmm7
   0x00000000004007a5 <+261>:   mov    %edi,%r15d
   0x00000000004007a8 <+264>:   and    %r9,%rbp
   0x00000000004007ab <+267>:   mulss  %xmm3,%xmm0
   0x00000000004007af <+271>:   or     %r15,%rbp
   0x00000000004007b2 <+274>:   mulss  %xmm4,%xmm2
   0x00000000004007b6 <+278>:   addss  %xmm7,%xmm1
   0x00000000004007ba <+282>:   subss  %xmm2,%xmm0
   0x00000000004007be <+286>:   movd   %xmm1,-0x30(%rsp)
   0x00000000004007c4 <+292>:   mov    -0x30(%rsp),%edi
   0x00000000004007c8 <+296>:   movd   %xmm0,-0x30(%rsp)
   0x00000000004007ce <+302>:   mov    %edi,%eax
   0x00000000004007d0 <+304>:   mov    -0x30(%rsp),%edi
   0x00000000004007d4 <+308>:   shl    $0x20,%rax
   0x00000000004007d8 <+312>:   or     %rax,%rbx
   0x00000000004007db <+315>:   and    %r9,%rbx
   0x00000000004007de <+318>:   mov    %edi,%eax
   0x00000000004007e0 <+320>:   or     %rax,%rbx
   0x00000000004007e3 <+323>:   mov    %r10,%rax
   0x00000000004007e6 <+326>:   mov    %rbx,%rdi
   0x00000000004007e9 <+329>:   and    $0xffffffff,%eax
   0x00000000004007ec <+332>:   shr    $0x20,%rdi
   0x00000000004007f0 <+336>:   mov    %edi,-0x20(%rsp)
   0x00000000004007f4 <+340>:   mov    %rbp,%rdi
   0x00000000004007f7 <+343>:   shr    $0x20,%rdi
   0x00000000004007fb <+347>:   movss  -0x20(%rsp),%xmm0
   0x0000000000400801 <+353>:   mov    %edi,-0x10(%rsp)
   0x0000000000400805 <+357>:   addss  -0x10(%rsp),%xmm0
   0x000000000040080b <+363>:   mov    %ebp,-0x10(%rsp)
   0x000000000040080f <+367>:   movss  %xmm0,-0x20(%rsp)
   0x0000000000400815 <+373>:   mov    -0x20(%rsp),%r10d
   0x000000000040081a <+378>:   mov    %ebx,-0x20(%rsp)
   0x000000000040081e <+382>:   movss  -0x20(%rsp),%xmm0
   0x0000000000400824 <+388>:   addss  -0x10(%rsp),%xmm0
   0x000000000040082a <+394>:   shl    $0x20,%r10
   0x000000000040082e <+398>:   or     %rax,%r10
   0x0000000000400831 <+401>:   and    %r9,%r10
   0x0000000000400834 <+404>:   movss  %xmm0,-0x20(%rsp)
   0x000000000040083a <+410>:   mov    -0x20(%rsp),%eax
   0x000000000040083e <+414>:   or     %rax,%r10
   0x0000000000400841 <+417>:   mov    %r11,%rax
   0x0000000000400844 <+420>:   mov    %r10,%rdi
   0x0000000000400847 <+423>:   and    $0xffffffff,%eax
   0x000000000040084a <+426>:   shr    $0x20,%rdi
   0x000000000040084e <+430>:   mov    %edi,-0x20(%rsp)
   0x0000000000400852 <+434>:   movss  -0x20(%rsp),%xmm0
   0x0000000000400858 <+440>:   addss  0x4(%r14),%xmm0
   0x000000000040085e <+446>:   movss  %xmm0,-0x20(%rsp)
   0x0000000000400864 <+452>:   mov    -0x20(%rsp),%r11d
   0x0000000000400869 <+457>:   mov    %r10d,-0x20(%rsp)
   0x000000000040086e <+462>:   movss  -0x20(%rsp),%xmm0
   0x0000000000400874 <+468>:   addss  (%r14),%xmm0
   0x0000000000400879 <+473>:   shl    $0x20,%r11
   0x000000000040087d <+477>:   or     %rax,%r11
   0x0000000000400880 <+480>:   and    %r9,%r11
   0x0000000000400883 <+483>:   movss  %xmm0,-0x20(%rsp)
   0x0000000000400889 <+489>:   mov    -0x20(%rsp),%eax
   0x000000000040088d <+493>:   or     %rax,%r11
   0x0000000000400890 <+496>:   cmp    %r8,%r12
   0x0000000000400893 <+499>:   mov    %r11,(%rcx,%r13,8)
   0x0000000000400897 <+503>:   mov    %r12,%r13
   0x000000000040089a <+506>:   jne    0x400728 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+136>
   0x00000000004008a0 <+512>:   pop    %rbx
   0x00000000004008a1 <+513>:   pop    %rbp
   0x00000000004008a2 <+514>:   pop    %r12
   0x00000000004008a4 <+516>:   pop    %r13
   0x00000000004008a6 <+518>:   pop    %r14
   0x00000000004008a8 <+520>:   pop    %r15
   0x00000000004008aa <+522>:   retq

大约有 133 条指令。

如果我这样定义 cfloat，以数组作为状态：

struct cfloat 
    cfloat(float re, float im)  ri[0] = re; ri[1] = im; 
    float ri[2];
;

cfloat operator +(cfloat a, cfloat b) 
    return cfloat(a.ri[0]+b.ri[0], a.ri[1]+b.ri[1]);


cfloat operator *(cfloat a, cfloat b) 
    return cfloat(a.ri[0]*b.ri[0]-a.ri[1]*b.ri[1], a.ri[0]*b.ri[1]+a.ri[1]*b.ri[0]);

它生成这个程序集：

Dump of assembler code for function _Z9benchmarkP6cfloatS0_S0_S0_S_S_m:
   0x00000000004006a0 <+0>: push   %rbx
   0x00000000004006a1 <+1>: movq   %xmm0,-0x8(%rsp)
   0x00000000004006a7 <+7>: mov    -0x8(%rsp),%r9
   0x00000000004006ac <+12>:    movq   %xmm1,-0x8(%rsp)
   0x00000000004006b2 <+18>:    mov    -0x8(%rsp),%rax
   0x00000000004006b7 <+23>:    mov    %r9d,-0xc(%rsp)
   0x00000000004006bc <+28>:    shr    $0x20,%r9
   0x00000000004006c0 <+32>:    movss  -0xc(%rsp),%xmm9
   0x00000000004006c7 <+39>:    mov    %r9d,-0xc(%rsp)
   0x00000000004006cc <+44>:    movss  -0xc(%rsp),%xmm8
   0x00000000004006d3 <+51>:    mov    %eax,-0xc(%rsp)
   0x00000000004006d7 <+55>:    shr    $0x20,%rax
   0x00000000004006db <+59>:    movss  -0xc(%rsp),%xmm7
   0x00000000004006e1 <+65>:    test   %r8,%r8
   0x00000000004006e4 <+68>:    mov    %eax,-0xc(%rsp)
   0x00000000004006e8 <+72>:    movss  -0xc(%rsp),%xmm6
   0x00000000004006ee <+78>:    je     0x400796 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+246>
   0x00000000004006f4 <+84>:    xor    %eax,%eax
   0x00000000004006f6 <+86>:    xor    %r9d,%r9d
   0x00000000004006f9 <+89>:    nopl   0x0(%rax)
   0x0000000000400700 <+96>:    shl    $0x3,%rax
   0x0000000000400704 <+100>:   movaps %xmm7,%xmm0
   0x0000000000400707 <+103>:   lea    (%rsi,%rax,1),%rbx
   0x000000000040070b <+107>:   movaps %xmm6,%xmm3
   0x000000000040070e <+110>:   lea    (%rcx,%rax,1),%r10
   0x0000000000400712 <+114>:   lea    (%rdx,%rax,1),%r11
   0x0000000000400716 <+118>:   lea    (%rdi,%rax,1),%rax
   0x000000000040071a <+122>:   movss  (%rbx),%xmm1
   0x000000000040071e <+126>:   add    $0x1,%r9
   0x0000000000400722 <+130>:   movss  0x4(%rbx),%xmm5
   0x0000000000400727 <+135>:   mulss  %xmm1,%xmm0
   0x000000000040072b <+139>:   mulss  %xmm5,%xmm3
   0x000000000040072f <+143>:   movss  (%rax),%xmm2
   0x0000000000400733 <+147>:   movaps %xmm8,%xmm10
   0x0000000000400737 <+151>:   mulss  %xmm6,%xmm1
   0x000000000040073b <+155>:   movss  0x4(%rax),%xmm4
   0x0000000000400740 <+160>:   mulss  %xmm7,%xmm5
   0x0000000000400744 <+164>:   mulss  %xmm4,%xmm10
   0x0000000000400749 <+169>:   cmp    %r8,%r9
   0x000000000040074c <+172>:   mov    %r9,%rax
   0x000000000040074f <+175>:   subss  %xmm3,%xmm0
   0x0000000000400753 <+179>:   movaps %xmm2,%xmm3
   0x0000000000400756 <+182>:   mulss  %xmm9,%xmm4
   0x000000000040075b <+187>:   mulss  %xmm9,%xmm3
   0x0000000000400760 <+192>:   addss  %xmm5,%xmm1
   0x0000000000400764 <+196>:   mulss  %xmm8,%xmm2
   0x0000000000400769 <+201>:   subss  %xmm10,%xmm3
   0x000000000040076e <+206>:   addss  %xmm4,%xmm2
   0x0000000000400772 <+210>:   addss  %xmm3,%xmm0
   0x0000000000400776 <+214>:   addss  %xmm2,%xmm1
   0x000000000040077a <+218>:   addss  (%r11),%xmm0
   0x000000000040077f <+223>:   addss  0x4(%r11),%xmm1
   0x0000000000400785 <+229>:   movss  %xmm0,(%r10)
   0x000000000040078a <+234>:   movss  %xmm1,0x4(%r10)
   0x0000000000400790 <+240>:   jne    0x400700 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+96>
   0x0000000000400796 <+246>:   pop    %rbx
   0x0000000000400797 <+247>:   retq   
End of assembler dump.

大约有 59 条指令。而且，我的基准测试显示，第一次迭代比第二次慢大约 3 倍。

我更喜欢单独的实数/虚数字段，尤其是因为将它们作为一个数组似乎会由于某种原因破坏英特尔编译器中的矢量化器。

有什么方法可以让 gcc 相信这两个类是等价的？

【问题讨论】：

您要为什么硬件编译？我希望 fused-multiply-add 操作在这里获得最佳性能。通用 x86-64。事实证明，我在循环中将“ssize_t”转换为“size_t”，g++ 实际上将向量化第一个示例，而不是第二个。有理由不使用std::complex吗？这看起来像是一个已删除问题的副本：***.com/questions/45134703/… page not found.... 有办法还能看到吗？ 【参考方案1】：

所以我不相信这一点，但如果我指定一个显式的复制构造函数，问题就会自行解决：

struct cfloat 
    cfloat(float re, float im) : re(re),   im(im)   
    cfloat(const cfloat& o)    : re(o.re), im(o.im) 

    float re,im;
;

现在生成相同的程序集：

Dump of assembler code for function benchmark(cfloat*, cfloat*, cfloat*, cfloat*, cfloat, cfloat, unsigned long):
   0x0000000000400600 <+0>: mov    0x8(%rsp),%r10
   0x0000000000400605 <+5>: test   %r10,%r10
   0x0000000000400608 <+8>: je     0x4006aa <benchmark(cfloat*, cfloat*, cfloat*, cfloat*, cfloat, cfloat, unsigned long)+170>
   0x000000000040060e <+14>:    xor    %eax,%eax
   0x0000000000400610 <+16>:    movss  (%r9),%xmm8
   0x0000000000400615 <+21>:    movss  0x4(%r9),%xmm9
   0x000000000040061b <+27>:    movaps %xmm8,%xmm0
   0x000000000040061f <+31>:    movaps %xmm9,%xmm3
   0x0000000000400623 <+35>:    movss  (%rsi,%rax,8),%xmm1
   0x0000000000400628 <+40>:    movss  0x4(%rsi,%rax,8),%xmm7
   0x000000000040062e <+46>:    mulss  %xmm1,%xmm0
   0x0000000000400632 <+50>:    mulss  %xmm7,%xmm3
   0x0000000000400636 <+54>:    movss  (%r8),%xmm5
   0x000000000040063b <+59>:    movss  0x4(%r8),%xmm6
   0x0000000000400641 <+65>:    mulss  %xmm9,%xmm1
   0x0000000000400646 <+70>:    movaps %xmm6,%xmm10
   0x000000000040064a <+74>:    mulss  %xmm8,%xmm7
   0x000000000040064f <+79>:    movss  (%rdi,%rax,8),%xmm2
   0x0000000000400654 <+84>:    subss  %xmm3,%xmm0
   0x0000000000400658 <+88>:    movaps %xmm5,%xmm3
   0x000000000040065b <+91>:    movss  0x4(%rdi,%rax,8),%xmm4
   0x0000000000400661 <+97>:    mulss  %xmm2,%xmm3
   0x0000000000400665 <+101>:   addss  %xmm7,%xmm1
   0x0000000000400669 <+105>:   mulss  %xmm4,%xmm10
   0x000000000040066e <+110>:   mulss  %xmm6,%xmm2
   0x0000000000400672 <+114>:   mulss  %xmm5,%xmm4
   0x0000000000400676 <+118>:   subss  %xmm10,%xmm3
   0x000000000040067b <+123>:   addss  %xmm4,%xmm2
   0x000000000040067f <+127>:   addss  %xmm3,%xmm0
   0x0000000000400683 <+131>:   addss  %xmm2,%xmm1
   0x0000000000400687 <+135>:   addss  (%rdx,%rax,8),%xmm0
   0x000000000040068c <+140>:   addss  0x4(%rdx,%rax,8),%xmm1
   0x0000000000400692 <+146>:   movss  %xmm0,(%rcx,%rax,8)
   0x0000000000400697 <+151>:   movss  %xmm1,0x4(%rcx,%rax,8)
   0x000000000040069d <+157>:   add    $0x1,%rax
   0x00000000004006a1 <+161>:   cmp    %rax,%r10
   0x00000000004006a4 <+164>:   ja     0x400610 <benchmark(cfloat*, cfloat*, cfloat*, cfloat*, cfloat, cfloat, unsigned long)+16>
   0x00000000004006aa <+170>:   repz retq 
End of assembler dump.

在规范中找到我那个。

【讨论】：

我想这在某种程度上是有道理的。其他代码的问题是 GCC 太努力地尝试方式来处理 64 位 GPR 中的整个结构，使用显式复制 ctor 它变得足够混乱以至于它认为它不能再这样做了，并且或多或少地修复了代码生成。添加复制构造函数可能会更改适用的 ABI 布局规则（它不再是 POD type for the purpose of layout）。是的。例如，查看operator+。在第一种情况下，xmmX 用于输入/输出，而在“固定”情况下，cfloat 通过堆栈上的值传递，并使用 RVO 写入输出。【参考方案2】：

您提到您以 Red Hat Enterprise Linux 为目标，并且（在您删除的帖子中）更新的编译器版本会生成更好的代码。您可以使用 Developer Toolset 获得更新的编译器，创建与操作系统其余部分兼容的应用程序：

https://www.softwarecollections.org/en/scls/rhscl/devtoolset-6/ https://developers.redhat.com/products/developertoolset/overview/

【讨论】：

不幸的是，升级对我来说不是一个选择，我提供源代码并且不控制机器。你能影响构建过程吗？使用 DTS 不会引入任何额外的运行时依赖项，并且您不必更改应用程序的部署方式。不，很遗憾，我提供了源代码并且通常不控制用于构建它的编译器（ANSI C++ 仅适用于我们）此外，除非您添加特殊标志，否则开发工具集代码与其他针对 RHEL 的 C++ 库的 ABI 不兼容，因为 C++ 中 std::string 和 std::list 的布局发生了变化11. @DanielH，这是不正确的。 GCC配置了--with-default-libstdcxx-abi=gcc4-compatible，所以ABI默认是向后兼容的。

以上是关于在 g++4.4.7 中为复杂算术生成快速汇编的主要内容，如果未能解决你的问题，请参考以下文章

汇编，栈上局部变量的算术运算

汇编--算术运算类指令

汇编语言通用数据处理指令——算术运算类指令

使用 SSE（IA32 汇编）执行简单的算术运算

汇编语言--高级汇编技术

汇编-寄存器数据的存储与变化-算术运算对标志位的影响