punpcklbw(MMX/SSE/AVX 中的交错)的用例都有哪些?
Posted
技术标签:
【中文标题】punpcklbw(MMX/SSE/AVX 中的交错)的用例都有哪些?【英文标题】:Which are the use case of punpcklbw (interleave in MMX/SSE/AVX)?punpcklbw(MMX/SSE/AVX 中的交错)的用例有哪些? 【发布时间】:2021-01-25 07:22:11 【问题描述】:哪些算法可以使用punpcklbw
?
具体来说,punpcklbw xmm0, xmm0
doing 是什么?
然而,maskedPow2_Value
有什么用处?
maskedValue = 0x101010101010101i64 * *(_QWORD *)&Val; // Val 是 int maskedPow2_Value = 0x101010101010101i64 * maskedValue;
(或mov r9, 101010101010101h; imul rdx, r9;
两次)
一个完整的例子(函数名为 CompressPacket 但它可能会产生误导),作为 IDA 反编译的结果:
void *__cdecl CompressPacket(void *Dst, int Val, size_t Size)
__int64 maskedPow2_Value; // rdx
unsigned int v5; // ecx
__int64 *bufferOut; // rcx
size_t size_; // r9
size_t i; // r9
size_t size__; // r9
size_t counter; // r8
size_t j; // r9
void *result; // rax
__m128i v13; // xmm0
__int64 lsb4; // rax
size_t counter1; // r9
size_t k; // r9
size_t lsb4_; // r8
__int64 maskedValue; // rdx
*(_QWORD *)&Val = (unsigned __int8)Val;
maskedValue = 0x101010101010101i64 * *(_QWORD *)&Val;
bufferOut = (__int64 *)((char *)Dst + Size);
result = Dst;
switch ( Size )
case 0ui64:
return result;
case 1ui64:
goto LBL_1_F;
case 2ui64:
goto LBL_2_E;
case 3ui64:
goto LBL_3_F;
case 4ui64:
goto LBL_4_C;
case 5ui64:
goto LBL_5_D;
case 6ui64:
goto LBL_6_E;
case 7ui64:
goto LBL_7_F;
case 8ui64:
*(bufferOut - 1) = maskedValue;
return result;
case 9ui64:
*(__int64 *)((char *)bufferOut - 9) = maskedValue;
*((_BYTE *)bufferOut - 1) = maskedValue;
return result;
case 0xAui64:
*(__int64 *)((char *)bufferOut - 10) = maskedValue;
*((_WORD *)bufferOut - 1) = maskedValue;
return result;
case 0xBui64:
*(__int64 *)((char *)bufferOut - 11) = maskedValue;
goto LBL_3_F;
case 0xCui64:
*(__int64 *)((char *)bufferOut - 12) = maskedValue;
LBL_4_C:
*((_DWORD *)bufferOut - 1) = maskedValue;
return result;
case 0xDui64:
*(__int64 *)((char *)bufferOut - 13) = maskedValue;
LBL_5_D:
*(_DWORD *)((char *)bufferOut - 5) = maskedValue;
*((_BYTE *)bufferOut - 1) = maskedValue;
return result;
case 0xEui64:
*(__int64 *)((char *)bufferOut - 14) = maskedValue;
LBL_6_E:
*(_DWORD *)((char *)bufferOut - 6) = maskedValue;
LBL_2_E:
*((_WORD *)bufferOut - 1) = maskedValue;
return result;
case 0xFui64:
*(__int64 *)((char *)bufferOut - 15) = maskedValue;
LBL_7_F:
*(_DWORD *)((char *)bufferOut - 7) = maskedValue;
LBL_3_F:
*(_WORD *)((char *)bufferOut - 3) = maskedValue;
LBL_1_F:
*((_BYTE *)bufferOut - 1) = maskedValue;
return result;
default:
if ( _bittest(dword_7FFFF4B237D8, 1u) )
memset(bufferOut, maskedValue, Size);
return Dst;
maskedPow2_Value = 0x101010101010101i64 * maskedValue;
if ( !_bittest(dword_7FFFF4B237D8, 2u) )
if ( Size >= 0x40 )
v5 = -(int)bufferOut & 7;
if ( v5 )
Size -= v5;
*(_QWORD *)Dst = maskedPow2_Value;
bufferOut = (__int64 *)((char *)Dst + v5);
size_ = Size;
Size &= 0x3Fu;
for ( i = size_ >> 6; i; *(bufferOut - 1) = maskedPow2_Value )
*bufferOut = maskedPow2_Value;
bufferOut[1] = maskedPow2_Value;
bufferOut[2] = maskedPow2_Value;
bufferOut += 8;
*(bufferOut - 5) = maskedPow2_Value;
*(bufferOut - 4) = maskedPow2_Value;
--i;
*(bufferOut - 3) = maskedPow2_Value;
*(bufferOut - 2) = maskedPow2_Value;
size__ = Size;
counter = Size & 7;
for ( j = size__ >> 3; j; --j )
*bufferOut++ = maskedPow2_Value;
for ( ; counter; --counter )
*(_BYTE *)bufferOut = maskedPow2_Value;
bufferOut = (__int64 *)((char *)bufferOut + 1);
return Dst;
v13 = _mm_unpacklo_epi8((__m128i)(unsigned __int64)maskedPow2_Value, (__m128i)(unsigned __int64)maskedPow2_Value);
if ( ((unsigned __int8)bufferOut & 0xF) != 0 )
*(__m128i *)bufferOut = v13;
lsb4 = (unsigned __int8)bufferOut & 0xF;
bufferOut = (__int64 *)((char *)bufferOut - lsb4 + 16);
Size = lsb4 + Size - 16;
counter1 = Size >> 7;
if ( Size >> 7 )
do
*(__m128i *)bufferOut = v13;
*((__m128i *)bufferOut + 1) = v13;
bufferOut += 16;
*((__m128i *)bufferOut - 6) = v13;
*((__m128i *)bufferOut - 5) = v13;
--counter1;
*((__m128i *)bufferOut - 4) = v13;
*((__m128i *)bufferOut - 3) = v13;
*((__m128i *)bufferOut - 2) = v13;
*((__m128i *)bufferOut - 1) = v13;
while ( counter1 );
Size &= 0x7Fu;
for ( k = Size >> 4; k; --k )
*(__m128i *)bufferOut = v13;
bufferOut += 2;
lsb4_ = Size & 0xF;
if ( lsb4_ )
*(__m128i *)((char *)bufferOut + lsb4_ - 16) = v13;
return Dst;
以及 IDA 的反汇编:
.text:00007FFFF4AF6440 ; void *__cdecl CompressPacket(void *Dst, int Val, size_t Size)
.text:00007FFFF4AF6440 CompressPacket proc near ; CODE XREF: j_memset↑j
.text:00007FFFF4AF6440 ; Concurrency::details::ResourceManager::CreateAllocatedNodeData(void)+49↑p ...
.text:00007FFFF4AF6440 mov r11, rcx
.text:00007FFFF4AF6443 movzx edx, dl ; Move with Zero-Extend
.text:00007FFFF4AF6446 cmp r8, 10h ; switch 16 cases
.text:00007FFFF4AF644A jb SetBytes15 ; Jump if Below (CF=1)
.text:00007FFFF4AF6450
.text:00007FFFF4AF6450 def_7FFFF4AF65D2: ; jumptable 00007FFFF4AF65D2 default case
.text:00007FFFF4AF6450 bt cs:dword_7FFFF4B237D8, 1
.text:00007FFFF4AF6458 jnb short mset05 ; Jump if Not Below (CF=0)
.text:00007FFFF4AF645A push rdi
.text:00007FFFF4AF645B mov rdi, rcx
.text:00007FFFF4AF645E mov eax, edx
.text:00007FFFF4AF6460 mov rcx, r8
.text:00007FFFF4AF6463 rep stosb ; Store String
.text:00007FFFF4AF6465 pop rdi
.text:00007FFFF4AF6466 jmp short mset60 ; Jump
.text:00007FFFF4AF6468 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6468
.text:00007FFFF4AF6468 mset05: ; CODE XREF: CompressPacket+18↑j
.text:00007FFFF4AF6468 mov r9, 101010101010101h
.text:00007FFFF4AF6472 imul rdx, r9 ; Signed Multiply
.text:00007FFFF4AF6476 bt cs:dword_7FFFF4B237D8, 2 ; Bit Test
.text:00007FFFF4AF647E jb msetxmm10 ; Jump if Below (CF=1)
.text:00007FFFF4AF6484 cmp r8, 40h ; '@' ; Compare Two Operands
.text:00007FFFF4AF6488 jb short mset20 ; Jump if Below (CF=1)
.text:00007FFFF4AF648A neg rcx ; Two's Complement Negation
.text:00007FFFF4AF648D and ecx, 7 ; Logical AND
.text:00007FFFF4AF6490 jz short mset10 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF6492 sub r8, rcx ; Integer Subtraction
.text:00007FFFF4AF6495 mov [r11], rdx
.text:00007FFFF4AF6498
.text:00007FFFF4AF6498 mset10: ; CODE XREF: CompressPacket+50↑j
.text:00007FFFF4AF6498 add rcx, r11 ; Add
.text:00007FFFF4AF649B mov r9, r8
.text:00007FFFF4AF649E and r8, 3Fh ; Logical AND
.text:00007FFFF4AF64A2 shr r9, 6 ; Shift Logical Right
.text:00007FFFF4AF64A6 jnz short mset80 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64A8
.text:00007FFFF4AF64A8 mset20: ; CODE XREF: CompressPacket+48↑j
.text:00007FFFF4AF64A8 ; CompressPacket+CF↓j
.text:00007FFFF4AF64A8 mov r9, r8
.text:00007FFFF4AF64AB and r8, 7 ; Logical AND
.text:00007FFFF4AF64AF shr r9, 3 ; Shift Logical Right
.text:00007FFFF4AF64B3 jz short mset40 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF64B5 db 66h, 66h
.text:00007FFFF4AF64B5 xchg ax, ax ; Exchange Register/Memory with Register
.text:00007FFFF4AF64B9 nop ; No Operation
.text:00007FFFF4AF64BA
.text:00007FFFF4AF64BA mset30: ; CODE XREF: CompressPacket+84↓j
.text:00007FFFF4AF64BA mov [rcx], rdx
.text:00007FFFF4AF64BD add rcx, 8 ; Add
.text:00007FFFF4AF64C1 dec r9 ; Decrement by 1
.text:00007FFFF4AF64C4 jnz short mset30 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64C6
.text:00007FFFF4AF64C6 mset40: ; CODE XREF: CompressPacket+73↑j
.text:00007FFFF4AF64C6 test r8, r8 ; Logical Compare
.text:00007FFFF4AF64C9 jz short mset60 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF64CB
.text:00007FFFF4AF64CB mset50: ; CODE XREF: CompressPacket+93↓j
.text:00007FFFF4AF64CB mov [rcx], dl
.text:00007FFFF4AF64CD inc rcx ; Increment by 1
.text:00007FFFF4AF64D0 dec r8 ; Decrement by 1
.text:00007FFFF4AF64D3 jnz short mset50 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64D5
.text:00007FFFF4AF64D5 mset60: ; CODE XREF: CompressPacket+26↑j
.text:00007FFFF4AF64D5 ; CompressPacket+89↑j
.text:00007FFFF4AF64D5 mov rax, r11
.text:00007FFFF4AF64D8 retn ; Return Near from Procedure
.text:00007FFFF4AF64D8 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF64D9 db 0Fh, 1Fh, 80h, 4 dup(0)
.text:00007FFFF4AF64E0 db 3 dup(66h), 90h
.text:00007FFFF4AF64E4 db 2 dup(66h), 90h
.text:00007FFFF4AF64E7 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF64E7
.text:00007FFFF4AF64E7 mset80: ; CODE XREF: CompressPacket+66↑j
.text:00007FFFF4AF64E7 ; CompressPacket+CD↓j
.text:00007FFFF4AF64E7 mov [rcx], rdx
.text:00007FFFF4AF64EA mov [rcx+8], rdx
.text:00007FFFF4AF64EE mov [rcx+10h], rdx
.text:00007FFFF4AF64F2 add rcx, 40h ; '@' ; Add
.text:00007FFFF4AF64F6 mov [rcx-28h], rdx
.text:00007FFFF4AF64FA mov [rcx-20h], rdx
.text:00007FFFF4AF64FE dec r9 ; Decrement by 1
.text:00007FFFF4AF6501 mov [rcx-18h], rdx
.text:00007FFFF4AF6505 mov [rcx-10h], rdx
.text:00007FFFF4AF6509 mov [rcx-8], rdx
.text:00007FFFF4AF650D jnz short mset80 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF650F jmp short mset20 ; Jump
.text:00007FFFF4AF650F ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6511 align 20h
.text:00007FFFF4AF6520
.text:00007FFFF4AF6520 msetxmm10: ; CODE XREF: CompressPacket+3E↑j
.text:00007FFFF4AF6520 movq xmm0, rdx ; Move 64 bits
.text:00007FFFF4AF6525 punpcklbw xmm0, xmm0 ; Unpack Low Packed Data (Byte->Word)
.text:00007FFFF4AF6529 test cl, 0Fh ; Logical Compare
.text:00007FFFF4AF652C jz short msetxmm20 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF652E movups xmmword ptr [rcx], xmm0 ; Move Unaligned Four Packed Single-FP
.text:00007FFFF4AF6531 mov rax, rcx
.text:00007FFFF4AF6534 and rax, 0Fh ; Logical AND
.text:00007FFFF4AF6538 add rcx, 10h ; Add
.text:00007FFFF4AF653C sub rcx, rax ; Integer Subtraction
.text:00007FFFF4AF653F lea r8, [rax+r8-10h] ; Load Effective Address
.text:00007FFFF4AF6544
.text:00007FFFF4AF6544 msetxmm20: ; CODE XREF: CompressPacket+EC↑j
.text:00007FFFF4AF6544 mov r9, r8
.text:00007FFFF4AF6547 shr r9, 7 ; Shift Logical Right
.text:00007FFFF4AF654B jz short msetxmm40 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF654D jmp short msetxmm30 ; Jump
.text:00007FFFF4AF654D ; ---------------------------------------------------------------------------
.text:00007FFFF4AF654F align 10h
.text:00007FFFF4AF6550
.text:00007FFFF4AF6550 msetxmm30: ; CODE XREF: CompressPacket+10D↑j
.text:00007FFFF4AF6550 ; CompressPacket+139↓j
.text:00007FFFF4AF6550 movaps xmmword ptr [rcx], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6553 movaps xmmword ptr [rcx+10h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6557 add rcx, 80h ; '€' ; Add
.text:00007FFFF4AF655E movaps xmmword ptr [rcx-60h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6562 movaps xmmword ptr [rcx-50h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6566 dec r9 ; Decrement by 1
.text:00007FFFF4AF6569 movaps xmmword ptr [rcx-40h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF656D movaps xmmword ptr [rcx-30h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6571 movaps xmmword ptr [rcx-20h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6575 movaps xmmword ptr [rcx-10h], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6579 jnz short msetxmm30 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF657B and r8, 7Fh ; Logical AND
.text:00007FFFF4AF657F
.text:00007FFFF4AF657F msetxmm40: ; CODE XREF: CompressPacket+10B↑j
.text:00007FFFF4AF657F mov r9, r8
.text:00007FFFF4AF6582 shr r9, 4 ; Shift Logical Right
.text:00007FFFF4AF6586 jz short msetxmm60 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF6588 nop dword ptr [rax+rax+00000000h] ; No Operation
.text:00007FFFF4AF6590
.text:00007FFFF4AF6590 msetxmm50: ; CODE XREF: CompressPacket+15A↓j
.text:00007FFFF4AF6590 movaps xmmword ptr [rcx], xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6593 add rcx, 10h ; Add
.text:00007FFFF4AF6597 dec r9 ; Decrement by 1
.text:00007FFFF4AF659A jnz short msetxmm50 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF659C
.text:00007FFFF4AF659C msetxmm60: ; CODE XREF: CompressPacket+146↑j
.text:00007FFFF4AF659C and r8, 0Fh ; Logical AND
.text:00007FFFF4AF65A0 jz short msetxmm70 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF65A2 movups xmmword ptr [r8+rcx-10h], xmm0 ; Move Unaligned Four Packed Single-FP
.text:00007FFFF4AF65A8
.text:00007FFFF4AF65A8 msetxmm70: ; CODE XREF: CompressPacket+160↑j
.text:00007FFFF4AF65A8 mov rax, r11
.text:00007FFFF4AF65AB retn ; Return Near from Procedure
.text:00007FFFF4AF65AC ; ---------------------------------------------------------------------------
.text:00007FFFF4AF65AC
.text:00007FFFF4AF65AC SetBytes15: ; CODE XREF: CompressPacket+A↑j
.text:00007FFFF4AF65AC mov r9, 101010101010101h
.text:00007FFFF4AF65B6 imul rdx, r9 ; Signed Multiply
.text:00007FFFF4AF65BA lea r9, cs:7FFFF4AB0000h ; Load Effective Address
.text:00007FFFF4AF65C1 mov eax, ds:(jpt_7FFFF4AF65D2 - 7FFFF4AB0000h)[r9+r8*4]
.text:00007FFFF4AF65C9 add r9, rax ; Add
.text:00007FFFF4AF65CC add rcx, r8 ; Add
.text:00007FFFF4AF65CF mov rax, r11
.text:00007FFFF4AF65D2 jmp r9 ; switch jump
.text:00007FFFF4AF65D2 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF65D5 jpt_7FFFF4AF65D2 dd offset msetTab00 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 ; DATA XREF: CompressPacket+181↑r
.text:00007FFFF4AF65D5 dd offset msetTab01 - 7FFFF4AB0000h ; jump table for switch statement
.text:00007FFFF4AF65D5 dd offset msetTab02 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab03 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab04 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab05 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab06 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab07 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab08 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab09 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab10 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab11 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab12 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab13 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab14 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5 dd offset msetTab15 - 7FFFF4AB0000h
.text:00007FFFF4AF6615 align 20h
.text:00007FFFF4AF6620
.text:00007FFFF4AF6620 msetTab15: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6620 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6620 mov [rcx-0Fh], rdx ; jumptable 00007FFFF4AF65D2 case 15
.text:00007FFFF4AF6624
.text:00007FFFF4AF6624 msetTab07: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6624 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6624 mov [rcx-7], edx ; jumptable 00007FFFF4AF65D2 case 7
.text:00007FFFF4AF6627
.text:00007FFFF4AF6627 msetTab03: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6627 ; CompressPacket+1F3↓j
.text:00007FFFF4AF6627 ; DATA XREF: ...
.text:00007FFFF4AF6627 mov [rcx-3], dx ; jumptable 00007FFFF4AF65D2 case 3
.text:00007FFFF4AF662B
.text:00007FFFF4AF662B msetTab01: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF662B ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF662B mov [rcx-1], dl ; jumptable 00007FFFF4AF65D2 case 1
.text:00007FFFF4AF662E
.text:00007FFFF4AF662E msetTab00: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF662E ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF662E retn ; jumptable 00007FFFF4AF65D2 case 0
.text:00007FFFF4AF662F ; ---------------------------------------------------------------------------
.text:00007FFFF4AF662F
.text:00007FFFF4AF662F msetTab11: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF662F ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF662F mov [rcx-0Bh], rdx ; jumptable 00007FFFF4AF65D2 case 11
.text:00007FFFF4AF6633 jmp short msetTab03 ; jumptable 00007FFFF4AF65D2 case 3
.text:00007FFFF4AF6635 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6635
.text:00007FFFF4AF6635 msetTab14: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6635 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6635 mov [rcx-0Eh], rdx ; jumptable 00007FFFF4AF65D2 case 14
.text:00007FFFF4AF6639
.text:00007FFFF4AF6639 msetTab06: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6639 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6639 mov [rcx-6], edx ; jumptable 00007FFFF4AF65D2 case 6
.text:00007FFFF4AF663C
.text:00007FFFF4AF663C msetTab02: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF663C ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF663C mov [rcx-2], dx ; jumptable 00007FFFF4AF65D2 case 2
.text:00007FFFF4AF6640 retn ; Return Near from Procedure
.text:00007FFFF4AF6641 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6641
.text:00007FFFF4AF6641 msetTab13: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6641 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6641 mov [rcx-0Dh], rdx ; jumptable 00007FFFF4AF65D2 case 13
.text:00007FFFF4AF6645
.text:00007FFFF4AF6645 msetTab05: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6645 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6645 mov [rcx-5], edx ; jumptable 00007FFFF4AF65D2 case 5
.text:00007FFFF4AF6648 mov [rcx-1], dl
.text:00007FFFF4AF664B retn ; Return Near from Procedure
.text:00007FFFF4AF664C ; ---------------------------------------------------------------------------
.text:00007FFFF4AF664C
.text:00007FFFF4AF664C msetTab12: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF664C ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF664C mov [rcx-0Ch], rdx ; jumptable 00007FFFF4AF65D2 case 12
.text:00007FFFF4AF6650
.text:00007FFFF4AF6650 msetTab04: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6650 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6650 mov [rcx-4], edx ; jumptable 00007FFFF4AF65D2 case 4
.text:00007FFFF4AF6653 retn ; Return Near from Procedure
.text:00007FFFF4AF6654 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6654
.text:00007FFFF4AF6654 msetTab10: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6654 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6654 mov [rcx-0Ah], rdx ; jumptable 00007FFFF4AF65D2 case 10
.text:00007FFFF4AF6658 mov [rcx-2], dx
.text:00007FFFF4AF665C retn ; Return Near from Procedure
.text:00007FFFF4AF665D ; ---------------------------------------------------------------------------
.text:00007FFFF4AF665D
.text:00007FFFF4AF665D msetTab09: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF665D ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF665D mov [rcx-9], rdx ; jumptable 00007FFFF4AF65D2 case 9
.text:00007FFFF4AF6661 mov [rcx-1], dl
.text:00007FFFF4AF6664 retn ; Return Near from Procedure
.text:00007FFFF4AF6665 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6665
.text:00007FFFF4AF6665 msetTab08: ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6665 ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6665 mov [rcx-8], rdx ; jumptable 00007FFFF4AF65D2 case 8
.text:00007FFFF4AF6669 retn ; Return Near from Procedure
.text:00007FFFF4AF6669 CompressPacket endp
【问题讨论】:
您可以在this project 中看到一些示例,这些示例用于多种用途,包括零扩展、复制字节和随机排列字节。 与您最近提出的其他问题一样,您的 C 代码似乎是反编译器输出,而不是人工编写的代码。最好这样说,并确定正在使用的反编译器。它将避免人们试图猜测为什么人类程序员会编写如此奇怪的代码,如果有些人有使用该反编译器及其翻译的经验,它甚至可能会有所帮助。 @NateEldredge 我已根据您的有用评论进行了相应更新。 【参考方案1】:一个常见的用例是用零解包以将 8 位数字扩展为 16 位(使用零扩展名),例如 SSE4.1 pmovzxbw
。或者特别是解压缩 16 字节寄存器的低半部分和高半部分,以获得两个向量,每个向量包含 8x 16 位元素。 这是唯一一种“解包”名称有意义的用例,packuswb
是它的倒数,将 2 个寄存器组合到 1 个。(或 packsswb
表示有符号饱和度。) p>
“解包”这个名字在其他方面很奇怪;这只是一个将两个寄存器中的元素交错的洗牌。 ARM NEON 有一个similar shuffle whose mnemonic is "zip"。
在您的情况下,它是 将一个字节广播到 XMM 寄存器的一部分,作为 memset 的一部分。即它是 _mm_set_epi8(x)
所做的一部分。
与0x0101010101010101
相乘会在一个 64 位整数中重复一个字节 8 次。这使您可以将标量整数存储用于奇数 8 个字节(不是 16 的倍数),例如 mov [r11], rdx
存储。
鉴于此 8 字节广播作为输入(通过 movaq
),只需要一次 SIMD shuffle。用punpcklqdq
复制低8 是我的选择,因为8 字节粒度洗牌在像Core 2 这样的非常旧的CPU 上更有效。但是将字节相互交错是等效的,因为它们无论如何都是相同的,导致一个 XMM 寄存器,包含 16 个相同字节的副本。
事实上,SSE2 可以用一条指令广播一个双字:pshufd xmm0, xmm0, 0
,所以如果不是想要一个 8 字节的标量,它可以只使用 imul edx, r9d, 0x01010101
。
使用 8 字节 mov
和 16 字节 movups
存储实现 memset 当然需要此作为输入,如果它使用该策略而不是 rep stosb
策略。
使用 SSSE3,您可以使用带有全零向量的 pshufb
直接广播单个字节(无需先进行乘法运算),为目标的每个元素选择源的第 0 个元素。或使用 AVX2 vpbroadcastb
。跳过整数乘法步骤就可以了;您可以使用来自 xmm0 而不是来自 RDX 的 movq [mem], xmm0
8 字节存储。
xmm 寄存器底部有一个字节,其他元素中有垃圾(即,如果你没有使用 imul
),2x punpcklbw
+ pshufd
可以仅使用 SSE2 进行广播。或者当然是 punpcklbw xmm0,xmm0
/ punpcklwd xmm0,xmm0
作为前 2 次洗牌。或punpcklbw xmm0,xmm0
/pshuflw xmm0,xmm0, 0
/punpcklqdq xmm0,xmm0
。
【讨论】:
你是否暗示它是解压缩算法的一部分(如解压缩)? @Soleil:不,它是 memset 的一部分。 (这可能被用于重复 1 字节模式的解压缩算法使用)。 那么,它是 memset 的快速/并行版本吗? (它 = punpcklbw) @Soleil:它不是线程并行的,只是使用宽存储。这是现代 C 实现中 memset 的 100% 标准;任何将 memset 内联为一次字节循环的编译器都是垃圾。当然,您想使用机器可以做的最广泛的商店。这就是 libc 中 memset 的手写 asm 实现会做的事情(例如 glibc 的 code.woboq.org/userspace/glibc/sysdeps/x86_64/multiarch/…),如果编译器选择内联一些代码而不是调用它,它也会做类似的事情。 @Soleil:不,punpcklbw
本身并不是 memset。它只是一个构建块,还有imul
,以及一些用于整理大小和对齐的分支。实际的mov
和movups
指令存储到内存中。请参阅我的更新答案。以上是关于punpcklbw(MMX/SSE/AVX 中的交错)的用例都有哪些?的主要内容,如果未能解决你的问题,请参考以下文章
集合操作 用单链表模拟有序集合,实现集合的加入一个元素、删除一个元素、集合的交、并、差运算。