ref: 4d930c1478f6b9e4851e84e139b4cdf4450fb963
parent: fafbd2f4802df3b858e0e12c3472a4498c20854b
author: Runxi Yu <me@runxiyu.org>
date: Fri Dec 19 17:51:57 EST 2025
adler32: Use meaningful label names and remove generated comments
--- a/internal/adler32/adler32_avx2.s
+++ b/internal/adler32/adler32_avx2.s
@@ -22,242 +22,242 @@
MOVQ buf_base+8(FP), SI
MOVQ buf_len+16(FP), DX
MOVQ buf_cap+24(FP), CX
- WORD $0x8548; BYTE $0xf6 // TESTQ SI, SI // test rsi, rsi
- JE LBB0_1 // <-- // je .LBB0_1
- WORD $0xf889 // MOVL DI, AX // mov eax, edi
- WORD $0x8548; BYTE $0xd2 // TESTQ DX, DX // test rdx, rdx
- JE LBB0_2 // <-- // je .LBB0_2
- NOP // (skipped) // push rbp
- NOP // (skipped) // mov rbp, rsp
- NOP // (skipped) // and rsp, -8
- WORD $0xc189 // MOVL AX, CX // mov ecx, eax
- WORD $0xe9c1; BYTE $0x10 // SHRL $0x10, CX // shr ecx, 16
- WORD $0xb70f; BYTE $0xc0 // MOVZX AX, AX // movzx eax, ax
- CMPQ DX, $0x20 // <-- // cmp rdx, 32
- JB LBB0_17 // <-- // jb .LBB0_17
- LONG $0x078071bf; BYTE $0x80 // MOVL $-0x7ff87f8f, DI // mov edi, 2147975281
- LONG $0xc0eff9c5 // VPXOR X0, X0, X0 // vpxor xmm0, xmm0, xmm0
- VMOVDQA LCPI0_0<>(SB), Y1 // <-- // vmovdqa ymm1, ymmword ptr [rip + .LCPI0_0]
- VPBROADCASTW LCPI0_2<>(SB), Y2 // <-- // vpbroadcastw ymm2, word ptr [rip + .LCPI0_2]
- JMP LBB0_6 // <-- // jmp .LBB0_6
+ WORD $0x8548; BYTE $0xf6
+ JE return_one
+ WORD $0xf889
+ WORD $0x8548; BYTE $0xd2
+ JE return_result
+ NOP
+ NOP
+ NOP
+ WORD $0xc189
+ WORD $0xe9c1; BYTE $0x10
+ WORD $0xb70f; BYTE $0xc0
+ CMPQ DX, $0x20
+ JB tail16_check
+ LONG $0x078071bf; BYTE $0x80
+ LONG $0xc0eff9c5
+ VMOVDQA LCPI0_0<>(SB), Y1
+ VPBROADCASTW LCPI0_2<>(SB), Y2
+ JMP block_loop_setup
-LBB0_7:
- LONG $0xf46ffdc5 // VMOVDQA Y4, Y6 // vmovdqa ymm6, ymm4
- LONG $0xedefd1c5 // VPXOR X5, X5, X5 // vpxor xmm5, xmm5, xmm5
+block_accum_init:
+ LONG $0xf46ffdc5
+ LONG $0xedefd1c5
-LBB0_14:
- SUBQ AX, DX // <-- // sub rdx, rax
- LONG $0xf572ddc5; BYTE $0x05 // ? // vpslld ymm4, ymm5, 5
- LONG $0xdbfeddc5 // VPADDD Y3, Y4, Y3 // vpaddd ymm3, ymm4, ymm3
- LONG $0x397de3c4; WORD $0x01f4 // VEXTRACTI128 $0x1, Y6, X4 // vextracti128 xmm4, ymm6, 1
- LONG $0xecc6c8c5; BYTE $0x88 // VSHUFPS $-0x78, X4, X6, X5 // vshufps xmm5, xmm6, xmm4, 136
- LONG $0xe470f9c5; BYTE $0x88 // VPSHUFD $-0x78, X4, X4 // vpshufd xmm4, xmm4, 136
- LONG $0xe4fed1c5 // VPADDD X4, X5, X4 // vpaddd xmm4, xmm5, xmm4
- LONG $0xec70f9c5; BYTE $0x55 // VPSHUFD $0x55, X4, X5 // vpshufd xmm5, xmm4, 85
- LONG $0xe4fed1c5 // VPADDD X4, X5, X4 // vpaddd xmm4, xmm5, xmm4
- LONG $0xe07ef9c5 // VMOVD X4, AX // vmovd eax, xmm4
- MOVQ AX, CX // <-- // mov rcx, rax
- IMULQ DI, CX // <-- // imul rcx, rdi
- SHRQ $0x2f, CX // <-- // shr rcx, 47
- LONG $0xfff1c969; WORD $0x0000 // IMULL $0xfff1, CX, CX // imul ecx, ecx, 65521
- WORD $0xc829 // SUBL CX, AX // sub eax, ecx
- LONG $0x397de3c4; WORD $0x01dc // VEXTRACTI128 $0x1, Y3, X4 // vextracti128 xmm4, ymm3, 1
- LONG $0xdbfed9c5 // VPADDD X3, X4, X3 // vpaddd xmm3, xmm4, xmm3
- LONG $0xe370f9c5; BYTE $0xee // VPSHUFD $-0x12, X3, X4 // vpshufd xmm4, xmm3, 238
- LONG $0xdcfee1c5 // VPADDD X4, X3, X3 // vpaddd xmm3, xmm3, xmm4
- LONG $0xe370f9c5; BYTE $0x55 // VPSHUFD $0x55, X3, X4 // vpshufd xmm4, xmm3, 85
- LONG $0xdbfed9c5 // VPADDD X3, X4, X3 // vpaddd xmm3, xmm4, xmm3
- LONG $0xd97ef9c5 // VMOVD X3, CX // vmovd ecx, xmm3
- MOVQ CX, R8 // <-- // mov r8, rcx
- IMULQ DI, R8 // <-- // imul r8, rdi
- SHRQ $0x2f, R8 // <-- // shr r8, 47
- LONG $0xf1c06945; WORD $0x00ff; BYTE $0x00 // IMULL $0xfff1, R8, R8 // imul r8d, r8d, 65521
- WORD $0x2944; BYTE $0xc1 // SUBL R8, CX // sub ecx, r8d
- CMPQ DX, $0x1f // <-- // cmp rdx, 31
- JBE LBB0_15 // <-- // jbe .LBB0_15
+block_reduce:
+ SUBQ AX, DX
+ LONG $0xf572ddc5; BYTE $0x05
+ LONG $0xdbfeddc5
+ LONG $0x397de3c4; WORD $0x01f4
+ LONG $0xecc6c8c5; BYTE $0x88
+ LONG $0xe470f9c5; BYTE $0x88
+ LONG $0xe4fed1c5
+ LONG $0xec70f9c5; BYTE $0x55
+ LONG $0xe4fed1c5
+ LONG $0xe07ef9c5
+ MOVQ AX, CX
+ IMULQ DI, CX
+ SHRQ $0x2f, CX
+ LONG $0xfff1c969; WORD $0x0000
+ WORD $0xc829
+ LONG $0x397de3c4; WORD $0x01dc
+ LONG $0xdbfed9c5
+ LONG $0xe370f9c5; BYTE $0xee
+ LONG $0xdcfee1c5
+ LONG $0xe370f9c5; BYTE $0x55
+ LONG $0xdbfed9c5
+ LONG $0xd97ef9c5
+ MOVQ CX, R8
+ IMULQ DI, R8
+ SHRQ $0x2f, R8
+ LONG $0xf1c06945; WORD $0x00ff; BYTE $0x00
+ WORD $0x2944; BYTE $0xc1
+ CMPQ DX, $0x1f
+ JBE tail_check
-LBB0_6:
- LONG $0xe06ef9c5 // VMOVD AX, X4 // vmovd xmm4, eax
- LONG $0xd96ef9c5 // VMOVD CX, X3 // vmovd xmm3, ecx
- CMPQ DX, $0x15b0 // <-- // cmp rdx, 5552
- LONG $0x15b0b841; WORD $0x0000 // MOVL $0x15b0, R8 // mov r8d, 5552
- LONG $0xc2420f4c // CMOVB DX, R8 // cmovb r8, rdx
- WORD $0x8944; BYTE $0xc0 // MOVL R8, AX // mov eax, r8d
- LONG $0x001fe025; BYTE $0x00 // ANDL $0x1fe0, AX // and eax, 8160
- JE LBB0_7 // <-- // je .LBB0_7
- ADDQ $-0x20, R8 // <-- // add r8, -32
- LONG $0xedefd1c5 // VPXOR X5, X5, X5 // vpxor xmm5, xmm5, xmm5
- LONG $0x20c0f641 // TESTL $0x20, R8 // test r8b, 32
- JNE LBB0_9 // <-- // jne .LBB0_9
- LONG $0x2e6ffec5 // VMOVDQU 0(SI), Y5 // vmovdqu ymm5, ymmword ptr [rsi]
- ADDQ $0x20, SI // <-- // add rsi, 32
- LEAQ -0x20(AX), CX // <-- // lea rcx, [rax - 32]
- LONG $0xf0f6d5c5 // VPSADBW Y0, Y5, Y6 // vpsadbw ymm6, ymm5, ymm0
- LONG $0xf4fecdc5 // VPADDD Y4, Y6, Y6 // vpaddd ymm6, ymm6, ymm4
- LONG $0x0455e2c4; BYTE $0xe9 // VPMADDUBSW Y1, Y5, Y5 // vpmaddubsw ymm5, ymm5, ymm1
- LONG $0xeaf5d5c5 // VPMADDWD Y2, Y5, Y5 // vpmaddwd ymm5, ymm5, ymm2
- LONG $0xdbfed5c5 // VPADDD Y3, Y5, Y3 // vpaddd ymm3, ymm5, ymm3
- LONG $0xec6ffdc5 // VMOVDQA Y4, Y5 // vmovdqa ymm5, ymm4
- LONG $0xe66ffdc5 // VMOVDQA Y6, Y4 // vmovdqa ymm4, ymm6
- CMPQ R8, $0x20 // <-- // cmp r8, 32
- JAE LBB0_12 // <-- // jae .LBB0_12
- JMP LBB0_14 // <-- // jmp .LBB0_14
+block_loop_setup:
+ LONG $0xe06ef9c5
+ LONG $0xd96ef9c5
+ CMPQ DX, $0x15b0
+ LONG $0x15b0b841; WORD $0x0000
+ LONG $0xc2420f4c
+ WORD $0x8944; BYTE $0xc0
+ LONG $0x001fe025; BYTE $0x00
+ JE block_accum_init
+ ADDQ $-0x20, R8
+ LONG $0xedefd1c5
+ LONG $0x20c0f641
+ JNE block_loop_entry
+ LONG $0x2e6ffec5
+ ADDQ $0x20, SI
+ LEAQ -0x20(AX), CX
+ LONG $0xf0f6d5c5
+ LONG $0xf4fecdc5
+ LONG $0x0455e2c4; BYTE $0xe9
+ LONG $0xeaf5d5c5
+ LONG $0xdbfed5c5
+ LONG $0xec6ffdc5
+ LONG $0xe66ffdc5
+ CMPQ R8, $0x20
+ JAE block_loop_64
+ JMP block_reduce
-LBB0_9:
- MOVQ AX, CX // <-- // mov rcx, rax
- CMPQ R8, $0x20 // <-- // cmp r8, 32
- JB LBB0_14 // <-- // jb .LBB0_14
+block_loop_entry:
+ MOVQ AX, CX
+ CMPQ R8, $0x20
+ JB block_reduce
-LBB0_12:
- LONG $0x366ffec5 // VMOVDQU 0(SI), Y6 // vmovdqu ymm6, ymmword ptr [rsi]
- LONG $0x7e6ffec5; BYTE $0x20 // VMOVDQU 0x20(SI), Y7 // vmovdqu ymm7, ymmword ptr [rsi + 32]
- LONG $0xc0f64dc5 // VPSADBW Y0, Y6, Y8 // vpsadbw ymm8, ymm6, ymm0
- LONG $0xc4fe3dc5 // VPADDD Y4, Y8, Y8 // vpaddd ymm8, ymm8, ymm4
- LONG $0xecfed5c5 // VPADDD Y4, Y5, Y5 // vpaddd ymm5, ymm5, ymm4
- LONG $0x044de2c4; BYTE $0xe1 // VPMADDUBSW Y1, Y6, Y4 // vpmaddubsw ymm4, ymm6, ymm1
- LONG $0xe2f5ddc5 // VPMADDWD Y2, Y4, Y4 // vpmaddwd ymm4, ymm4, ymm2
- LONG $0xdbfeddc5 // VPADDD Y3, Y4, Y3 // vpaddd ymm3, ymm4, ymm3
- ADDQ $0x40, SI // <-- // add rsi, 64
- LONG $0xe0f6c5c5 // VPSADBW Y0, Y7, Y4 // vpsadbw ymm4, ymm7, ymm0
- LONG $0xe4febdc5 // VPADDD Y4, Y8, Y4 // vpaddd ymm4, ymm8, ymm4
- LONG $0xedfebdc5 // VPADDD Y5, Y8, Y5 // vpaddd ymm5, ymm8, ymm5
- LONG $0x0445e2c4; BYTE $0xf1 // VPMADDUBSW Y1, Y7, Y6 // vpmaddubsw ymm6, ymm7, ymm1
- LONG $0xf2f5cdc5 // VPMADDWD Y2, Y6, Y6 // vpmaddwd ymm6, ymm6, ymm2
- LONG $0xdbfecdc5 // VPADDD Y3, Y6, Y3 // vpaddd ymm3, ymm6, ymm3
- ADDQ $-0x40, CX // <-- // add rcx, -64
- JNE LBB0_12 // <-- // jne .LBB0_12
- LONG $0xf46ffdc5 // VMOVDQA Y4, Y6 // vmovdqa ymm6, ymm4
- JMP LBB0_14 // <-- // jmp .LBB0_14
+block_loop_64:
+ LONG $0x366ffec5
+ LONG $0x7e6ffec5; BYTE $0x20
+ LONG $0xc0f64dc5
+ LONG $0xc4fe3dc5
+ LONG $0xecfed5c5
+ LONG $0x044de2c4; BYTE $0xe1
+ LONG $0xe2f5ddc5
+ LONG $0xdbfeddc5
+ ADDQ $0x40, SI
+ LONG $0xe0f6c5c5
+ LONG $0xe4febdc5
+ LONG $0xedfebdc5
+ LONG $0x0445e2c4; BYTE $0xf1
+ LONG $0xf2f5cdc5
+ LONG $0xdbfecdc5
+ ADDQ $-0x40, CX
+ JNE block_loop_64
+ LONG $0xf46ffdc5
+ JMP block_reduce
-LBB0_1:
- LONG $0x000001b8; BYTE $0x00 // MOVL $0x1, AX // mov eax, 1
+return_one:
+ LONG $0x000001b8; BYTE $0x00
-LBB0_2:
- MOVL AX, ret+32(FP) // <--
- RET // <-- // ret
+return_result:
+ MOVL AX, ret+32(FP)
+ RET
-LBB0_15:
- WORD $0x8548; BYTE $0xd2 // TESTQ DX, DX // test rdx, rdx
- JE LBB0_16 // <-- // je .LBB0_16
+tail_check:
+ WORD $0x8548; BYTE $0xd2
+ JE return_no_tail
-LBB0_17:
- CMPQ DX, $0x10 // <-- // cmp rdx, 16
- JB LBB0_20 // <-- // jb .LBB0_20
- WORD $0xb60f; BYTE $0x3e // MOVZX 0(SI), DI // movzx edi, byte ptr [rsi]
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- WORD $0xc101 // ADDL AX, CX // add ecx, eax
- LONG $0x017eb60f // MOVZX 0x1(SI), DI // movzx edi, byte ptr [rsi + 1]
- WORD $0xc701 // ADDL AX, DI // add edi, eax
- WORD $0xf901 // ADDL DI, CX // add ecx, edi
- LONG $0x0246b60f // MOVZX 0x2(SI), AX // movzx eax, byte ptr [rsi + 2]
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- WORD $0xc101 // ADDL AX, CX // add ecx, eax
- LONG $0x037eb60f // MOVZX 0x3(SI), DI // movzx edi, byte ptr [rsi + 3]
- WORD $0xc701 // ADDL AX, DI // add edi, eax
- WORD $0xf901 // ADDL DI, CX // add ecx, edi
- LONG $0x0446b60f // MOVZX 0x4(SI), AX // movzx eax, byte ptr [rsi + 4]
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- WORD $0xc101 // ADDL AX, CX // add ecx, eax
- LONG $0x057eb60f // MOVZX 0x5(SI), DI // movzx edi, byte ptr [rsi + 5]
- WORD $0xc701 // ADDL AX, DI // add edi, eax
- WORD $0xf901 // ADDL DI, CX // add ecx, edi
- LONG $0x0646b60f // MOVZX 0x6(SI), AX // movzx eax, byte ptr [rsi + 6]
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- WORD $0xc101 // ADDL AX, CX // add ecx, eax
- LONG $0x077eb60f // MOVZX 0x7(SI), DI // movzx edi, byte ptr [rsi + 7]
- WORD $0xc701 // ADDL AX, DI // add edi, eax
- WORD $0xf901 // ADDL DI, CX // add ecx, edi
- LONG $0x0846b60f // MOVZX 0x8(SI), AX // movzx eax, byte ptr [rsi + 8]
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- WORD $0xc101 // ADDL AX, CX // add ecx, eax
- LONG $0x097eb60f // MOVZX 0x9(SI), DI // movzx edi, byte ptr [rsi + 9]
- WORD $0xc701 // ADDL AX, DI // add edi, eax
- WORD $0xf901 // ADDL DI, CX // add ecx, edi
- LONG $0x0a46b60f // MOVZX 0xa(SI), AX // movzx eax, byte ptr [rsi + 10]
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- WORD $0xc101 // ADDL AX, CX // add ecx, eax
- LONG $0x0b7eb60f // MOVZX 0xb(SI), DI // movzx edi, byte ptr [rsi + 11]
- WORD $0xc701 // ADDL AX, DI // add edi, eax
- WORD $0xf901 // ADDL DI, CX // add ecx, edi
- LONG $0x0c46b60f // MOVZX 0xc(SI), AX // movzx eax, byte ptr [rsi + 12]
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- WORD $0xc101 // ADDL AX, CX // add ecx, eax
- LONG $0x0d7eb60f // MOVZX 0xd(SI), DI // movzx edi, byte ptr [rsi + 13]
- WORD $0xc701 // ADDL AX, DI // add edi, eax
- WORD $0xf901 // ADDL DI, CX // add ecx, edi
- LONG $0x46b60f44; BYTE $0x0e // MOVZX 0xe(SI), R8 // movzx r8d, byte ptr [rsi + 14]
- WORD $0x0141; BYTE $0xf8 // ADDL DI, R8 // add r8d, edi
- WORD $0x0144; BYTE $0xc1 // ADDL R8, CX // add ecx, r8d
- LONG $0x0f46b60f // MOVZX 0xf(SI), AX // movzx eax, byte ptr [rsi + 15]
- WORD $0x0144; BYTE $0xc0 // ADDL R8, AX // add eax, r8d
- WORD $0xc101 // ADDL AX, CX // add ecx, eax
- ADDQ $-0x10, DX // <-- // add rdx, -16
- JE LBB0_27 // <-- // je .LBB0_27
- ADDQ $0x10, SI // <-- // add rsi, 16
+tail16_check:
+ CMPQ DX, $0x10
+ JB tail_bytes_setup
+ WORD $0xb60f; BYTE $0x3e
+ WORD $0xf801
+ WORD $0xc101
+ LONG $0x017eb60f
+ WORD $0xc701
+ WORD $0xf901
+ LONG $0x0246b60f
+ WORD $0xf801
+ WORD $0xc101
+ LONG $0x037eb60f
+ WORD $0xc701
+ WORD $0xf901
+ LONG $0x0446b60f
+ WORD $0xf801
+ WORD $0xc101
+ LONG $0x057eb60f
+ WORD $0xc701
+ WORD $0xf901
+ LONG $0x0646b60f
+ WORD $0xf801
+ WORD $0xc101
+ LONG $0x077eb60f
+ WORD $0xc701
+ WORD $0xf901
+ LONG $0x0846b60f
+ WORD $0xf801
+ WORD $0xc101
+ LONG $0x097eb60f
+ WORD $0xc701
+ WORD $0xf901
+ LONG $0x0a46b60f
+ WORD $0xf801
+ WORD $0xc101
+ LONG $0x0b7eb60f
+ WORD $0xc701
+ WORD $0xf901
+ LONG $0x0c46b60f
+ WORD $0xf801
+ WORD $0xc101
+ LONG $0x0d7eb60f
+ WORD $0xc701
+ WORD $0xf901
+ LONG $0x46b60f44; BYTE $0x0e
+ WORD $0x0141; BYTE $0xf8
+ WORD $0x0144; BYTE $0xc1
+ LONG $0x0f46b60f
+ WORD $0x0144; BYTE $0xc0
+ WORD $0xc101
+ ADDQ $-0x10, DX
+ JE final_reduce
+ ADDQ $0x10, SI
-LBB0_20:
- LEAQ -0x1(DX), DI // <-- // lea rdi, [rdx - 1]
- MOVQ DX, R9 // <-- // mov r9, rdx
- ANDQ $0x3, R9 // <-- // and r9, 3
- JE LBB0_24 // <-- // je .LBB0_24
- XORL R8, R8 // <-- // xor r8d, r8d
+tail_bytes_setup:
+ LEAQ -0x1(DX), DI
+ MOVQ DX, R9
+ ANDQ $0x3, R9
+ JE tail_dword_setup
+ XORL R8, R8
-LBB0_22:
- LONG $0x14b60f46; BYTE $0x06 // MOVZX 0(SI)(R8*1), R10 // movzx r10d, byte ptr [rsi + r8]
- WORD $0x0144; BYTE $0xd0 // ADDL R10, AX // add eax, r10d
- WORD $0xc101 // ADDL AX, CX // add ecx, eax
- INCQ R8 // <-- // inc r8
- CMPQ R9, R8 // <-- // cmp r9, r8
- JNE LBB0_22 // <-- // jne .LBB0_22
- ADDQ R8, SI // <-- // add rsi, r8
- SUBQ R8, DX // <-- // sub rdx, r8
+tail_byte_loop:
+ LONG $0x14b60f46; BYTE $0x06
+ WORD $0x0144; BYTE $0xd0
+ WORD $0xc101
+ INCQ R8
+ CMPQ R9, R8
+ JNE tail_byte_loop
+ ADDQ R8, SI
+ SUBQ R8, DX
-LBB0_24:
- CMPQ DI, $0x3 // <-- // cmp rdi, 3
- JB LBB0_27 // <-- // jb .LBB0_27
- XORL DI, DI // <-- // xor edi, edi
+tail_dword_setup:
+ CMPQ DI, $0x3
+ JB final_reduce
+ XORL DI, DI
-LBB0_26:
- LONG $0x04b60f44; BYTE $0x3e // MOVZX 0(SI)(DI*1), R8 // movzx r8d, byte ptr [rsi + rdi]
- WORD $0x0141; BYTE $0xc0 // ADDL AX, R8 // add r8d, eax
- WORD $0x0144; BYTE $0xc1 // ADDL R8, CX // add ecx, r8d
- LONG $0x3e44b60f; BYTE $0x01 // MOVZX 0x1(SI)(DI*1), AX // movzx eax, byte ptr [rsi + rdi + 1]
- WORD $0x0144; BYTE $0xc0 // ADDL R8, AX // add eax, r8d
- WORD $0xc101 // ADDL AX, CX // add ecx, eax
- LONG $0x44b60f44; WORD $0x023e // MOVZX 0x2(SI)(DI*1), R8 // movzx r8d, byte ptr [rsi + rdi + 2]
- WORD $0x0141; BYTE $0xc0 // ADDL AX, R8 // add r8d, eax
- WORD $0x0144; BYTE $0xc1 // ADDL R8, CX // add ecx, r8d
- LONG $0x3e44b60f; BYTE $0x03 // MOVZX 0x3(SI)(DI*1), AX // movzx eax, byte ptr [rsi + rdi + 3]
- WORD $0x0144; BYTE $0xc0 // ADDL R8, AX // add eax, r8d
- WORD $0xc101 // ADDL AX, CX // add ecx, eax
- ADDQ $0x4, DI // <-- // add rdi, 4
- CMPQ DX, DI // <-- // cmp rdx, rdi
- JNE LBB0_26 // <-- // jne .LBB0_26
+tail_dword_loop:
+ LONG $0x04b60f44; BYTE $0x3e
+ WORD $0x0141; BYTE $0xc0
+ WORD $0x0144; BYTE $0xc1
+ LONG $0x3e44b60f; BYTE $0x01
+ WORD $0x0144; BYTE $0xc0
+ WORD $0xc101
+ LONG $0x44b60f44; WORD $0x023e
+ WORD $0x0141; BYTE $0xc0
+ WORD $0x0144; BYTE $0xc1
+ LONG $0x3e44b60f; BYTE $0x03
+ WORD $0x0144; BYTE $0xc0
+ WORD $0xc101
+ ADDQ $0x4, DI
+ CMPQ DX, DI
+ JNE tail_dword_loop
-LBB0_27:
- LONG $0x000f908d; WORD $0xffff // LEAL -0xfff1(AX), DX // lea edx, [rax - 65521]
- CMPL AX, $0xfff1 // <-- // cmp eax, 65521
- WORD $0x420f; BYTE $0xd0 // CMOVB AX, DX // cmovb edx, eax
- WORD $0xc889 // MOVL CX, AX // mov eax, ecx
- LONG $0x078071be; BYTE $0x80 // MOVL $-0x7ff87f8f, SI // mov esi, 2147975281
- IMULQ AX, SI // <-- // imul rsi, rax
- SHRQ $0x2f, SI // <-- // shr rsi, 47
- LONG $0xfff1c669; WORD $0x0000 // IMULL $0xfff1, SI, AX // imul eax, esi, 65521
- WORD $0xc129 // SUBL AX, CX // sub ecx, eax
- WORD $0xe1c1; BYTE $0x10 // SHLL $0x10, CX // shl ecx, 16
- WORD $0xd109 // ORL DX, CX // or ecx, edx
- WORD $0xc889 // MOVL CX, AX // mov eax, ecx
- NOP // (skipped) // mov rsp, rbp
- NOP // (skipped) // pop rbp
- VZEROUPPER // <-- // vzeroupper
- MOVL AX, ret+32(FP) // <--
- RET // <-- // ret
+final_reduce:
+ LONG $0x000f908d; WORD $0xffff
+ CMPL AX, $0xfff1
+ WORD $0x420f; BYTE $0xd0
+ WORD $0xc889
+ LONG $0x078071be; BYTE $0x80
+ IMULQ AX, SI
+ SHRQ $0x2f, SI
+ LONG $0xfff1c669; WORD $0x0000
+ WORD $0xc129
+ WORD $0xe1c1; BYTE $0x10
+ WORD $0xd109
+ WORD $0xc889
+ NOP
+ NOP
+ VZEROUPPER
+ MOVL AX, ret+32(FP)
+ RET
-LBB0_16:
- WORD $0xe1c1; BYTE $0x10 // SHLL $0x10, CX // shl ecx, 16
- WORD $0xc809 // ORL CX, AX // or eax, ecx
- NOP // (skipped) // mov rsp, rbp
- NOP // (skipped) // pop rbp
- VZEROUPPER // <-- // vzeroupper
- MOVL AX, ret+32(FP) // <--
- RET // <-- // ret
+return_no_tail:
+ WORD $0xe1c1; BYTE $0x10
+ WORD $0xc809
+ NOP
+ NOP
+ VZEROUPPER
+ MOVL AX, ret+32(FP)
+ RET
--- a/internal/adler32/adler32_neon.s
+++ b/internal/adler32/adler32_neon.s
@@ -17,192 +17,192 @@
MOVD buf_base+8(FP), R1
MOVD buf_len+16(FP), R2
MOVD buf_cap+24(FP), R3
- NOP // (skipped) // stp x29, x30, [sp, #-16]!
- ANDS $15, R1, R10 // <-- // ands x10, x1, #0xf
- ANDW $65535, R0, R8 // <-- // and w8, w0, #0xffff
- LSRW $16, R0, R9 // <-- // lsr w9, w0, #16
- NOP // (skipped) // mov x29, sp
- BEQ LBB0_4 // <-- // b.eq .LBB0_4
- ADD $1, R1, R11 // <-- // add x11, x1, #1
- MOVD R1, R12 // <-- // mov x12, x1
+ NOP
+ ANDS $15, R1, R10
+ ANDW $65535, R0, R8
+ LSRW $16, R0, R9
+ NOP
+ BEQ vector_loop_setup
+ ADD $1, R1, R11
+ MOVD R1, R12
-LBB0_2:
- WORD $0x3840158d // MOVBU.P 1(R12), R13 // ldrb w13, [x12], #1
- SUB $1, R2, R2 // <-- // sub x2, x2, #1
- TST $15, R11 // <-- // tst x11, #0xf
- ADD $1, R11, R11 // <-- // add x11, x11, #1
- ADDW R13, R8, R8 // <-- // add w8, w8, w13
- ADDW R9, R8, R9 // <-- // add w9, w8, w9
- BNE LBB0_2 // <-- // b.ne .LBB0_2
- MOVW $32881, R11 // <-- // mov w11, #32881
- MOVW $65521, R13 // <-- // mov w13, #65521
- MOVKW $(32775<<16), R11 // <-- // movk w11, #32775, lsl #16
- MOVW $4294901775, R12 // <-- // mov w12, #-65521
- MOVW $65520, R14 // <-- // mov w14, #65520
- SUB R10, R1, R10 // <-- // sub x10, x1, x10
- UMULL R11, R9, R11 // <-- // umull x11, w9, w11
- ADDW R12, R8, R12 // <-- // add w12, w8, w12
- CMPW R14, R8 // <-- // cmp w8, w14
- ADD $16, R10, R1 // <-- // add x1, x10, #16
- LSR $47, R11, R11 // <-- // lsr x11, x11, #47
- CSELW HI, R12, R8, R8 // <-- // csel w8, w12, w8, hi
- MSUBW R13, R9, R11, R9 // <-- // msub w9, w11, w13, w9
+align_loop:
+ WORD $0x3840158d
+ SUB $1, R2, R2
+ TST $15, R11
+ ADD $1, R11, R11
+ ADDW R13, R8, R8
+ ADDW R9, R8, R9
+ BNE align_loop
+ MOVW $32881, R11
+ MOVW $65521, R13
+ MOVKW $(32775<<16), R11
+ MOVW $4294901775, R12
+ MOVW $65520, R14
+ SUB R10, R1, R10
+ UMULL R11, R9, R11
+ ADDW R12, R8, R12
+ CMPW R14, R8
+ ADD $16, R10, R1
+ LSR $47, R11, R11
+ CSELW HI, R12, R8, R8
+ MSUBW R13, R9, R11, R9
-LBB0_4:
- AND $31, R2, R10 // <-- // and x10, x2, #0x1f
- CMP $32, R2 // <-- // cmp x2, #32
- BCC LBB0_9 // <-- // b.lo .LBB0_9
- MOVD $mult_table<>(SB), R11 // <-- // adrp x11, mult_table
- ADD $0, R11, R11 // <-- // add x11, x11, :lo12:mult_table
- MOVW $32881, R14 // <-- // mov w14, #32881
- MOVW $173, R12 // <-- // mov w12, #173
- MOVD $137438953440, R13 // <-- // mov x13, #137438953440
- MOVKW $(32775<<16), R14 // <-- // movk w14, #32775, lsl #16
- VLD1 (R11), [V0.H8, V1.H8, V2.H8, V3.H8] // <-- // ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x11]- LSR $5, R2, R11 // <-- // lsr x11, x2, #5
- MOVW $65521, R15 // <-- // mov w15, #65521
- VEXT $8, V0.B16, V0.B16, V4.B16 // <-- // ext v4.16b, v0.16b, v0.16b, #8
- VEXT $8, V1.B16, V1.B16, V5.B16 // <-- // ext v5.16b, v1.16b, v1.16b, #8
- VEXT $8, V2.B16, V2.B16, V6.B16 // <-- // ext v6.16b, v2.16b, v2.16b, #8
- VEXT $8, V3.B16, V3.B16, V7.B16 // <-- // ext v7.16b, v3.16b, v3.16b, #8
+vector_loop_setup:
+ AND $31, R2, R10
+ CMP $32, R2
+ BCC tail_entry
+ MOVD $mult_table<>(SB), R11
+ ADD $0, R11, R11
+ MOVW $32881, R14
+ MOVW $173, R12
+ MOVD $137438953440, R13
+ MOVKW $(32775<<16), R14
+ VLD1 (R11), [V0.H8, V1.H8, V2.H8, V3.H8]
+ LSR $5, R2, R11
+ MOVW $65521, R15
+ VEXT $8, V0.B16, V0.B16, V4.B16
+ VEXT $8, V1.B16, V1.B16, V5.B16
+ VEXT $8, V2.B16, V2.B16, V6.B16
+ VEXT $8, V3.B16, V3.B16, V7.B16
-LBB0_6:
- CMP $173, R11 // <-- // cmp x11, #173
- MOVD R1, R2 // <-- // mov x2, x1
- CSEL LO, R11, R12, R16 // <-- // csel x16, x11, x12, lo
- WORD $0x6f00e414 // VMOVI $0, V20.D2 // movi v20.2d, #0000000000000000
- MULW R16, R8, R0 // <-- // mul w0, w8, w16
- ADD R16<<5, R13, R17 // <-- // add x17, x13, x16, lsl #5
- WORD $0x6f00e410 // VMOVI $0, V16.D2 // movi v16.2d, #0000000000000000
- AND $137438953440, R17, R17 // <-- // and x17, x17, #0x1fffffffe0
- WORD $0x6f00e412 // VMOVI $0, V18.D2 // movi v18.2d, #0000000000000000
- WORD $0x6f00e413 // VMOVI $0, V19.D2 // movi v19.2d, #0000000000000000
- WORD $0x6f00e415 // VMOVI $0, V21.D2 // movi v21.2d, #0000000000000000
- VMOV R0, V20.S[3] // <-- // mov v20.s[3], w0
- MOVW R16, R0 // <-- // mov w0, w16
- WORD $0x6f00e411 // VMOVI $0, V17.D2 // movi v17.2d, #0000000000000000
+vector_outer_loop:
+ CMP $173, R11
+ MOVD R1, R2
+ CSEL LO, R11, R12, R16
+ WORD $0x6f00e414
+ MULW R16, R8, R0
+ ADD R16<<5, R13, R17
+ WORD $0x6f00e410
+ AND $137438953440, R17, R17
+ WORD $0x6f00e412
+ WORD $0x6f00e413
+ WORD $0x6f00e415
+ VMOV R0, V20.S[3]
+ MOVW R16, R0
+ WORD $0x6f00e411
-LBB0_7:
- WORD $0xacc15857 // FLDPQ.P 32(R2), (F23, F22) // ldp q23, q22, [x2], #32
- SUBSW $1, R0, R0 // <-- // subs w0, w0, #1
- VADD V17.S4, V20.S4, V20.S4 // <-- // add v20.4s, v20.4s, v17.4s
- WORD $0x2e3712b5 // VUADDW V23.B8, V21.H8, V21.H8 // uaddw v21.8h, v21.8h, v23.8b
- WORD $0x6e371273 // VUADDW2 V23.B16, V19.H8, V19.H8 // uaddw2 v19.8h, v19.8h, v23.16b
- WORD $0x6e202ad8 // VUADDLP V22.B16, V24.H8 // uaddlp v24.8h, v22.16b
- WORD $0x2e361252 // VUADDW V22.B8, V18.H8, V18.H8 // uaddw v18.8h, v18.8h, v22.8b
- WORD $0x6e361210 // VUADDW2 V22.B16, V16.H8, V16.H8 // uaddw2 v16.8h, v16.8h, v22.16b
- WORD $0x6e206af8 // VUADALP V23.B16, V24.H8 // uadalp v24.8h, v23.16b
- WORD $0x6e606b11 // VUADALP V24.H8, V17.S4 // uadalp v17.4s, v24.8h
- BNE LBB0_7 // <-- // b.ne .LBB0_7
- VSHL $5, V20.S4, V20.S4 // <-- // shl v20.4s, v20.4s, #5
- ADD R17, R1, R17 // <-- // add x17, x1, x17
- SUBS R16, R11, R11 // <-- // subs x11, x11, x16
- ADD $32, R17, R1 // <-- // add x1, x17, #32
- WORD $0x2e6082b4 // VUMLAL V0.H4, V21.H4, V20.S4 // umlal v20.4s, v21.4h, v0.4h
- VEXT $8, V21.B16, V21.B16, V21.B16 // <-- // ext v21.16b, v21.16b, v21.16b, #8
- WORD $0x2e6482b4 // VUMLAL V4.H4, V21.H4, V20.S4 // umlal v20.4s, v21.4h, v4.4h
- VEXT $8, V19.B16, V19.B16, V21.B16 // <-- // ext v21.16b, v19.16b, v19.16b, #8
- WORD $0x2e618274 // VUMLAL V1.H4, V19.H4, V20.S4 // umlal v20.4s, v19.4h, v1.4h
- VEXT $8, V18.B16, V18.B16, V19.B16 // <-- // ext v19.16b, v18.16b, v18.16b, #8
- WORD $0x2e6582b4 // VUMLAL V5.H4, V21.H4, V20.S4 // umlal v20.4s, v21.4h, v5.4h
- WORD $0x2e628254 // VUMLAL V2.H4, V18.H4, V20.S4 // umlal v20.4s, v18.4h, v2.4h
- WORD $0x2e668274 // VUMLAL V6.H4, V19.H4, V20.S4 // umlal v20.4s, v19.4h, v6.4h
- WORD $0x2e638214 // VUMLAL V3.H4, V16.H4, V20.S4 // umlal v20.4s, v16.4h, v3.4h
- VEXT $8, V16.B16, V16.B16, V16.B16 // <-- // ext v16.16b, v16.16b, v16.16b, #8
- WORD $0x2e678214 // VUMLAL V7.H4, V16.H4, V20.S4 // umlal v20.4s, v16.4h, v7.4h
- WORD $0x4eb1be30 // VADDP V17.S4, V17.S4, V16.S4 // addp v16.4s, v17.4s, v17.4s
- WORD $0x4eb4be91 // VADDP V20.S4, V20.S4, V17.S4 // addp v17.4s, v20.4s, v20.4s
- WORD $0x0eb1be10 // VADDP V17.S2, V16.S2, V16.S2 // addp v16.2s, v16.2s, v17.2s
- VMOV V16.S[1], R0 // <-- // mov w0, v16.s[1]
- FMOVS F16, R2 // <-- // fmov w2, s16
- ADDW R8, R2, R8 // <-- // add w8, w2, w8
- ADDW R9, R0, R9 // <-- // add w9, w0, w9
- UMULL R14, R8, R0 // <-- // umull x0, w8, w14
- UMULL R14, R9, R2 // <-- // umull x2, w9, w14
- LSR $47, R0, R0 // <-- // lsr x0, x0, #47
- LSR $47, R2, R2 // <-- // lsr x2, x2, #47
- MSUBW R15, R8, R0, R8 // <-- // msub w8, w0, w15, w8
- MSUBW R15, R9, R2, R9 // <-- // msub w9, w2, w15, w9
- BNE LBB0_6 // <-- // b.ne .LBB0_6
+vector_inner_loop:
+ WORD $0xacc15857
+ SUBSW $1, R0, R0
+ VADD V17.S4, V20.S4, V20.S4
+ WORD $0x2e3712b5
+ WORD $0x6e371273
+ WORD $0x6e202ad8
+ WORD $0x2e361252
+ WORD $0x6e361210
+ WORD $0x6e206af8
+ WORD $0x6e606b11
+ BNE vector_inner_loop
+ VSHL $5, V20.S4, V20.S4
+ ADD R17, R1, R17
+ SUBS R16, R11, R11
+ ADD $32, R17, R1
+ WORD $0x2e6082b4
+ VEXT $8, V21.B16, V21.B16, V21.B16
+ WORD $0x2e6482b4
+ VEXT $8, V19.B16, V19.B16, V21.B16
+ WORD $0x2e618274
+ VEXT $8, V18.B16, V18.B16, V19.B16
+ WORD $0x2e6582b4
+ WORD $0x2e628254
+ WORD $0x2e668274
+ WORD $0x2e638214
+ VEXT $8, V16.B16, V16.B16, V16.B16
+ WORD $0x2e678214
+ WORD $0x4eb1be30
+ WORD $0x4eb4be91
+ WORD $0x0eb1be10
+ VMOV V16.S[1], R0
+ FMOVS F16, R2
+ ADDW R8, R2, R8
+ ADDW R9, R0, R9
+ UMULL R14, R8, R0
+ UMULL R14, R9, R2
+ LSR $47, R0, R0
+ LSR $47, R2, R2
+ MSUBW R15, R8, R0, R8
+ MSUBW R15, R9, R2, R9
+ BNE vector_outer_loop
-LBB0_9:
- CBZ R10, LBB0_15 // <-- // cbz x10, .LBB0_15
- CMP $16, R10 // <-- // cmp x10, #16
- BCC LBB0_13 // <-- // b.lo .LBB0_13
- WORD $0x3940002b // MOVBU (R1), R11 // ldrb w11, [x1]
- SUBS $16, R10, R10 // <-- // subs x10, x10, #16
- WORD $0x3940042c // MOVBU 1(R1), R12 // ldrb w12, [x1, #1]
- WORD $0x3940082d // MOVBU 2(R1), R13 // ldrb w13, [x1, #2]
- ADDW R11, R8, R8 // <-- // add w8, w8, w11
- WORD $0x39400c2b // MOVBU 3(R1), R11 // ldrb w11, [x1, #3]
- ADDW R9, R8, R9 // <-- // add w9, w8, w9
- ADDW R12, R8, R8 // <-- // add w8, w8, w12
- WORD $0x3940102c // MOVBU 4(R1), R12 // ldrb w12, [x1, #4]
- ADDW R8, R9, R9 // <-- // add w9, w9, w8
- ADDW R13, R8, R8 // <-- // add w8, w8, w13
- WORD $0x3940142d // MOVBU 5(R1), R13 // ldrb w13, [x1, #5]
- ADDW R8, R9, R9 // <-- // add w9, w9, w8
- ADDW R11, R8, R8 // <-- // add w8, w8, w11
- WORD $0x3940182b // MOVBU 6(R1), R11 // ldrb w11, [x1, #6]
- ADDW R8, R9, R9 // <-- // add w9, w9, w8
- ADDW R12, R8, R8 // <-- // add w8, w8, w12
- WORD $0x39401c2c // MOVBU 7(R1), R12 // ldrb w12, [x1, #7]
- ADDW R8, R9, R9 // <-- // add w9, w9, w8
- ADDW R13, R8, R8 // <-- // add w8, w8, w13
- ADDW R8, R9, R9 // <-- // add w9, w9, w8
- ADDW R11, R8, R8 // <-- // add w8, w8, w11
- WORD $0x3940202b // MOVBU 8(R1), R11 // ldrb w11, [x1, #8]
- ADDW R8, R9, R9 // <-- // add w9, w9, w8
- ADDW R12, R8, R8 // <-- // add w8, w8, w12
- WORD $0x3940242c // MOVBU 9(R1), R12 // ldrb w12, [x1, #9]
- ADDW R8, R9, R9 // <-- // add w9, w9, w8
- WORD $0x3940382d // MOVBU 14(R1), R13 // ldrb w13, [x1, #14]
- ADDW R11, R8, R8 // <-- // add w8, w8, w11
- WORD $0x3940282b // MOVBU 10(R1), R11 // ldrb w11, [x1, #10]
- ADDW R8, R9, R9 // <-- // add w9, w9, w8
- ADDW R12, R8, R8 // <-- // add w8, w8, w12
- WORD $0x39402c2c // MOVBU 11(R1), R12 // ldrb w12, [x1, #11]
- ADDW R8, R9, R9 // <-- // add w9, w9, w8
- ADDW R11, R8, R8 // <-- // add w8, w8, w11
- WORD $0x3940302b // MOVBU 12(R1), R11 // ldrb w11, [x1, #12]
- ADDW R8, R9, R9 // <-- // add w9, w9, w8
- ADDW R12, R8, R8 // <-- // add w8, w8, w12
- WORD $0x3940342c // MOVBU 13(R1), R12 // ldrb w12, [x1, #13]
- ADDW R8, R9, R9 // <-- // add w9, w9, w8
- ADDW R11, R8, R8 // <-- // add w8, w8, w11
- WORD $0x39403c2b // MOVBU 15(R1), R11 // ldrb w11, [x1, #15]
- ADDW R8, R9, R9 // <-- // add w9, w9, w8
- ADDW R12, R8, R8 // <-- // add w8, w8, w12
- ADDW R8, R9, R9 // <-- // add w9, w9, w8
- ADDW R13, R8, R8 // <-- // add w8, w8, w13
- ADDW R8, R9, R9 // <-- // add w9, w9, w8
- ADDW R11, R8, R8 // <-- // add w8, w8, w11
- ADDW R8, R9, R9 // <-- // add w9, w9, w8
- BEQ LBB0_14 // <-- // b.eq .LBB0_14
- ADD $16, R1, R1 // <-- // add x1, x1, #16
+tail_entry:
+ CBZ R10, return_result
+ CMP $16, R10
+ BCC tail_byte_loop
+ WORD $0x3940002b
+ SUBS $16, R10, R10
+ WORD $0x3940042c
+ WORD $0x3940082d
+ ADDW R11, R8, R8
+ WORD $0x39400c2b
+ ADDW R9, R8, R9
+ ADDW R12, R8, R8
+ WORD $0x3940102c
+ ADDW R8, R9, R9
+ ADDW R13, R8, R8
+ WORD $0x3940142d
+ ADDW R8, R9, R9
+ ADDW R11, R8, R8
+ WORD $0x3940182b
+ ADDW R8, R9, R9
+ ADDW R12, R8, R8
+ WORD $0x39401c2c
+ ADDW R8, R9, R9
+ ADDW R13, R8, R8
+ ADDW R8, R9, R9
+ ADDW R11, R8, R8
+ WORD $0x3940202b
+ ADDW R8, R9, R9
+ ADDW R12, R8, R8
+ WORD $0x3940242c
+ ADDW R8, R9, R9
+ WORD $0x3940382d
+ ADDW R11, R8, R8
+ WORD $0x3940282b
+ ADDW R8, R9, R9
+ ADDW R12, R8, R8
+ WORD $0x39402c2c
+ ADDW R8, R9, R9
+ ADDW R11, R8, R8
+ WORD $0x3940302b
+ ADDW R8, R9, R9
+ ADDW R12, R8, R8
+ WORD $0x3940342c
+ ADDW R8, R9, R9
+ ADDW R11, R8, R8
+ WORD $0x39403c2b
+ ADDW R8, R9, R9
+ ADDW R12, R8, R8
+ ADDW R8, R9, R9
+ ADDW R13, R8, R8
+ ADDW R8, R9, R9
+ ADDW R11, R8, R8
+ ADDW R8, R9, R9
+ BEQ final_reduce
+ ADD $16, R1, R1
-LBB0_13:
- WORD $0x3840142b // MOVBU.P 1(R1), R11 // ldrb w11, [x1], #1
- SUBS $1, R10, R10 // <-- // subs x10, x10, #1
- ADDW R11, R8, R8 // <-- // add w8, w8, w11
- ADDW R9, R8, R9 // <-- // add w9, w8, w9
- BNE LBB0_13 // <-- // b.ne .LBB0_13
+tail_byte_loop:
+ WORD $0x3840142b
+ SUBS $1, R10, R10
+ ADDW R11, R8, R8
+ ADDW R9, R8, R9
+ BNE tail_byte_loop
-LBB0_14:
- MOVW $32881, R10 // <-- // mov w10, #32881
- MOVW $65521, R12 // <-- // mov w12, #65521
- MOVKW $(32775<<16), R10 // <-- // movk w10, #32775, lsl #16
- MOVW $4294901775, R11 // <-- // mov w11, #-65521
- MOVW $65520, R13 // <-- // mov w13, #65520
- ADDW R11, R8, R11 // <-- // add w11, w8, w11
- UMULL R10, R9, R10 // <-- // umull x10, w9, w10
- CMPW R13, R8 // <-- // cmp w8, w13
- CSELW HI, R11, R8, R8 // <-- // csel w8, w11, w8, hi
- LSR $47, R10, R10 // <-- // lsr x10, x10, #47
- MSUBW R12, R9, R10, R9 // <-- // msub w9, w10, w12, w9
+final_reduce:
+ MOVW $32881, R10
+ MOVW $65521, R12
+ MOVKW $(32775<<16), R10
+ MOVW $4294901775, R11
+ MOVW $65520, R13
+ ADDW R11, R8, R11
+ UMULL R10, R9, R10
+ CMPW R13, R8
+ CSELW HI, R11, R8, R8
+ LSR $47, R10, R10
+ MSUBW R12, R9, R10, R9
-LBB0_15:
- ORRW R9<<16, R8, R0 // <-- // orr w0, w8, w9, lsl #16
- NOP // (skipped) // ldp x29, x30, [sp], #16
- MOVW R0, ret+32(FP) // <--
- RET // <-- // ret
+return_result:
+ ORRW R9<<16, R8, R0
+ NOP
+ MOVW R0, ret+32(FP)
+ RET
--- a/internal/adler32/adler32_sse3.s
+++ b/internal/adler32/adler32_sse3.s
@@ -19,196 +19,196 @@
MOVQ buf_base+8(FP), SI
MOVQ buf_len+16(FP), DX
MOVQ buf_cap+24(FP), CX
- NOP // (skipped) // push rbp
- NOP // (skipped) // mov rbp, rsp
- NOP // (skipped) // and rsp, -8
- WORD $0xf889 // MOVL DI, AX // mov eax, edi
- LONG $0xc8b70f44 // MOVZX AX, R9 // movzx r9d, ax
- WORD $0xe8c1; BYTE $0x10 // SHRL $0x10, AX // shr eax, 16
- WORD $0xd189 // MOVL DX, CX // mov ecx, edx
- WORD $0xe183; BYTE $0x1f // ANDL $0x1f, CX // and ecx, 31
- CMPQ DX, $0x20 // <-- // cmp rdx, 32
- JAE LBB0_2 // <-- // jae .LBB0_2
- WORD $0x8944; BYTE $0xcf // MOVL R9, DI // mov edi, r9d
- JMP LBB0_6 // <-- // jmp .LBB0_6
+ NOP
+ NOP
+ NOP
+ WORD $0xf889
+ LONG $0xc8b70f44
+ WORD $0xe8c1; BYTE $0x10
+ WORD $0xd189
+ WORD $0xe183; BYTE $0x1f
+ CMPQ DX, $0x20
+ JAE block_loop_setup
+ WORD $0x8944; BYTE $0xcf
+ JMP tail_entry
-LBB0_2:
- SHRQ $0x5, DX // <-- // shr rdx, 5
- LONG $0xc0ef0f66 // PXOR X0, X0 // pxor xmm0, xmm0
- MOVO LCPI0_0<>(SB), X1 // <-- // movdqa xmm1, xmmword ptr [rip + .LCPI0_0]
- MOVO LCPI0_1<>(SB), X2 // <-- // movdqa xmm2, xmmword ptr [rip + .LCPI0_1]
- MOVO LCPI0_2<>(SB), X3 // <-- // movdqa xmm3, xmmword ptr [rip + .LCPI0_2]
- LONG $0x8071b841; WORD $0x8007 // MOVL $-0x7ff87f8f, R8 // mov r8d, 2147975281
+block_loop_setup:
+ SHRQ $0x5, DX
+ LONG $0xc0ef0f66
+ MOVO LCPI0_0<>(SB), X1
+ MOVO LCPI0_1<>(SB), X2
+ MOVO LCPI0_2<>(SB), X3
+ LONG $0x8071b841; WORD $0x8007
-LBB0_3:
- CMPQ DX, $0xad // <-- // cmp rdx, 173
- LONG $0x00adba41; WORD $0x0000 // MOVL $0xad, R10 // mov r10d, 173
- LONG $0xd2420f4c // CMOVB DX, R10 // cmovb r10, rdx
- WORD $0x8944; BYTE $0xcf // MOVL R9, DI // mov edi, r9d
- LONG $0xfaaf0f41 // IMULL R10, DI // imul edi, r10d
- LONG $0xef6e0f66 // MOVD DI, X5 // movd xmm5, edi
- LONG $0xe06e0f66 // MOVD AX, X4 // movd xmm4, eax
- WORD $0x8944; BYTE $0xd0 // MOVL R10, AX // mov eax, r10d
- LONG $0xf6ef0f66 // PXOR X6, X6 // pxor xmm6, xmm6
+block_outer_loop:
+ CMPQ DX, $0xad
+ LONG $0x00adba41; WORD $0x0000
+ LONG $0xd2420f4c
+ WORD $0x8944; BYTE $0xcf
+ LONG $0xfaaf0f41
+ LONG $0xef6e0f66
+ LONG $0xe06e0f66
+ WORD $0x8944; BYTE $0xd0
+ LONG $0xf6ef0f66
-LBB0_4:
- LONG $0x3e6f0ff3 // MOVDQU 0(SI), X7 // movdqu xmm7, xmmword ptr [rsi]
- LONG $0x6f0f4466; BYTE $0xc7 // MOVDQA X7, X8 // movdqa xmm8, xmm7
- LONG $0x04380f66; BYTE $0xf9 // PMADDUBSW X1, X7 // pmaddubsw xmm7, xmm1
- LONG $0xfaf50f66 // PMADDWD X2, X7 // pmaddwd xmm7, xmm2
- LONG $0xfcfe0f66 // PADDD X4, X7 // paddd xmm7, xmm4
- LONG $0x666f0ff3; BYTE $0x10 // MOVDQU 0x10(SI), X4 // movdqu xmm4, xmmword ptr [rsi + 16]
- LONG $0xeefe0f66 // PADDD X6, X5 // paddd xmm5, xmm6
- LONG $0xf60f4466; BYTE $0xc0 // PSADBW X0, X8 // psadbw xmm8, xmm0
- LONG $0xfe0f4466; BYTE $0xc6 // PADDD X6, X8 // paddd xmm8, xmm6
- LONG $0xf46f0f66 // MOVDQA X4, X6 // movdqa xmm6, xmm4
- LONG $0xf0f60f66 // PSADBW X0, X6 // psadbw xmm6, xmm0
- LONG $0xfe0f4166; BYTE $0xf0 // PADDD X8, X6 // paddd xmm6, xmm8
- LONG $0x04380f66; BYTE $0xe3 // PMADDUBSW X3, X4 // pmaddubsw xmm4, xmm3
- LONG $0xe2f50f66 // PMADDWD X2, X4 // pmaddwd xmm4, xmm2
- LONG $0xe7fe0f66 // PADDD X7, X4 // paddd xmm4, xmm7
- ADDQ $0x20, SI // <-- // add rsi, 32
- WORD $0xc8ff // DECL AX // dec eax
- JNE LBB0_4 // <-- // jne .LBB0_4
- LONG $0xf5720f66; BYTE $0x05 // PSLLD $0x5, X5 // pslld xmm5, 5
- LONG $0xe5fe0f66 // PADDD X5, X4 // paddd xmm4, xmm5
- LONG $0xee700f66; BYTE $0xb1 // PSHUFD $0xb1, X6, X5 // pshufd xmm5, xmm6, 177
- LONG $0xeefe0f66 // PADDD X6, X5 // paddd xmm5, xmm6
- LONG $0xf5700f66; BYTE $0xee // PSHUFD $0xee, X5, X6 // pshufd xmm6, xmm5, 238
- LONG $0xf5fe0f66 // PADDD X5, X6 // paddd xmm6, xmm5
- LONG $0xf77e0f66 // MOVD X6, DI // movd edi, xmm6
- WORD $0x0144; BYTE $0xcf // ADDL R9, DI // add edi, r9d
- LONG $0xec700f66; BYTE $0xb1 // PSHUFD $0xb1, X4, X5 // pshufd xmm5, xmm4, 177
- LONG $0xecfe0f66 // PADDD X4, X5 // paddd xmm5, xmm4
- LONG $0xe5700f66; BYTE $0xee // PSHUFD $0xee, X5, X4 // pshufd xmm4, xmm5, 238
- LONG $0xe5fe0f66 // PADDD X5, X4 // paddd xmm4, xmm5
- LONG $0xe07e0f66 // MOVD X4, AX // movd eax, xmm4
- MOVQ DI, R9 // <-- // mov r9, rdi
- IMULQ R8, R9 // <-- // imul r9, r8
- SHRQ $0x2f, R9 // <-- // shr r9, 47
- LONG $0xf1c96945; WORD $0x00ff; BYTE $0x00 // IMULL $0xfff1, R9, R9 // imul r9d, r9d, 65521
- WORD $0x2944; BYTE $0xcf // SUBL R9, DI // sub edi, r9d
- MOVQ AX, R9 // <-- // mov r9, rax
- IMULQ R8, R9 // <-- // imul r9, r8
- SHRQ $0x2f, R9 // <-- // shr r9, 47
- LONG $0xf1c96945; WORD $0x00ff; BYTE $0x00 // IMULL $0xfff1, R9, R9 // imul r9d, r9d, 65521
- WORD $0x2944; BYTE $0xc8 // SUBL R9, AX // sub eax, r9d
- WORD $0x8941; BYTE $0xf9 // MOVL DI, R9 // mov r9d, edi
- SUBQ R10, DX // <-- // sub rdx, r10
- JNE LBB0_3 // <-- // jne .LBB0_3
+block_inner_loop:
+ LONG $0x3e6f0ff3
+ LONG $0x6f0f4466; BYTE $0xc7
+ LONG $0x04380f66; BYTE $0xf9
+ LONG $0xfaf50f66
+ LONG $0xfcfe0f66
+ LONG $0x666f0ff3; BYTE $0x10
+ LONG $0xeefe0f66
+ LONG $0xf60f4466; BYTE $0xc0
+ LONG $0xfe0f4466; BYTE $0xc6
+ LONG $0xf46f0f66
+ LONG $0xf0f60f66
+ LONG $0xfe0f4166; BYTE $0xf0
+ LONG $0x04380f66; BYTE $0xe3
+ LONG $0xe2f50f66
+ LONG $0xe7fe0f66
+ ADDQ $0x20, SI
+ WORD $0xc8ff
+ JNE block_inner_loop
+ LONG $0xf5720f66; BYTE $0x05
+ LONG $0xe5fe0f66
+ LONG $0xee700f66; BYTE $0xb1
+ LONG $0xeefe0f66
+ LONG $0xf5700f66; BYTE $0xee
+ LONG $0xf5fe0f66
+ LONG $0xf77e0f66
+ WORD $0x0144; BYTE $0xcf
+ LONG $0xec700f66; BYTE $0xb1
+ LONG $0xecfe0f66
+ LONG $0xe5700f66; BYTE $0xee
+ LONG $0xe5fe0f66
+ LONG $0xe07e0f66
+ MOVQ DI, R9
+ IMULQ R8, R9
+ SHRQ $0x2f, R9
+ LONG $0xf1c96945; WORD $0x00ff; BYTE $0x00
+ WORD $0x2944; BYTE $0xcf
+ MOVQ AX, R9
+ IMULQ R8, R9
+ SHRQ $0x2f, R9
+ LONG $0xf1c96945; WORD $0x00ff; BYTE $0x00
+ WORD $0x2944; BYTE $0xc8
+ WORD $0x8941; BYTE $0xf9
+ SUBQ R10, DX
+ JNE block_outer_loop
-LBB0_6:
- WORD $0x8548; BYTE $0xc9 // TESTQ CX, CX // test rcx, rcx
- JE LBB0_18 // <-- // je .LBB0_18
- CMPL CX, $0x10 // <-- // cmp ecx, 16
- JB LBB0_10 // <-- // jb .LBB0_10
- WORD $0xb60f; BYTE $0x16 // MOVZX 0(SI), DX // movzx edx, byte ptr [rsi]
- WORD $0xd701 // ADDL DX, DI // add edi, edx
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- LONG $0x0156b60f // MOVZX 0x1(SI), DX // movzx edx, byte ptr [rsi + 1]
- WORD $0xfa01 // ADDL DI, DX // add edx, edi
- WORD $0xd001 // ADDL DX, AX // add eax, edx
- LONG $0x027eb60f // MOVZX 0x2(SI), DI // movzx edi, byte ptr [rsi + 2]
- WORD $0xd701 // ADDL DX, DI // add edi, edx
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- LONG $0x0356b60f // MOVZX 0x3(SI), DX // movzx edx, byte ptr [rsi + 3]
- WORD $0xfa01 // ADDL DI, DX // add edx, edi
- WORD $0xd001 // ADDL DX, AX // add eax, edx
- LONG $0x047eb60f // MOVZX 0x4(SI), DI // movzx edi, byte ptr [rsi + 4]
- WORD $0xd701 // ADDL DX, DI // add edi, edx
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- LONG $0x0556b60f // MOVZX 0x5(SI), DX // movzx edx, byte ptr [rsi + 5]
- WORD $0xfa01 // ADDL DI, DX // add edx, edi
- WORD $0xd001 // ADDL DX, AX // add eax, edx
- LONG $0x067eb60f // MOVZX 0x6(SI), DI // movzx edi, byte ptr [rsi + 6]
- WORD $0xd701 // ADDL DX, DI // add edi, edx
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- LONG $0x0756b60f // MOVZX 0x7(SI), DX // movzx edx, byte ptr [rsi + 7]
- WORD $0xfa01 // ADDL DI, DX // add edx, edi
- WORD $0xd001 // ADDL DX, AX // add eax, edx
- LONG $0x087eb60f // MOVZX 0x8(SI), DI // movzx edi, byte ptr [rsi + 8]
- WORD $0xd701 // ADDL DX, DI // add edi, edx
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- LONG $0x0956b60f // MOVZX 0x9(SI), DX // movzx edx, byte ptr [rsi + 9]
- WORD $0xfa01 // ADDL DI, DX // add edx, edi
- WORD $0xd001 // ADDL DX, AX // add eax, edx
- LONG $0x0a7eb60f // MOVZX 0xa(SI), DI // movzx edi, byte ptr [rsi + 10]
- WORD $0xd701 // ADDL DX, DI // add edi, edx
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- LONG $0x0b56b60f // MOVZX 0xb(SI), DX // movzx edx, byte ptr [rsi + 11]
- WORD $0xfa01 // ADDL DI, DX // add edx, edi
- WORD $0xd001 // ADDL DX, AX // add eax, edx
- LONG $0x0c7eb60f // MOVZX 0xc(SI), DI // movzx edi, byte ptr [rsi + 12]
- WORD $0xd701 // ADDL DX, DI // add edi, edx
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- LONG $0x0d56b60f // MOVZX 0xd(SI), DX // movzx edx, byte ptr [rsi + 13]
- WORD $0xfa01 // ADDL DI, DX // add edx, edi
- WORD $0xd001 // ADDL DX, AX // add eax, edx
- LONG $0x46b60f44; BYTE $0x0e // MOVZX 0xe(SI), R8 // movzx r8d, byte ptr [rsi + 14]
- WORD $0x0141; BYTE $0xd0 // ADDL DX, R8 // add r8d, edx
- WORD $0x0144; BYTE $0xc0 // ADDL R8, AX // add eax, r8d
- LONG $0x0f7eb60f // MOVZX 0xf(SI), DI // movzx edi, byte ptr [rsi + 15]
- WORD $0x0144; BYTE $0xc7 // ADDL R8, DI // add edi, r8d
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- ADDQ $-0x10, CX // <-- // add rcx, -16
- JE LBB0_17 // <-- // je .LBB0_17
- ADDQ $0x10, SI // <-- // add rsi, 16
+tail_entry:
+ WORD $0x8548; BYTE $0xc9
+ JE return_result
+ CMPL CX, $0x10
+ JB tail_bytes_setup
+ WORD $0xb60f; BYTE $0x16
+ WORD $0xd701
+ WORD $0xf801
+ LONG $0x0156b60f
+ WORD $0xfa01
+ WORD $0xd001
+ LONG $0x027eb60f
+ WORD $0xd701
+ WORD $0xf801
+ LONG $0x0356b60f
+ WORD $0xfa01
+ WORD $0xd001
+ LONG $0x047eb60f
+ WORD $0xd701
+ WORD $0xf801
+ LONG $0x0556b60f
+ WORD $0xfa01
+ WORD $0xd001
+ LONG $0x067eb60f
+ WORD $0xd701
+ WORD $0xf801
+ LONG $0x0756b60f
+ WORD $0xfa01
+ WORD $0xd001
+ LONG $0x087eb60f
+ WORD $0xd701
+ WORD $0xf801
+ LONG $0x0956b60f
+ WORD $0xfa01
+ WORD $0xd001
+ LONG $0x0a7eb60f
+ WORD $0xd701
+ WORD $0xf801
+ LONG $0x0b56b60f
+ WORD $0xfa01
+ WORD $0xd001
+ LONG $0x0c7eb60f
+ WORD $0xd701
+ WORD $0xf801
+ LONG $0x0d56b60f
+ WORD $0xfa01
+ WORD $0xd001
+ LONG $0x46b60f44; BYTE $0x0e
+ WORD $0x0141; BYTE $0xd0
+ WORD $0x0144; BYTE $0xc0
+ LONG $0x0f7eb60f
+ WORD $0x0144; BYTE $0xc7
+ WORD $0xf801
+ ADDQ $-0x10, CX
+ JE final_reduce
+ ADDQ $0x10, SI
-LBB0_10:
- LEAQ -0x1(CX), DX // <-- // lea rdx, [rcx - 1]
- MOVQ CX, R9 // <-- // mov r9, rcx
- ANDQ $0x3, R9 // <-- // and r9, 3
- JE LBB0_14 // <-- // je .LBB0_14
- XORL R8, R8 // <-- // xor r8d, r8d
+tail_bytes_setup:
+ LEAQ -0x1(CX), DX
+ MOVQ CX, R9
+ ANDQ $0x3, R9
+ JE tail_dword_setup
+ XORL R8, R8
-LBB0_12:
- LONG $0x14b60f46; BYTE $0x06 // MOVZX 0(SI)(R8*1), R10 // movzx r10d, byte ptr [rsi + r8]
- WORD $0x0144; BYTE $0xd7 // ADDL R10, DI // add edi, r10d
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- INCQ R8 // <-- // inc r8
- CMPQ R9, R8 // <-- // cmp r9, r8
- JNE LBB0_12 // <-- // jne .LBB0_12
- ADDQ R8, SI // <-- // add rsi, r8
- SUBQ R8, CX // <-- // sub rcx, r8
+tail_byte_loop:
+ LONG $0x14b60f46; BYTE $0x06
+ WORD $0x0144; BYTE $0xd7
+ WORD $0xf801
+ INCQ R8
+ CMPQ R9, R8
+ JNE tail_byte_loop
+ ADDQ R8, SI
+ SUBQ R8, CX
-LBB0_14:
- CMPQ DX, $0x3 // <-- // cmp rdx, 3
- JB LBB0_17 // <-- // jb .LBB0_17
- XORL DX, DX // <-- // xor edx, edx
+tail_dword_setup:
+ CMPQ DX, $0x3
+ JB final_reduce
+ XORL DX, DX
-LBB0_16:
- LONG $0x04b60f44; BYTE $0x16 // MOVZX 0(SI)(DX*1), R8 // movzx r8d, byte ptr [rsi + rdx]
- WORD $0x0141; BYTE $0xf8 // ADDL DI, R8 // add r8d, edi
- WORD $0x0144; BYTE $0xc0 // ADDL R8, AX // add eax, r8d
- LONG $0x167cb60f; BYTE $0x01 // MOVZX 0x1(SI)(DX*1), DI // movzx edi, byte ptr [rsi + rdx + 1]
- WORD $0x0144; BYTE $0xc7 // ADDL R8, DI // add edi, r8d
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- LONG $0x44b60f44; WORD $0x0216 // MOVZX 0x2(SI)(DX*1), R8 // movzx r8d, byte ptr [rsi + rdx + 2]
- WORD $0x0141; BYTE $0xf8 // ADDL DI, R8 // add r8d, edi
- WORD $0x0144; BYTE $0xc0 // ADDL R8, AX // add eax, r8d
- LONG $0x167cb60f; BYTE $0x03 // MOVZX 0x3(SI)(DX*1), DI // movzx edi, byte ptr [rsi + rdx + 3]
- WORD $0x0144; BYTE $0xc7 // ADDL R8, DI // add edi, r8d
- WORD $0xf801 // ADDL DI, AX // add eax, edi
- ADDQ $0x4, DX // <-- // add rdx, 4
- CMPQ CX, DX // <-- // cmp rcx, rdx
- JNE LBB0_16 // <-- // jne .LBB0_16
+tail_dword_loop:
+ LONG $0x04b60f44; BYTE $0x16
+ WORD $0x0141; BYTE $0xf8
+ WORD $0x0144; BYTE $0xc0
+ LONG $0x167cb60f; BYTE $0x01
+ WORD $0x0144; BYTE $0xc7
+ WORD $0xf801
+ LONG $0x44b60f44; WORD $0x0216
+ WORD $0x0141; BYTE $0xf8
+ WORD $0x0144; BYTE $0xc0
+ LONG $0x167cb60f; BYTE $0x03
+ WORD $0x0144; BYTE $0xc7
+ WORD $0xf801
+ ADDQ $0x4, DX
+ CMPQ CX, DX
+ JNE tail_dword_loop
-LBB0_17:
- LONG $0x000f8f8d; WORD $0xffff // LEAL -0xfff1(DI), CX // lea ecx, [rdi - 65521]
- CMPL DI, $0xfff1 // <-- // cmp edi, 65521
- WORD $0x420f; BYTE $0xcf // CMOVB DI, CX // cmovb ecx, edi
- WORD $0xc289 // MOVL AX, DX // mov edx, eax
- LONG $0x078071be; BYTE $0x80 // MOVL $-0x7ff87f8f, SI // mov esi, 2147975281
- IMULQ DX, SI // <-- // imul rsi, rdx
- SHRQ $0x2f, SI // <-- // shr rsi, 47
- LONG $0xfff1d669; WORD $0x0000 // IMULL $0xfff1, SI, DX // imul edx, esi, 65521
- WORD $0xd029 // SUBL DX, AX // sub eax, edx
- WORD $0xcf89 // MOVL CX, DI // mov edi, ecx
+final_reduce:
+ LONG $0x000f8f8d; WORD $0xffff
+ CMPL DI, $0xfff1
+ WORD $0x420f; BYTE $0xcf
+ WORD $0xc289
+ LONG $0x078071be; BYTE $0x80
+ IMULQ DX, SI
+ SHRQ $0x2f, SI
+ LONG $0xfff1d669; WORD $0x0000
+ WORD $0xd029
+ WORD $0xcf89
-LBB0_18:
- WORD $0xe0c1; BYTE $0x10 // SHLL $0x10, AX // shl eax, 16
- WORD $0xf809 // ORL DI, AX // or eax, edi
- NOP // (skipped) // mov rsp, rbp
- NOP // (skipped) // pop rbp
- MOVL AX, ret+32(FP) // <--
- RET // <-- // ret
+return_result:
+ WORD $0xe0c1; BYTE $0x10
+ WORD $0xf809
+ NOP
+ NOP
+ MOVL AX, ret+32(FP)
+ RET
--
⑨