shithub: furgit

Download patch

ref: 4d930c1478f6b9e4851e84e139b4cdf4450fb963
parent: fafbd2f4802df3b858e0e12c3472a4498c20854b
author: Runxi Yu <me@runxiyu.org>
date: Fri Dec 19 17:51:57 EST 2025

adler32: Use meaningful label names and remove generated comments

--- a/internal/adler32/adler32_avx2.s
+++ b/internal/adler32/adler32_avx2.s
@@ -22,242 +22,242 @@
 	MOVQ         buf_base+8(FP), SI
 	MOVQ         buf_len+16(FP), DX
 	MOVQ         buf_cap+24(FP), CX
-	WORD         $0x8548; BYTE $0xf6     // TESTQ SI, SI                         // test	rsi, rsi
-	JE           LBB0_1                  // <--                                  // je	.LBB0_1
-	WORD         $0xf889                 // MOVL DI, AX                          // mov	eax, edi
-	WORD         $0x8548; BYTE $0xd2     // TESTQ DX, DX                         // test	rdx, rdx
-	JE           LBB0_2                  // <--                                  // je	.LBB0_2
-	NOP                                  // (skipped)                            // push	rbp
-	NOP                                  // (skipped)                            // mov	rbp, rsp
-	NOP                                  // (skipped)                            // and	rsp, -8
-	WORD         $0xc189                 // MOVL AX, CX                          // mov	ecx, eax
-	WORD         $0xe9c1; BYTE $0x10     // SHRL $0x10, CX                       // shr	ecx, 16
-	WORD         $0xb70f; BYTE $0xc0     // MOVZX AX, AX                         // movzx	eax, ax
-	CMPQ         DX, $0x20               // <--                                  // cmp	rdx, 32
-	JB           LBB0_17                 // <--                                  // jb	.LBB0_17
-	LONG         $0x078071bf; BYTE $0x80 // MOVL $-0x7ff87f8f, DI                // mov	edi, 2147975281
-	LONG         $0xc0eff9c5             // VPXOR X0, X0, X0                     // vpxor	xmm0, xmm0, xmm0
-	VMOVDQA      LCPI0_0<>(SB), Y1       // <--                                  // vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_0]
-	VPBROADCASTW LCPI0_2<>(SB), Y2       // <--                                  // vpbroadcastw	ymm2, word ptr [rip + .LCPI0_2]
-	JMP          LBB0_6                  // <--                                  // jmp	.LBB0_6
+	WORD         $0x8548; BYTE $0xf6
+	JE           return_one
+	WORD         $0xf889
+	WORD         $0x8548; BYTE $0xd2
+	JE           return_result
+	NOP
+	NOP
+	NOP
+	WORD         $0xc189
+	WORD         $0xe9c1; BYTE $0x10
+	WORD         $0xb70f; BYTE $0xc0
+	CMPQ         DX, $0x20
+	JB           tail16_check
+	LONG         $0x078071bf; BYTE $0x80
+	LONG         $0xc0eff9c5
+	VMOVDQA      LCPI0_0<>(SB), Y1
+	VPBROADCASTW LCPI0_2<>(SB), Y2
+	JMP          block_loop_setup
 
-LBB0_7:
-	LONG $0xf46ffdc5 // VMOVDQA Y4, Y6                       // vmovdqa	ymm6, ymm4
-	LONG $0xedefd1c5 // VPXOR X5, X5, X5                     // vpxor	xmm5, xmm5, xmm5
+block_accum_init:
+	LONG $0xf46ffdc5
+	LONG $0xedefd1c5
 
-LBB0_14:
-	SUBQ  AX, DX                                // <--                                  // sub	rdx, rax
-	LONG  $0xf572ddc5; BYTE $0x05               // ?                                    // vpslld	ymm4, ymm5, 5
-	LONG  $0xdbfeddc5                           // VPADDD Y3, Y4, Y3                    // vpaddd	ymm3, ymm4, ymm3
-	LONG  $0x397de3c4; WORD $0x01f4             // VEXTRACTI128 $0x1, Y6, X4            // vextracti128	xmm4, ymm6, 1
-	LONG  $0xecc6c8c5; BYTE $0x88               // VSHUFPS $-0x78, X4, X6, X5           // vshufps	xmm5, xmm6, xmm4, 136
-	LONG  $0xe470f9c5; BYTE $0x88               // VPSHUFD $-0x78, X4, X4               // vpshufd	xmm4, xmm4, 136
-	LONG  $0xe4fed1c5                           // VPADDD X4, X5, X4                    // vpaddd	xmm4, xmm5, xmm4
-	LONG  $0xec70f9c5; BYTE $0x55               // VPSHUFD $0x55, X4, X5                // vpshufd	xmm5, xmm4, 85
-	LONG  $0xe4fed1c5                           // VPADDD X4, X5, X4                    // vpaddd	xmm4, xmm5, xmm4
-	LONG  $0xe07ef9c5                           // VMOVD X4, AX                         // vmovd	eax, xmm4
-	MOVQ  AX, CX                                // <--                                  // mov	rcx, rax
-	IMULQ DI, CX                                // <--                                  // imul	rcx, rdi
-	SHRQ  $0x2f, CX                             // <--                                  // shr	rcx, 47
-	LONG  $0xfff1c969; WORD $0x0000             // IMULL $0xfff1, CX, CX                // imul	ecx, ecx, 65521
-	WORD  $0xc829                               // SUBL CX, AX                          // sub	eax, ecx
-	LONG  $0x397de3c4; WORD $0x01dc             // VEXTRACTI128 $0x1, Y3, X4            // vextracti128	xmm4, ymm3, 1
-	LONG  $0xdbfed9c5                           // VPADDD X3, X4, X3                    // vpaddd	xmm3, xmm4, xmm3
-	LONG  $0xe370f9c5; BYTE $0xee               // VPSHUFD $-0x12, X3, X4               // vpshufd	xmm4, xmm3, 238
-	LONG  $0xdcfee1c5                           // VPADDD X4, X3, X3                    // vpaddd	xmm3, xmm3, xmm4
-	LONG  $0xe370f9c5; BYTE $0x55               // VPSHUFD $0x55, X3, X4                // vpshufd	xmm4, xmm3, 85
-	LONG  $0xdbfed9c5                           // VPADDD X3, X4, X3                    // vpaddd	xmm3, xmm4, xmm3
-	LONG  $0xd97ef9c5                           // VMOVD X3, CX                         // vmovd	ecx, xmm3
-	MOVQ  CX, R8                                // <--                                  // mov	r8, rcx
-	IMULQ DI, R8                                // <--                                  // imul	r8, rdi
-	SHRQ  $0x2f, R8                             // <--                                  // shr	r8, 47
-	LONG  $0xf1c06945; WORD $0x00ff; BYTE $0x00 // IMULL $0xfff1, R8, R8                // imul	r8d, r8d, 65521
-	WORD  $0x2944; BYTE $0xc1                   // SUBL R8, CX                          // sub	ecx, r8d
-	CMPQ  DX, $0x1f                             // <--                                  // cmp	rdx, 31
-	JBE   LBB0_15                               // <--                                  // jbe	.LBB0_15
+block_reduce:
+	SUBQ  AX, DX
+	LONG  $0xf572ddc5; BYTE $0x05
+	LONG  $0xdbfeddc5
+	LONG  $0x397de3c4; WORD $0x01f4
+	LONG  $0xecc6c8c5; BYTE $0x88
+	LONG  $0xe470f9c5; BYTE $0x88
+	LONG  $0xe4fed1c5
+	LONG  $0xec70f9c5; BYTE $0x55
+	LONG  $0xe4fed1c5
+	LONG  $0xe07ef9c5
+	MOVQ  AX, CX
+	IMULQ DI, CX
+	SHRQ  $0x2f, CX
+	LONG  $0xfff1c969; WORD $0x0000
+	WORD  $0xc829
+	LONG  $0x397de3c4; WORD $0x01dc
+	LONG  $0xdbfed9c5
+	LONG  $0xe370f9c5; BYTE $0xee
+	LONG  $0xdcfee1c5
+	LONG  $0xe370f9c5; BYTE $0x55
+	LONG  $0xdbfed9c5
+	LONG  $0xd97ef9c5
+	MOVQ  CX, R8
+	IMULQ DI, R8
+	SHRQ  $0x2f, R8
+	LONG  $0xf1c06945; WORD $0x00ff; BYTE $0x00
+	WORD  $0x2944; BYTE $0xc1
+	CMPQ  DX, $0x1f
+	JBE   tail_check
 
-LBB0_6:
-	LONG $0xe06ef9c5               // VMOVD AX, X4                         // vmovd	xmm4, eax
-	LONG $0xd96ef9c5               // VMOVD CX, X3                         // vmovd	xmm3, ecx
-	CMPQ DX, $0x15b0               // <--                                  // cmp	rdx, 5552
-	LONG $0x15b0b841; WORD $0x0000 // MOVL $0x15b0, R8                     // mov	r8d, 5552
-	LONG $0xc2420f4c               // CMOVB DX, R8                         // cmovb	r8, rdx
-	WORD $0x8944; BYTE $0xc0       // MOVL R8, AX                          // mov	eax, r8d
-	LONG $0x001fe025; BYTE $0x00   // ANDL $0x1fe0, AX                     // and	eax, 8160
-	JE   LBB0_7                    // <--                                  // je	.LBB0_7
-	ADDQ $-0x20, R8                // <--                                  // add	r8, -32
-	LONG $0xedefd1c5               // VPXOR X5, X5, X5                     // vpxor	xmm5, xmm5, xmm5
-	LONG $0x20c0f641               // TESTL $0x20, R8                      // test	r8b, 32
-	JNE  LBB0_9                    // <--                                  // jne	.LBB0_9
-	LONG $0x2e6ffec5               // VMOVDQU 0(SI), Y5                    // vmovdqu	ymm5, ymmword ptr [rsi]
-	ADDQ $0x20, SI                 // <--                                  // add	rsi, 32
-	LEAQ -0x20(AX), CX             // <--                                  // lea	rcx, [rax - 32]
-	LONG $0xf0f6d5c5               // VPSADBW Y0, Y5, Y6                   // vpsadbw	ymm6, ymm5, ymm0
-	LONG $0xf4fecdc5               // VPADDD Y4, Y6, Y6                    // vpaddd	ymm6, ymm6, ymm4
-	LONG $0x0455e2c4; BYTE $0xe9   // VPMADDUBSW Y1, Y5, Y5                // vpmaddubsw	ymm5, ymm5, ymm1
-	LONG $0xeaf5d5c5               // VPMADDWD Y2, Y5, Y5                  // vpmaddwd	ymm5, ymm5, ymm2
-	LONG $0xdbfed5c5               // VPADDD Y3, Y5, Y3                    // vpaddd	ymm3, ymm5, ymm3
-	LONG $0xec6ffdc5               // VMOVDQA Y4, Y5                       // vmovdqa	ymm5, ymm4
-	LONG $0xe66ffdc5               // VMOVDQA Y6, Y4                       // vmovdqa	ymm4, ymm6
-	CMPQ R8, $0x20                 // <--                                  // cmp	r8, 32
-	JAE  LBB0_12                   // <--                                  // jae	.LBB0_12
-	JMP  LBB0_14                   // <--                                  // jmp	.LBB0_14
+block_loop_setup:
+	LONG $0xe06ef9c5
+	LONG $0xd96ef9c5
+	CMPQ DX, $0x15b0
+	LONG $0x15b0b841; WORD $0x0000
+	LONG $0xc2420f4c
+	WORD $0x8944; BYTE $0xc0
+	LONG $0x001fe025; BYTE $0x00
+	JE   block_accum_init
+	ADDQ $-0x20, R8
+	LONG $0xedefd1c5
+	LONG $0x20c0f641
+	JNE  block_loop_entry
+	LONG $0x2e6ffec5
+	ADDQ $0x20, SI
+	LEAQ -0x20(AX), CX
+	LONG $0xf0f6d5c5
+	LONG $0xf4fecdc5
+	LONG $0x0455e2c4; BYTE $0xe9
+	LONG $0xeaf5d5c5
+	LONG $0xdbfed5c5
+	LONG $0xec6ffdc5
+	LONG $0xe66ffdc5
+	CMPQ R8, $0x20
+	JAE  block_loop_64
+	JMP  block_reduce
 
-LBB0_9:
-	MOVQ AX, CX    // <--                                  // mov	rcx, rax
-	CMPQ R8, $0x20 // <--                                  // cmp	r8, 32
-	JB   LBB0_14   // <--                                  // jb	.LBB0_14
+block_loop_entry:
+	MOVQ AX, CX
+	CMPQ R8, $0x20
+	JB   block_reduce
 
-LBB0_12:
-	LONG $0x366ffec5             // VMOVDQU 0(SI), Y6                    // vmovdqu	ymm6, ymmword ptr [rsi]
-	LONG $0x7e6ffec5; BYTE $0x20 // VMOVDQU 0x20(SI), Y7                 // vmovdqu	ymm7, ymmword ptr [rsi + 32]
-	LONG $0xc0f64dc5             // VPSADBW Y0, Y6, Y8                   // vpsadbw	ymm8, ymm6, ymm0
-	LONG $0xc4fe3dc5             // VPADDD Y4, Y8, Y8                    // vpaddd	ymm8, ymm8, ymm4
-	LONG $0xecfed5c5             // VPADDD Y4, Y5, Y5                    // vpaddd	ymm5, ymm5, ymm4
-	LONG $0x044de2c4; BYTE $0xe1 // VPMADDUBSW Y1, Y6, Y4                // vpmaddubsw	ymm4, ymm6, ymm1
-	LONG $0xe2f5ddc5             // VPMADDWD Y2, Y4, Y4                  // vpmaddwd	ymm4, ymm4, ymm2
-	LONG $0xdbfeddc5             // VPADDD Y3, Y4, Y3                    // vpaddd	ymm3, ymm4, ymm3
-	ADDQ $0x40, SI               // <--                                  // add	rsi, 64
-	LONG $0xe0f6c5c5             // VPSADBW Y0, Y7, Y4                   // vpsadbw	ymm4, ymm7, ymm0
-	LONG $0xe4febdc5             // VPADDD Y4, Y8, Y4                    // vpaddd	ymm4, ymm8, ymm4
-	LONG $0xedfebdc5             // VPADDD Y5, Y8, Y5                    // vpaddd	ymm5, ymm8, ymm5
-	LONG $0x0445e2c4; BYTE $0xf1 // VPMADDUBSW Y1, Y7, Y6                // vpmaddubsw	ymm6, ymm7, ymm1
-	LONG $0xf2f5cdc5             // VPMADDWD Y2, Y6, Y6                  // vpmaddwd	ymm6, ymm6, ymm2
-	LONG $0xdbfecdc5             // VPADDD Y3, Y6, Y3                    // vpaddd	ymm3, ymm6, ymm3
-	ADDQ $-0x40, CX              // <--                                  // add	rcx, -64
-	JNE  LBB0_12                 // <--                                  // jne	.LBB0_12
-	LONG $0xf46ffdc5             // VMOVDQA Y4, Y6                       // vmovdqa	ymm6, ymm4
-	JMP  LBB0_14                 // <--                                  // jmp	.LBB0_14
+block_loop_64:
+	LONG $0x366ffec5
+	LONG $0x7e6ffec5; BYTE $0x20
+	LONG $0xc0f64dc5
+	LONG $0xc4fe3dc5
+	LONG $0xecfed5c5
+	LONG $0x044de2c4; BYTE $0xe1
+	LONG $0xe2f5ddc5
+	LONG $0xdbfeddc5
+	ADDQ $0x40, SI
+	LONG $0xe0f6c5c5
+	LONG $0xe4febdc5
+	LONG $0xedfebdc5
+	LONG $0x0445e2c4; BYTE $0xf1
+	LONG $0xf2f5cdc5
+	LONG $0xdbfecdc5
+	ADDQ $-0x40, CX
+	JNE  block_loop_64
+	LONG $0xf46ffdc5
+	JMP  block_reduce
 
-LBB0_1:
-	LONG $0x000001b8; BYTE $0x00 // MOVL $0x1, AX                        // mov	eax, 1
+return_one:
+	LONG $0x000001b8; BYTE $0x00
 
-LBB0_2:
-	MOVL AX, ret+32(FP) // <--
-	RET                 // <--                                  // ret
+return_result:
+	MOVL AX, ret+32(FP)
+	RET
 
-LBB0_15:
-	WORD $0x8548; BYTE $0xd2 // TESTQ DX, DX                         // test	rdx, rdx
-	JE   LBB0_16             // <--                                  // je	.LBB0_16
+tail_check:
+	WORD $0x8548; BYTE $0xd2
+	JE   return_no_tail
 
-LBB0_17:
-	CMPQ DX, $0x10               // <--                                  // cmp	rdx, 16
-	JB   LBB0_20                 // <--                                  // jb	.LBB0_20
-	WORD $0xb60f; BYTE $0x3e     // MOVZX 0(SI), DI                      // movzx	edi, byte ptr [rsi]
-	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
-	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
-	LONG $0x017eb60f             // MOVZX 0x1(SI), DI                    // movzx	edi, byte ptr [rsi + 1]
-	WORD $0xc701                 // ADDL AX, DI                          // add	edi, eax
-	WORD $0xf901                 // ADDL DI, CX                          // add	ecx, edi
-	LONG $0x0246b60f             // MOVZX 0x2(SI), AX                    // movzx	eax, byte ptr [rsi + 2]
-	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
-	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
-	LONG $0x037eb60f             // MOVZX 0x3(SI), DI                    // movzx	edi, byte ptr [rsi + 3]
-	WORD $0xc701                 // ADDL AX, DI                          // add	edi, eax
-	WORD $0xf901                 // ADDL DI, CX                          // add	ecx, edi
-	LONG $0x0446b60f             // MOVZX 0x4(SI), AX                    // movzx	eax, byte ptr [rsi + 4]
-	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
-	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
-	LONG $0x057eb60f             // MOVZX 0x5(SI), DI                    // movzx	edi, byte ptr [rsi + 5]
-	WORD $0xc701                 // ADDL AX, DI                          // add	edi, eax
-	WORD $0xf901                 // ADDL DI, CX                          // add	ecx, edi
-	LONG $0x0646b60f             // MOVZX 0x6(SI), AX                    // movzx	eax, byte ptr [rsi + 6]
-	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
-	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
-	LONG $0x077eb60f             // MOVZX 0x7(SI), DI                    // movzx	edi, byte ptr [rsi + 7]
-	WORD $0xc701                 // ADDL AX, DI                          // add	edi, eax
-	WORD $0xf901                 // ADDL DI, CX                          // add	ecx, edi
-	LONG $0x0846b60f             // MOVZX 0x8(SI), AX                    // movzx	eax, byte ptr [rsi + 8]
-	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
-	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
-	LONG $0x097eb60f             // MOVZX 0x9(SI), DI                    // movzx	edi, byte ptr [rsi + 9]
-	WORD $0xc701                 // ADDL AX, DI                          // add	edi, eax
-	WORD $0xf901                 // ADDL DI, CX                          // add	ecx, edi
-	LONG $0x0a46b60f             // MOVZX 0xa(SI), AX                    // movzx	eax, byte ptr [rsi + 10]
-	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
-	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
-	LONG $0x0b7eb60f             // MOVZX 0xb(SI), DI                    // movzx	edi, byte ptr [rsi + 11]
-	WORD $0xc701                 // ADDL AX, DI                          // add	edi, eax
-	WORD $0xf901                 // ADDL DI, CX                          // add	ecx, edi
-	LONG $0x0c46b60f             // MOVZX 0xc(SI), AX                    // movzx	eax, byte ptr [rsi + 12]
-	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
-	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
-	LONG $0x0d7eb60f             // MOVZX 0xd(SI), DI                    // movzx	edi, byte ptr [rsi + 13]
-	WORD $0xc701                 // ADDL AX, DI                          // add	edi, eax
-	WORD $0xf901                 // ADDL DI, CX                          // add	ecx, edi
-	LONG $0x46b60f44; BYTE $0x0e // MOVZX 0xe(SI), R8                    // movzx	r8d, byte ptr [rsi + 14]
-	WORD $0x0141; BYTE $0xf8     // ADDL DI, R8                          // add	r8d, edi
-	WORD $0x0144; BYTE $0xc1     // ADDL R8, CX                          // add	ecx, r8d
-	LONG $0x0f46b60f             // MOVZX 0xf(SI), AX                    // movzx	eax, byte ptr [rsi + 15]
-	WORD $0x0144; BYTE $0xc0     // ADDL R8, AX                          // add	eax, r8d
-	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
-	ADDQ $-0x10, DX              // <--                                  // add	rdx, -16
-	JE   LBB0_27                 // <--                                  // je	.LBB0_27
-	ADDQ $0x10, SI               // <--                                  // add	rsi, 16
+tail16_check:
+	CMPQ DX, $0x10
+	JB   tail_bytes_setup
+	WORD $0xb60f; BYTE $0x3e
+	WORD $0xf801
+	WORD $0xc101
+	LONG $0x017eb60f
+	WORD $0xc701
+	WORD $0xf901
+	LONG $0x0246b60f
+	WORD $0xf801
+	WORD $0xc101
+	LONG $0x037eb60f
+	WORD $0xc701
+	WORD $0xf901
+	LONG $0x0446b60f
+	WORD $0xf801
+	WORD $0xc101
+	LONG $0x057eb60f
+	WORD $0xc701
+	WORD $0xf901
+	LONG $0x0646b60f
+	WORD $0xf801
+	WORD $0xc101
+	LONG $0x077eb60f
+	WORD $0xc701
+	WORD $0xf901
+	LONG $0x0846b60f
+	WORD $0xf801
+	WORD $0xc101
+	LONG $0x097eb60f
+	WORD $0xc701
+	WORD $0xf901
+	LONG $0x0a46b60f
+	WORD $0xf801
+	WORD $0xc101
+	LONG $0x0b7eb60f
+	WORD $0xc701
+	WORD $0xf901
+	LONG $0x0c46b60f
+	WORD $0xf801
+	WORD $0xc101
+	LONG $0x0d7eb60f
+	WORD $0xc701
+	WORD $0xf901
+	LONG $0x46b60f44; BYTE $0x0e
+	WORD $0x0141; BYTE $0xf8
+	WORD $0x0144; BYTE $0xc1
+	LONG $0x0f46b60f
+	WORD $0x0144; BYTE $0xc0
+	WORD $0xc101
+	ADDQ $-0x10, DX
+	JE   final_reduce
+	ADDQ $0x10, SI
 
-LBB0_20:
-	LEAQ -0x1(DX), DI // <--                                  // lea	rdi, [rdx - 1]
-	MOVQ DX, R9       // <--                                  // mov	r9, rdx
-	ANDQ $0x3, R9     // <--                                  // and	r9, 3
-	JE   LBB0_24      // <--                                  // je	.LBB0_24
-	XORL R8, R8       // <--                                  // xor	r8d, r8d
+tail_bytes_setup:
+	LEAQ -0x1(DX), DI
+	MOVQ DX, R9
+	ANDQ $0x3, R9
+	JE   tail_dword_setup
+	XORL R8, R8
 
-LBB0_22:
-	LONG $0x14b60f46; BYTE $0x06 // MOVZX 0(SI)(R8*1), R10               // movzx	r10d, byte ptr [rsi + r8]
-	WORD $0x0144; BYTE $0xd0     // ADDL R10, AX                         // add	eax, r10d
-	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
-	INCQ R8                      // <--                                  // inc	r8
-	CMPQ R9, R8                  // <--                                  // cmp	r9, r8
-	JNE  LBB0_22                 // <--                                  // jne	.LBB0_22
-	ADDQ R8, SI                  // <--                                  // add	rsi, r8
-	SUBQ R8, DX                  // <--                                  // sub	rdx, r8
+tail_byte_loop:
+	LONG $0x14b60f46; BYTE $0x06
+	WORD $0x0144; BYTE $0xd0
+	WORD $0xc101
+	INCQ R8
+	CMPQ R9, R8
+	JNE  tail_byte_loop
+	ADDQ R8, SI
+	SUBQ R8, DX
 
-LBB0_24:
-	CMPQ DI, $0x3 // <--                                  // cmp	rdi, 3
-	JB   LBB0_27  // <--                                  // jb	.LBB0_27
-	XORL DI, DI   // <--                                  // xor	edi, edi
+tail_dword_setup:
+	CMPQ DI, $0x3
+	JB   final_reduce
+	XORL DI, DI
 
-LBB0_26:
-	LONG $0x04b60f44; BYTE $0x3e   // MOVZX 0(SI)(DI*1), R8                // movzx	r8d, byte ptr [rsi + rdi]
-	WORD $0x0141; BYTE $0xc0       // ADDL AX, R8                          // add	r8d, eax
-	WORD $0x0144; BYTE $0xc1       // ADDL R8, CX                          // add	ecx, r8d
-	LONG $0x3e44b60f; BYTE $0x01   // MOVZX 0x1(SI)(DI*1), AX              // movzx	eax, byte ptr [rsi + rdi + 1]
-	WORD $0x0144; BYTE $0xc0       // ADDL R8, AX                          // add	eax, r8d
-	WORD $0xc101                   // ADDL AX, CX                          // add	ecx, eax
-	LONG $0x44b60f44; WORD $0x023e // MOVZX 0x2(SI)(DI*1), R8              // movzx	r8d, byte ptr [rsi + rdi + 2]
-	WORD $0x0141; BYTE $0xc0       // ADDL AX, R8                          // add	r8d, eax
-	WORD $0x0144; BYTE $0xc1       // ADDL R8, CX                          // add	ecx, r8d
-	LONG $0x3e44b60f; BYTE $0x03   // MOVZX 0x3(SI)(DI*1), AX              // movzx	eax, byte ptr [rsi + rdi + 3]
-	WORD $0x0144; BYTE $0xc0       // ADDL R8, AX                          // add	eax, r8d
-	WORD $0xc101                   // ADDL AX, CX                          // add	ecx, eax
-	ADDQ $0x4, DI                  // <--                                  // add	rdi, 4
-	CMPQ DX, DI                    // <--                                  // cmp	rdx, rdi
-	JNE  LBB0_26                   // <--                                  // jne	.LBB0_26
+tail_dword_loop:
+	LONG $0x04b60f44; BYTE $0x3e
+	WORD $0x0141; BYTE $0xc0
+	WORD $0x0144; BYTE $0xc1
+	LONG $0x3e44b60f; BYTE $0x01
+	WORD $0x0144; BYTE $0xc0
+	WORD $0xc101
+	LONG $0x44b60f44; WORD $0x023e
+	WORD $0x0141; BYTE $0xc0
+	WORD $0x0144; BYTE $0xc1
+	LONG $0x3e44b60f; BYTE $0x03
+	WORD $0x0144; BYTE $0xc0
+	WORD $0xc101
+	ADDQ $0x4, DI
+	CMPQ DX, DI
+	JNE  tail_dword_loop
 
-LBB0_27:
-	LONG  $0x000f908d; WORD $0xffff // LEAL -0xfff1(AX), DX                 // lea	edx, [rax - 65521]
-	CMPL  AX, $0xfff1               // <--                                  // cmp	eax, 65521
-	WORD  $0x420f; BYTE $0xd0       // CMOVB AX, DX                         // cmovb	edx, eax
-	WORD  $0xc889                   // MOVL CX, AX                          // mov	eax, ecx
-	LONG  $0x078071be; BYTE $0x80   // MOVL $-0x7ff87f8f, SI                // mov	esi, 2147975281
-	IMULQ AX, SI                    // <--                                  // imul	rsi, rax
-	SHRQ  $0x2f, SI                 // <--                                  // shr	rsi, 47
-	LONG  $0xfff1c669; WORD $0x0000 // IMULL $0xfff1, SI, AX                // imul	eax, esi, 65521
-	WORD  $0xc129                   // SUBL AX, CX                          // sub	ecx, eax
-	WORD  $0xe1c1; BYTE $0x10       // SHLL $0x10, CX                       // shl	ecx, 16
-	WORD  $0xd109                   // ORL DX, CX                           // or	ecx, edx
-	WORD  $0xc889                   // MOVL CX, AX                          // mov	eax, ecx
-	NOP                             // (skipped)                            // mov	rsp, rbp
-	NOP                             // (skipped)                            // pop	rbp
-	VZEROUPPER                      // <--                                  // vzeroupper
-	MOVL  AX, ret+32(FP)            // <--
-	RET                             // <--                                  // ret
+final_reduce:
+	LONG  $0x000f908d; WORD $0xffff
+	CMPL  AX, $0xfff1
+	WORD  $0x420f; BYTE $0xd0
+	WORD  $0xc889
+	LONG  $0x078071be; BYTE $0x80
+	IMULQ AX, SI
+	SHRQ  $0x2f, SI
+	LONG  $0xfff1c669; WORD $0x0000
+	WORD  $0xc129
+	WORD  $0xe1c1; BYTE $0x10
+	WORD  $0xd109
+	WORD  $0xc889
+	NOP
+	NOP
+	VZEROUPPER
+	MOVL  AX, ret+32(FP)
+	RET
 
-LBB0_16:
-	WORD $0xe1c1; BYTE $0x10 // SHLL $0x10, CX                       // shl	ecx, 16
-	WORD $0xc809             // ORL CX, AX                           // or	eax, ecx
-	NOP                      // (skipped)                            // mov	rsp, rbp
-	NOP                      // (skipped)                            // pop	rbp
-	VZEROUPPER               // <--                                  // vzeroupper
-	MOVL AX, ret+32(FP)      // <--
-	RET                      // <--                                  // ret
+return_no_tail:
+	WORD $0xe1c1; BYTE $0x10
+	WORD $0xc809
+	NOP
+	NOP
+	VZEROUPPER
+	MOVL AX, ret+32(FP)
+	RET
--- a/internal/adler32/adler32_neon.s
+++ b/internal/adler32/adler32_neon.s
@@ -17,192 +17,192 @@
 	MOVD buf_base+8(FP), R1
 	MOVD buf_len+16(FP), R2
 	MOVD buf_cap+24(FP), R3
-	NOP                     // (skipped)                            // stp	x29, x30, [sp, #-16]!
-	ANDS $15, R1, R10       // <--                                  // ands	x10, x1, #0xf
-	ANDW $65535, R0, R8     // <--                                  // and	w8, w0, #0xffff
-	LSRW $16, R0, R9        // <--                                  // lsr	w9, w0, #16
-	NOP                     // (skipped)                            // mov	x29, sp
-	BEQ  LBB0_4             // <--                                  // b.eq	.LBB0_4
-	ADD  $1, R1, R11        // <--                                  // add	x11, x1, #1
-	MOVD R1, R12            // <--                                  // mov	x12, x1
+	NOP
+	ANDS $15, R1, R10
+	ANDW $65535, R0, R8
+	LSRW $16, R0, R9
+	NOP
+	BEQ  vector_loop_setup
+	ADD  $1, R1, R11
+	MOVD R1, R12
 
-LBB0_2:
-	WORD  $0x3840158d       // MOVBU.P 1(R12), R13                  // ldrb	w13, [x12], #1
-	SUB   $1, R2, R2        // <--                                  // sub	x2, x2, #1
-	TST   $15, R11          // <--                                  // tst	x11, #0xf
-	ADD   $1, R11, R11      // <--                                  // add	x11, x11, #1
-	ADDW  R13, R8, R8       // <--                                  // add	w8, w8, w13
-	ADDW  R9, R8, R9        // <--                                  // add	w9, w8, w9
-	BNE   LBB0_2            // <--                                  // b.ne	.LBB0_2
-	MOVW  $32881, R11       // <--                                  // mov	w11, #32881
-	MOVW  $65521, R13       // <--                                  // mov	w13, #65521
-	MOVKW $(32775<<16), R11 // <--                                  // movk	w11, #32775, lsl #16
-	MOVW  $4294901775, R12  // <--                                  // mov	w12, #-65521
-	MOVW  $65520, R14       // <--                                  // mov	w14, #65520
-	SUB   R10, R1, R10      // <--                                  // sub	x10, x1, x10
-	UMULL R11, R9, R11      // <--                                  // umull	x11, w9, w11
-	ADDW  R12, R8, R12      // <--                                  // add	w12, w8, w12
-	CMPW  R14, R8           // <--                                  // cmp	w8, w14
-	ADD   $16, R10, R1      // <--                                  // add	x1, x10, #16
-	LSR   $47, R11, R11     // <--                                  // lsr	x11, x11, #47
-	CSELW HI, R12, R8, R8   // <--                                  // csel	w8, w12, w8, hi
-	MSUBW R13, R9, R11, R9  // <--                                  // msub	w9, w11, w13, w9
+align_loop:
+	WORD  $0x3840158d
+	SUB   $1, R2, R2
+	TST   $15, R11
+	ADD   $1, R11, R11
+	ADDW  R13, R8, R8
+	ADDW  R9, R8, R9
+	BNE   align_loop
+	MOVW  $32881, R11
+	MOVW  $65521, R13
+	MOVKW $(32775<<16), R11
+	MOVW  $4294901775, R12
+	MOVW  $65520, R14
+	SUB   R10, R1, R10
+	UMULL R11, R9, R11
+	ADDW  R12, R8, R12
+	CMPW  R14, R8
+	ADD   $16, R10, R1
+	LSR   $47, R11, R11
+	CSELW HI, R12, R8, R8
+	MSUBW R13, R9, R11, R9
 
-LBB0_4:
-	AND   $31, R2, R10                        // <--                                  // and	x10, x2, #0x1f
-	CMP   $32, R2                             // <--                                  // cmp	x2, #32
-	BCC   LBB0_9                              // <--                                  // b.lo	.LBB0_9
-	MOVD  $mult_table<>(SB), R11              // <--                                  // adrp	x11, mult_table
-	ADD   $0, R11, R11                        // <--                                  // add	x11, x11, :lo12:mult_table
-	MOVW  $32881, R14                         // <--                                  // mov	w14, #32881
-	MOVW  $173, R12                           // <--                                  // mov	w12, #173
-	MOVD  $137438953440, R13                  // <--                                  // mov	x13, #137438953440
-	MOVKW $(32775<<16), R14                   // <--                                  // movk	w14, #32775, lsl #16
-	VLD1  (R11), [V0.H8, V1.H8, V2.H8, V3.H8] // <--                                  // ld1	{ v0.8h, v1.8h, v2.8h, v3.8h }, [x11]
-	LSR   $5, R2, R11                         // <--                                  // lsr	x11, x2, #5
-	MOVW  $65521, R15                         // <--                                  // mov	w15, #65521
-	VEXT  $8, V0.B16, V0.B16, V4.B16          // <--                                  // ext	v4.16b, v0.16b, v0.16b, #8
-	VEXT  $8, V1.B16, V1.B16, V5.B16          // <--                                  // ext	v5.16b, v1.16b, v1.16b, #8
-	VEXT  $8, V2.B16, V2.B16, V6.B16          // <--                                  // ext	v6.16b, v2.16b, v2.16b, #8
-	VEXT  $8, V3.B16, V3.B16, V7.B16          // <--                                  // ext	v7.16b, v3.16b, v3.16b, #8
+vector_loop_setup:
+	AND   $31, R2, R10
+	CMP   $32, R2
+	BCC   tail_entry
+	MOVD  $mult_table<>(SB), R11
+	ADD   $0, R11, R11
+	MOVW  $32881, R14
+	MOVW  $173, R12
+	MOVD  $137438953440, R13
+	MOVKW $(32775<<16), R14
+	VLD1  (R11), [V0.H8, V1.H8, V2.H8, V3.H8]
+	LSR   $5, R2, R11
+	MOVW  $65521, R15
+	VEXT  $8, V0.B16, V0.B16, V4.B16
+	VEXT  $8, V1.B16, V1.B16, V5.B16
+	VEXT  $8, V2.B16, V2.B16, V6.B16
+	VEXT  $8, V3.B16, V3.B16, V7.B16
 
-LBB0_6:
-	CMP  $173, R11               // <--                                  // cmp	x11, #173
-	MOVD R1, R2                  // <--                                  // mov	x2, x1
-	CSEL LO, R11, R12, R16       // <--                                  // csel	x16, x11, x12, lo
-	WORD $0x6f00e414             // VMOVI $0, V20.D2                     // movi	v20.2d, #0000000000000000
-	MULW R16, R8, R0             // <--                                  // mul	w0, w8, w16
-	ADD  R16<<5, R13, R17        // <--                                  // add	x17, x13, x16, lsl #5
-	WORD $0x6f00e410             // VMOVI $0, V16.D2                     // movi	v16.2d, #0000000000000000
-	AND  $137438953440, R17, R17 // <--                                  // and	x17, x17, #0x1fffffffe0
-	WORD $0x6f00e412             // VMOVI $0, V18.D2                     // movi	v18.2d, #0000000000000000
-	WORD $0x6f00e413             // VMOVI $0, V19.D2                     // movi	v19.2d, #0000000000000000
-	WORD $0x6f00e415             // VMOVI $0, V21.D2                     // movi	v21.2d, #0000000000000000
-	VMOV R0, V20.S[3]            // <--                                  // mov	v20.s[3], w0
-	MOVW R16, R0                 // <--                                  // mov	w0, w16
-	WORD $0x6f00e411             // VMOVI $0, V17.D2                     // movi	v17.2d, #0000000000000000
+vector_outer_loop:
+	CMP  $173, R11
+	MOVD R1, R2
+	CSEL LO, R11, R12, R16
+	WORD $0x6f00e414
+	MULW R16, R8, R0
+	ADD  R16<<5, R13, R17
+	WORD $0x6f00e410
+	AND  $137438953440, R17, R17
+	WORD $0x6f00e412
+	WORD $0x6f00e413
+	WORD $0x6f00e415
+	VMOV R0, V20.S[3]
+	MOVW R16, R0
+	WORD $0x6f00e411
 
-LBB0_7:
-	WORD  $0xacc15857                   // FLDPQ.P 32(R2), (F23, F22)           // ldp	q23, q22, [x2], #32
-	SUBSW $1, R0, R0                    // <--                                  // subs	w0, w0, #1
-	VADD  V17.S4, V20.S4, V20.S4        // <--                                  // add	v20.4s, v20.4s, v17.4s
-	WORD  $0x2e3712b5                   // VUADDW V23.B8, V21.H8, V21.H8        // uaddw	v21.8h, v21.8h, v23.8b
-	WORD  $0x6e371273                   // VUADDW2 V23.B16, V19.H8, V19.H8      // uaddw2	v19.8h, v19.8h, v23.16b
-	WORD  $0x6e202ad8                   // VUADDLP V22.B16, V24.H8              // uaddlp	v24.8h, v22.16b
-	WORD  $0x2e361252                   // VUADDW V22.B8, V18.H8, V18.H8        // uaddw	v18.8h, v18.8h, v22.8b
-	WORD  $0x6e361210                   // VUADDW2 V22.B16, V16.H8, V16.H8      // uaddw2	v16.8h, v16.8h, v22.16b
-	WORD  $0x6e206af8                   // VUADALP V23.B16, V24.H8              // uadalp	v24.8h, v23.16b
-	WORD  $0x6e606b11                   // VUADALP V24.H8, V17.S4               // uadalp	v17.4s, v24.8h
-	BNE   LBB0_7                        // <--                                  // b.ne	.LBB0_7
-	VSHL  $5, V20.S4, V20.S4            // <--                                  // shl	v20.4s, v20.4s, #5
-	ADD   R17, R1, R17                  // <--                                  // add	x17, x1, x17
-	SUBS  R16, R11, R11                 // <--                                  // subs	x11, x11, x16
-	ADD   $32, R17, R1                  // <--                                  // add	x1, x17, #32
-	WORD  $0x2e6082b4                   // VUMLAL V0.H4, V21.H4, V20.S4         // umlal	v20.4s, v21.4h, v0.4h
-	VEXT  $8, V21.B16, V21.B16, V21.B16 // <--                                  // ext	v21.16b, v21.16b, v21.16b, #8
-	WORD  $0x2e6482b4                   // VUMLAL V4.H4, V21.H4, V20.S4         // umlal	v20.4s, v21.4h, v4.4h
-	VEXT  $8, V19.B16, V19.B16, V21.B16 // <--                                  // ext	v21.16b, v19.16b, v19.16b, #8
-	WORD  $0x2e618274                   // VUMLAL V1.H4, V19.H4, V20.S4         // umlal	v20.4s, v19.4h, v1.4h
-	VEXT  $8, V18.B16, V18.B16, V19.B16 // <--                                  // ext	v19.16b, v18.16b, v18.16b, #8
-	WORD  $0x2e6582b4                   // VUMLAL V5.H4, V21.H4, V20.S4         // umlal	v20.4s, v21.4h, v5.4h
-	WORD  $0x2e628254                   // VUMLAL V2.H4, V18.H4, V20.S4         // umlal	v20.4s, v18.4h, v2.4h
-	WORD  $0x2e668274                   // VUMLAL V6.H4, V19.H4, V20.S4         // umlal	v20.4s, v19.4h, v6.4h
-	WORD  $0x2e638214                   // VUMLAL V3.H4, V16.H4, V20.S4         // umlal	v20.4s, v16.4h, v3.4h
-	VEXT  $8, V16.B16, V16.B16, V16.B16 // <--                                  // ext	v16.16b, v16.16b, v16.16b, #8
-	WORD  $0x2e678214                   // VUMLAL V7.H4, V16.H4, V20.S4         // umlal	v20.4s, v16.4h, v7.4h
-	WORD  $0x4eb1be30                   // VADDP V17.S4, V17.S4, V16.S4         // addp	v16.4s, v17.4s, v17.4s
-	WORD  $0x4eb4be91                   // VADDP V20.S4, V20.S4, V17.S4         // addp	v17.4s, v20.4s, v20.4s
-	WORD  $0x0eb1be10                   // VADDP V17.S2, V16.S2, V16.S2         // addp	v16.2s, v16.2s, v17.2s
-	VMOV  V16.S[1], R0                  // <--                                  // mov	w0, v16.s[1]
-	FMOVS F16, R2                       // <--                                  // fmov	w2, s16
-	ADDW  R8, R2, R8                    // <--                                  // add	w8, w2, w8
-	ADDW  R9, R0, R9                    // <--                                  // add	w9, w0, w9
-	UMULL R14, R8, R0                   // <--                                  // umull	x0, w8, w14
-	UMULL R14, R9, R2                   // <--                                  // umull	x2, w9, w14
-	LSR   $47, R0, R0                   // <--                                  // lsr	x0, x0, #47
-	LSR   $47, R2, R2                   // <--                                  // lsr	x2, x2, #47
-	MSUBW R15, R8, R0, R8               // <--                                  // msub	w8, w0, w15, w8
-	MSUBW R15, R9, R2, R9               // <--                                  // msub	w9, w2, w15, w9
-	BNE   LBB0_6                        // <--                                  // b.ne	.LBB0_6
+vector_inner_loop:
+	WORD  $0xacc15857
+	SUBSW $1, R0, R0
+	VADD  V17.S4, V20.S4, V20.S4
+	WORD  $0x2e3712b5
+	WORD  $0x6e371273
+	WORD  $0x6e202ad8
+	WORD  $0x2e361252
+	WORD  $0x6e361210
+	WORD  $0x6e206af8
+	WORD  $0x6e606b11
+	BNE   vector_inner_loop
+	VSHL  $5, V20.S4, V20.S4
+	ADD   R17, R1, R17
+	SUBS  R16, R11, R11
+	ADD   $32, R17, R1
+	WORD  $0x2e6082b4
+	VEXT  $8, V21.B16, V21.B16, V21.B16
+	WORD  $0x2e6482b4
+	VEXT  $8, V19.B16, V19.B16, V21.B16
+	WORD  $0x2e618274
+	VEXT  $8, V18.B16, V18.B16, V19.B16
+	WORD  $0x2e6582b4
+	WORD  $0x2e628254
+	WORD  $0x2e668274
+	WORD  $0x2e638214
+	VEXT  $8, V16.B16, V16.B16, V16.B16
+	WORD  $0x2e678214
+	WORD  $0x4eb1be30
+	WORD  $0x4eb4be91
+	WORD  $0x0eb1be10
+	VMOV  V16.S[1], R0
+	FMOVS F16, R2
+	ADDW  R8, R2, R8
+	ADDW  R9, R0, R9
+	UMULL R14, R8, R0
+	UMULL R14, R9, R2
+	LSR   $47, R0, R0
+	LSR   $47, R2, R2
+	MSUBW R15, R8, R0, R8
+	MSUBW R15, R9, R2, R9
+	BNE   vector_outer_loop
 
-LBB0_9:
-	CBZ  R10, LBB0_15  // <--                                  // cbz	x10, .LBB0_15
-	CMP  $16, R10      // <--                                  // cmp	x10, #16
-	BCC  LBB0_13       // <--                                  // b.lo	.LBB0_13
-	WORD $0x3940002b   // MOVBU (R1), R11                      // ldrb	w11, [x1]
-	SUBS $16, R10, R10 // <--                                  // subs	x10, x10, #16
-	WORD $0x3940042c   // MOVBU 1(R1), R12                     // ldrb	w12, [x1, #1]
-	WORD $0x3940082d   // MOVBU 2(R1), R13                     // ldrb	w13, [x1, #2]
-	ADDW R11, R8, R8   // <--                                  // add	w8, w8, w11
-	WORD $0x39400c2b   // MOVBU 3(R1), R11                     // ldrb	w11, [x1, #3]
-	ADDW R9, R8, R9    // <--                                  // add	w9, w8, w9
-	ADDW R12, R8, R8   // <--                                  // add	w8, w8, w12
-	WORD $0x3940102c   // MOVBU 4(R1), R12                     // ldrb	w12, [x1, #4]
-	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
-	ADDW R13, R8, R8   // <--                                  // add	w8, w8, w13
-	WORD $0x3940142d   // MOVBU 5(R1), R13                     // ldrb	w13, [x1, #5]
-	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
-	ADDW R11, R8, R8   // <--                                  // add	w8, w8, w11
-	WORD $0x3940182b   // MOVBU 6(R1), R11                     // ldrb	w11, [x1, #6]
-	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
-	ADDW R12, R8, R8   // <--                                  // add	w8, w8, w12
-	WORD $0x39401c2c   // MOVBU 7(R1), R12                     // ldrb	w12, [x1, #7]
-	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
-	ADDW R13, R8, R8   // <--                                  // add	w8, w8, w13
-	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
-	ADDW R11, R8, R8   // <--                                  // add	w8, w8, w11
-	WORD $0x3940202b   // MOVBU 8(R1), R11                     // ldrb	w11, [x1, #8]
-	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
-	ADDW R12, R8, R8   // <--                                  // add	w8, w8, w12
-	WORD $0x3940242c   // MOVBU 9(R1), R12                     // ldrb	w12, [x1, #9]
-	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
-	WORD $0x3940382d   // MOVBU 14(R1), R13                    // ldrb	w13, [x1, #14]
-	ADDW R11, R8, R8   // <--                                  // add	w8, w8, w11
-	WORD $0x3940282b   // MOVBU 10(R1), R11                    // ldrb	w11, [x1, #10]
-	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
-	ADDW R12, R8, R8   // <--                                  // add	w8, w8, w12
-	WORD $0x39402c2c   // MOVBU 11(R1), R12                    // ldrb	w12, [x1, #11]
-	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
-	ADDW R11, R8, R8   // <--                                  // add	w8, w8, w11
-	WORD $0x3940302b   // MOVBU 12(R1), R11                    // ldrb	w11, [x1, #12]
-	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
-	ADDW R12, R8, R8   // <--                                  // add	w8, w8, w12
-	WORD $0x3940342c   // MOVBU 13(R1), R12                    // ldrb	w12, [x1, #13]
-	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
-	ADDW R11, R8, R8   // <--                                  // add	w8, w8, w11
-	WORD $0x39403c2b   // MOVBU 15(R1), R11                    // ldrb	w11, [x1, #15]
-	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
-	ADDW R12, R8, R8   // <--                                  // add	w8, w8, w12
-	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
-	ADDW R13, R8, R8   // <--                                  // add	w8, w8, w13
-	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
-	ADDW R11, R8, R8   // <--                                  // add	w8, w8, w11
-	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
-	BEQ  LBB0_14       // <--                                  // b.eq	.LBB0_14
-	ADD  $16, R1, R1   // <--                                  // add	x1, x1, #16
+tail_entry:
+	CBZ  R10, return_result
+	CMP  $16, R10
+	BCC  tail_byte_loop
+	WORD $0x3940002b
+	SUBS $16, R10, R10
+	WORD $0x3940042c
+	WORD $0x3940082d
+	ADDW R11, R8, R8
+	WORD $0x39400c2b
+	ADDW R9, R8, R9
+	ADDW R12, R8, R8
+	WORD $0x3940102c
+	ADDW R8, R9, R9
+	ADDW R13, R8, R8
+	WORD $0x3940142d
+	ADDW R8, R9, R9
+	ADDW R11, R8, R8
+	WORD $0x3940182b
+	ADDW R8, R9, R9
+	ADDW R12, R8, R8
+	WORD $0x39401c2c
+	ADDW R8, R9, R9
+	ADDW R13, R8, R8
+	ADDW R8, R9, R9
+	ADDW R11, R8, R8
+	WORD $0x3940202b
+	ADDW R8, R9, R9
+	ADDW R12, R8, R8
+	WORD $0x3940242c
+	ADDW R8, R9, R9
+	WORD $0x3940382d
+	ADDW R11, R8, R8
+	WORD $0x3940282b
+	ADDW R8, R9, R9
+	ADDW R12, R8, R8
+	WORD $0x39402c2c
+	ADDW R8, R9, R9
+	ADDW R11, R8, R8
+	WORD $0x3940302b
+	ADDW R8, R9, R9
+	ADDW R12, R8, R8
+	WORD $0x3940342c
+	ADDW R8, R9, R9
+	ADDW R11, R8, R8
+	WORD $0x39403c2b
+	ADDW R8, R9, R9
+	ADDW R12, R8, R8
+	ADDW R8, R9, R9
+	ADDW R13, R8, R8
+	ADDW R8, R9, R9
+	ADDW R11, R8, R8
+	ADDW R8, R9, R9
+	BEQ  final_reduce
+	ADD  $16, R1, R1
 
-LBB0_13:
-	WORD $0x3840142b  // MOVBU.P 1(R1), R11                   // ldrb	w11, [x1], #1
-	SUBS $1, R10, R10 // <--                                  // subs	x10, x10, #1
-	ADDW R11, R8, R8  // <--                                  // add	w8, w8, w11
-	ADDW R9, R8, R9   // <--                                  // add	w9, w8, w9
-	BNE  LBB0_13      // <--                                  // b.ne	.LBB0_13
+tail_byte_loop:
+	WORD $0x3840142b
+	SUBS $1, R10, R10
+	ADDW R11, R8, R8
+	ADDW R9, R8, R9
+	BNE  tail_byte_loop
 
-LBB0_14:
-	MOVW  $32881, R10       // <--                                  // mov	w10, #32881
-	MOVW  $65521, R12       // <--                                  // mov	w12, #65521
-	MOVKW $(32775<<16), R10 // <--                                  // movk	w10, #32775, lsl #16
-	MOVW  $4294901775, R11  // <--                                  // mov	w11, #-65521
-	MOVW  $65520, R13       // <--                                  // mov	w13, #65520
-	ADDW  R11, R8, R11      // <--                                  // add	w11, w8, w11
-	UMULL R10, R9, R10      // <--                                  // umull	x10, w9, w10
-	CMPW  R13, R8           // <--                                  // cmp	w8, w13
-	CSELW HI, R11, R8, R8   // <--                                  // csel	w8, w11, w8, hi
-	LSR   $47, R10, R10     // <--                                  // lsr	x10, x10, #47
-	MSUBW R12, R9, R10, R9  // <--                                  // msub	w9, w10, w12, w9
+final_reduce:
+	MOVW  $32881, R10
+	MOVW  $65521, R12
+	MOVKW $(32775<<16), R10
+	MOVW  $4294901775, R11
+	MOVW  $65520, R13
+	ADDW  R11, R8, R11
+	UMULL R10, R9, R10
+	CMPW  R13, R8
+	CSELW HI, R11, R8, R8
+	LSR   $47, R10, R10
+	MSUBW R12, R9, R10, R9
 
-LBB0_15:
-	ORRW R9<<16, R8, R0 // <--                                  // orr	w0, w8, w9, lsl #16
-	NOP                 // (skipped)                            // ldp	x29, x30, [sp], #16
-	MOVW R0, ret+32(FP) // <--
-	RET                 // <--                                  // ret
+return_result:
+	ORRW R9<<16, R8, R0
+	NOP
+	MOVW R0, ret+32(FP)
+	RET
--- a/internal/adler32/adler32_sse3.s
+++ b/internal/adler32/adler32_sse3.s
@@ -19,196 +19,196 @@
 	MOVQ    buf_base+8(FP), SI
 	MOVQ    buf_len+16(FP), DX
 	MOVQ    buf_cap+24(FP), CX
-	NOP                         // (skipped)                            // push	rbp
-	NOP                         // (skipped)                            // mov	rbp, rsp
-	NOP                         // (skipped)                            // and	rsp, -8
-	WORD    $0xf889             // MOVL DI, AX                          // mov	eax, edi
-	LONG    $0xc8b70f44         // MOVZX AX, R9                         // movzx	r9d, ax
-	WORD    $0xe8c1; BYTE $0x10 // SHRL $0x10, AX                       // shr	eax, 16
-	WORD    $0xd189             // MOVL DX, CX                          // mov	ecx, edx
-	WORD    $0xe183; BYTE $0x1f // ANDL $0x1f, CX                       // and	ecx, 31
-	CMPQ    DX, $0x20           // <--                                  // cmp	rdx, 32
-	JAE     LBB0_2              // <--                                  // jae	.LBB0_2
-	WORD    $0x8944; BYTE $0xcf // MOVL R9, DI                          // mov	edi, r9d
-	JMP     LBB0_6              // <--                                  // jmp	.LBB0_6
+	NOP
+	NOP
+	NOP
+	WORD    $0xf889
+	LONG    $0xc8b70f44
+	WORD    $0xe8c1; BYTE $0x10
+	WORD    $0xd189
+	WORD    $0xe183; BYTE $0x1f
+	CMPQ    DX, $0x20
+	JAE     block_loop_setup
+	WORD    $0x8944; BYTE $0xcf
+	JMP     tail_entry
 
-LBB0_2:
-	SHRQ $0x5, DX                  // <--                                  // shr	rdx, 5
-	LONG $0xc0ef0f66               // PXOR X0, X0                          // pxor	xmm0, xmm0
-	MOVO LCPI0_0<>(SB), X1         // <--                                  // movdqa	xmm1, xmmword ptr [rip + .LCPI0_0]
-	MOVO LCPI0_1<>(SB), X2         // <--                                  // movdqa	xmm2, xmmword ptr [rip + .LCPI0_1]
-	MOVO LCPI0_2<>(SB), X3         // <--                                  // movdqa	xmm3, xmmword ptr [rip + .LCPI0_2]
-	LONG $0x8071b841; WORD $0x8007 // MOVL $-0x7ff87f8f, R8                // mov	r8d, 2147975281
+block_loop_setup:
+	SHRQ $0x5, DX
+	LONG $0xc0ef0f66
+	MOVO LCPI0_0<>(SB), X1
+	MOVO LCPI0_1<>(SB), X2
+	MOVO LCPI0_2<>(SB), X3
+	LONG $0x8071b841; WORD $0x8007
 
-LBB0_3:
-	CMPQ DX, $0xad                 // <--                                  // cmp	rdx, 173
-	LONG $0x00adba41; WORD $0x0000 // MOVL $0xad, R10                      // mov	r10d, 173
-	LONG $0xd2420f4c               // CMOVB DX, R10                        // cmovb	r10, rdx
-	WORD $0x8944; BYTE $0xcf       // MOVL R9, DI                          // mov	edi, r9d
-	LONG $0xfaaf0f41               // IMULL R10, DI                        // imul	edi, r10d
-	LONG $0xef6e0f66               // MOVD DI, X5                          // movd	xmm5, edi
-	LONG $0xe06e0f66               // MOVD AX, X4                          // movd	xmm4, eax
-	WORD $0x8944; BYTE $0xd0       // MOVL R10, AX                         // mov	eax, r10d
-	LONG $0xf6ef0f66               // PXOR X6, X6                          // pxor	xmm6, xmm6
+block_outer_loop:
+	CMPQ DX, $0xad
+	LONG $0x00adba41; WORD $0x0000
+	LONG $0xd2420f4c
+	WORD $0x8944; BYTE $0xcf
+	LONG $0xfaaf0f41
+	LONG $0xef6e0f66
+	LONG $0xe06e0f66
+	WORD $0x8944; BYTE $0xd0
+	LONG $0xf6ef0f66
 
-LBB0_4:
-	LONG  $0x3e6f0ff3                           // MOVDQU 0(SI), X7                     // movdqu	xmm7, xmmword ptr [rsi]
-	LONG  $0x6f0f4466; BYTE $0xc7               // MOVDQA X7, X8                        // movdqa	xmm8, xmm7
-	LONG  $0x04380f66; BYTE $0xf9               // PMADDUBSW X1, X7                     // pmaddubsw	xmm7, xmm1
-	LONG  $0xfaf50f66                           // PMADDWD X2, X7                       // pmaddwd	xmm7, xmm2
-	LONG  $0xfcfe0f66                           // PADDD X4, X7                         // paddd	xmm7, xmm4
-	LONG  $0x666f0ff3; BYTE $0x10               // MOVDQU 0x10(SI), X4                  // movdqu	xmm4, xmmword ptr [rsi + 16]
-	LONG  $0xeefe0f66                           // PADDD X6, X5                         // paddd	xmm5, xmm6
-	LONG  $0xf60f4466; BYTE $0xc0               // PSADBW X0, X8                        // psadbw	xmm8, xmm0
-	LONG  $0xfe0f4466; BYTE $0xc6               // PADDD X6, X8                         // paddd	xmm8, xmm6
-	LONG  $0xf46f0f66                           // MOVDQA X4, X6                        // movdqa	xmm6, xmm4
-	LONG  $0xf0f60f66                           // PSADBW X0, X6                        // psadbw	xmm6, xmm0
-	LONG  $0xfe0f4166; BYTE $0xf0               // PADDD X8, X6                         // paddd	xmm6, xmm8
-	LONG  $0x04380f66; BYTE $0xe3               // PMADDUBSW X3, X4                     // pmaddubsw	xmm4, xmm3
-	LONG  $0xe2f50f66                           // PMADDWD X2, X4                       // pmaddwd	xmm4, xmm2
-	LONG  $0xe7fe0f66                           // PADDD X7, X4                         // paddd	xmm4, xmm7
-	ADDQ  $0x20, SI                             // <--                                  // add	rsi, 32
-	WORD  $0xc8ff                               // DECL AX                              // dec	eax
-	JNE   LBB0_4                                // <--                                  // jne	.LBB0_4
-	LONG  $0xf5720f66; BYTE $0x05               // PSLLD $0x5, X5                       // pslld	xmm5, 5
-	LONG  $0xe5fe0f66                           // PADDD X5, X4                         // paddd	xmm4, xmm5
-	LONG  $0xee700f66; BYTE $0xb1               // PSHUFD $0xb1, X6, X5                 // pshufd	xmm5, xmm6, 177
-	LONG  $0xeefe0f66                           // PADDD X6, X5                         // paddd	xmm5, xmm6
-	LONG  $0xf5700f66; BYTE $0xee               // PSHUFD $0xee, X5, X6                 // pshufd	xmm6, xmm5, 238
-	LONG  $0xf5fe0f66                           // PADDD X5, X6                         // paddd	xmm6, xmm5
-	LONG  $0xf77e0f66                           // MOVD X6, DI                          // movd	edi, xmm6
-	WORD  $0x0144; BYTE $0xcf                   // ADDL R9, DI                          // add	edi, r9d
-	LONG  $0xec700f66; BYTE $0xb1               // PSHUFD $0xb1, X4, X5                 // pshufd	xmm5, xmm4, 177
-	LONG  $0xecfe0f66                           // PADDD X4, X5                         // paddd	xmm5, xmm4
-	LONG  $0xe5700f66; BYTE $0xee               // PSHUFD $0xee, X5, X4                 // pshufd	xmm4, xmm5, 238
-	LONG  $0xe5fe0f66                           // PADDD X5, X4                         // paddd	xmm4, xmm5
-	LONG  $0xe07e0f66                           // MOVD X4, AX                          // movd	eax, xmm4
-	MOVQ  DI, R9                                // <--                                  // mov	r9, rdi
-	IMULQ R8, R9                                // <--                                  // imul	r9, r8
-	SHRQ  $0x2f, R9                             // <--                                  // shr	r9, 47
-	LONG  $0xf1c96945; WORD $0x00ff; BYTE $0x00 // IMULL $0xfff1, R9, R9                // imul	r9d, r9d, 65521
-	WORD  $0x2944; BYTE $0xcf                   // SUBL R9, DI                          // sub	edi, r9d
-	MOVQ  AX, R9                                // <--                                  // mov	r9, rax
-	IMULQ R8, R9                                // <--                                  // imul	r9, r8
-	SHRQ  $0x2f, R9                             // <--                                  // shr	r9, 47
-	LONG  $0xf1c96945; WORD $0x00ff; BYTE $0x00 // IMULL $0xfff1, R9, R9                // imul	r9d, r9d, 65521
-	WORD  $0x2944; BYTE $0xc8                   // SUBL R9, AX                          // sub	eax, r9d
-	WORD  $0x8941; BYTE $0xf9                   // MOVL DI, R9                          // mov	r9d, edi
-	SUBQ  R10, DX                               // <--                                  // sub	rdx, r10
-	JNE   LBB0_3                                // <--                                  // jne	.LBB0_3
+block_inner_loop:
+	LONG  $0x3e6f0ff3
+	LONG  $0x6f0f4466; BYTE $0xc7
+	LONG  $0x04380f66; BYTE $0xf9
+	LONG  $0xfaf50f66
+	LONG  $0xfcfe0f66
+	LONG  $0x666f0ff3; BYTE $0x10
+	LONG  $0xeefe0f66
+	LONG  $0xf60f4466; BYTE $0xc0
+	LONG  $0xfe0f4466; BYTE $0xc6
+	LONG  $0xf46f0f66
+	LONG  $0xf0f60f66
+	LONG  $0xfe0f4166; BYTE $0xf0
+	LONG  $0x04380f66; BYTE $0xe3
+	LONG  $0xe2f50f66
+	LONG  $0xe7fe0f66
+	ADDQ  $0x20, SI
+	WORD  $0xc8ff
+	JNE   block_inner_loop
+	LONG  $0xf5720f66; BYTE $0x05
+	LONG  $0xe5fe0f66
+	LONG  $0xee700f66; BYTE $0xb1
+	LONG  $0xeefe0f66
+	LONG  $0xf5700f66; BYTE $0xee
+	LONG  $0xf5fe0f66
+	LONG  $0xf77e0f66
+	WORD  $0x0144; BYTE $0xcf
+	LONG  $0xec700f66; BYTE $0xb1
+	LONG  $0xecfe0f66
+	LONG  $0xe5700f66; BYTE $0xee
+	LONG  $0xe5fe0f66
+	LONG  $0xe07e0f66
+	MOVQ  DI, R9
+	IMULQ R8, R9
+	SHRQ  $0x2f, R9
+	LONG  $0xf1c96945; WORD $0x00ff; BYTE $0x00
+	WORD  $0x2944; BYTE $0xcf
+	MOVQ  AX, R9
+	IMULQ R8, R9
+	SHRQ  $0x2f, R9
+	LONG  $0xf1c96945; WORD $0x00ff; BYTE $0x00
+	WORD  $0x2944; BYTE $0xc8
+	WORD  $0x8941; BYTE $0xf9
+	SUBQ  R10, DX
+	JNE   block_outer_loop
 
-LBB0_6:
-	WORD $0x8548; BYTE $0xc9     // TESTQ CX, CX                         // test	rcx, rcx
-	JE   LBB0_18                 // <--                                  // je	.LBB0_18
-	CMPL CX, $0x10               // <--                                  // cmp	ecx, 16
-	JB   LBB0_10                 // <--                                  // jb	.LBB0_10
-	WORD $0xb60f; BYTE $0x16     // MOVZX 0(SI), DX                      // movzx	edx, byte ptr [rsi]
-	WORD $0xd701                 // ADDL DX, DI                          // add	edi, edx
-	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
-	LONG $0x0156b60f             // MOVZX 0x1(SI), DX                    // movzx	edx, byte ptr [rsi + 1]
-	WORD $0xfa01                 // ADDL DI, DX                          // add	edx, edi
-	WORD $0xd001                 // ADDL DX, AX                          // add	eax, edx
-	LONG $0x027eb60f             // MOVZX 0x2(SI), DI                    // movzx	edi, byte ptr [rsi + 2]
-	WORD $0xd701                 // ADDL DX, DI                          // add	edi, edx
-	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
-	LONG $0x0356b60f             // MOVZX 0x3(SI), DX                    // movzx	edx, byte ptr [rsi + 3]
-	WORD $0xfa01                 // ADDL DI, DX                          // add	edx, edi
-	WORD $0xd001                 // ADDL DX, AX                          // add	eax, edx
-	LONG $0x047eb60f             // MOVZX 0x4(SI), DI                    // movzx	edi, byte ptr [rsi + 4]
-	WORD $0xd701                 // ADDL DX, DI                          // add	edi, edx
-	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
-	LONG $0x0556b60f             // MOVZX 0x5(SI), DX                    // movzx	edx, byte ptr [rsi + 5]
-	WORD $0xfa01                 // ADDL DI, DX                          // add	edx, edi
-	WORD $0xd001                 // ADDL DX, AX                          // add	eax, edx
-	LONG $0x067eb60f             // MOVZX 0x6(SI), DI                    // movzx	edi, byte ptr [rsi + 6]
-	WORD $0xd701                 // ADDL DX, DI                          // add	edi, edx
-	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
-	LONG $0x0756b60f             // MOVZX 0x7(SI), DX                    // movzx	edx, byte ptr [rsi + 7]
-	WORD $0xfa01                 // ADDL DI, DX                          // add	edx, edi
-	WORD $0xd001                 // ADDL DX, AX                          // add	eax, edx
-	LONG $0x087eb60f             // MOVZX 0x8(SI), DI                    // movzx	edi, byte ptr [rsi + 8]
-	WORD $0xd701                 // ADDL DX, DI                          // add	edi, edx
-	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
-	LONG $0x0956b60f             // MOVZX 0x9(SI), DX                    // movzx	edx, byte ptr [rsi + 9]
-	WORD $0xfa01                 // ADDL DI, DX                          // add	edx, edi
-	WORD $0xd001                 // ADDL DX, AX                          // add	eax, edx
-	LONG $0x0a7eb60f             // MOVZX 0xa(SI), DI                    // movzx	edi, byte ptr [rsi + 10]
-	WORD $0xd701                 // ADDL DX, DI                          // add	edi, edx
-	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
-	LONG $0x0b56b60f             // MOVZX 0xb(SI), DX                    // movzx	edx, byte ptr [rsi + 11]
-	WORD $0xfa01                 // ADDL DI, DX                          // add	edx, edi
-	WORD $0xd001                 // ADDL DX, AX                          // add	eax, edx
-	LONG $0x0c7eb60f             // MOVZX 0xc(SI), DI                    // movzx	edi, byte ptr [rsi + 12]
-	WORD $0xd701                 // ADDL DX, DI                          // add	edi, edx
-	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
-	LONG $0x0d56b60f             // MOVZX 0xd(SI), DX                    // movzx	edx, byte ptr [rsi + 13]
-	WORD $0xfa01                 // ADDL DI, DX                          // add	edx, edi
-	WORD $0xd001                 // ADDL DX, AX                          // add	eax, edx
-	LONG $0x46b60f44; BYTE $0x0e // MOVZX 0xe(SI), R8                    // movzx	r8d, byte ptr [rsi + 14]
-	WORD $0x0141; BYTE $0xd0     // ADDL DX, R8                          // add	r8d, edx
-	WORD $0x0144; BYTE $0xc0     // ADDL R8, AX                          // add	eax, r8d
-	LONG $0x0f7eb60f             // MOVZX 0xf(SI), DI                    // movzx	edi, byte ptr [rsi + 15]
-	WORD $0x0144; BYTE $0xc7     // ADDL R8, DI                          // add	edi, r8d
-	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
-	ADDQ $-0x10, CX              // <--                                  // add	rcx, -16
-	JE   LBB0_17                 // <--                                  // je	.LBB0_17
-	ADDQ $0x10, SI               // <--                                  // add	rsi, 16
+tail_entry:
+	WORD $0x8548; BYTE $0xc9
+	JE   return_result
+	CMPL CX, $0x10
+	JB   tail_bytes_setup
+	WORD $0xb60f; BYTE $0x16
+	WORD $0xd701
+	WORD $0xf801
+	LONG $0x0156b60f
+	WORD $0xfa01
+	WORD $0xd001
+	LONG $0x027eb60f
+	WORD $0xd701
+	WORD $0xf801
+	LONG $0x0356b60f
+	WORD $0xfa01
+	WORD $0xd001
+	LONG $0x047eb60f
+	WORD $0xd701
+	WORD $0xf801
+	LONG $0x0556b60f
+	WORD $0xfa01
+	WORD $0xd001
+	LONG $0x067eb60f
+	WORD $0xd701
+	WORD $0xf801
+	LONG $0x0756b60f
+	WORD $0xfa01
+	WORD $0xd001
+	LONG $0x087eb60f
+	WORD $0xd701
+	WORD $0xf801
+	LONG $0x0956b60f
+	WORD $0xfa01
+	WORD $0xd001
+	LONG $0x0a7eb60f
+	WORD $0xd701
+	WORD $0xf801
+	LONG $0x0b56b60f
+	WORD $0xfa01
+	WORD $0xd001
+	LONG $0x0c7eb60f
+	WORD $0xd701
+	WORD $0xf801
+	LONG $0x0d56b60f
+	WORD $0xfa01
+	WORD $0xd001
+	LONG $0x46b60f44; BYTE $0x0e
+	WORD $0x0141; BYTE $0xd0
+	WORD $0x0144; BYTE $0xc0
+	LONG $0x0f7eb60f
+	WORD $0x0144; BYTE $0xc7
+	WORD $0xf801
+	ADDQ $-0x10, CX
+	JE   final_reduce
+	ADDQ $0x10, SI
 
-LBB0_10:
-	LEAQ -0x1(CX), DX // <--                                  // lea	rdx, [rcx - 1]
-	MOVQ CX, R9       // <--                                  // mov	r9, rcx
-	ANDQ $0x3, R9     // <--                                  // and	r9, 3
-	JE   LBB0_14      // <--                                  // je	.LBB0_14
-	XORL R8, R8       // <--                                  // xor	r8d, r8d
+tail_bytes_setup:
+	LEAQ -0x1(CX), DX
+	MOVQ CX, R9
+	ANDQ $0x3, R9
+	JE   tail_dword_setup
+	XORL R8, R8
 
-LBB0_12:
-	LONG $0x14b60f46; BYTE $0x06 // MOVZX 0(SI)(R8*1), R10               // movzx	r10d, byte ptr [rsi + r8]
-	WORD $0x0144; BYTE $0xd7     // ADDL R10, DI                         // add	edi, r10d
-	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
-	INCQ R8                      // <--                                  // inc	r8
-	CMPQ R9, R8                  // <--                                  // cmp	r9, r8
-	JNE  LBB0_12                 // <--                                  // jne	.LBB0_12
-	ADDQ R8, SI                  // <--                                  // add	rsi, r8
-	SUBQ R8, CX                  // <--                                  // sub	rcx, r8
+tail_byte_loop:
+	LONG $0x14b60f46; BYTE $0x06
+	WORD $0x0144; BYTE $0xd7
+	WORD $0xf801
+	INCQ R8
+	CMPQ R9, R8
+	JNE  tail_byte_loop
+	ADDQ R8, SI
+	SUBQ R8, CX
 
-LBB0_14:
-	CMPQ DX, $0x3 // <--                                  // cmp	rdx, 3
-	JB   LBB0_17  // <--                                  // jb	.LBB0_17
-	XORL DX, DX   // <--                                  // xor	edx, edx
+tail_dword_setup:
+	CMPQ DX, $0x3
+	JB   final_reduce
+	XORL DX, DX
 
-LBB0_16:
-	LONG $0x04b60f44; BYTE $0x16   // MOVZX 0(SI)(DX*1), R8                // movzx	r8d, byte ptr [rsi + rdx]
-	WORD $0x0141; BYTE $0xf8       // ADDL DI, R8                          // add	r8d, edi
-	WORD $0x0144; BYTE $0xc0       // ADDL R8, AX                          // add	eax, r8d
-	LONG $0x167cb60f; BYTE $0x01   // MOVZX 0x1(SI)(DX*1), DI              // movzx	edi, byte ptr [rsi + rdx + 1]
-	WORD $0x0144; BYTE $0xc7       // ADDL R8, DI                          // add	edi, r8d
-	WORD $0xf801                   // ADDL DI, AX                          // add	eax, edi
-	LONG $0x44b60f44; WORD $0x0216 // MOVZX 0x2(SI)(DX*1), R8              // movzx	r8d, byte ptr [rsi + rdx + 2]
-	WORD $0x0141; BYTE $0xf8       // ADDL DI, R8                          // add	r8d, edi
-	WORD $0x0144; BYTE $0xc0       // ADDL R8, AX                          // add	eax, r8d
-	LONG $0x167cb60f; BYTE $0x03   // MOVZX 0x3(SI)(DX*1), DI              // movzx	edi, byte ptr [rsi + rdx + 3]
-	WORD $0x0144; BYTE $0xc7       // ADDL R8, DI                          // add	edi, r8d
-	WORD $0xf801                   // ADDL DI, AX                          // add	eax, edi
-	ADDQ $0x4, DX                  // <--                                  // add	rdx, 4
-	CMPQ CX, DX                    // <--                                  // cmp	rcx, rdx
-	JNE  LBB0_16                   // <--                                  // jne	.LBB0_16
+tail_dword_loop:
+	LONG $0x04b60f44; BYTE $0x16
+	WORD $0x0141; BYTE $0xf8
+	WORD $0x0144; BYTE $0xc0
+	LONG $0x167cb60f; BYTE $0x01
+	WORD $0x0144; BYTE $0xc7
+	WORD $0xf801
+	LONG $0x44b60f44; WORD $0x0216
+	WORD $0x0141; BYTE $0xf8
+	WORD $0x0144; BYTE $0xc0
+	LONG $0x167cb60f; BYTE $0x03
+	WORD $0x0144; BYTE $0xc7
+	WORD $0xf801
+	ADDQ $0x4, DX
+	CMPQ CX, DX
+	JNE  tail_dword_loop
 
-LBB0_17:
-	LONG  $0x000f8f8d; WORD $0xffff // LEAL -0xfff1(DI), CX                 // lea	ecx, [rdi - 65521]
-	CMPL  DI, $0xfff1               // <--                                  // cmp	edi, 65521
-	WORD  $0x420f; BYTE $0xcf       // CMOVB DI, CX                         // cmovb	ecx, edi
-	WORD  $0xc289                   // MOVL AX, DX                          // mov	edx, eax
-	LONG  $0x078071be; BYTE $0x80   // MOVL $-0x7ff87f8f, SI                // mov	esi, 2147975281
-	IMULQ DX, SI                    // <--                                  // imul	rsi, rdx
-	SHRQ  $0x2f, SI                 // <--                                  // shr	rsi, 47
-	LONG  $0xfff1d669; WORD $0x0000 // IMULL $0xfff1, SI, DX                // imul	edx, esi, 65521
-	WORD  $0xd029                   // SUBL DX, AX                          // sub	eax, edx
-	WORD  $0xcf89                   // MOVL CX, DI                          // mov	edi, ecx
+final_reduce:
+	LONG  $0x000f8f8d; WORD $0xffff
+	CMPL  DI, $0xfff1
+	WORD  $0x420f; BYTE $0xcf
+	WORD  $0xc289
+	LONG  $0x078071be; BYTE $0x80
+	IMULQ DX, SI
+	SHRQ  $0x2f, SI
+	LONG  $0xfff1d669; WORD $0x0000
+	WORD  $0xd029
+	WORD  $0xcf89
 
-LBB0_18:
-	WORD $0xe0c1; BYTE $0x10 // SHLL $0x10, AX                       // shl	eax, 16
-	WORD $0xf809             // ORL DI, AX                           // or	eax, edi
-	NOP                      // (skipped)                            // mov	rsp, rbp
-	NOP                      // (skipped)                            // pop	rbp
-	MOVL AX, ret+32(FP)      // <--
-	RET                      // <--                                  // ret
+return_result:
+	WORD $0xe0c1; BYTE $0x10
+	WORD $0xf809
+	NOP
+	NOP
+	MOVL AX, ret+32(FP)
+	RET
--