shithub: furgit

Download patch

ref: 5e26980abad6374917aafad7e6c8120cd2e55d63
parent: e0ce3837ee2ea643a4cd7eb5e7b068aae0b3910d
author: Runxi Yu <me@runxiyu.org>
date: Sun Feb 22 06:38:33 EST 2026

adler32: Import

--- /dev/null
+++ b/internal/adler32/LICENSE
@@ -1,0 +1,30 @@
+Copyright (c) 2024, Michal Hruby
+Copyright (c) 2017 The Chromium Authors. All rights reserved.
+Copyright (c) 1995-2024 Mark Adler
+Copyright (c) 1995-2024 Jean-loup Gailly
+Copyright (c) 2022 Adam Stylinski
+
+BSD 2-Clause License
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
--- /dev/null
+++ b/internal/adler32/LICENSE.ZLIB
@@ -1,0 +1,17 @@
+Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
+
+This software is provided 'as-is', without any express or implied
+warranty.  In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
--- /dev/null
+++ b/internal/adler32/README
@@ -1,0 +1,1 @@
+This package was mostly copied from github.com/mhr3/adler32-simd.
--- /dev/null
+++ b/internal/adler32/adler32_amd64.go
@@ -1,0 +1,93 @@
+//go:build amd64 && !purego
+
+package adler32
+
+import (
+	"encoding/binary"
+	"errors"
+	"hash"
+	"hash/adler32"
+
+	"golang.org/x/sys/cpu"
+)
+
+// The size of an Adler-32 checksum in bytes.
+const Size = 4
+
+var (
+	hasSSE3 = cpu.X86.HasSSE3
+	hasAVX2 = cpu.X86.HasAVX2
+)
+
+// digest represents the partial evaluation of a checksum.
+// The low 16 bits are s1, the high 16 bits are s2.
+type digest uint32
+
+func (d *digest) Reset() { *d = 1 }
+
+// New returns a new hash.Hash32 computing the Adler-32 checksum.
+func New() hash.Hash32 {
+	if !hasSSE3 {
+		return adler32.New()
+	}
+	d := new(digest)
+	d.Reset()
+	return d
+}
+
+func (d *digest) MarshalBinary() ([]byte, error) {
+	b := make([]byte, 0, marshaledSize)
+	b = append(b, magic...)
+	b = binary.BigEndian.AppendUint32(b, uint32(*d))
+	return b, nil
+}
+
+func (d *digest) UnmarshalBinary(b []byte) error {
+	if len(b) < len(magic) || string(b[:len(magic)]) != magic {
+		return errors.New("hash/adler32: invalid hash state identifier")
+	}
+	if len(b) != marshaledSize {
+		return errors.New("hash/adler32: invalid hash state size")
+	}
+	*d = digest(binary.BigEndian.Uint32(b[len(magic):]))
+	return nil
+}
+
+func (d *digest) Size() int { return Size }
+
+func (d *digest) BlockSize() int { return 4 }
+
+func (d *digest) Write(data []byte) (nn int, err error) {
+	if len(data) >= 64 {
+		var h uint32
+		if hasAVX2 {
+			h = adler32_avx2(uint32(*d), data)
+		} else {
+			h = adler32_sse3(uint32(*d), data)
+		}
+		*d = digest(h)
+	} else {
+		h := update(uint32(*d), data)
+		*d = digest(h)
+	}
+	return len(data), nil
+}
+
+func (d *digest) Sum32() uint32 { return uint32(*d) }
+
+func (d *digest) Sum(in []byte) []byte {
+	s := uint32(*d)
+	return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s))
+}
+
+// Checksum returns the Adler-32 checksum of data.
+func Checksum(data []byte) uint32 {
+	if !hasSSE3 || len(data) < 64 {
+		return update(1, data)
+	}
+
+	if hasAVX2 {
+		return adler32_avx2(1, data)
+	}
+	return adler32_sse3(1, data)
+}
--- /dev/null
+++ b/internal/adler32/adler32_arm64.go
@@ -1,0 +1,73 @@
+//go:build arm64 && !purego
+
+package adler32
+
+import (
+	"encoding/binary"
+	"errors"
+	"hash"
+)
+
+// The size of an Adler-32 checksum in bytes.
+const Size = 4
+
+// digest represents the partial evaluation of a checksum.
+// The low 16 bits are s1, the high 16 bits are s2.
+type digest uint32
+
+func (d *digest) Reset() { *d = 1 }
+
+// New returns a new hash.Hash32 computing the Adler-32 checksum.
+func New() hash.Hash32 {
+	d := new(digest)
+	d.Reset()
+	return d
+}
+
+func (d *digest) MarshalBinary() ([]byte, error) {
+	b := make([]byte, 0, marshaledSize)
+	b = append(b, magic...)
+	b = binary.BigEndian.AppendUint32(b, uint32(*d))
+	return b, nil
+}
+
+func (d *digest) UnmarshalBinary(b []byte) error {
+	if len(b) < len(magic) || string(b[:len(magic)]) != magic {
+		return errors.New("hash/adler32: invalid hash state identifier")
+	}
+	if len(b) != marshaledSize {
+		return errors.New("hash/adler32: invalid hash state size")
+	}
+	*d = digest(binary.BigEndian.Uint32(b[len(magic):]))
+	return nil
+}
+
+func (d *digest) Size() int { return Size }
+
+func (d *digest) BlockSize() int { return 4 }
+
+func (d *digest) Write(data []byte) (nn int, err error) {
+	if len(data) >= 64 {
+		h := adler32_neon(uint32(*d), data)
+		*d = digest(h)
+	} else {
+		h := update(uint32(*d), data)
+		*d = digest(h)
+	}
+	return len(data), nil
+}
+
+func (d *digest) Sum32() uint32 { return uint32(*d) }
+
+func (d *digest) Sum(in []byte) []byte {
+	s := uint32(*d)
+	return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s))
+}
+
+// Checksum returns the Adler-32 checksum of data.
+func Checksum(data []byte) uint32 {
+	if len(data) >= 64 {
+		return adler32_neon(1, data)
+	}
+	return update(1, data)
+}
--- /dev/null
+++ b/internal/adler32/adler32_avx2.go
@@ -1,0 +1,6 @@
+//go:build !purego && amd64
+
+package adler32
+
+//go:noescape
+func adler32_avx2(in uint32, buf []byte) uint32
--- /dev/null
+++ b/internal/adler32/adler32_avx2.s
@@ -1,0 +1,263 @@
+//go:build !purego && amd64
+
+#include "textflag.h"
+
+DATA LCPI0_0<>+0x00(SB)/8, $0x191a1b1c1d1e1f20
+DATA LCPI0_0<>+0x08(SB)/8, $0x1112131415161718
+DATA LCPI0_0<>+0x10(SB)/8, $0x090a0b0c0d0e0f10
+DATA LCPI0_0<>+0x18(SB)/8, $0x0102030405060708
+GLOBL LCPI0_0<>(SB), (RODATA|NOPTR), $32
+
+DATA LCPI0_1<>+0x00(SB)/8, $0x0001000100010001
+DATA LCPI0_1<>+0x08(SB)/8, $0x0001000100010001
+DATA LCPI0_1<>+0x10(SB)/8, $0x0001000100010001
+DATA LCPI0_1<>+0x18(SB)/8, $0x0001000100010001
+GLOBL LCPI0_1<>(SB), (RODATA|NOPTR), $32
+
+DATA LCPI0_2<>+0x00(SB)/2, $0x0001
+GLOBL LCPI0_2<>(SB), (RODATA|NOPTR), $2
+
+TEXT ·adler32_avx2(SB), NOSPLIT, $0-36
+	MOVLQZX      in+0(FP), DI
+	MOVQ         buf_base+8(FP), SI
+	MOVQ         buf_len+16(FP), DX
+	MOVQ         buf_cap+24(FP), CX
+	WORD         $0x8548; BYTE $0xf6     // TESTQ SI, SI                         // test	rsi, rsi
+	JE           LBB0_1                  // <--                                  // je	.LBB0_1
+	WORD         $0xf889                 // MOVL DI, AX                          // mov	eax, edi
+	WORD         $0x8548; BYTE $0xd2     // TESTQ DX, DX                         // test	rdx, rdx
+	JE           LBB0_2                  // <--                                  // je	.LBB0_2
+	NOP                                  // (skipped)                            // push	rbp
+	NOP                                  // (skipped)                            // mov	rbp, rsp
+	NOP                                  // (skipped)                            // and	rsp, -8
+	WORD         $0xc189                 // MOVL AX, CX                          // mov	ecx, eax
+	WORD         $0xe9c1; BYTE $0x10     // SHRL $0x10, CX                       // shr	ecx, 16
+	WORD         $0xb70f; BYTE $0xc0     // MOVZX AX, AX                         // movzx	eax, ax
+	CMPQ         DX, $0x20               // <--                                  // cmp	rdx, 32
+	JB           LBB0_17                 // <--                                  // jb	.LBB0_17
+	LONG         $0x078071bf; BYTE $0x80 // MOVL $-0x7ff87f8f, DI                // mov	edi, 2147975281
+	LONG         $0xc0eff9c5             // VPXOR X0, X0, X0                     // vpxor	xmm0, xmm0, xmm0
+	VMOVDQA      LCPI0_0<>(SB), Y1       // <--                                  // vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_0]
+	VPBROADCASTW LCPI0_2<>(SB), Y2       // <--                                  // vpbroadcastw	ymm2, word ptr [rip + .LCPI0_2]
+	JMP          LBB0_6                  // <--                                  // jmp	.LBB0_6
+
+LBB0_7:
+	LONG $0xf46ffdc5 // VMOVDQA Y4, Y6                       // vmovdqa	ymm6, ymm4
+	LONG $0xedefd1c5 // VPXOR X5, X5, X5                     // vpxor	xmm5, xmm5, xmm5
+
+LBB0_14:
+	SUBQ  AX, DX                                // <--                                  // sub	rdx, rax
+	LONG  $0xf572ddc5; BYTE $0x05               // ?                                    // vpslld	ymm4, ymm5, 5
+	LONG  $0xdbfeddc5                           // VPADDD Y3, Y4, Y3                    // vpaddd	ymm3, ymm4, ymm3
+	LONG  $0x397de3c4; WORD $0x01f4             // VEXTRACTI128 $0x1, Y6, X4            // vextracti128	xmm4, ymm6, 1
+	LONG  $0xecc6c8c5; BYTE $0x88               // VSHUFPS $-0x78, X4, X6, X5           // vshufps	xmm5, xmm6, xmm4, 136
+	LONG  $0xe470f9c5; BYTE $0x88               // VPSHUFD $-0x78, X4, X4               // vpshufd	xmm4, xmm4, 136
+	LONG  $0xe4fed1c5                           // VPADDD X4, X5, X4                    // vpaddd	xmm4, xmm5, xmm4
+	LONG  $0xec70f9c5; BYTE $0x55               // VPSHUFD $0x55, X4, X5                // vpshufd	xmm5, xmm4, 85
+	LONG  $0xe4fed1c5                           // VPADDD X4, X5, X4                    // vpaddd	xmm4, xmm5, xmm4
+	LONG  $0xe07ef9c5                           // VMOVD X4, AX                         // vmovd	eax, xmm4
+	MOVQ  AX, CX                                // <--                                  // mov	rcx, rax
+	IMULQ DI, CX                                // <--                                  // imul	rcx, rdi
+	SHRQ  $0x2f, CX                             // <--                                  // shr	rcx, 47
+	LONG  $0xfff1c969; WORD $0x0000             // IMULL $0xfff1, CX, CX                // imul	ecx, ecx, 65521
+	WORD  $0xc829                               // SUBL CX, AX                          // sub	eax, ecx
+	LONG  $0x397de3c4; WORD $0x01dc             // VEXTRACTI128 $0x1, Y3, X4            // vextracti128	xmm4, ymm3, 1
+	LONG  $0xdbfed9c5                           // VPADDD X3, X4, X3                    // vpaddd	xmm3, xmm4, xmm3
+	LONG  $0xe370f9c5; BYTE $0xee               // VPSHUFD $-0x12, X3, X4               // vpshufd	xmm4, xmm3, 238
+	LONG  $0xdcfee1c5                           // VPADDD X4, X3, X3                    // vpaddd	xmm3, xmm3, xmm4
+	LONG  $0xe370f9c5; BYTE $0x55               // VPSHUFD $0x55, X3, X4                // vpshufd	xmm4, xmm3, 85
+	LONG  $0xdbfed9c5                           // VPADDD X3, X4, X3                    // vpaddd	xmm3, xmm4, xmm3
+	LONG  $0xd97ef9c5                           // VMOVD X3, CX                         // vmovd	ecx, xmm3
+	MOVQ  CX, R8                                // <--                                  // mov	r8, rcx
+	IMULQ DI, R8                                // <--                                  // imul	r8, rdi
+	SHRQ  $0x2f, R8                             // <--                                  // shr	r8, 47
+	LONG  $0xf1c06945; WORD $0x00ff; BYTE $0x00 // IMULL $0xfff1, R8, R8                // imul	r8d, r8d, 65521
+	WORD  $0x2944; BYTE $0xc1                   // SUBL R8, CX                          // sub	ecx, r8d
+	CMPQ  DX, $0x1f                             // <--                                  // cmp	rdx, 31
+	JBE   LBB0_15                               // <--                                  // jbe	.LBB0_15
+
+LBB0_6:
+	LONG $0xe06ef9c5               // VMOVD AX, X4                         // vmovd	xmm4, eax
+	LONG $0xd96ef9c5               // VMOVD CX, X3                         // vmovd	xmm3, ecx
+	CMPQ DX, $0x15b0               // <--                                  // cmp	rdx, 5552
+	LONG $0x15b0b841; WORD $0x0000 // MOVL $0x15b0, R8                     // mov	r8d, 5552
+	LONG $0xc2420f4c               // CMOVB DX, R8                         // cmovb	r8, rdx
+	WORD $0x8944; BYTE $0xc0       // MOVL R8, AX                          // mov	eax, r8d
+	LONG $0x001fe025; BYTE $0x00   // ANDL $0x1fe0, AX                     // and	eax, 8160
+	JE   LBB0_7                    // <--                                  // je	.LBB0_7
+	ADDQ $-0x20, R8                // <--                                  // add	r8, -32
+	LONG $0xedefd1c5               // VPXOR X5, X5, X5                     // vpxor	xmm5, xmm5, xmm5
+	LONG $0x20c0f641               // TESTL $0x20, R8                      // test	r8b, 32
+	JNE  LBB0_9                    // <--                                  // jne	.LBB0_9
+	LONG $0x2e6ffec5               // VMOVDQU 0(SI), Y5                    // vmovdqu	ymm5, ymmword ptr [rsi]
+	ADDQ $0x20, SI                 // <--                                  // add	rsi, 32
+	LEAQ -0x20(AX), CX             // <--                                  // lea	rcx, [rax - 32]
+	LONG $0xf0f6d5c5               // VPSADBW Y0, Y5, Y6                   // vpsadbw	ymm6, ymm5, ymm0
+	LONG $0xf4fecdc5               // VPADDD Y4, Y6, Y6                    // vpaddd	ymm6, ymm6, ymm4
+	LONG $0x0455e2c4; BYTE $0xe9   // VPMADDUBSW Y1, Y5, Y5                // vpmaddubsw	ymm5, ymm5, ymm1
+	LONG $0xeaf5d5c5               // VPMADDWD Y2, Y5, Y5                  // vpmaddwd	ymm5, ymm5, ymm2
+	LONG $0xdbfed5c5               // VPADDD Y3, Y5, Y3                    // vpaddd	ymm3, ymm5, ymm3
+	LONG $0xec6ffdc5               // VMOVDQA Y4, Y5                       // vmovdqa	ymm5, ymm4
+	LONG $0xe66ffdc5               // VMOVDQA Y6, Y4                       // vmovdqa	ymm4, ymm6
+	CMPQ R8, $0x20                 // <--                                  // cmp	r8, 32
+	JAE  LBB0_12                   // <--                                  // jae	.LBB0_12
+	JMP  LBB0_14                   // <--                                  // jmp	.LBB0_14
+
+LBB0_9:
+	MOVQ AX, CX    // <--                                  // mov	rcx, rax
+	CMPQ R8, $0x20 // <--                                  // cmp	r8, 32
+	JB   LBB0_14   // <--                                  // jb	.LBB0_14
+
+LBB0_12:
+	LONG $0x366ffec5             // VMOVDQU 0(SI), Y6                    // vmovdqu	ymm6, ymmword ptr [rsi]
+	LONG $0x7e6ffec5; BYTE $0x20 // VMOVDQU 0x20(SI), Y7                 // vmovdqu	ymm7, ymmword ptr [rsi + 32]
+	LONG $0xc0f64dc5             // VPSADBW Y0, Y6, Y8                   // vpsadbw	ymm8, ymm6, ymm0
+	LONG $0xc4fe3dc5             // VPADDD Y4, Y8, Y8                    // vpaddd	ymm8, ymm8, ymm4
+	LONG $0xecfed5c5             // VPADDD Y4, Y5, Y5                    // vpaddd	ymm5, ymm5, ymm4
+	LONG $0x044de2c4; BYTE $0xe1 // VPMADDUBSW Y1, Y6, Y4                // vpmaddubsw	ymm4, ymm6, ymm1
+	LONG $0xe2f5ddc5             // VPMADDWD Y2, Y4, Y4                  // vpmaddwd	ymm4, ymm4, ymm2
+	LONG $0xdbfeddc5             // VPADDD Y3, Y4, Y3                    // vpaddd	ymm3, ymm4, ymm3
+	ADDQ $0x40, SI               // <--                                  // add	rsi, 64
+	LONG $0xe0f6c5c5             // VPSADBW Y0, Y7, Y4                   // vpsadbw	ymm4, ymm7, ymm0
+	LONG $0xe4febdc5             // VPADDD Y4, Y8, Y4                    // vpaddd	ymm4, ymm8, ymm4
+	LONG $0xedfebdc5             // VPADDD Y5, Y8, Y5                    // vpaddd	ymm5, ymm8, ymm5
+	LONG $0x0445e2c4; BYTE $0xf1 // VPMADDUBSW Y1, Y7, Y6                // vpmaddubsw	ymm6, ymm7, ymm1
+	LONG $0xf2f5cdc5             // VPMADDWD Y2, Y6, Y6                  // vpmaddwd	ymm6, ymm6, ymm2
+	LONG $0xdbfecdc5             // VPADDD Y3, Y6, Y3                    // vpaddd	ymm3, ymm6, ymm3
+	ADDQ $-0x40, CX              // <--                                  // add	rcx, -64
+	JNE  LBB0_12                 // <--                                  // jne	.LBB0_12
+	LONG $0xf46ffdc5             // VMOVDQA Y4, Y6                       // vmovdqa	ymm6, ymm4
+	JMP  LBB0_14                 // <--                                  // jmp	.LBB0_14
+
+LBB0_1:
+	LONG $0x000001b8; BYTE $0x00 // MOVL $0x1, AX                        // mov	eax, 1
+
+LBB0_2:
+	MOVL AX, ret+32(FP) // <--
+	RET                 // <--                                  // ret
+
+LBB0_15:
+	WORD $0x8548; BYTE $0xd2 // TESTQ DX, DX                         // test	rdx, rdx
+	JE   LBB0_16             // <--                                  // je	.LBB0_16
+
+LBB0_17:
+	CMPQ DX, $0x10               // <--                                  // cmp	rdx, 16
+	JB   LBB0_20                 // <--                                  // jb	.LBB0_20
+	WORD $0xb60f; BYTE $0x3e     // MOVZX 0(SI), DI                      // movzx	edi, byte ptr [rsi]
+	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
+	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
+	LONG $0x017eb60f             // MOVZX 0x1(SI), DI                    // movzx	edi, byte ptr [rsi + 1]
+	WORD $0xc701                 // ADDL AX, DI                          // add	edi, eax
+	WORD $0xf901                 // ADDL DI, CX                          // add	ecx, edi
+	LONG $0x0246b60f             // MOVZX 0x2(SI), AX                    // movzx	eax, byte ptr [rsi + 2]
+	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
+	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
+	LONG $0x037eb60f             // MOVZX 0x3(SI), DI                    // movzx	edi, byte ptr [rsi + 3]
+	WORD $0xc701                 // ADDL AX, DI                          // add	edi, eax
+	WORD $0xf901                 // ADDL DI, CX                          // add	ecx, edi
+	LONG $0x0446b60f             // MOVZX 0x4(SI), AX                    // movzx	eax, byte ptr [rsi + 4]
+	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
+	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
+	LONG $0x057eb60f             // MOVZX 0x5(SI), DI                    // movzx	edi, byte ptr [rsi + 5]
+	WORD $0xc701                 // ADDL AX, DI                          // add	edi, eax
+	WORD $0xf901                 // ADDL DI, CX                          // add	ecx, edi
+	LONG $0x0646b60f             // MOVZX 0x6(SI), AX                    // movzx	eax, byte ptr [rsi + 6]
+	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
+	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
+	LONG $0x077eb60f             // MOVZX 0x7(SI), DI                    // movzx	edi, byte ptr [rsi + 7]
+	WORD $0xc701                 // ADDL AX, DI                          // add	edi, eax
+	WORD $0xf901                 // ADDL DI, CX                          // add	ecx, edi
+	LONG $0x0846b60f             // MOVZX 0x8(SI), AX                    // movzx	eax, byte ptr [rsi + 8]
+	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
+	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
+	LONG $0x097eb60f             // MOVZX 0x9(SI), DI                    // movzx	edi, byte ptr [rsi + 9]
+	WORD $0xc701                 // ADDL AX, DI                          // add	edi, eax
+	WORD $0xf901                 // ADDL DI, CX                          // add	ecx, edi
+	LONG $0x0a46b60f             // MOVZX 0xa(SI), AX                    // movzx	eax, byte ptr [rsi + 10]
+	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
+	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
+	LONG $0x0b7eb60f             // MOVZX 0xb(SI), DI                    // movzx	edi, byte ptr [rsi + 11]
+	WORD $0xc701                 // ADDL AX, DI                          // add	edi, eax
+	WORD $0xf901                 // ADDL DI, CX                          // add	ecx, edi
+	LONG $0x0c46b60f             // MOVZX 0xc(SI), AX                    // movzx	eax, byte ptr [rsi + 12]
+	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
+	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
+	LONG $0x0d7eb60f             // MOVZX 0xd(SI), DI                    // movzx	edi, byte ptr [rsi + 13]
+	WORD $0xc701                 // ADDL AX, DI                          // add	edi, eax
+	WORD $0xf901                 // ADDL DI, CX                          // add	ecx, edi
+	LONG $0x46b60f44; BYTE $0x0e // MOVZX 0xe(SI), R8                    // movzx	r8d, byte ptr [rsi + 14]
+	WORD $0x0141; BYTE $0xf8     // ADDL DI, R8                          // add	r8d, edi
+	WORD $0x0144; BYTE $0xc1     // ADDL R8, CX                          // add	ecx, r8d
+	LONG $0x0f46b60f             // MOVZX 0xf(SI), AX                    // movzx	eax, byte ptr [rsi + 15]
+	WORD $0x0144; BYTE $0xc0     // ADDL R8, AX                          // add	eax, r8d
+	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
+	ADDQ $-0x10, DX              // <--                                  // add	rdx, -16
+	JE   LBB0_27                 // <--                                  // je	.LBB0_27
+	ADDQ $0x10, SI               // <--                                  // add	rsi, 16
+
+LBB0_20:
+	LEAQ -0x1(DX), DI // <--                                  // lea	rdi, [rdx - 1]
+	MOVQ DX, R9       // <--                                  // mov	r9, rdx
+	ANDQ $0x3, R9     // <--                                  // and	r9, 3
+	JE   LBB0_24      // <--                                  // je	.LBB0_24
+	XORL R8, R8       // <--                                  // xor	r8d, r8d
+
+LBB0_22:
+	LONG $0x14b60f46; BYTE $0x06 // MOVZX 0(SI)(R8*1), R10               // movzx	r10d, byte ptr [rsi + r8]
+	WORD $0x0144; BYTE $0xd0     // ADDL R10, AX                         // add	eax, r10d
+	WORD $0xc101                 // ADDL AX, CX                          // add	ecx, eax
+	INCQ R8                      // <--                                  // inc	r8
+	CMPQ R9, R8                  // <--                                  // cmp	r9, r8
+	JNE  LBB0_22                 // <--                                  // jne	.LBB0_22
+	ADDQ R8, SI                  // <--                                  // add	rsi, r8
+	SUBQ R8, DX                  // <--                                  // sub	rdx, r8
+
+LBB0_24:
+	CMPQ DI, $0x3 // <--                                  // cmp	rdi, 3
+	JB   LBB0_27  // <--                                  // jb	.LBB0_27
+	XORL DI, DI   // <--                                  // xor	edi, edi
+
+LBB0_26:
+	LONG $0x04b60f44; BYTE $0x3e   // MOVZX 0(SI)(DI*1), R8                // movzx	r8d, byte ptr [rsi + rdi]
+	WORD $0x0141; BYTE $0xc0       // ADDL AX, R8                          // add	r8d, eax
+	WORD $0x0144; BYTE $0xc1       // ADDL R8, CX                          // add	ecx, r8d
+	LONG $0x3e44b60f; BYTE $0x01   // MOVZX 0x1(SI)(DI*1), AX              // movzx	eax, byte ptr [rsi + rdi + 1]
+	WORD $0x0144; BYTE $0xc0       // ADDL R8, AX                          // add	eax, r8d
+	WORD $0xc101                   // ADDL AX, CX                          // add	ecx, eax
+	LONG $0x44b60f44; WORD $0x023e // MOVZX 0x2(SI)(DI*1), R8              // movzx	r8d, byte ptr [rsi + rdi + 2]
+	WORD $0x0141; BYTE $0xc0       // ADDL AX, R8                          // add	r8d, eax
+	WORD $0x0144; BYTE $0xc1       // ADDL R8, CX                          // add	ecx, r8d
+	LONG $0x3e44b60f; BYTE $0x03   // MOVZX 0x3(SI)(DI*1), AX              // movzx	eax, byte ptr [rsi + rdi + 3]
+	WORD $0x0144; BYTE $0xc0       // ADDL R8, AX                          // add	eax, r8d
+	WORD $0xc101                   // ADDL AX, CX                          // add	ecx, eax
+	ADDQ $0x4, DI                  // <--                                  // add	rdi, 4
+	CMPQ DX, DI                    // <--                                  // cmp	rdx, rdi
+	JNE  LBB0_26                   // <--                                  // jne	.LBB0_26
+
+LBB0_27:
+	LONG  $0x000f908d; WORD $0xffff // LEAL -0xfff1(AX), DX                 // lea	edx, [rax - 65521]
+	CMPL  AX, $0xfff1               // <--                                  // cmp	eax, 65521
+	WORD  $0x420f; BYTE $0xd0       // CMOVB AX, DX                         // cmovb	edx, eax
+	WORD  $0xc889                   // MOVL CX, AX                          // mov	eax, ecx
+	LONG  $0x078071be; BYTE $0x80   // MOVL $-0x7ff87f8f, SI                // mov	esi, 2147975281
+	IMULQ AX, SI                    // <--                                  // imul	rsi, rax
+	SHRQ  $0x2f, SI                 // <--                                  // shr	rsi, 47
+	LONG  $0xfff1c669; WORD $0x0000 // IMULL $0xfff1, SI, AX                // imul	eax, esi, 65521
+	WORD  $0xc129                   // SUBL AX, CX                          // sub	ecx, eax
+	WORD  $0xe1c1; BYTE $0x10       // SHLL $0x10, CX                       // shl	ecx, 16
+	WORD  $0xd109                   // ORL DX, CX                           // or	ecx, edx
+	WORD  $0xc889                   // MOVL CX, AX                          // mov	eax, ecx
+	NOP                             // (skipped)                            // mov	rsp, rbp
+	NOP                             // (skipped)                            // pop	rbp
+	VZEROUPPER                      // <--                                  // vzeroupper
+	MOVL  AX, ret+32(FP)            // <--
+	RET                             // <--                                  // ret
+
+LBB0_16:
+	WORD $0xe1c1; BYTE $0x10 // SHLL $0x10, CX                       // shl	ecx, 16
+	WORD $0xc809             // ORL CX, AX                           // or	eax, ecx
+	NOP                      // (skipped)                            // mov	rsp, rbp
+	NOP                      // (skipped)                            // pop	rbp
+	VZEROUPPER               // <--                                  // vzeroupper
+	MOVL AX, ret+32(FP)      // <--
+	RET                      // <--                                  // ret
--- /dev/null
+++ b/internal/adler32/adler32_fallback.go
@@ -1,0 +1,19 @@
+//go:build (!arm64 && !amd64) || purego
+
+package adler32
+
+import (
+	"hash"
+	"hash/adler32"
+)
+
+// The size of an Adler-32 checksum in bytes.
+const Size = 4
+
+// New returns a new hash.Hash32 computing the Adler-32 checksum.
+func New() hash.Hash32 {
+	return adler32.New()
+}
+
+// Checksum returns the Adler-32 checksum of data.
+func Checksum(data []byte) uint32 { return adler32.Checksum(data) }
--- /dev/null
+++ b/internal/adler32/adler32_generic.go
@@ -1,0 +1,44 @@
+package adler32
+
+const (
+	// mod is the largest prime that is less than 65536.
+	mod = 65521
+	// nmax is the largest n such that
+	// 255 * n * (n+1) / 2 + (n+1) * (mod-1) <= 2^32-1.
+	// It is mentioned in RFC 1950 (search for "5552").
+	nmax = 5552
+
+	// binary representation compatible with standard library.
+	magic         = "adl\x01"
+	marshaledSize = len(magic) + 4
+)
+
+// Add p to the running checksum d.
+func update(d uint32, p []byte) uint32 {
+	s1, s2 := d&0xffff, d>>16
+	for len(p) > 0 {
+		var q []byte
+		if len(p) > nmax {
+			p, q = p[:nmax], p[nmax:]
+		}
+		for len(p) >= 4 {
+			s1 += uint32(p[0])
+			s2 += s1
+			s1 += uint32(p[1])
+			s2 += s1
+			s1 += uint32(p[2])
+			s2 += s1
+			s1 += uint32(p[3])
+			s2 += s1
+			p = p[4:]
+		}
+		for _, x := range p {
+			s1 += uint32(x)
+			s2 += s1
+		}
+		s1 %= mod
+		s2 %= mod
+		p = q
+	}
+	return s2<<16 | s1
+}
--- /dev/null
+++ b/internal/adler32/adler32_neon.go
@@ -1,0 +1,6 @@
+//go:build !purego && arm64
+
+package adler32
+
+//go:noescape
+func adler32_neon(in uint32, buf []byte) uint32
--- /dev/null
+++ b/internal/adler32/adler32_neon.s
@@ -1,0 +1,208 @@
+//go:build !purego && arm64
+
+#include "textflag.h"
+
+DATA mult_table<>+0x00(SB)/8, $0x001d001e001f0020
+DATA mult_table<>+0x08(SB)/8, $0x0019001a001b001c
+DATA mult_table<>+0x10(SB)/8, $0x0015001600170018
+DATA mult_table<>+0x18(SB)/8, $0x0011001200130014
+DATA mult_table<>+0x20(SB)/8, $0x000d000e000f0010
+DATA mult_table<>+0x28(SB)/8, $0x0009000a000b000c
+DATA mult_table<>+0x30(SB)/8, $0x0005000600070008
+DATA mult_table<>+0x38(SB)/8, $0x0001000200030004
+GLOBL mult_table<>(SB), (RODATA|NOPTR), $64
+
+TEXT ·adler32_neon(SB), NOSPLIT, $0-36
+	MOVW in+0(FP), R0
+	MOVD buf_base+8(FP), R1
+	MOVD buf_len+16(FP), R2
+	MOVD buf_cap+24(FP), R3
+	NOP                     // (skipped)                            // stp	x29, x30, [sp, #-16]!
+	ANDS $15, R1, R10       // <--                                  // ands	x10, x1, #0xf
+	ANDW $65535, R0, R8     // <--                                  // and	w8, w0, #0xffff
+	LSRW $16, R0, R9        // <--                                  // lsr	w9, w0, #16
+	NOP                     // (skipped)                            // mov	x29, sp
+	BEQ  LBB0_4             // <--                                  // b.eq	.LBB0_4
+	ADD  $1, R1, R11        // <--                                  // add	x11, x1, #1
+	MOVD R1, R12            // <--                                  // mov	x12, x1
+
+LBB0_2:
+	WORD  $0x3840158d       // MOVBU.P 1(R12), R13                  // ldrb	w13, [x12], #1
+	SUB   $1, R2, R2        // <--                                  // sub	x2, x2, #1
+	TST   $15, R11          // <--                                  // tst	x11, #0xf
+	ADD   $1, R11, R11      // <--                                  // add	x11, x11, #1
+	ADDW  R13, R8, R8       // <--                                  // add	w8, w8, w13
+	ADDW  R9, R8, R9        // <--                                  // add	w9, w8, w9
+	BNE   LBB0_2            // <--                                  // b.ne	.LBB0_2
+	MOVW  $32881, R11       // <--                                  // mov	w11, #32881
+	MOVW  $65521, R13       // <--                                  // mov	w13, #65521
+	MOVKW $(32775<<16), R11 // <--                                  // movk	w11, #32775, lsl #16
+	MOVW  $4294901775, R12  // <--                                  // mov	w12, #-65521
+	MOVW  $65520, R14       // <--                                  // mov	w14, #65520
+	SUB   R10, R1, R10      // <--                                  // sub	x10, x1, x10
+	UMULL R11, R9, R11      // <--                                  // umull	x11, w9, w11
+	ADDW  R12, R8, R12      // <--                                  // add	w12, w8, w12
+	CMPW  R14, R8           // <--                                  // cmp	w8, w14
+	ADD   $16, R10, R1      // <--                                  // add	x1, x10, #16
+	LSR   $47, R11, R11     // <--                                  // lsr	x11, x11, #47
+	CSELW HI, R12, R8, R8   // <--                                  // csel	w8, w12, w8, hi
+	MSUBW R13, R9, R11, R9  // <--                                  // msub	w9, w11, w13, w9
+
+LBB0_4:
+	AND   $31, R2, R10                        // <--                                  // and	x10, x2, #0x1f
+	CMP   $32, R2                             // <--                                  // cmp	x2, #32
+	BCC   LBB0_9                              // <--                                  // b.lo	.LBB0_9
+	MOVD  $mult_table<>(SB), R11              // <--                                  // adrp	x11, mult_table
+	ADD   $0, R11, R11                        // <--                                  // add	x11, x11, :lo12:mult_table
+	MOVW  $32881, R14                         // <--                                  // mov	w14, #32881
+	MOVW  $173, R12                           // <--                                  // mov	w12, #173
+	MOVD  $137438953440, R13                  // <--                                  // mov	x13, #137438953440
+	MOVKW $(32775<<16), R14                   // <--                                  // movk	w14, #32775, lsl #16
+	VLD1  (R11), [V0.H8, V1.H8, V2.H8, V3.H8] // <--                                  // ld1	{ v0.8h, v1.8h, v2.8h, v3.8h }, [x11]
+	LSR   $5, R2, R11                         // <--                                  // lsr	x11, x2, #5
+	MOVW  $65521, R15                         // <--                                  // mov	w15, #65521
+	VEXT  $8, V0.B16, V0.B16, V4.B16          // <--                                  // ext	v4.16b, v0.16b, v0.16b, #8
+	VEXT  $8, V1.B16, V1.B16, V5.B16          // <--                                  // ext	v5.16b, v1.16b, v1.16b, #8
+	VEXT  $8, V2.B16, V2.B16, V6.B16          // <--                                  // ext	v6.16b, v2.16b, v2.16b, #8
+	VEXT  $8, V3.B16, V3.B16, V7.B16          // <--                                  // ext	v7.16b, v3.16b, v3.16b, #8
+
+LBB0_6:
+	CMP  $173, R11               // <--                                  // cmp	x11, #173
+	MOVD R1, R2                  // <--                                  // mov	x2, x1
+	CSEL LO, R11, R12, R16       // <--                                  // csel	x16, x11, x12, lo
+	WORD $0x6f00e414             // VMOVI $0, V20.D2                     // movi	v20.2d, #0000000000000000
+	MULW R16, R8, R0             // <--                                  // mul	w0, w8, w16
+	ADD  R16<<5, R13, R17        // <--                                  // add	x17, x13, x16, lsl #5
+	WORD $0x6f00e410             // VMOVI $0, V16.D2                     // movi	v16.2d, #0000000000000000
+	AND  $137438953440, R17, R17 // <--                                  // and	x17, x17, #0x1fffffffe0
+	WORD $0x6f00e412             // VMOVI $0, V18.D2                     // movi	v18.2d, #0000000000000000
+	WORD $0x6f00e413             // VMOVI $0, V19.D2                     // movi	v19.2d, #0000000000000000
+	WORD $0x6f00e415             // VMOVI $0, V21.D2                     // movi	v21.2d, #0000000000000000
+	VMOV R0, V20.S[3]            // <--                                  // mov	v20.s[3], w0
+	MOVW R16, R0                 // <--                                  // mov	w0, w16
+	WORD $0x6f00e411             // VMOVI $0, V17.D2                     // movi	v17.2d, #0000000000000000
+
+LBB0_7:
+	WORD  $0xacc15857                   // FLDPQ.P 32(R2), (F23, F22)           // ldp	q23, q22, [x2], #32
+	SUBSW $1, R0, R0                    // <--                                  // subs	w0, w0, #1
+	VADD  V17.S4, V20.S4, V20.S4        // <--                                  // add	v20.4s, v20.4s, v17.4s
+	WORD  $0x2e3712b5                   // VUADDW V23.B8, V21.H8, V21.H8        // uaddw	v21.8h, v21.8h, v23.8b
+	WORD  $0x6e371273                   // VUADDW2 V23.B16, V19.H8, V19.H8      // uaddw2	v19.8h, v19.8h, v23.16b
+	WORD  $0x6e202ad8                   // VUADDLP V22.B16, V24.H8              // uaddlp	v24.8h, v22.16b
+	WORD  $0x2e361252                   // VUADDW V22.B8, V18.H8, V18.H8        // uaddw	v18.8h, v18.8h, v22.8b
+	WORD  $0x6e361210                   // VUADDW2 V22.B16, V16.H8, V16.H8      // uaddw2	v16.8h, v16.8h, v22.16b
+	WORD  $0x6e206af8                   // VUADALP V23.B16, V24.H8              // uadalp	v24.8h, v23.16b
+	WORD  $0x6e606b11                   // VUADALP V24.H8, V17.S4               // uadalp	v17.4s, v24.8h
+	BNE   LBB0_7                        // <--                                  // b.ne	.LBB0_7
+	VSHL  $5, V20.S4, V20.S4            // <--                                  // shl	v20.4s, v20.4s, #5
+	ADD   R17, R1, R17                  // <--                                  // add	x17, x1, x17
+	SUBS  R16, R11, R11                 // <--                                  // subs	x11, x11, x16
+	ADD   $32, R17, R1                  // <--                                  // add	x1, x17, #32
+	WORD  $0x2e6082b4                   // VUMLAL V0.H4, V21.H4, V20.S4         // umlal	v20.4s, v21.4h, v0.4h
+	VEXT  $8, V21.B16, V21.B16, V21.B16 // <--                                  // ext	v21.16b, v21.16b, v21.16b, #8
+	WORD  $0x2e6482b4                   // VUMLAL V4.H4, V21.H4, V20.S4         // umlal	v20.4s, v21.4h, v4.4h
+	VEXT  $8, V19.B16, V19.B16, V21.B16 // <--                                  // ext	v21.16b, v19.16b, v19.16b, #8
+	WORD  $0x2e618274                   // VUMLAL V1.H4, V19.H4, V20.S4         // umlal	v20.4s, v19.4h, v1.4h
+	VEXT  $8, V18.B16, V18.B16, V19.B16 // <--                                  // ext	v19.16b, v18.16b, v18.16b, #8
+	WORD  $0x2e6582b4                   // VUMLAL V5.H4, V21.H4, V20.S4         // umlal	v20.4s, v21.4h, v5.4h
+	WORD  $0x2e628254                   // VUMLAL V2.H4, V18.H4, V20.S4         // umlal	v20.4s, v18.4h, v2.4h
+	WORD  $0x2e668274                   // VUMLAL V6.H4, V19.H4, V20.S4         // umlal	v20.4s, v19.4h, v6.4h
+	WORD  $0x2e638214                   // VUMLAL V3.H4, V16.H4, V20.S4         // umlal	v20.4s, v16.4h, v3.4h
+	VEXT  $8, V16.B16, V16.B16, V16.B16 // <--                                  // ext	v16.16b, v16.16b, v16.16b, #8
+	WORD  $0x2e678214                   // VUMLAL V7.H4, V16.H4, V20.S4         // umlal	v20.4s, v16.4h, v7.4h
+	WORD  $0x4eb1be30                   // VADDP V17.S4, V17.S4, V16.S4         // addp	v16.4s, v17.4s, v17.4s
+	WORD  $0x4eb4be91                   // VADDP V20.S4, V20.S4, V17.S4         // addp	v17.4s, v20.4s, v20.4s
+	WORD  $0x0eb1be10                   // VADDP V17.S2, V16.S2, V16.S2         // addp	v16.2s, v16.2s, v17.2s
+	VMOV  V16.S[1], R0                  // <--                                  // mov	w0, v16.s[1]
+	FMOVS F16, R2                       // <--                                  // fmov	w2, s16
+	ADDW  R8, R2, R8                    // <--                                  // add	w8, w2, w8
+	ADDW  R9, R0, R9                    // <--                                  // add	w9, w0, w9
+	UMULL R14, R8, R0                   // <--                                  // umull	x0, w8, w14
+	UMULL R14, R9, R2                   // <--                                  // umull	x2, w9, w14
+	LSR   $47, R0, R0                   // <--                                  // lsr	x0, x0, #47
+	LSR   $47, R2, R2                   // <--                                  // lsr	x2, x2, #47
+	MSUBW R15, R8, R0, R8               // <--                                  // msub	w8, w0, w15, w8
+	MSUBW R15, R9, R2, R9               // <--                                  // msub	w9, w2, w15, w9
+	BNE   LBB0_6                        // <--                                  // b.ne	.LBB0_6
+
+LBB0_9:
+	CBZ  R10, LBB0_15  // <--                                  // cbz	x10, .LBB0_15
+	CMP  $16, R10      // <--                                  // cmp	x10, #16
+	BCC  LBB0_13       // <--                                  // b.lo	.LBB0_13
+	WORD $0x3940002b   // MOVBU (R1), R11                      // ldrb	w11, [x1]
+	SUBS $16, R10, R10 // <--                                  // subs	x10, x10, #16
+	WORD $0x3940042c   // MOVBU 1(R1), R12                     // ldrb	w12, [x1, #1]
+	WORD $0x3940082d   // MOVBU 2(R1), R13                     // ldrb	w13, [x1, #2]
+	ADDW R11, R8, R8   // <--                                  // add	w8, w8, w11
+	WORD $0x39400c2b   // MOVBU 3(R1), R11                     // ldrb	w11, [x1, #3]
+	ADDW R9, R8, R9    // <--                                  // add	w9, w8, w9
+	ADDW R12, R8, R8   // <--                                  // add	w8, w8, w12
+	WORD $0x3940102c   // MOVBU 4(R1), R12                     // ldrb	w12, [x1, #4]
+	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
+	ADDW R13, R8, R8   // <--                                  // add	w8, w8, w13
+	WORD $0x3940142d   // MOVBU 5(R1), R13                     // ldrb	w13, [x1, #5]
+	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
+	ADDW R11, R8, R8   // <--                                  // add	w8, w8, w11
+	WORD $0x3940182b   // MOVBU 6(R1), R11                     // ldrb	w11, [x1, #6]
+	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
+	ADDW R12, R8, R8   // <--                                  // add	w8, w8, w12
+	WORD $0x39401c2c   // MOVBU 7(R1), R12                     // ldrb	w12, [x1, #7]
+	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
+	ADDW R13, R8, R8   // <--                                  // add	w8, w8, w13
+	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
+	ADDW R11, R8, R8   // <--                                  // add	w8, w8, w11
+	WORD $0x3940202b   // MOVBU 8(R1), R11                     // ldrb	w11, [x1, #8]
+	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
+	ADDW R12, R8, R8   // <--                                  // add	w8, w8, w12
+	WORD $0x3940242c   // MOVBU 9(R1), R12                     // ldrb	w12, [x1, #9]
+	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
+	WORD $0x3940382d   // MOVBU 14(R1), R13                    // ldrb	w13, [x1, #14]
+	ADDW R11, R8, R8   // <--                                  // add	w8, w8, w11
+	WORD $0x3940282b   // MOVBU 10(R1), R11                    // ldrb	w11, [x1, #10]
+	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
+	ADDW R12, R8, R8   // <--                                  // add	w8, w8, w12
+	WORD $0x39402c2c   // MOVBU 11(R1), R12                    // ldrb	w12, [x1, #11]
+	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
+	ADDW R11, R8, R8   // <--                                  // add	w8, w8, w11
+	WORD $0x3940302b   // MOVBU 12(R1), R11                    // ldrb	w11, [x1, #12]
+	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
+	ADDW R12, R8, R8   // <--                                  // add	w8, w8, w12
+	WORD $0x3940342c   // MOVBU 13(R1), R12                    // ldrb	w12, [x1, #13]
+	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
+	ADDW R11, R8, R8   // <--                                  // add	w8, w8, w11
+	WORD $0x39403c2b   // MOVBU 15(R1), R11                    // ldrb	w11, [x1, #15]
+	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
+	ADDW R12, R8, R8   // <--                                  // add	w8, w8, w12
+	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
+	ADDW R13, R8, R8   // <--                                  // add	w8, w8, w13
+	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
+	ADDW R11, R8, R8   // <--                                  // add	w8, w8, w11
+	ADDW R8, R9, R9    // <--                                  // add	w9, w9, w8
+	BEQ  LBB0_14       // <--                                  // b.eq	.LBB0_14
+	ADD  $16, R1, R1   // <--                                  // add	x1, x1, #16
+
+LBB0_13:
+	WORD $0x3840142b  // MOVBU.P 1(R1), R11                   // ldrb	w11, [x1], #1
+	SUBS $1, R10, R10 // <--                                  // subs	x10, x10, #1
+	ADDW R11, R8, R8  // <--                                  // add	w8, w8, w11
+	ADDW R9, R8, R9   // <--                                  // add	w9, w8, w9
+	BNE  LBB0_13      // <--                                  // b.ne	.LBB0_13
+
+LBB0_14:
+	MOVW  $32881, R10       // <--                                  // mov	w10, #32881
+	MOVW  $65521, R12       // <--                                  // mov	w12, #65521
+	MOVKW $(32775<<16), R10 // <--                                  // movk	w10, #32775, lsl #16
+	MOVW  $4294901775, R11  // <--                                  // mov	w11, #-65521
+	MOVW  $65520, R13       // <--                                  // mov	w13, #65520
+	ADDW  R11, R8, R11      // <--                                  // add	w11, w8, w11
+	UMULL R10, R9, R10      // <--                                  // umull	x10, w9, w10
+	CMPW  R13, R8           // <--                                  // cmp	w8, w13
+	CSELW HI, R11, R8, R8   // <--                                  // csel	w8, w11, w8, hi
+	LSR   $47, R10, R10     // <--                                  // lsr	x10, x10, #47
+	MSUBW R12, R9, R10, R9  // <--                                  // msub	w9, w10, w12, w9
+
+LBB0_15:
+	ORRW R9<<16, R8, R0 // <--                                  // orr	w0, w8, w9, lsl #16
+	NOP                 // (skipped)                            // ldp	x29, x30, [sp], #16
+	MOVW R0, ret+32(FP) // <--
+	RET                 // <--                                  // ret
--- /dev/null
+++ b/internal/adler32/adler32_sse3.go
@@ -1,0 +1,6 @@
+//go:build !purego && amd64
+
+package adler32
+
+//go:noescape
+func adler32_sse3(in uint32, buf []byte) uint32
--- /dev/null
+++ b/internal/adler32/adler32_sse3.s
@@ -1,0 +1,214 @@
+//go:build !purego && amd64
+
+#include "textflag.h"
+
+DATA LCPI0_0<>+0x00(SB)/8, $0x191a1b1c1d1e1f20
+DATA LCPI0_0<>+0x08(SB)/8, $0x1112131415161718
+GLOBL LCPI0_0<>(SB), (RODATA|NOPTR), $16
+
+DATA LCPI0_1<>+0x00(SB)/8, $0x0001000100010001
+DATA LCPI0_1<>+0x08(SB)/8, $0x0001000100010001
+GLOBL LCPI0_1<>(SB), (RODATA|NOPTR), $16
+
+DATA LCPI0_2<>+0x00(SB)/8, $0x090a0b0c0d0e0f10
+DATA LCPI0_2<>+0x08(SB)/8, $0x0102030405060708
+GLOBL LCPI0_2<>(SB), (RODATA|NOPTR), $16
+
+TEXT ·adler32_sse3(SB), NOSPLIT, $0-36
+	MOVLQZX in+0(FP), DI
+	MOVQ    buf_base+8(FP), SI
+	MOVQ    buf_len+16(FP), DX
+	MOVQ    buf_cap+24(FP), CX
+	NOP                         // (skipped)                            // push	rbp
+	NOP                         // (skipped)                            // mov	rbp, rsp
+	NOP                         // (skipped)                            // and	rsp, -8
+	WORD    $0xf889             // MOVL DI, AX                          // mov	eax, edi
+	LONG    $0xc8b70f44         // MOVZX AX, R9                         // movzx	r9d, ax
+	WORD    $0xe8c1; BYTE $0x10 // SHRL $0x10, AX                       // shr	eax, 16
+	WORD    $0xd189             // MOVL DX, CX                          // mov	ecx, edx
+	WORD    $0xe183; BYTE $0x1f // ANDL $0x1f, CX                       // and	ecx, 31
+	CMPQ    DX, $0x20           // <--                                  // cmp	rdx, 32
+	JAE     LBB0_2              // <--                                  // jae	.LBB0_2
+	WORD    $0x8944; BYTE $0xcf // MOVL R9, DI                          // mov	edi, r9d
+	JMP     LBB0_6              // <--                                  // jmp	.LBB0_6
+
+LBB0_2:
+	SHRQ $0x5, DX                  // <--                                  // shr	rdx, 5
+	LONG $0xc0ef0f66               // PXOR X0, X0                          // pxor	xmm0, xmm0
+	MOVO LCPI0_0<>(SB), X1         // <--                                  // movdqa	xmm1, xmmword ptr [rip + .LCPI0_0]
+	MOVO LCPI0_1<>(SB), X2         // <--                                  // movdqa	xmm2, xmmword ptr [rip + .LCPI0_1]
+	MOVO LCPI0_2<>(SB), X3         // <--                                  // movdqa	xmm3, xmmword ptr [rip + .LCPI0_2]
+	LONG $0x8071b841; WORD $0x8007 // MOVL $-0x7ff87f8f, R8                // mov	r8d, 2147975281
+
+LBB0_3:
+	CMPQ DX, $0xad                 // <--                                  // cmp	rdx, 173
+	LONG $0x00adba41; WORD $0x0000 // MOVL $0xad, R10                      // mov	r10d, 173
+	LONG $0xd2420f4c               // CMOVB DX, R10                        // cmovb	r10, rdx
+	WORD $0x8944; BYTE $0xcf       // MOVL R9, DI                          // mov	edi, r9d
+	LONG $0xfaaf0f41               // IMULL R10, DI                        // imul	edi, r10d
+	LONG $0xef6e0f66               // MOVD DI, X5                          // movd	xmm5, edi
+	LONG $0xe06e0f66               // MOVD AX, X4                          // movd	xmm4, eax
+	WORD $0x8944; BYTE $0xd0       // MOVL R10, AX                         // mov	eax, r10d
+	LONG $0xf6ef0f66               // PXOR X6, X6                          // pxor	xmm6, xmm6
+
+LBB0_4:
+	LONG  $0x3e6f0ff3                           // MOVDQU 0(SI), X7                     // movdqu	xmm7, xmmword ptr [rsi]
+	LONG  $0x6f0f4466; BYTE $0xc7               // MOVDQA X7, X8                        // movdqa	xmm8, xmm7
+	LONG  $0x04380f66; BYTE $0xf9               // PMADDUBSW X1, X7                     // pmaddubsw	xmm7, xmm1
+	LONG  $0xfaf50f66                           // PMADDWD X2, X7                       // pmaddwd	xmm7, xmm2
+	LONG  $0xfcfe0f66                           // PADDD X4, X7                         // paddd	xmm7, xmm4
+	LONG  $0x666f0ff3; BYTE $0x10               // MOVDQU 0x10(SI), X4                  // movdqu	xmm4, xmmword ptr [rsi + 16]
+	LONG  $0xeefe0f66                           // PADDD X6, X5                         // paddd	xmm5, xmm6
+	LONG  $0xf60f4466; BYTE $0xc0               // PSADBW X0, X8                        // psadbw	xmm8, xmm0
+	LONG  $0xfe0f4466; BYTE $0xc6               // PADDD X6, X8                         // paddd	xmm8, xmm6
+	LONG  $0xf46f0f66                           // MOVDQA X4, X6                        // movdqa	xmm6, xmm4
+	LONG  $0xf0f60f66                           // PSADBW X0, X6                        // psadbw	xmm6, xmm0
+	LONG  $0xfe0f4166; BYTE $0xf0               // PADDD X8, X6                         // paddd	xmm6, xmm8
+	LONG  $0x04380f66; BYTE $0xe3               // PMADDUBSW X3, X4                     // pmaddubsw	xmm4, xmm3
+	LONG  $0xe2f50f66                           // PMADDWD X2, X4                       // pmaddwd	xmm4, xmm2
+	LONG  $0xe7fe0f66                           // PADDD X7, X4                         // paddd	xmm4, xmm7
+	ADDQ  $0x20, SI                             // <--                                  // add	rsi, 32
+	WORD  $0xc8ff                               // DECL AX                              // dec	eax
+	JNE   LBB0_4                                // <--                                  // jne	.LBB0_4
+	LONG  $0xf5720f66; BYTE $0x05               // PSLLD $0x5, X5                       // pslld	xmm5, 5
+	LONG  $0xe5fe0f66                           // PADDD X5, X4                         // paddd	xmm4, xmm5
+	LONG  $0xee700f66; BYTE $0xb1               // PSHUFD $0xb1, X6, X5                 // pshufd	xmm5, xmm6, 177
+	LONG  $0xeefe0f66                           // PADDD X6, X5                         // paddd	xmm5, xmm6
+	LONG  $0xf5700f66; BYTE $0xee               // PSHUFD $0xee, X5, X6                 // pshufd	xmm6, xmm5, 238
+	LONG  $0xf5fe0f66                           // PADDD X5, X6                         // paddd	xmm6, xmm5
+	LONG  $0xf77e0f66                           // MOVD X6, DI                          // movd	edi, xmm6
+	WORD  $0x0144; BYTE $0xcf                   // ADDL R9, DI                          // add	edi, r9d
+	LONG  $0xec700f66; BYTE $0xb1               // PSHUFD $0xb1, X4, X5                 // pshufd	xmm5, xmm4, 177
+	LONG  $0xecfe0f66                           // PADDD X4, X5                         // paddd	xmm5, xmm4
+	LONG  $0xe5700f66; BYTE $0xee               // PSHUFD $0xee, X5, X4                 // pshufd	xmm4, xmm5, 238
+	LONG  $0xe5fe0f66                           // PADDD X5, X4                         // paddd	xmm4, xmm5
+	LONG  $0xe07e0f66                           // MOVD X4, AX                          // movd	eax, xmm4
+	MOVQ  DI, R9                                // <--                                  // mov	r9, rdi
+	IMULQ R8, R9                                // <--                                  // imul	r9, r8
+	SHRQ  $0x2f, R9                             // <--                                  // shr	r9, 47
+	LONG  $0xf1c96945; WORD $0x00ff; BYTE $0x00 // IMULL $0xfff1, R9, R9                // imul	r9d, r9d, 65521
+	WORD  $0x2944; BYTE $0xcf                   // SUBL R9, DI                          // sub	edi, r9d
+	MOVQ  AX, R9                                // <--                                  // mov	r9, rax
+	IMULQ R8, R9                                // <--                                  // imul	r9, r8
+	SHRQ  $0x2f, R9                             // <--                                  // shr	r9, 47
+	LONG  $0xf1c96945; WORD $0x00ff; BYTE $0x00 // IMULL $0xfff1, R9, R9                // imul	r9d, r9d, 65521
+	WORD  $0x2944; BYTE $0xc8                   // SUBL R9, AX                          // sub	eax, r9d
+	WORD  $0x8941; BYTE $0xf9                   // MOVL DI, R9                          // mov	r9d, edi
+	SUBQ  R10, DX                               // <--                                  // sub	rdx, r10
+	JNE   LBB0_3                                // <--                                  // jne	.LBB0_3
+
+LBB0_6:
+	WORD $0x8548; BYTE $0xc9     // TESTQ CX, CX                         // test	rcx, rcx
+	JE   LBB0_18                 // <--                                  // je	.LBB0_18
+	CMPL CX, $0x10               // <--                                  // cmp	ecx, 16
+	JB   LBB0_10                 // <--                                  // jb	.LBB0_10
+	WORD $0xb60f; BYTE $0x16     // MOVZX 0(SI), DX                      // movzx	edx, byte ptr [rsi]
+	WORD $0xd701                 // ADDL DX, DI                          // add	edi, edx
+	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
+	LONG $0x0156b60f             // MOVZX 0x1(SI), DX                    // movzx	edx, byte ptr [rsi + 1]
+	WORD $0xfa01                 // ADDL DI, DX                          // add	edx, edi
+	WORD $0xd001                 // ADDL DX, AX                          // add	eax, edx
+	LONG $0x027eb60f             // MOVZX 0x2(SI), DI                    // movzx	edi, byte ptr [rsi + 2]
+	WORD $0xd701                 // ADDL DX, DI                          // add	edi, edx
+	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
+	LONG $0x0356b60f             // MOVZX 0x3(SI), DX                    // movzx	edx, byte ptr [rsi + 3]
+	WORD $0xfa01                 // ADDL DI, DX                          // add	edx, edi
+	WORD $0xd001                 // ADDL DX, AX                          // add	eax, edx
+	LONG $0x047eb60f             // MOVZX 0x4(SI), DI                    // movzx	edi, byte ptr [rsi + 4]
+	WORD $0xd701                 // ADDL DX, DI                          // add	edi, edx
+	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
+	LONG $0x0556b60f             // MOVZX 0x5(SI), DX                    // movzx	edx, byte ptr [rsi + 5]
+	WORD $0xfa01                 // ADDL DI, DX                          // add	edx, edi
+	WORD $0xd001                 // ADDL DX, AX                          // add	eax, edx
+	LONG $0x067eb60f             // MOVZX 0x6(SI), DI                    // movzx	edi, byte ptr [rsi + 6]
+	WORD $0xd701                 // ADDL DX, DI                          // add	edi, edx
+	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
+	LONG $0x0756b60f             // MOVZX 0x7(SI), DX                    // movzx	edx, byte ptr [rsi + 7]
+	WORD $0xfa01                 // ADDL DI, DX                          // add	edx, edi
+	WORD $0xd001                 // ADDL DX, AX                          // add	eax, edx
+	LONG $0x087eb60f             // MOVZX 0x8(SI), DI                    // movzx	edi, byte ptr [rsi + 8]
+	WORD $0xd701                 // ADDL DX, DI                          // add	edi, edx
+	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
+	LONG $0x0956b60f             // MOVZX 0x9(SI), DX                    // movzx	edx, byte ptr [rsi + 9]
+	WORD $0xfa01                 // ADDL DI, DX                          // add	edx, edi
+	WORD $0xd001                 // ADDL DX, AX                          // add	eax, edx
+	LONG $0x0a7eb60f             // MOVZX 0xa(SI), DI                    // movzx	edi, byte ptr [rsi + 10]
+	WORD $0xd701                 // ADDL DX, DI                          // add	edi, edx
+	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
+	LONG $0x0b56b60f             // MOVZX 0xb(SI), DX                    // movzx	edx, byte ptr [rsi + 11]
+	WORD $0xfa01                 // ADDL DI, DX                          // add	edx, edi
+	WORD $0xd001                 // ADDL DX, AX                          // add	eax, edx
+	LONG $0x0c7eb60f             // MOVZX 0xc(SI), DI                    // movzx	edi, byte ptr [rsi + 12]
+	WORD $0xd701                 // ADDL DX, DI                          // add	edi, edx
+	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
+	LONG $0x0d56b60f             // MOVZX 0xd(SI), DX                    // movzx	edx, byte ptr [rsi + 13]
+	WORD $0xfa01                 // ADDL DI, DX                          // add	edx, edi
+	WORD $0xd001                 // ADDL DX, AX                          // add	eax, edx
+	LONG $0x46b60f44; BYTE $0x0e // MOVZX 0xe(SI), R8                    // movzx	r8d, byte ptr [rsi + 14]
+	WORD $0x0141; BYTE $0xd0     // ADDL DX, R8                          // add	r8d, edx
+	WORD $0x0144; BYTE $0xc0     // ADDL R8, AX                          // add	eax, r8d
+	LONG $0x0f7eb60f             // MOVZX 0xf(SI), DI                    // movzx	edi, byte ptr [rsi + 15]
+	WORD $0x0144; BYTE $0xc7     // ADDL R8, DI                          // add	edi, r8d
+	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
+	ADDQ $-0x10, CX              // <--                                  // add	rcx, -16
+	JE   LBB0_17                 // <--                                  // je	.LBB0_17
+	ADDQ $0x10, SI               // <--                                  // add	rsi, 16
+
+LBB0_10:
+	LEAQ -0x1(CX), DX // <--                                  // lea	rdx, [rcx - 1]
+	MOVQ CX, R9       // <--                                  // mov	r9, rcx
+	ANDQ $0x3, R9     // <--                                  // and	r9, 3
+	JE   LBB0_14      // <--                                  // je	.LBB0_14
+	XORL R8, R8       // <--                                  // xor	r8d, r8d
+
+LBB0_12:
+	LONG $0x14b60f46; BYTE $0x06 // MOVZX 0(SI)(R8*1), R10               // movzx	r10d, byte ptr [rsi + r8]
+	WORD $0x0144; BYTE $0xd7     // ADDL R10, DI                         // add	edi, r10d
+	WORD $0xf801                 // ADDL DI, AX                          // add	eax, edi
+	INCQ R8                      // <--                                  // inc	r8
+	CMPQ R9, R8                  // <--                                  // cmp	r9, r8
+	JNE  LBB0_12                 // <--                                  // jne	.LBB0_12
+	ADDQ R8, SI                  // <--                                  // add	rsi, r8
+	SUBQ R8, CX                  // <--                                  // sub	rcx, r8
+
+LBB0_14:
+	CMPQ DX, $0x3 // <--                                  // cmp	rdx, 3
+	JB   LBB0_17  // <--                                  // jb	.LBB0_17
+	XORL DX, DX   // <--                                  // xor	edx, edx
+
+LBB0_16:
+	LONG $0x04b60f44; BYTE $0x16   // MOVZX 0(SI)(DX*1), R8                // movzx	r8d, byte ptr [rsi + rdx]
+	WORD $0x0141; BYTE $0xf8       // ADDL DI, R8                          // add	r8d, edi
+	WORD $0x0144; BYTE $0xc0       // ADDL R8, AX                          // add	eax, r8d
+	LONG $0x167cb60f; BYTE $0x01   // MOVZX 0x1(SI)(DX*1), DI              // movzx	edi, byte ptr [rsi + rdx + 1]
+	WORD $0x0144; BYTE $0xc7       // ADDL R8, DI                          // add	edi, r8d
+	WORD $0xf801                   // ADDL DI, AX                          // add	eax, edi
+	LONG $0x44b60f44; WORD $0x0216 // MOVZX 0x2(SI)(DX*1), R8              // movzx	r8d, byte ptr [rsi + rdx + 2]
+	WORD $0x0141; BYTE $0xf8       // ADDL DI, R8                          // add	r8d, edi
+	WORD $0x0144; BYTE $0xc0       // ADDL R8, AX                          // add	eax, r8d
+	LONG $0x167cb60f; BYTE $0x03   // MOVZX 0x3(SI)(DX*1), DI              // movzx	edi, byte ptr [rsi + rdx + 3]
+	WORD $0x0144; BYTE $0xc7       // ADDL R8, DI                          // add	edi, r8d
+	WORD $0xf801                   // ADDL DI, AX                          // add	eax, edi
+	ADDQ $0x4, DX                  // <--                                  // add	rdx, 4
+	CMPQ CX, DX                    // <--                                  // cmp	rcx, rdx
+	JNE  LBB0_16                   // <--                                  // jne	.LBB0_16
+
+LBB0_17:
+	LONG  $0x000f8f8d; WORD $0xffff // LEAL -0xfff1(DI), CX                 // lea	ecx, [rdi - 65521]
+	CMPL  DI, $0xfff1               // <--                                  // cmp	edi, 65521
+	WORD  $0x420f; BYTE $0xcf       // CMOVB DI, CX                         // cmovb	ecx, edi
+	WORD  $0xc289                   // MOVL AX, DX                          // mov	edx, eax
+	LONG  $0x078071be; BYTE $0x80   // MOVL $-0x7ff87f8f, SI                // mov	esi, 2147975281
+	IMULQ DX, SI                    // <--                                  // imul	rsi, rdx
+	SHRQ  $0x2f, SI                 // <--                                  // shr	rsi, 47
+	LONG  $0xfff1d669; WORD $0x0000 // IMULL $0xfff1, SI, DX                // imul	edx, esi, 65521
+	WORD  $0xd029                   // SUBL DX, AX                          // sub	eax, edx
+	WORD  $0xcf89                   // MOVL CX, DI                          // mov	edi, ecx
+
+LBB0_18:
+	WORD $0xe0c1; BYTE $0x10 // SHLL $0x10, AX                       // shl	eax, 16
+	WORD $0xf809             // ORL DI, AX                           // or	eax, edi
+	NOP                      // (skipped)                            // mov	rsp, rbp
+	NOP                      // (skipped)                            // pop	rbp
+	MOVL AX, ret+32(FP)      // <--
+	RET                      // <--                                  // ret
--- /dev/null
+++ b/internal/adler32/bench_test.go
@@ -1,0 +1,22 @@
+package adler32
+
+import (
+	"testing"
+)
+
+const benchmarkSize = 64 * 1024
+
+var data = make([]byte, benchmarkSize)
+
+func init() {
+	for i := range benchmarkSize {
+		data[i] = byte(i % 256)
+	}
+}
+
+func BenchmarkChecksum(b *testing.B) {
+	b.ReportAllocs()
+	for range b.N {
+		Checksum(data)
+	}
+}
--