ref: c85eeaccb831afddedac7bd5f1ba163ebac79fc1
parent: ba327e76c5b110044ec8ebb75630e79506dfbed8
author: Runxi Yu <runxiyu@umich.edu>
date: Tue Jan 6 17:31:52 EST 2026
Revert "adler32: Drop SSE3 support" This reverts commit ba327e76c5b110044ec8ebb75630e79506dfbed8 and re-adds SSE3 support at the request of Noisytoot.
--- a/internal/adler32/adler32_amd64.go
+++ b/internal/adler32/adler32_amd64.go
@@ -15,6 +15,7 @@
const Size = 4
var (
+ hasSSE3 = cpu.X86.HasSSE3
hasAVX2 = cpu.X86.HasAVX2
)
@@ -26,7 +27,7 @@
// New returns a new hash.Hash32 computing the Adler-32 checksum.
func New() hash.Hash32 {- if !hasAVX2 {+ if !hasSSE3 {return adler32.New()
}
d := new(digest)
@@ -59,7 +60,11 @@
func (d *digest) Write(data []byte) (nn int, err error) { if len(data) >= 64 {var h uint32
- h = adler32_avx2(uint32(*d), data)
+ if hasAVX2 {+ h = adler32_avx2(uint32(*d), data)
+ } else {+ h = adler32_sse3(uint32(*d), data)
+ }
*d = digest(h)
} else {h := update(uint32(*d), data)
@@ -77,9 +82,12 @@
// Checksum returns the Adler-32 checksum of data.
func Checksum(data []byte) uint32 {- if !hasAVX2 || len(data) < 64 {+ if !hasSSE3 || len(data) < 64 {return update(1, data)
}
- return adler32_avx2(1, data)
+ if hasAVX2 {+ return adler32_avx2(1, data)
+ }
+ return adler32_sse3(1, data)
}
--- /dev/null
+++ b/internal/adler32/adler32_sse3.go
@@ -1,0 +1,6 @@
+//go:build !purego && amd64
+
+package adler32
+
+//go:noescape
+func adler32_sse3(in uint32, buf []byte) uint32
--- /dev/null
+++ b/internal/adler32/adler32_sse3.s
@@ -1,0 +1,214 @@
+//go:build !purego && amd64
+
+#include "textflag.h"
+
+DATA weights_17_32<>+0x00(SB)/8, $0x191a1b1c1d1e1f20
+DATA weights_17_32<>+0x08(SB)/8, $0x1112131415161718
+GLOBL weights_17_32<>(SB), (RODATA|NOPTR), $16
+
+DATA ones_u16<>+0x00(SB)/8, $0x0001000100010001
+DATA ones_u16<>+0x08(SB)/8, $0x0001000100010001
+GLOBL ones_u16<>(SB), (RODATA|NOPTR), $16
+
+DATA weights_1_16<>+0x00(SB)/8, $0x090a0b0c0d0e0f10
+DATA weights_1_16<>+0x08(SB)/8, $0x0102030405060708
+GLOBL weights_1_16<>(SB), (RODATA|NOPTR), $16
+
+TEXT ·adler32_sse3(SB), NOSPLIT, $0-36
+ MOVLQZX in+0(FP), DI
+ MOVQ buf_base+8(FP), SI
+ MOVQ buf_len+16(FP), DX
+ MOVQ buf_cap+24(FP), CX
+ NOP
+ NOP
+ NOP
+ WORD $0xf889
+ LONG $0xc8b70f44
+ WORD $0xe8c1; BYTE $0x10
+ WORD $0xd189
+ WORD $0xe183; BYTE $0x1f
+ CMPQ DX, $0x20
+ JAE block_loop_setup
+ WORD $0x8944; BYTE $0xcf
+ JMP tail_entry
+
+block_loop_setup:
+ SHRQ $0x5, DX
+ LONG $0xc0ef0f66
+ MOVO weights_17_32<>(SB), X1
+ MOVO ones_u16<>(SB), X2
+ MOVO weights_1_16<>(SB), X3
+ LONG $0x8071b841; WORD $0x8007
+
+block_outer_loop:
+ CMPQ DX, $0xad
+ LONG $0x00adba41; WORD $0x0000
+ LONG $0xd2420f4c
+ WORD $0x8944; BYTE $0xcf
+ LONG $0xfaaf0f41
+ LONG $0xef6e0f66
+ LONG $0xe06e0f66
+ WORD $0x8944; BYTE $0xd0
+ LONG $0xf6ef0f66
+
+block_inner_loop:
+ LONG $0x3e6f0ff3
+ LONG $0x6f0f4466; BYTE $0xc7
+ LONG $0x04380f66; BYTE $0xf9
+ LONG $0xfaf50f66
+ LONG $0xfcfe0f66
+ LONG $0x666f0ff3; BYTE $0x10
+ LONG $0xeefe0f66
+ LONG $0xf60f4466; BYTE $0xc0
+ LONG $0xfe0f4466; BYTE $0xc6
+ LONG $0xf46f0f66
+ LONG $0xf0f60f66
+ LONG $0xfe0f4166; BYTE $0xf0
+ LONG $0x04380f66; BYTE $0xe3
+ LONG $0xe2f50f66
+ LONG $0xe7fe0f66
+ ADDQ $0x20, SI
+ WORD $0xc8ff
+ JNE block_inner_loop
+ LONG $0xf5720f66; BYTE $0x05
+ LONG $0xe5fe0f66
+ LONG $0xee700f66; BYTE $0xb1
+ LONG $0xeefe0f66
+ LONG $0xf5700f66; BYTE $0xee
+ LONG $0xf5fe0f66
+ LONG $0xf77e0f66
+ WORD $0x0144; BYTE $0xcf
+ LONG $0xec700f66; BYTE $0xb1
+ LONG $0xecfe0f66
+ LONG $0xe5700f66; BYTE $0xee
+ LONG $0xe5fe0f66
+ LONG $0xe07e0f66
+ MOVQ DI, R9
+ IMULQ R8, R9
+ SHRQ $0x2f, R9
+ LONG $0xf1c96945; WORD $0x00ff; BYTE $0x00
+ WORD $0x2944; BYTE $0xcf
+ MOVQ AX, R9
+ IMULQ R8, R9
+ SHRQ $0x2f, R9
+ LONG $0xf1c96945; WORD $0x00ff; BYTE $0x00
+ WORD $0x2944; BYTE $0xc8
+ WORD $0x8941; BYTE $0xf9
+ SUBQ R10, DX
+ JNE block_outer_loop
+
+tail_entry:
+ WORD $0x8548; BYTE $0xc9
+ JE return_result
+ CMPL CX, $0x10
+ JB tail_bytes_setup
+ WORD $0xb60f; BYTE $0x16
+ WORD $0xd701
+ WORD $0xf801
+ LONG $0x0156b60f
+ WORD $0xfa01
+ WORD $0xd001
+ LONG $0x027eb60f
+ WORD $0xd701
+ WORD $0xf801
+ LONG $0x0356b60f
+ WORD $0xfa01
+ WORD $0xd001
+ LONG $0x047eb60f
+ WORD $0xd701
+ WORD $0xf801
+ LONG $0x0556b60f
+ WORD $0xfa01
+ WORD $0xd001
+ LONG $0x067eb60f
+ WORD $0xd701
+ WORD $0xf801
+ LONG $0x0756b60f
+ WORD $0xfa01
+ WORD $0xd001
+ LONG $0x087eb60f
+ WORD $0xd701
+ WORD $0xf801
+ LONG $0x0956b60f
+ WORD $0xfa01
+ WORD $0xd001
+ LONG $0x0a7eb60f
+ WORD $0xd701
+ WORD $0xf801
+ LONG $0x0b56b60f
+ WORD $0xfa01
+ WORD $0xd001
+ LONG $0x0c7eb60f
+ WORD $0xd701
+ WORD $0xf801
+ LONG $0x0d56b60f
+ WORD $0xfa01
+ WORD $0xd001
+ LONG $0x46b60f44; BYTE $0x0e
+ WORD $0x0141; BYTE $0xd0
+ WORD $0x0144; BYTE $0xc0
+ LONG $0x0f7eb60f
+ WORD $0x0144; BYTE $0xc7
+ WORD $0xf801
+ ADDQ $-0x10, CX
+ JE final_reduce
+ ADDQ $0x10, SI
+
+tail_bytes_setup:
+ LEAQ -0x1(CX), DX
+ MOVQ CX, R9
+ ANDQ $0x3, R9
+ JE tail_dword_setup
+ XORL R8, R8
+
+tail_byte_loop:
+ LONG $0x14b60f46; BYTE $0x06
+ WORD $0x0144; BYTE $0xd7
+ WORD $0xf801
+ INCQ R8
+ CMPQ R9, R8
+ JNE tail_byte_loop
+ ADDQ R8, SI
+ SUBQ R8, CX
+
+tail_dword_setup:
+ CMPQ DX, $0x3
+ JB final_reduce
+ XORL DX, DX
+
+tail_dword_loop:
+ LONG $0x04b60f44; BYTE $0x16
+ WORD $0x0141; BYTE $0xf8
+ WORD $0x0144; BYTE $0xc0
+ LONG $0x167cb60f; BYTE $0x01
+ WORD $0x0144; BYTE $0xc7
+ WORD $0xf801
+ LONG $0x44b60f44; WORD $0x0216
+ WORD $0x0141; BYTE $0xf8
+ WORD $0x0144; BYTE $0xc0
+ LONG $0x167cb60f; BYTE $0x03
+ WORD $0x0144; BYTE $0xc7
+ WORD $0xf801
+ ADDQ $0x4, DX
+ CMPQ CX, DX
+ JNE tail_dword_loop
+
+final_reduce:
+ LONG $0x000f8f8d; WORD $0xffff
+ CMPL DI, $0xfff1
+ WORD $0x420f; BYTE $0xcf
+ WORD $0xc289
+ LONG $0x078071be; BYTE $0x80
+ IMULQ DX, SI
+ SHRQ $0x2f, SI
+ LONG $0xfff1d669; WORD $0x0000
+ WORD $0xd029
+ WORD $0xcf89
+
+return_result:
+ WORD $0xe0c1; BYTE $0x10
+ WORD $0xf809
+ NOP
+ NOP
+ MOVL AX, ret+32(FP)
+ RET
--
⑨