shithub: furgit

ref: 1fa0d2bcfa7aebdcec8644f53acc58465c109b72
dir: /internal/adler32/adler32_avx2.s/

View raw version
//go:build !purego && amd64

#include "textflag.h"

DATA weights_1_32<>+0x00(SB)/8, $0x191a1b1c1d1e1f20
DATA weights_1_32<>+0x08(SB)/8, $0x1112131415161718
DATA weights_1_32<>+0x10(SB)/8, $0x090a0b0c0d0e0f10
DATA weights_1_32<>+0x18(SB)/8, $0x0102030405060708
GLOBL weights_1_32<>(SB), (RODATA|NOPTR), $32

DATA ones_u16<>+0x00(SB)/8, $0x0001000100010001
DATA ones_u16<>+0x08(SB)/8, $0x0001000100010001
DATA ones_u16<>+0x10(SB)/8, $0x0001000100010001
DATA ones_u16<>+0x18(SB)/8, $0x0001000100010001
GLOBL ones_u16<>(SB), (RODATA|NOPTR), $32

DATA one_u16<>+0x00(SB)/2, $0x0001
GLOBL one_u16<>(SB), (RODATA|NOPTR), $2

TEXT ·adler32_avx2(SB), NOSPLIT, $0-36
	MOVLQZX      in+0(FP), DI
	MOVQ         buf_base+8(FP), SI
	MOVQ         buf_len+16(FP), DX
	MOVQ         buf_cap+24(FP), CX
	WORD         $0x8548; BYTE $0xf6
	JE           return_one
	WORD         $0xf889
	WORD         $0x8548; BYTE $0xd2
	JE           return_result
	NOP
	NOP
	NOP
	WORD         $0xc189
	WORD         $0xe9c1; BYTE $0x10
	WORD         $0xb70f; BYTE $0xc0
	CMPQ         DX, $0x20
	JB           tail16_check
	LONG         $0x078071bf; BYTE $0x80
	LONG         $0xc0eff9c5
	VMOVDQA      weights_1_32<>(SB), Y1
	VPBROADCASTW one_u16<>(SB), Y2
	JMP          block_loop_setup

block_accum_init:
	LONG $0xf46ffdc5
	LONG $0xedefd1c5

block_reduce:
	SUBQ  AX, DX
	LONG  $0xf572ddc5; BYTE $0x05
	LONG  $0xdbfeddc5
	LONG  $0x397de3c4; WORD $0x01f4
	LONG  $0xecc6c8c5; BYTE $0x88
	LONG  $0xe470f9c5; BYTE $0x88
	LONG  $0xe4fed1c5
	LONG  $0xec70f9c5; BYTE $0x55
	LONG  $0xe4fed1c5
	LONG  $0xe07ef9c5
	MOVQ  AX, CX
	IMULQ DI, CX
	SHRQ  $0x2f, CX
	LONG  $0xfff1c969; WORD $0x0000
	WORD  $0xc829
	LONG  $0x397de3c4; WORD $0x01dc
	LONG  $0xdbfed9c5
	LONG  $0xe370f9c5; BYTE $0xee
	LONG  $0xdcfee1c5
	LONG  $0xe370f9c5; BYTE $0x55
	LONG  $0xdbfed9c5
	LONG  $0xd97ef9c5
	MOVQ  CX, R8
	IMULQ DI, R8
	SHRQ  $0x2f, R8
	LONG  $0xf1c06945; WORD $0x00ff; BYTE $0x00
	WORD  $0x2944; BYTE $0xc1
	CMPQ  DX, $0x1f
	JBE   tail_check

block_loop_setup:
	LONG $0xe06ef9c5
	LONG $0xd96ef9c5
	CMPQ DX, $0x15b0
	LONG $0x15b0b841; WORD $0x0000
	LONG $0xc2420f4c
	WORD $0x8944; BYTE $0xc0
	LONG $0x001fe025; BYTE $0x00
	JE   block_accum_init
	ADDQ $-0x20, R8
	LONG $0xedefd1c5
	LONG $0x20c0f641
	JNE  block_loop_entry
	LONG $0x2e6ffec5
	ADDQ $0x20, SI
	LEAQ -0x20(AX), CX
	LONG $0xf0f6d5c5
	LONG $0xf4fecdc5
	LONG $0x0455e2c4; BYTE $0xe9
	LONG $0xeaf5d5c5
	LONG $0xdbfed5c5
	LONG $0xec6ffdc5
	LONG $0xe66ffdc5
	CMPQ R8, $0x20
	JAE  block_loop_64
	JMP  block_reduce

block_loop_entry:
	MOVQ AX, CX
	CMPQ R8, $0x20
	JB   block_reduce

block_loop_64:
	LONG $0x366ffec5
	LONG $0x7e6ffec5; BYTE $0x20
	LONG $0xc0f64dc5
	LONG $0xc4fe3dc5
	LONG $0xecfed5c5
	LONG $0x044de2c4; BYTE $0xe1
	LONG $0xe2f5ddc5
	LONG $0xdbfeddc5
	ADDQ $0x40, SI
	LONG $0xe0f6c5c5
	LONG $0xe4febdc5
	LONG $0xedfebdc5
	LONG $0x0445e2c4; BYTE $0xf1
	LONG $0xf2f5cdc5
	LONG $0xdbfecdc5
	ADDQ $-0x40, CX
	JNE  block_loop_64
	LONG $0xf46ffdc5
	JMP  block_reduce

return_one:
	LONG $0x000001b8; BYTE $0x00

return_result:
	MOVL AX, ret+32(FP)
	RET

tail_check:
	WORD $0x8548; BYTE $0xd2
	JE   return_no_tail

tail16_check:
	CMPQ DX, $0x10
	JB   tail_bytes_setup
	WORD $0xb60f; BYTE $0x3e
	WORD $0xf801
	WORD $0xc101
	LONG $0x017eb60f
	WORD $0xc701
	WORD $0xf901
	LONG $0x0246b60f
	WORD $0xf801
	WORD $0xc101
	LONG $0x037eb60f
	WORD $0xc701
	WORD $0xf901
	LONG $0x0446b60f
	WORD $0xf801
	WORD $0xc101
	LONG $0x057eb60f
	WORD $0xc701
	WORD $0xf901
	LONG $0x0646b60f
	WORD $0xf801
	WORD $0xc101
	LONG $0x077eb60f
	WORD $0xc701
	WORD $0xf901
	LONG $0x0846b60f
	WORD $0xf801
	WORD $0xc101
	LONG $0x097eb60f
	WORD $0xc701
	WORD $0xf901
	LONG $0x0a46b60f
	WORD $0xf801
	WORD $0xc101
	LONG $0x0b7eb60f
	WORD $0xc701
	WORD $0xf901
	LONG $0x0c46b60f
	WORD $0xf801
	WORD $0xc101
	LONG $0x0d7eb60f
	WORD $0xc701
	WORD $0xf901
	LONG $0x46b60f44; BYTE $0x0e
	WORD $0x0141; BYTE $0xf8
	WORD $0x0144; BYTE $0xc1
	LONG $0x0f46b60f
	WORD $0x0144; BYTE $0xc0
	WORD $0xc101
	ADDQ $-0x10, DX
	JE   final_reduce
	ADDQ $0x10, SI

tail_bytes_setup:
	LEAQ -0x1(DX), DI
	MOVQ DX, R9
	ANDQ $0x3, R9
	JE   tail_dword_setup
	XORL R8, R8

tail_byte_loop:
	LONG $0x14b60f46; BYTE $0x06
	WORD $0x0144; BYTE $0xd0
	WORD $0xc101
	INCQ R8
	CMPQ R9, R8
	JNE  tail_byte_loop
	ADDQ R8, SI
	SUBQ R8, DX

tail_dword_setup:
	CMPQ DI, $0x3
	JB   final_reduce
	XORL DI, DI

tail_dword_loop:
	LONG $0x04b60f44; BYTE $0x3e
	WORD $0x0141; BYTE $0xc0
	WORD $0x0144; BYTE $0xc1
	LONG $0x3e44b60f; BYTE $0x01
	WORD $0x0144; BYTE $0xc0
	WORD $0xc101
	LONG $0x44b60f44; WORD $0x023e
	WORD $0x0141; BYTE $0xc0
	WORD $0x0144; BYTE $0xc1
	LONG $0x3e44b60f; BYTE $0x03
	WORD $0x0144; BYTE $0xc0
	WORD $0xc101
	ADDQ $0x4, DI
	CMPQ DX, DI
	JNE  tail_dword_loop

final_reduce:
	LONG  $0x000f908d; WORD $0xffff
	CMPL  AX, $0xfff1
	WORD  $0x420f; BYTE $0xd0
	WORD  $0xc889
	LONG  $0x078071be; BYTE $0x80
	IMULQ AX, SI
	SHRQ  $0x2f, SI
	LONG  $0xfff1c669; WORD $0x0000
	WORD  $0xc129
	WORD  $0xe1c1; BYTE $0x10
	WORD  $0xd109
	WORD  $0xc889
	NOP
	NOP
	VZEROUPPER
	MOVL  AX, ret+32(FP)
	RET

return_no_tail:
	WORD $0xe1c1; BYTE $0x10
	WORD $0xc809
	NOP
	NOP
	VZEROUPPER
	MOVL AX, ret+32(FP)
	RET