shithub: furgit

ref: 1fa0d2bcfa7aebdcec8644f53acc58465c109b72
dir: /internal/adler32/adler32_neon.s/

View raw version
//go:build !purego && arm64

#include "textflag.h"

DATA mult_table<>+0x00(SB)/8, $0x001d001e001f0020
DATA mult_table<>+0x08(SB)/8, $0x0019001a001b001c
DATA mult_table<>+0x10(SB)/8, $0x0015001600170018
DATA mult_table<>+0x18(SB)/8, $0x0011001200130014
DATA mult_table<>+0x20(SB)/8, $0x000d000e000f0010
DATA mult_table<>+0x28(SB)/8, $0x0009000a000b000c
DATA mult_table<>+0x30(SB)/8, $0x0005000600070008
DATA mult_table<>+0x38(SB)/8, $0x0001000200030004
GLOBL mult_table<>(SB), (RODATA|NOPTR), $64

TEXT ·adler32_neon(SB), NOSPLIT, $0-36
	MOVW in+0(FP), R0
	MOVD buf_base+8(FP), R1
	MOVD buf_len+16(FP), R2
	MOVD buf_cap+24(FP), R3
	NOP
	ANDS $15, R1, R10
	ANDW $65535, R0, R8
	LSRW $16, R0, R9
	NOP
	BEQ  vector_loop_setup
	ADD  $1, R1, R11
	MOVD R1, R12

align_loop:
	WORD  $0x3840158d
	SUB   $1, R2, R2
	TST   $15, R11
	ADD   $1, R11, R11
	ADDW  R13, R8, R8
	ADDW  R9, R8, R9
	BNE   align_loop
	MOVW  $32881, R11
	MOVW  $65521, R13
	MOVKW $(32775<<16), R11
	MOVW  $4294901775, R12
	MOVW  $65520, R14
	SUB   R10, R1, R10
	UMULL R11, R9, R11
	ADDW  R12, R8, R12
	CMPW  R14, R8
	ADD   $16, R10, R1
	LSR   $47, R11, R11
	CSELW HI, R12, R8, R8
	MSUBW R13, R9, R11, R9

vector_loop_setup:
	AND   $31, R2, R10
	CMP   $32, R2
	BCC   tail_entry
	MOVD  $mult_table<>(SB), R11
	ADD   $0, R11, R11
	MOVW  $32881, R14
	MOVW  $173, R12
	MOVD  $137438953440, R13
	MOVKW $(32775<<16), R14
	VLD1  (R11), [V0.H8, V1.H8, V2.H8, V3.H8]
	LSR   $5, R2, R11
	MOVW  $65521, R15
	VEXT  $8, V0.B16, V0.B16, V4.B16
	VEXT  $8, V1.B16, V1.B16, V5.B16
	VEXT  $8, V2.B16, V2.B16, V6.B16
	VEXT  $8, V3.B16, V3.B16, V7.B16

vector_outer_loop:
	CMP  $173, R11
	MOVD R1, R2
	CSEL LO, R11, R12, R16
	WORD $0x6f00e414
	MULW R16, R8, R0
	ADD  R16<<5, R13, R17
	WORD $0x6f00e410
	AND  $137438953440, R17, R17
	WORD $0x6f00e412
	WORD $0x6f00e413
	WORD $0x6f00e415
	VMOV R0, V20.S[3]
	MOVW R16, R0
	WORD $0x6f00e411

vector_inner_loop:
	WORD  $0xacc15857
	SUBSW $1, R0, R0
	VADD  V17.S4, V20.S4, V20.S4
	WORD  $0x2e3712b5
	WORD  $0x6e371273
	WORD  $0x6e202ad8
	WORD  $0x2e361252
	WORD  $0x6e361210
	WORD  $0x6e206af8
	WORD  $0x6e606b11
	BNE   vector_inner_loop
	VSHL  $5, V20.S4, V20.S4
	ADD   R17, R1, R17
	SUBS  R16, R11, R11
	ADD   $32, R17, R1
	WORD  $0x2e6082b4
	VEXT  $8, V21.B16, V21.B16, V21.B16
	WORD  $0x2e6482b4
	VEXT  $8, V19.B16, V19.B16, V21.B16
	WORD  $0x2e618274
	VEXT  $8, V18.B16, V18.B16, V19.B16
	WORD  $0x2e6582b4
	WORD  $0x2e628254
	WORD  $0x2e668274
	WORD  $0x2e638214
	VEXT  $8, V16.B16, V16.B16, V16.B16
	WORD  $0x2e678214
	WORD  $0x4eb1be30
	WORD  $0x4eb4be91
	WORD  $0x0eb1be10
	VMOV  V16.S[1], R0
	FMOVS F16, R2
	ADDW  R8, R2, R8
	ADDW  R9, R0, R9
	UMULL R14, R8, R0
	UMULL R14, R9, R2
	LSR   $47, R0, R0
	LSR   $47, R2, R2
	MSUBW R15, R8, R0, R8
	MSUBW R15, R9, R2, R9
	BNE   vector_outer_loop

tail_entry:
	CBZ  R10, return_result
	CMP  $16, R10
	BCC  tail_byte_loop
	WORD $0x3940002b
	SUBS $16, R10, R10
	WORD $0x3940042c
	WORD $0x3940082d
	ADDW R11, R8, R8
	WORD $0x39400c2b
	ADDW R9, R8, R9
	ADDW R12, R8, R8
	WORD $0x3940102c
	ADDW R8, R9, R9
	ADDW R13, R8, R8
	WORD $0x3940142d
	ADDW R8, R9, R9
	ADDW R11, R8, R8
	WORD $0x3940182b
	ADDW R8, R9, R9
	ADDW R12, R8, R8
	WORD $0x39401c2c
	ADDW R8, R9, R9
	ADDW R13, R8, R8
	ADDW R8, R9, R9
	ADDW R11, R8, R8
	WORD $0x3940202b
	ADDW R8, R9, R9
	ADDW R12, R8, R8
	WORD $0x3940242c
	ADDW R8, R9, R9
	WORD $0x3940382d
	ADDW R11, R8, R8
	WORD $0x3940282b
	ADDW R8, R9, R9
	ADDW R12, R8, R8
	WORD $0x39402c2c
	ADDW R8, R9, R9
	ADDW R11, R8, R8
	WORD $0x3940302b
	ADDW R8, R9, R9
	ADDW R12, R8, R8
	WORD $0x3940342c
	ADDW R8, R9, R9
	ADDW R11, R8, R8
	WORD $0x39403c2b
	ADDW R8, R9, R9
	ADDW R12, R8, R8
	ADDW R8, R9, R9
	ADDW R13, R8, R8
	ADDW R8, R9, R9
	ADDW R11, R8, R8
	ADDW R8, R9, R9
	BEQ  final_reduce
	ADD  $16, R1, R1

tail_byte_loop:
	WORD $0x3840142b
	SUBS $1, R10, R10
	ADDW R11, R8, R8
	ADDW R9, R8, R9
	BNE  tail_byte_loop

final_reduce:
	MOVW  $32881, R10
	MOVW  $65521, R12
	MOVKW $(32775<<16), R10
	MOVW  $4294901775, R11
	MOVW  $65520, R13
	ADDW  R11, R8, R11
	UMULL R10, R9, R10
	CMPW  R13, R8
	CSELW HI, R11, R8, R8
	LSR   $47, R10, R10
	MSUBW R12, R9, R10, R9

return_result:
	ORRW R9<<16, R8, R0
	NOP
	MOVW R0, ret+32(FP)
	RET