ref: 1fa0d2bcfa7aebdcec8644f53acc58465c109b72
dir: /internal/adler32/adler32_neon.s/
//go:build !purego && arm64 #include "textflag.h" DATA mult_table<>+0x00(SB)/8, $0x001d001e001f0020 DATA mult_table<>+0x08(SB)/8, $0x0019001a001b001c DATA mult_table<>+0x10(SB)/8, $0x0015001600170018 DATA mult_table<>+0x18(SB)/8, $0x0011001200130014 DATA mult_table<>+0x20(SB)/8, $0x000d000e000f0010 DATA mult_table<>+0x28(SB)/8, $0x0009000a000b000c DATA mult_table<>+0x30(SB)/8, $0x0005000600070008 DATA mult_table<>+0x38(SB)/8, $0x0001000200030004 GLOBL mult_table<>(SB), (RODATA|NOPTR), $64 TEXT ·adler32_neon(SB), NOSPLIT, $0-36 MOVW in+0(FP), R0 MOVD buf_base+8(FP), R1 MOVD buf_len+16(FP), R2 MOVD buf_cap+24(FP), R3 NOP ANDS $15, R1, R10 ANDW $65535, R0, R8 LSRW $16, R0, R9 NOP BEQ vector_loop_setup ADD $1, R1, R11 MOVD R1, R12 align_loop: WORD $0x3840158d SUB $1, R2, R2 TST $15, R11 ADD $1, R11, R11 ADDW R13, R8, R8 ADDW R9, R8, R9 BNE align_loop MOVW $32881, R11 MOVW $65521, R13 MOVKW $(32775<<16), R11 MOVW $4294901775, R12 MOVW $65520, R14 SUB R10, R1, R10 UMULL R11, R9, R11 ADDW R12, R8, R12 CMPW R14, R8 ADD $16, R10, R1 LSR $47, R11, R11 CSELW HI, R12, R8, R8 MSUBW R13, R9, R11, R9 vector_loop_setup: AND $31, R2, R10 CMP $32, R2 BCC tail_entry MOVD $mult_table<>(SB), R11 ADD $0, R11, R11 MOVW $32881, R14 MOVW $173, R12 MOVD $137438953440, R13 MOVKW $(32775<<16), R14 VLD1 (R11), [V0.H8, V1.H8, V2.H8, V3.H8] LSR $5, R2, R11 MOVW $65521, R15 VEXT $8, V0.B16, V0.B16, V4.B16 VEXT $8, V1.B16, V1.B16, V5.B16 VEXT $8, V2.B16, V2.B16, V6.B16 VEXT $8, V3.B16, V3.B16, V7.B16 vector_outer_loop: CMP $173, R11 MOVD R1, R2 CSEL LO, R11, R12, R16 WORD $0x6f00e414 MULW R16, R8, R0 ADD R16<<5, R13, R17 WORD $0x6f00e410 AND $137438953440, R17, R17 WORD $0x6f00e412 WORD $0x6f00e413 WORD $0x6f00e415 VMOV R0, V20.S[3] MOVW R16, R0 WORD $0x6f00e411 vector_inner_loop: WORD $0xacc15857 SUBSW $1, R0, R0 VADD V17.S4, V20.S4, V20.S4 WORD $0x2e3712b5 WORD $0x6e371273 WORD $0x6e202ad8 WORD $0x2e361252 WORD $0x6e361210 WORD $0x6e206af8 WORD $0x6e606b11 BNE vector_inner_loop VSHL $5, V20.S4, V20.S4 ADD R17, R1, R17 SUBS R16, R11, R11 ADD $32, R17, R1 WORD $0x2e6082b4 VEXT $8, V21.B16, V21.B16, V21.B16 WORD $0x2e6482b4 VEXT $8, V19.B16, V19.B16, V21.B16 WORD $0x2e618274 VEXT $8, V18.B16, V18.B16, V19.B16 WORD $0x2e6582b4 WORD $0x2e628254 WORD $0x2e668274 WORD $0x2e638214 VEXT $8, V16.B16, V16.B16, V16.B16 WORD $0x2e678214 WORD $0x4eb1be30 WORD $0x4eb4be91 WORD $0x0eb1be10 VMOV V16.S[1], R0 FMOVS F16, R2 ADDW R8, R2, R8 ADDW R9, R0, R9 UMULL R14, R8, R0 UMULL R14, R9, R2 LSR $47, R0, R0 LSR $47, R2, R2 MSUBW R15, R8, R0, R8 MSUBW R15, R9, R2, R9 BNE vector_outer_loop tail_entry: CBZ R10, return_result CMP $16, R10 BCC tail_byte_loop WORD $0x3940002b SUBS $16, R10, R10 WORD $0x3940042c WORD $0x3940082d ADDW R11, R8, R8 WORD $0x39400c2b ADDW R9, R8, R9 ADDW R12, R8, R8 WORD $0x3940102c ADDW R8, R9, R9 ADDW R13, R8, R8 WORD $0x3940142d ADDW R8, R9, R9 ADDW R11, R8, R8 WORD $0x3940182b ADDW R8, R9, R9 ADDW R12, R8, R8 WORD $0x39401c2c ADDW R8, R9, R9 ADDW R13, R8, R8 ADDW R8, R9, R9 ADDW R11, R8, R8 WORD $0x3940202b ADDW R8, R9, R9 ADDW R12, R8, R8 WORD $0x3940242c ADDW R8, R9, R9 WORD $0x3940382d ADDW R11, R8, R8 WORD $0x3940282b ADDW R8, R9, R9 ADDW R12, R8, R8 WORD $0x39402c2c ADDW R8, R9, R9 ADDW R11, R8, R8 WORD $0x3940302b ADDW R8, R9, R9 ADDW R12, R8, R8 WORD $0x3940342c ADDW R8, R9, R9 ADDW R11, R8, R8 WORD $0x39403c2b ADDW R8, R9, R9 ADDW R12, R8, R8 ADDW R8, R9, R9 ADDW R13, R8, R8 ADDW R8, R9, R9 ADDW R11, R8, R8 ADDW R8, R9, R9 BEQ final_reduce ADD $16, R1, R1 tail_byte_loop: WORD $0x3840142b SUBS $1, R10, R10 ADDW R11, R8, R8 ADDW R9, R8, R9 BNE tail_byte_loop final_reduce: MOVW $32881, R10 MOVW $65521, R12 MOVKW $(32775<<16), R10 MOVW $4294901775, R11 MOVW $65520, R13 ADDW R11, R8, R11 UMULL R10, R9, R10 CMPW R13, R8 CSELW HI, R11, R8, R8 LSR $47, R10, R10 MSUBW R12, R9, R10, R9 return_result: ORRW R9<<16, R8, R0 NOP MOVW R0, ret+32(FP) RET