mirror of
https://github.com/octoleo/syncthing.git
synced 2025-01-23 15:18:24 +00:00
400 lines
7.4 KiB
ArmAsm
400 lines
7.4 KiB
ArmAsm
|
// Copyright 2009 The Go Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style
|
||
|
// license that can be found in the LICENSE file.
|
||
|
|
||
|
// This file provides fast assembly versions for the elementary
|
||
|
// arithmetic operations on vectors implemented in arith.go.
|
||
|
|
||
|
// Literal instruction for MOVQ $0, CX.
|
||
|
// (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.)
|
||
|
#define ZERO_CX BYTE $0x48; \
|
||
|
BYTE $0xc7; \
|
||
|
BYTE $0xc1; \
|
||
|
BYTE $0x00; \
|
||
|
BYTE $0x00; \
|
||
|
BYTE $0x00; \
|
||
|
BYTE $0x00
|
||
|
|
||
|
// func mulWW(x, y Word) (z1, z0 Word)
|
||
|
TEXT ·mulWW(SB),7,$0
|
||
|
MOVQ x+0(FP), AX
|
||
|
MULQ y+8(FP)
|
||
|
MOVQ DX, z1+16(FP)
|
||
|
MOVQ AX, z0+24(FP)
|
||
|
RET
|
||
|
|
||
|
|
||
|
// func divWW(x1, x0, y Word) (q, r Word)
|
||
|
TEXT ·divWW(SB),7,$0
|
||
|
MOVQ x1+0(FP), DX
|
||
|
MOVQ x0+8(FP), AX
|
||
|
DIVQ y+16(FP)
|
||
|
MOVQ AX, q+24(FP)
|
||
|
MOVQ DX, r+32(FP)
|
||
|
RET
|
||
|
|
||
|
|
||
|
// func addVV(z, x, y []Word) (c Word)
|
||
|
TEXT ·addVV(SB),7,$0
|
||
|
MOVQ z_len+8(FP), DI
|
||
|
MOVQ x+24(FP), R8
|
||
|
MOVQ y+48(FP), R9
|
||
|
MOVQ z+0(FP), R10
|
||
|
|
||
|
MOVQ $0, CX // c = 0
|
||
|
MOVQ $0, SI // i = 0
|
||
|
|
||
|
// s/JL/JMP/ below to disable the unrolled loop
|
||
|
SUBQ $4, DI // n -= 4
|
||
|
JL V1 // if n < 0 goto V1
|
||
|
|
||
|
U1: // n >= 0
|
||
|
// regular loop body unrolled 4x
|
||
|
RCRQ $1, CX // CF = c
|
||
|
MOVQ 0(R8)(SI*8), R11
|
||
|
MOVQ 8(R8)(SI*8), R12
|
||
|
MOVQ 16(R8)(SI*8), R13
|
||
|
MOVQ 24(R8)(SI*8), R14
|
||
|
ADCQ 0(R9)(SI*8), R11
|
||
|
ADCQ 8(R9)(SI*8), R12
|
||
|
ADCQ 16(R9)(SI*8), R13
|
||
|
ADCQ 24(R9)(SI*8), R14
|
||
|
MOVQ R11, 0(R10)(SI*8)
|
||
|
MOVQ R12, 8(R10)(SI*8)
|
||
|
MOVQ R13, 16(R10)(SI*8)
|
||
|
MOVQ R14, 24(R10)(SI*8)
|
||
|
RCLQ $1, CX // c = CF
|
||
|
|
||
|
ADDQ $4, SI // i += 4
|
||
|
SUBQ $4, DI // n -= 4
|
||
|
JGE U1 // if n >= 0 goto U1
|
||
|
|
||
|
V1: ADDQ $4, DI // n += 4
|
||
|
JLE E1 // if n <= 0 goto E1
|
||
|
|
||
|
L1: // n > 0
|
||
|
RCRQ $1, CX // CF = c
|
||
|
MOVQ 0(R8)(SI*8), R11
|
||
|
ADCQ 0(R9)(SI*8), R11
|
||
|
MOVQ R11, 0(R10)(SI*8)
|
||
|
RCLQ $1, CX // c = CF
|
||
|
|
||
|
ADDQ $1, SI // i++
|
||
|
SUBQ $1, DI // n--
|
||
|
JG L1 // if n > 0 goto L1
|
||
|
|
||
|
E1: MOVQ CX, c+72(FP) // return c
|
||
|
RET
|
||
|
|
||
|
|
||
|
// func subVV(z, x, y []Word) (c Word)
|
||
|
// (same as addVV except for SBBQ instead of ADCQ and label names)
|
||
|
TEXT ·subVV(SB),7,$0
|
||
|
MOVQ z_len+8(FP), DI
|
||
|
MOVQ x+24(FP), R8
|
||
|
MOVQ y+48(FP), R9
|
||
|
MOVQ z+0(FP), R10
|
||
|
|
||
|
MOVQ $0, CX // c = 0
|
||
|
MOVQ $0, SI // i = 0
|
||
|
|
||
|
// s/JL/JMP/ below to disable the unrolled loop
|
||
|
SUBQ $4, DI // n -= 4
|
||
|
JL V2 // if n < 0 goto V2
|
||
|
|
||
|
U2: // n >= 0
|
||
|
// regular loop body unrolled 4x
|
||
|
RCRQ $1, CX // CF = c
|
||
|
MOVQ 0(R8)(SI*8), R11
|
||
|
MOVQ 8(R8)(SI*8), R12
|
||
|
MOVQ 16(R8)(SI*8), R13
|
||
|
MOVQ 24(R8)(SI*8), R14
|
||
|
SBBQ 0(R9)(SI*8), R11
|
||
|
SBBQ 8(R9)(SI*8), R12
|
||
|
SBBQ 16(R9)(SI*8), R13
|
||
|
SBBQ 24(R9)(SI*8), R14
|
||
|
MOVQ R11, 0(R10)(SI*8)
|
||
|
MOVQ R12, 8(R10)(SI*8)
|
||
|
MOVQ R13, 16(R10)(SI*8)
|
||
|
MOVQ R14, 24(R10)(SI*8)
|
||
|
RCLQ $1, CX // c = CF
|
||
|
|
||
|
ADDQ $4, SI // i += 4
|
||
|
SUBQ $4, DI // n -= 4
|
||
|
JGE U2 // if n >= 0 goto U2
|
||
|
|
||
|
V2: ADDQ $4, DI // n += 4
|
||
|
JLE E2 // if n <= 0 goto E2
|
||
|
|
||
|
L2: // n > 0
|
||
|
RCRQ $1, CX // CF = c
|
||
|
MOVQ 0(R8)(SI*8), R11
|
||
|
SBBQ 0(R9)(SI*8), R11
|
||
|
MOVQ R11, 0(R10)(SI*8)
|
||
|
RCLQ $1, CX // c = CF
|
||
|
|
||
|
ADDQ $1, SI // i++
|
||
|
SUBQ $1, DI // n--
|
||
|
JG L2 // if n > 0 goto L2
|
||
|
|
||
|
E2: MOVQ CX, c+72(FP) // return c
|
||
|
RET
|
||
|
|
||
|
|
||
|
// func addVW(z, x []Word, y Word) (c Word)
|
||
|
TEXT ·addVW(SB),7,$0
|
||
|
MOVQ z_len+8(FP), DI
|
||
|
MOVQ x+24(FP), R8
|
||
|
MOVQ y+48(FP), CX // c = y
|
||
|
MOVQ z+0(FP), R10
|
||
|
|
||
|
MOVQ $0, SI // i = 0
|
||
|
|
||
|
// s/JL/JMP/ below to disable the unrolled loop
|
||
|
SUBQ $4, DI // n -= 4
|
||
|
JL V3 // if n < 4 goto V3
|
||
|
|
||
|
U3: // n >= 0
|
||
|
// regular loop body unrolled 4x
|
||
|
MOVQ 0(R8)(SI*8), R11
|
||
|
MOVQ 8(R8)(SI*8), R12
|
||
|
MOVQ 16(R8)(SI*8), R13
|
||
|
MOVQ 24(R8)(SI*8), R14
|
||
|
ADDQ CX, R11
|
||
|
ZERO_CX
|
||
|
ADCQ $0, R12
|
||
|
ADCQ $0, R13
|
||
|
ADCQ $0, R14
|
||
|
SETCS CX // c = CF
|
||
|
MOVQ R11, 0(R10)(SI*8)
|
||
|
MOVQ R12, 8(R10)(SI*8)
|
||
|
MOVQ R13, 16(R10)(SI*8)
|
||
|
MOVQ R14, 24(R10)(SI*8)
|
||
|
|
||
|
ADDQ $4, SI // i += 4
|
||
|
SUBQ $4, DI // n -= 4
|
||
|
JGE U3 // if n >= 0 goto U3
|
||
|
|
||
|
V3: ADDQ $4, DI // n += 4
|
||
|
JLE E3 // if n <= 0 goto E3
|
||
|
|
||
|
L3: // n > 0
|
||
|
ADDQ 0(R8)(SI*8), CX
|
||
|
MOVQ CX, 0(R10)(SI*8)
|
||
|
ZERO_CX
|
||
|
RCLQ $1, CX // c = CF
|
||
|
|
||
|
ADDQ $1, SI // i++
|
||
|
SUBQ $1, DI // n--
|
||
|
JG L3 // if n > 0 goto L3
|
||
|
|
||
|
E3: MOVQ CX, c+56(FP) // return c
|
||
|
RET
|
||
|
|
||
|
|
||
|
// func subVW(z, x []Word, y Word) (c Word)
|
||
|
// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
|
||
|
TEXT ·subVW(SB),7,$0
|
||
|
MOVQ z_len+8(FP), DI
|
||
|
MOVQ x+24(FP), R8
|
||
|
MOVQ y+48(FP), CX // c = y
|
||
|
MOVQ z+0(FP), R10
|
||
|
|
||
|
MOVQ $0, SI // i = 0
|
||
|
|
||
|
// s/JL/JMP/ below to disable the unrolled loop
|
||
|
SUBQ $4, DI // n -= 4
|
||
|
JL V4 // if n < 4 goto V4
|
||
|
|
||
|
U4: // n >= 0
|
||
|
// regular loop body unrolled 4x
|
||
|
MOVQ 0(R8)(SI*8), R11
|
||
|
MOVQ 8(R8)(SI*8), R12
|
||
|
MOVQ 16(R8)(SI*8), R13
|
||
|
MOVQ 24(R8)(SI*8), R14
|
||
|
SUBQ CX, R11
|
||
|
ZERO_CX
|
||
|
SBBQ $0, R12
|
||
|
SBBQ $0, R13
|
||
|
SBBQ $0, R14
|
||
|
SETCS CX // c = CF
|
||
|
MOVQ R11, 0(R10)(SI*8)
|
||
|
MOVQ R12, 8(R10)(SI*8)
|
||
|
MOVQ R13, 16(R10)(SI*8)
|
||
|
MOVQ R14, 24(R10)(SI*8)
|
||
|
|
||
|
ADDQ $4, SI // i += 4
|
||
|
SUBQ $4, DI // n -= 4
|
||
|
JGE U4 // if n >= 0 goto U4
|
||
|
|
||
|
V4: ADDQ $4, DI // n += 4
|
||
|
JLE E4 // if n <= 0 goto E4
|
||
|
|
||
|
L4: // n > 0
|
||
|
MOVQ 0(R8)(SI*8), R11
|
||
|
SUBQ CX, R11
|
||
|
MOVQ R11, 0(R10)(SI*8)
|
||
|
ZERO_CX
|
||
|
RCLQ $1, CX // c = CF
|
||
|
|
||
|
ADDQ $1, SI // i++
|
||
|
SUBQ $1, DI // n--
|
||
|
JG L4 // if n > 0 goto L4
|
||
|
|
||
|
E4: MOVQ CX, c+56(FP) // return c
|
||
|
RET
|
||
|
|
||
|
|
||
|
// func shlVU(z, x []Word, s uint) (c Word)
|
||
|
TEXT ·shlVU(SB),7,$0
|
||
|
MOVQ z_len+8(FP), BX // i = z
|
||
|
SUBQ $1, BX // i--
|
||
|
JL X8b // i < 0 (n <= 0)
|
||
|
|
||
|
// n > 0
|
||
|
MOVQ z+0(FP), R10
|
||
|
MOVQ x+24(FP), R8
|
||
|
MOVQ s+48(FP), CX
|
||
|
MOVQ (R8)(BX*8), AX // w1 = x[n-1]
|
||
|
MOVQ $0, DX
|
||
|
SHLQ CX, DX:AX // w1>>ŝ
|
||
|
MOVQ DX, c+56(FP)
|
||
|
|
||
|
CMPQ BX, $0
|
||
|
JLE X8a // i <= 0
|
||
|
|
||
|
// i > 0
|
||
|
L8: MOVQ AX, DX // w = w1
|
||
|
MOVQ -8(R8)(BX*8), AX // w1 = x[i-1]
|
||
|
SHLQ CX, DX:AX // w<<s | w1>>ŝ
|
||
|
MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ
|
||
|
SUBQ $1, BX // i--
|
||
|
JG L8 // i > 0
|
||
|
|
||
|
// i <= 0
|
||
|
X8a: SHLQ CX, AX // w1<<s
|
||
|
MOVQ AX, (R10) // z[0] = w1<<s
|
||
|
RET
|
||
|
|
||
|
X8b: MOVQ $0, c+56(FP)
|
||
|
RET
|
||
|
|
||
|
|
||
|
// func shrVU(z, x []Word, s uint) (c Word)
|
||
|
TEXT ·shrVU(SB),7,$0
|
||
|
MOVQ z_len+8(FP), R11
|
||
|
SUBQ $1, R11 // n--
|
||
|
JL X9b // n < 0 (n <= 0)
|
||
|
|
||
|
// n > 0
|
||
|
MOVQ z+0(FP), R10
|
||
|
MOVQ x+24(FP), R8
|
||
|
MOVQ s+48(FP), CX
|
||
|
MOVQ (R8), AX // w1 = x[0]
|
||
|
MOVQ $0, DX
|
||
|
SHRQ CX, DX:AX // w1<<ŝ
|
||
|
MOVQ DX, c+56(FP)
|
||
|
|
||
|
MOVQ $0, BX // i = 0
|
||
|
JMP E9
|
||
|
|
||
|
// i < n-1
|
||
|
L9: MOVQ AX, DX // w = w1
|
||
|
MOVQ 8(R8)(BX*8), AX // w1 = x[i+1]
|
||
|
SHRQ CX, DX:AX // w>>s | w1<<ŝ
|
||
|
MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ
|
||
|
ADDQ $1, BX // i++
|
||
|
|
||
|
E9: CMPQ BX, R11
|
||
|
JL L9 // i < n-1
|
||
|
|
||
|
// i >= n-1
|
||
|
X9a: SHRQ CX, AX // w1>>s
|
||
|
MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s
|
||
|
RET
|
||
|
|
||
|
X9b: MOVQ $0, c+56(FP)
|
||
|
RET
|
||
|
|
||
|
|
||
|
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
|
||
|
TEXT ·mulAddVWW(SB),7,$0
|
||
|
MOVQ z+0(FP), R10
|
||
|
MOVQ x+24(FP), R8
|
||
|
MOVQ y+48(FP), R9
|
||
|
MOVQ r+56(FP), CX // c = r
|
||
|
MOVQ z_len+8(FP), R11
|
||
|
MOVQ $0, BX // i = 0
|
||
|
JMP E5
|
||
|
|
||
|
L5: MOVQ (R8)(BX*8), AX
|
||
|
MULQ R9
|
||
|
ADDQ CX, AX
|
||
|
ADCQ $0, DX
|
||
|
MOVQ AX, (R10)(BX*8)
|
||
|
MOVQ DX, CX
|
||
|
ADDQ $1, BX // i++
|
||
|
|
||
|
E5: CMPQ BX, R11 // i < n
|
||
|
JL L5
|
||
|
|
||
|
MOVQ CX, c+64(FP)
|
||
|
RET
|
||
|
|
||
|
|
||
|
// func addMulVVW(z, x []Word, y Word) (c Word)
|
||
|
TEXT ·addMulVVW(SB),7,$0
|
||
|
MOVQ z+0(FP), R10
|
||
|
MOVQ x+24(FP), R8
|
||
|
MOVQ y+48(FP), R9
|
||
|
MOVQ z_len+8(FP), R11
|
||
|
MOVQ $0, BX // i = 0
|
||
|
MOVQ $0, CX // c = 0
|
||
|
JMP E6
|
||
|
|
||
|
L6: MOVQ (R8)(BX*8), AX
|
||
|
MULQ R9
|
||
|
ADDQ CX, AX
|
||
|
ADCQ $0, DX
|
||
|
ADDQ AX, (R10)(BX*8)
|
||
|
ADCQ $0, DX
|
||
|
MOVQ DX, CX
|
||
|
ADDQ $1, BX // i++
|
||
|
|
||
|
E6: CMPQ BX, R11 // i < n
|
||
|
JL L6
|
||
|
|
||
|
MOVQ CX, c+56(FP)
|
||
|
RET
|
||
|
|
||
|
|
||
|
// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
|
||
|
TEXT ·divWVW(SB),7,$0
|
||
|
MOVQ z+0(FP), R10
|
||
|
MOVQ xn+24(FP), DX // r = xn
|
||
|
MOVQ x+32(FP), R8
|
||
|
MOVQ y+56(FP), R9
|
||
|
MOVQ z_len+8(FP), BX // i = z
|
||
|
JMP E7
|
||
|
|
||
|
L7: MOVQ (R8)(BX*8), AX
|
||
|
DIVQ R9
|
||
|
MOVQ AX, (R10)(BX*8)
|
||
|
|
||
|
E7: SUBQ $1, BX // i--
|
||
|
JGE L7 // i >= 0
|
||
|
|
||
|
MOVQ DX, r+64(FP)
|
||
|
RET
|
||
|
|
||
|
// func bitLen(x Word) (n int)
|
||
|
TEXT ·bitLen(SB),7,$0
|
||
|
BSRQ x+0(FP), AX
|
||
|
JZ Z1
|
||
|
ADDQ $1, AX
|
||
|
MOVQ AX, n+8(FP)
|
||
|
RET
|
||
|
|
||
|
Z1: MOVQ $0, n+8(FP)
|
||
|
RET
|