vendor: Update vendor dir from a80c0fda

This commit is contained in:
Jakob Borg 2019-01-20 08:50:40 +01:00
parent 00fa77dd47
commit 29e4b417f2
8 changed files with 68 additions and 47 deletions

View File

@ -21,3 +21,4 @@ matrix:
script: script:
- diff -au <(gofmt -d .) <(printf "") - diff -au <(gofmt -d .) <(printf "")
- go test -race -v ./... - go test -race -v ./...
- go tool vet -asmdecl .

View File

@ -1,6 +1,6 @@
# sha256-simd # sha256-simd
Accelerate SHA256 computations in pure Go using AVX512 and AVX2 for Intel and ARM64 for ARM. On AVX512 it provides an up to 8x improvement (over 3 GB/s per core) in comparison to AVX2. Accelerate SHA256 computations in pure Go using AVX512, SHA Extensions and AVX2 for Intel and ARM64 for ARM. On AVX512 it provides an up to 8x improvement (over 3 GB/s per core) in comparison to AVX2. SHA Extensions give a performance boost of close to 4x over AVX2.
## Introduction ## Introduction
@ -8,7 +8,19 @@ This package is designed as a replacement for `crypto/sha256`. For Intel CPUs it
This package uses Golang assembly. The AVX512 version is based on the Intel's "multi-buffer crypto library for IPSec" whereas the other Intel implementations are described in "Fast SHA-256 Implementations on Intel Architecture Processors" by J. Guilford et al. This package uses Golang assembly. The AVX512 version is based on the Intel's "multi-buffer crypto library for IPSec" whereas the other Intel implementations are described in "Fast SHA-256 Implementations on Intel Architecture Processors" by J. Guilford et al.
## New: Support for AVX512 ## New: Support for Intel SHA Extensions
Support for the Intel SHA Extensions has been added by Kristofer Peterson (@svenski123), originally developed for spacemeshos [here](https://github.com/spacemeshos/POET/issues/23). On CPUs that support it (known thus far Intel Celeron J3455 and AMD Ryzen) it gives a significant boost in performance (with thanks to @AudriusButkevicius for reporting the results; full results [here](https://github.com/minio/sha256-simd/pull/37#issuecomment-451607827)).
```
$ benchcmp avx2.txt sha-ext.txt
benchmark AVX2 MB/s SHA Ext MB/s speedup
BenchmarkHash5M 514.40 1975.17 3.84x
```
Thanks to Kristofer Peterson, we also added additional performance changes such as optimized padding, endian conversions which sped up all implementations i.e. Intel SHA alone while doubled performance for small sizes, the other changes increased everything roughly 50%.
## Support for AVX512
We have added support for AVX512 which results in an up to 8x performance improvement over AVX2 (3.0 GHz Xeon Platinum 8124M CPU): We have added support for AVX512 which results in an up to 8x performance improvement over AVX2 (3.0 GHz Xeon Platinum 8124M CPU):
@ -66,6 +78,7 @@ Below is the speed in MB/s for a single core (ranked fast to slow) for blocks la
| Processor | SIMD | Speed (MB/s) | | Processor | SIMD | Speed (MB/s) |
| --------------------------------- | ------- | ------------:| | --------------------------------- | ------- | ------------:|
| 3.0 GHz Intel Xeon Platinum 8124M | AVX512 | 3498 | | 3.0 GHz Intel Xeon Platinum 8124M | AVX512 | 3498 |
| 3.7 GHz AMD Ryzen 7 2700X | SHA Ext | 1979 |
| 1.2 GHz ARM Cortex-A53 | ARM64 | 638 | | 1.2 GHz ARM Cortex-A53 | ARM64 | 638 |
| 3.0 GHz Intel Xeon Platinum 8124M | AVX2 | 449 | | 3.0 GHz Intel Xeon Platinum 8124M | AVX2 | 449 |
| 3.1 GHz Intel Core i7 | AVX | 362 | | 3.1 GHz Intel Core i7 | AVX | 362 |

View File

@ -32,8 +32,6 @@
// equivalents // equivalents
// //
#include "textflag.h"
DATA K256<>+0x000(SB)/8, $0x71374491428a2f98 DATA K256<>+0x000(SB)/8, $0x71374491428a2f98
DATA K256<>+0x008(SB)/8, $0xe9b5dba5b5c0fbcf DATA K256<>+0x008(SB)/8, $0xe9b5dba5b5c0fbcf
DATA K256<>+0x010(SB)/8, $0x71374491428a2f98 DATA K256<>+0x010(SB)/8, $0x71374491428a2f98
@ -114,16 +112,25 @@ DATA K256<>+0x258(SB)/8, $0x0b0a090803020100
GLOBL K256<>(SB), 8, $608 GLOBL K256<>(SB), 8, $608
// func blockAvx2(h []uint32, message []uint8) // We need 0x220 stack space aligned on a 512 boundary, so for the
TEXT ·blockAvx2(SB), 7, $0 // worstcase-aligned SP we need twice this amount, being 1088 (=0x440)
//
// SP aligned end-aligned stacksize
// 100013d0 10001400 10001620 592
// 100013d8 10001400 10001620 584
// 100013e0 10001600 10001820 1088
// 100013e8 10001600 10001820 1080
MOVQ ctx+0(FP), DI // DI: &h // func blockAvx2(h []uint32, message []uint8)
MOVQ inp+24(FP), SI // SI: &message TEXT ·blockAvx2(SB),$1088-48
MOVQ inplength+32(FP), DX // len(message)
MOVQ h+0(FP), DI // DI: &h
MOVQ message_base+24(FP), SI // SI: &message
MOVQ message_len+32(FP), DX // len(message)
ADDQ SI, DX // end pointer of input ADDQ SI, DX // end pointer of input
MOVQ SP, R11 // copy stack pointer MOVQ SP, R11 // copy stack pointer
SUBQ $0x220, SP // sp -= 0x220 ADDQ $0x220, SP // sp += 0x220
ANDQ $0xfffffffffffffc00, SP // align stack frame ANDQ $0xfffffffffffffe00, SP // align stack frame
ADDQ $0x1c0, SP ADDQ $0x1c0, SP
MOVQ DI, 0x40(SP) // save ctx MOVQ DI, 0x40(SP) // save ctx
MOVQ SI, 0x48(SP) // save input MOVQ SI, 0x48(SP) // save input
@ -1435,7 +1442,7 @@ loop2:
done: done:
MOVQ BP, SP MOVQ BP, SP
MOVQ 0x58(SP), SP MOVQ 0x58(SP), SP // restore saved stack pointer
WORD $0xf8c5; BYTE $0x77 // vzeroupper WORD $0xf8c5; BYTE $0x77 // vzeroupper
RET RET

View File

@ -2,7 +2,7 @@ TEXT ·sha256X16Avx512(SB), 7, $0
MOVQ digests+0(FP), DI MOVQ digests+0(FP), DI
MOVQ scratch+8(FP), R12 MOVQ scratch+8(FP), R12
MOVQ mask_len+32(FP), SI MOVQ mask_len+32(FP), SI
MOVQ r14+24(FP), R13 MOVQ mask_base+24(FP), R13
MOVQ (R13), R14 MOVQ (R13), R14
LONG $0x92fbc1c4; BYTE $0xce LONG $0x92fbc1c4; BYTE $0xce
LEAQ inputs+48(FP), AX LEAQ inputs+48(FP), AX

View File

@ -232,15 +232,15 @@
ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ
// func blockAvx(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64) // func blockAvx(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
TEXT ·blockAvx(SB), 7, $0 TEXT ·blockAvx(SB), 7, $0-80
MOVQ h+0(FP), SI // SI: &h MOVQ h+0(FP), SI // SI: &h
MOVQ message+24(FP), R8 // &message MOVQ message_base+24(FP), R8 // &message
MOVQ lenmessage+32(FP), R9 // length of message MOVQ message_len+32(FP), R9 // length of message
CMPQ R9, $0 CMPQ R9, $0
JEQ done_hash JEQ done_hash
ADDQ R8, R9 ADDQ R8, R9
MOVQ R9, _inp_end+64(FP) // store end of message MOVQ R9, reserved2+64(FP) // store end of message
// Register definition // Register definition
// a --> eax // a --> eax
@ -269,7 +269,7 @@ TEXT ·blockAvx(SB), 7, $0
MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA
MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00 MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00
MOVQ message+24(FP), SI // SI: &message MOVQ message_base+24(FP), SI // SI: &message
loop0: loop0:
LEAQ constants<>(SB), BP LEAQ constants<>(SB), BP
@ -284,25 +284,25 @@ loop0:
MOVOU 3*16(SI), X7 MOVOU 3*16(SI), X7
LONG $0x0041c2c4; BYTE $0xfd // VPSHUFB XMM7, XMM7, XMM13 LONG $0x0041c2c4; BYTE $0xfd // VPSHUFB XMM7, XMM7, XMM13
MOVQ SI, _inp+72(FP) MOVQ SI, reserved3+72(FP)
MOVD $0x3, DI MOVD $0x3, DI
// schedule 48 input dwords, by doing 3 rounds of 16 each // schedule 48 input dwords, by doing 3 rounds of 16 each
loop1: loop1:
LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */ LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */
MOVOU X9, _xfer+48(FP) MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11) FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
LONG $0x4dfe59c5; BYTE $0x10 // VPADDD XMM9, XMM4, 16[RBP] /* Add 2nd constant to message */ LONG $0x4dfe59c5; BYTE $0x10 // VPADDD XMM9, XMM4, 16[RBP] /* Add 2nd constant to message */
MOVOU X9, _xfer+48(FP) MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8) FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
LONG $0x4dfe59c5; BYTE $0x20 // VPADDD XMM9, XMM4, 32[RBP] /* Add 3rd constant to message */ LONG $0x4dfe59c5; BYTE $0x20 // VPADDD XMM9, XMM4, 32[RBP] /* Add 3rd constant to message */
MOVOU X9, _xfer+48(FP) MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11) FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
LONG $0x4dfe59c5; BYTE $0x30 // VPADDD XMM9, XMM4, 48[RBP] /* Add 4th constant to message */ LONG $0x4dfe59c5; BYTE $0x30 // VPADDD XMM9, XMM4, 48[RBP] /* Add 4th constant to message */
MOVOU X9, _xfer+48(FP) MOVOU X9, reserved0+48(FP)
ADDQ $64, BP ADDQ $64, BP
FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8) FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
@ -313,14 +313,14 @@ loop1:
loop2: loop2:
LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */ LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */
MOVOU X9, _xfer+48(FP) MOVOU X9, reserved0+48(FP)
DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48) DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48)
DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52) DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52)
DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56) DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56)
DO_ROUND( R9, R10, R11, AX, BX, CX, R8, DX, 60) DO_ROUND( R9, R10, R11, AX, BX, CX, R8, DX, 60)
LONG $0x4dfe51c5; BYTE $0x10 // VPADDD XMM9, XMM5, 16[RBP] /* Add 2nd constant to message */ LONG $0x4dfe51c5; BYTE $0x10 // VPADDD XMM9, XMM5, 16[RBP] /* Add 2nd constant to message */
MOVOU X9, _xfer+48(FP) MOVOU X9, reserved0+48(FP)
ADDQ $32, BP ADDQ $32, BP
DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48) DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48)
DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52) DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52)
@ -351,9 +351,9 @@ loop2:
ADDL (7*4)(SI), R11 // H7 = h + H7 ADDL (7*4)(SI), R11 // H7 = h + H7
MOVL R11, (7*4)(SI) MOVL R11, (7*4)(SI)
MOVQ _inp+72(FP), SI MOVQ reserved3+72(FP), SI
ADDQ $64, SI ADDQ $64, SI
CMPQ _inp_end+64(FP), SI CMPQ reserved2+64(FP), SI
JNE loop0 JNE loop0
done_hash: done_hash:

View File

@ -2,7 +2,7 @@
// SHA intrinsic version of SHA256 // SHA intrinsic version of SHA256
// Minio Cloud Storage, (C) 2018 Minio, Inc. // Kristofer Peterson, (C) 2018.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.

View File

@ -244,15 +244,15 @@
ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ
// func blockSsse(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64) // func blockSsse(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
TEXT ·blockSsse(SB), 7, $0 TEXT ·blockSsse(SB), 7, $0-80
MOVQ h+0(FP), SI // SI: &h MOVQ h+0(FP), SI // SI: &h
MOVQ message+24(FP), R8 // &message MOVQ message_base+24(FP), R8 // &message
MOVQ lenmessage+32(FP), R9 // length of message MOVQ message_len+32(FP), R9 // length of message
CMPQ R9, $0 CMPQ R9, $0
JEQ done_hash JEQ done_hash
ADDQ R8, R9 ADDQ R8, R9
MOVQ R9, _inp_end+64(FP) // store end of message MOVQ R9, reserved2+64(FP) // store end of message
// Register definition // Register definition
// a --> eax // a --> eax
@ -281,7 +281,7 @@ TEXT ·blockSsse(SB), 7, $0
MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA
MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00 MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00
MOVQ message+24(FP), SI // SI: &message MOVQ message_base+24(FP), SI // SI: &message
loop0: loop0:
LEAQ constants<>(SB), BP LEAQ constants<>(SB), BP
@ -296,7 +296,7 @@ loop0:
MOVOU 3*16(SI), X7 MOVOU 3*16(SI), X7
LONG $0x380f4166; WORD $0xfd00 // PSHUFB XMM7, XMM13 LONG $0x380f4166; WORD $0xfd00 // PSHUFB XMM7, XMM13
MOVQ SI, _inp+72(FP) MOVQ SI, reserved3+72(FP)
MOVD $0x3, DI MOVD $0x3, DI
// Align // Align
@ -306,22 +306,22 @@ loop0:
loop1: loop1:
MOVOU X4, X9 MOVOU X4, X9
LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */ LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */
MOVOU X9, _xfer+48(FP) MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11) FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
MOVOU X4, X9 MOVOU X4, X9
LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */ LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */
MOVOU X9, _xfer+48(FP) MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8) FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
MOVOU X4, X9 MOVOU X4, X9
LONG $0xfe0f4466; WORD $0x204d // PADDD XMM9, 32[RBP] /* Add 3rd constant to message */ LONG $0xfe0f4466; WORD $0x204d // PADDD XMM9, 32[RBP] /* Add 3rd constant to message */
MOVOU X9, _xfer+48(FP) MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11) FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
MOVOU X4, X9 MOVOU X4, X9
LONG $0xfe0f4466; WORD $0x304d // PADDD XMM9, 48[RBP] /* Add 4th constant to message */ LONG $0xfe0f4466; WORD $0x304d // PADDD XMM9, 48[RBP] /* Add 4th constant to message */
MOVOU X9, _xfer+48(FP) MOVOU X9, reserved0+48(FP)
ADDQ $64, BP ADDQ $64, BP
FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8) FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
@ -333,7 +333,7 @@ loop1:
loop2: loop2:
MOVOU X4, X9 MOVOU X4, X9
LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */ LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */
MOVOU X9, _xfer+48(FP) MOVOU X9, reserved0+48(FP)
DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48) DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48)
DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52) DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52)
DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56) DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56)
@ -341,7 +341,7 @@ loop2:
MOVOU X5, X9 MOVOU X5, X9
LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */ LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */
MOVOU X9, _xfer+48(FP) MOVOU X9, reserved0+48(FP)
ADDQ $32, BP ADDQ $32, BP
DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48) DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48)
DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52) DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52)
@ -372,9 +372,9 @@ loop2:
ADDL (7*4)(SI), R11 // H7 = h + H7 ADDL (7*4)(SI), R11 // H7 = h + H7
MOVL R11, (7*4)(SI) MOVL R11, (7*4)(SI)
MOVQ _inp+72(FP), SI MOVQ reserved3+72(FP), SI
ADDQ $64, SI ADDQ $64, SI
CMPQ _inp_end+64(FP), SI CMPQ reserved2+64(FP), SI
JNE loop0 JNE loop0
done_hash: done_hash:

2
vendor/modules.txt vendored
View File

@ -95,7 +95,7 @@ github.com/lib/pq
github.com/lib/pq/oid github.com/lib/pq/oid
# github.com/matttproud/golang_protobuf_extensions v1.0.1 # github.com/matttproud/golang_protobuf_extensions v1.0.1
github.com/matttproud/golang_protobuf_extensions/pbutil github.com/matttproud/golang_protobuf_extensions/pbutil
# github.com/minio/sha256-simd v0.0.0-20190104231041-e529fa194128 # github.com/minio/sha256-simd v0.0.0-20190117184323-cc1980cb0338
github.com/minio/sha256-simd github.com/minio/sha256-simd
# github.com/oschwald/geoip2-golang v1.1.0 # github.com/oschwald/geoip2-golang v1.1.0
github.com/oschwald/geoip2-golang github.com/oschwald/geoip2-golang