vendor: Update vendor dir from a80c0fda

This commit is contained in:
Jakob Borg 2019-01-20 08:50:40 +01:00
parent 00fa77dd47
commit 29e4b417f2
8 changed files with 68 additions and 47 deletions

View File

@ -21,3 +21,4 @@ matrix:
script:
- diff -au <(gofmt -d .) <(printf "")
- go test -race -v ./...
- go tool vet -asmdecl .

View File

@ -1,6 +1,6 @@
# sha256-simd
Accelerate SHA256 computations in pure Go using AVX512 and AVX2 for Intel and ARM64 for ARM. On AVX512 it provides an up to 8x improvement (over 3 GB/s per core) in comparison to AVX2.
Accelerate SHA256 computations in pure Go using AVX512, SHA Extensions and AVX2 for Intel and ARM64 for ARM. On AVX512 it provides an up to 8x improvement (over 3 GB/s per core) in comparison to AVX2. SHA Extensions give a performance boost of close to 4x over AVX2.
## Introduction
@ -8,7 +8,19 @@ This package is designed as a replacement for `crypto/sha256`. For Intel CPUs it
This package uses Golang assembly. The AVX512 version is based on the Intel's "multi-buffer crypto library for IPSec" whereas the other Intel implementations are described in "Fast SHA-256 Implementations on Intel Architecture Processors" by J. Guilford et al.
## New: Support for AVX512
## New: Support for Intel SHA Extensions
Support for the Intel SHA Extensions has been added by Kristofer Peterson (@svenski123), originally developed for spacemeshos [here](https://github.com/spacemeshos/POET/issues/23). On CPUs that support it (known thus far Intel Celeron J3455 and AMD Ryzen) it gives a significant boost in performance (with thanks to @AudriusButkevicius for reporting the results; full results [here](https://github.com/minio/sha256-simd/pull/37#issuecomment-451607827)).
```
$ benchcmp avx2.txt sha-ext.txt
benchmark AVX2 MB/s SHA Ext MB/s speedup
BenchmarkHash5M 514.40 1975.17 3.84x
```
Thanks to Kristofer Peterson, we also added additional performance changes such as optimized padding, endian conversions which sped up all implementations i.e. Intel SHA alone while doubled performance for small sizes, the other changes increased everything roughly 50%.
## Support for AVX512
We have added support for AVX512 which results in an up to 8x performance improvement over AVX2 (3.0 GHz Xeon Platinum 8124M CPU):
@ -66,6 +78,7 @@ Below is the speed in MB/s for a single core (ranked fast to slow) for blocks la
| Processor | SIMD | Speed (MB/s) |
| --------------------------------- | ------- | ------------:|
| 3.0 GHz Intel Xeon Platinum 8124M | AVX512 | 3498 |
| 3.7 GHz AMD Ryzen 7 2700X | SHA Ext | 1979 |
| 1.2 GHz ARM Cortex-A53 | ARM64 | 638 |
| 3.0 GHz Intel Xeon Platinum 8124M | AVX2 | 449 |
| 3.1 GHz Intel Core i7 | AVX | 362 |

View File

@ -32,8 +32,6 @@
// equivalents
//
#include "textflag.h"
DATA K256<>+0x000(SB)/8, $0x71374491428a2f98
DATA K256<>+0x008(SB)/8, $0xe9b5dba5b5c0fbcf
DATA K256<>+0x010(SB)/8, $0x71374491428a2f98
@ -114,16 +112,25 @@ DATA K256<>+0x258(SB)/8, $0x0b0a090803020100
GLOBL K256<>(SB), 8, $608
// func blockAvx2(h []uint32, message []uint8)
TEXT ·blockAvx2(SB), 7, $0
// We need 0x220 stack space aligned on a 512 boundary, so for the
// worstcase-aligned SP we need twice this amount, being 1088 (=0x440)
//
// SP aligned end-aligned stacksize
// 100013d0 10001400 10001620 592
// 100013d8 10001400 10001620 584
// 100013e0 10001600 10001820 1088
// 100013e8 10001600 10001820 1080
MOVQ ctx+0(FP), DI // DI: &h
MOVQ inp+24(FP), SI // SI: &message
MOVQ inplength+32(FP), DX // len(message)
// func blockAvx2(h []uint32, message []uint8)
TEXT ·blockAvx2(SB),$1088-48
MOVQ h+0(FP), DI // DI: &h
MOVQ message_base+24(FP), SI // SI: &message
MOVQ message_len+32(FP), DX // len(message)
ADDQ SI, DX // end pointer of input
MOVQ SP, R11 // copy stack pointer
SUBQ $0x220, SP // sp -= 0x220
ANDQ $0xfffffffffffffc00, SP // align stack frame
ADDQ $0x220, SP // sp += 0x220
ANDQ $0xfffffffffffffe00, SP // align stack frame
ADDQ $0x1c0, SP
MOVQ DI, 0x40(SP) // save ctx
MOVQ SI, 0x48(SP) // save input
@ -1435,7 +1442,7 @@ loop2:
done:
MOVQ BP, SP
MOVQ 0x58(SP), SP
MOVQ 0x58(SP), SP // restore saved stack pointer
WORD $0xf8c5; BYTE $0x77 // vzeroupper
RET

View File

@ -2,7 +2,7 @@ TEXT ·sha256X16Avx512(SB), 7, $0
MOVQ digests+0(FP), DI
MOVQ scratch+8(FP), R12
MOVQ mask_len+32(FP), SI
MOVQ r14+24(FP), R13
MOVQ mask_base+24(FP), R13
MOVQ (R13), R14
LONG $0x92fbc1c4; BYTE $0xce
LEAQ inputs+48(FP), AX

View File

@ -232,15 +232,15 @@
ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ
// func blockAvx(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
TEXT ·blockAvx(SB), 7, $0
TEXT ·blockAvx(SB), 7, $0-80
MOVQ h+0(FP), SI // SI: &h
MOVQ message+24(FP), R8 // &message
MOVQ lenmessage+32(FP), R9 // length of message
MOVQ h+0(FP), SI // SI: &h
MOVQ message_base+24(FP), R8 // &message
MOVQ message_len+32(FP), R9 // length of message
CMPQ R9, $0
JEQ done_hash
ADDQ R8, R9
MOVQ R9, _inp_end+64(FP) // store end of message
MOVQ R9, reserved2+64(FP) // store end of message
// Register definition
// a --> eax
@ -269,7 +269,7 @@ TEXT ·blockAvx(SB), 7, $0
MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA
MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00
MOVQ message+24(FP), SI // SI: &message
MOVQ message_base+24(FP), SI // SI: &message
loop0:
LEAQ constants<>(SB), BP
@ -284,25 +284,25 @@ loop0:
MOVOU 3*16(SI), X7
LONG $0x0041c2c4; BYTE $0xfd // VPSHUFB XMM7, XMM7, XMM13
MOVQ SI, _inp+72(FP)
MOVQ SI, reserved3+72(FP)
MOVD $0x3, DI
// schedule 48 input dwords, by doing 3 rounds of 16 each
loop1:
LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */
MOVOU X9, _xfer+48(FP)
MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
LONG $0x4dfe59c5; BYTE $0x10 // VPADDD XMM9, XMM4, 16[RBP] /* Add 2nd constant to message */
MOVOU X9, _xfer+48(FP)
MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
LONG $0x4dfe59c5; BYTE $0x20 // VPADDD XMM9, XMM4, 32[RBP] /* Add 3rd constant to message */
MOVOU X9, _xfer+48(FP)
MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
LONG $0x4dfe59c5; BYTE $0x30 // VPADDD XMM9, XMM4, 48[RBP] /* Add 4th constant to message */
MOVOU X9, _xfer+48(FP)
MOVOU X9, reserved0+48(FP)
ADDQ $64, BP
FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
@ -313,14 +313,14 @@ loop1:
loop2:
LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */
MOVOU X9, _xfer+48(FP)
MOVOU X9, reserved0+48(FP)
DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48)
DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52)
DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56)
DO_ROUND( R9, R10, R11, AX, BX, CX, R8, DX, 60)
LONG $0x4dfe51c5; BYTE $0x10 // VPADDD XMM9, XMM5, 16[RBP] /* Add 2nd constant to message */
MOVOU X9, _xfer+48(FP)
MOVOU X9, reserved0+48(FP)
ADDQ $32, BP
DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48)
DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52)
@ -351,9 +351,9 @@ loop2:
ADDL (7*4)(SI), R11 // H7 = h + H7
MOVL R11, (7*4)(SI)
MOVQ _inp+72(FP), SI
MOVQ reserved3+72(FP), SI
ADDQ $64, SI
CMPQ _inp_end+64(FP), SI
CMPQ reserved2+64(FP), SI
JNE loop0
done_hash:

View File

@ -2,7 +2,7 @@
// SHA intrinsic version of SHA256
// Minio Cloud Storage, (C) 2018 Minio, Inc.
// Kristofer Peterson, (C) 2018.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.

View File

@ -244,15 +244,15 @@
ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ
// func blockSsse(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
TEXT ·blockSsse(SB), 7, $0
TEXT ·blockSsse(SB), 7, $0-80
MOVQ h+0(FP), SI // SI: &h
MOVQ message+24(FP), R8 // &message
MOVQ lenmessage+32(FP), R9 // length of message
MOVQ h+0(FP), SI // SI: &h
MOVQ message_base+24(FP), R8 // &message
MOVQ message_len+32(FP), R9 // length of message
CMPQ R9, $0
JEQ done_hash
ADDQ R8, R9
MOVQ R9, _inp_end+64(FP) // store end of message
MOVQ R9, reserved2+64(FP) // store end of message
// Register definition
// a --> eax
@ -281,7 +281,7 @@ TEXT ·blockSsse(SB), 7, $0
MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA
MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00
MOVQ message+24(FP), SI // SI: &message
MOVQ message_base+24(FP), SI // SI: &message
loop0:
LEAQ constants<>(SB), BP
@ -296,7 +296,7 @@ loop0:
MOVOU 3*16(SI), X7
LONG $0x380f4166; WORD $0xfd00 // PSHUFB XMM7, XMM13
MOVQ SI, _inp+72(FP)
MOVQ SI, reserved3+72(FP)
MOVD $0x3, DI
// Align
@ -306,22 +306,22 @@ loop0:
loop1:
MOVOU X4, X9
LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */
MOVOU X9, _xfer+48(FP)
MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
MOVOU X4, X9
LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */
MOVOU X9, _xfer+48(FP)
MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
MOVOU X4, X9
LONG $0xfe0f4466; WORD $0x204d // PADDD XMM9, 32[RBP] /* Add 3rd constant to message */
MOVOU X9, _xfer+48(FP)
MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
MOVOU X4, X9
LONG $0xfe0f4466; WORD $0x304d // PADDD XMM9, 48[RBP] /* Add 4th constant to message */
MOVOU X9, _xfer+48(FP)
MOVOU X9, reserved0+48(FP)
ADDQ $64, BP
FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
@ -333,7 +333,7 @@ loop1:
loop2:
MOVOU X4, X9
LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */
MOVOU X9, _xfer+48(FP)
MOVOU X9, reserved0+48(FP)
DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48)
DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52)
DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56)
@ -341,7 +341,7 @@ loop2:
MOVOU X5, X9
LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */
MOVOU X9, _xfer+48(FP)
MOVOU X9, reserved0+48(FP)
ADDQ $32, BP
DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48)
DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52)
@ -372,9 +372,9 @@ loop2:
ADDL (7*4)(SI), R11 // H7 = h + H7
MOVL R11, (7*4)(SI)
MOVQ _inp+72(FP), SI
MOVQ reserved3+72(FP), SI
ADDQ $64, SI
CMPQ _inp_end+64(FP), SI
CMPQ reserved2+64(FP), SI
JNE loop0
done_hash:

2
vendor/modules.txt vendored
View File

@ -95,7 +95,7 @@ github.com/lib/pq
github.com/lib/pq/oid
# github.com/matttproud/golang_protobuf_extensions v1.0.1
github.com/matttproud/golang_protobuf_extensions/pbutil
# github.com/minio/sha256-simd v0.0.0-20190104231041-e529fa194128
# github.com/minio/sha256-simd v0.0.0-20190117184323-cc1980cb0338
github.com/minio/sha256-simd
# github.com/oschwald/geoip2-golang v1.1.0
github.com/oschwald/geoip2-golang