From cbcc3ea132e1374e43296564900e3b9d4d1c896a Mon Sep 17 00:00:00 2001 From: Audrius Butkevicius Date: Sat, 2 Sep 2017 06:04:35 +0000 Subject: [PATCH] lib/connections: Use our own fork of kcp (fixes #4063) This updates kcp and uses our own fork which: 1. Keys sessions not just by remote address, but by remote address + conversation id 2. Allows not to close connections that were passed directly to the library. 3. Resets cache key if the session gets terminated. GitHub-Pull-Request: https://github.com/syncthing/syncthing/pull/4339 LGTM: calmh --- lib/connections/kcp_dial.go | 4 +- lib/connections/kcp_listen.go | 2 +- lib/protocol/benchmark_test.go | 2 +- .../kcp-go/LICENSE | 0 .../kcp-go/crypt.go | 14 +- .../kcp-go/fec.go | 34 +- .../kcp-go/kcp.go | 44 +- .../kcp-go/sess.go | 252 ++++---- .../kcp-go/snmp.go | 0 .../kcp-go/updater.go | 24 +- .../kcp-go/xor.go | 0 vendor/github.com/templexxx/xor/LICENSE | 21 + vendor/github.com/templexxx/xor/avx2_amd64.s | 442 ++++++++++++++ vendor/github.com/templexxx/xor/nosimd.go | 116 ++++ vendor/github.com/templexxx/xor/sse2_amd64.s | 574 ++++++++++++++++++ vendor/github.com/templexxx/xor/xor.go | 49 ++ vendor/github.com/templexxx/xor/xor_amd64.go | 118 ++++ vendor/github.com/templexxx/xor/xor_other.go | 19 + vendor/manifest | 24 +- 19 files changed, 1551 insertions(+), 188 deletions(-) rename vendor/github.com/{xtaci => AudriusButkevicius}/kcp-go/LICENSE (100%) rename vendor/github.com/{xtaci => AudriusButkevicius}/kcp-go/crypt.go (94%) rename vendor/github.com/{xtaci => AudriusButkevicius}/kcp-go/fec.go (89%) rename vendor/github.com/{xtaci => AudriusButkevicius}/kcp-go/kcp.go (96%) rename vendor/github.com/{xtaci => AudriusButkevicius}/kcp-go/sess.go (79%) rename vendor/github.com/{xtaci => AudriusButkevicius}/kcp-go/snmp.go (100%) rename vendor/github.com/{xtaci => AudriusButkevicius}/kcp-go/updater.go (82%) rename vendor/github.com/{xtaci => AudriusButkevicius}/kcp-go/xor.go (100%) create mode 100644 vendor/github.com/templexxx/xor/LICENSE create mode 100644 vendor/github.com/templexxx/xor/avx2_amd64.s create mode 100644 vendor/github.com/templexxx/xor/nosimd.go create mode 100644 vendor/github.com/templexxx/xor/sse2_amd64.s create mode 100644 vendor/github.com/templexxx/xor/xor.go create mode 100644 vendor/github.com/templexxx/xor/xor_amd64.go create mode 100644 vendor/github.com/templexxx/xor/xor_other.go diff --git a/lib/connections/kcp_dial.go b/lib/connections/kcp_dial.go index d94843f22..3f923be0f 100644 --- a/lib/connections/kcp_dial.go +++ b/lib/connections/kcp_dial.go @@ -11,9 +11,9 @@ import ( "net/url" "time" + "github.com/AudriusButkevicius/kcp-go" "github.com/syncthing/syncthing/lib/config" "github.com/syncthing/syncthing/lib/protocol" - "github.com/xtaci/kcp-go" "github.com/xtaci/smux" ) @@ -38,7 +38,7 @@ func (d *kcpDialer) Dial(id protocol.DeviceID, uri *url.URL) (internalConn, erro // Try to dial via an existing listening connection // giving better changes punching through NAT. if f := getDialingFilter(); f != nil { - conn, err = kcp.NewConn(uri.Host, nil, 0, 0, f.NewConn(kcpConversationFilterPriority, &kcpConversationFilter{})) + conn, err = kcp.NewConn(uri.Host, nil, 0, 0, f.NewConn(kcpConversationFilterPriority, &kcpConversationFilter{}), false) l.Debugf("dial %s using existing conn on %s", uri.String(), conn.LocalAddr()) } else { conn, err = kcp.DialWithOptions(uri.Host, nil, 0, 0) diff --git a/lib/connections/kcp_listen.go b/lib/connections/kcp_listen.go index b4a1fc485..1bf7f95c2 100644 --- a/lib/connections/kcp_listen.go +++ b/lib/connections/kcp_listen.go @@ -14,11 +14,11 @@ import ( "sync" "time" + "github.com/AudriusButkevicius/kcp-go" "github.com/AudriusButkevicius/pfilter" "github.com/ccding/go-stun/stun" "github.com/syncthing/syncthing/lib/config" "github.com/syncthing/syncthing/lib/nat" - "github.com/xtaci/kcp-go" "github.com/xtaci/smux" ) diff --git a/lib/protocol/benchmark_test.go b/lib/protocol/benchmark_test.go index 98de4ff57..e2cfefc03 100644 --- a/lib/protocol/benchmark_test.go +++ b/lib/protocol/benchmark_test.go @@ -8,8 +8,8 @@ import ( "net" "testing" + "github.com/AudriusButkevicius/kcp-go" "github.com/syncthing/syncthing/lib/dialer" - "github.com/xtaci/kcp-go" ) func BenchmarkRequestsRawTCP(b *testing.B) { diff --git a/vendor/github.com/xtaci/kcp-go/LICENSE b/vendor/github.com/AudriusButkevicius/kcp-go/LICENSE similarity index 100% rename from vendor/github.com/xtaci/kcp-go/LICENSE rename to vendor/github.com/AudriusButkevicius/kcp-go/LICENSE diff --git a/vendor/github.com/xtaci/kcp-go/crypt.go b/vendor/github.com/AudriusButkevicius/kcp-go/crypt.go similarity index 94% rename from vendor/github.com/xtaci/kcp-go/crypt.go rename to vendor/github.com/AudriusButkevicius/kcp-go/crypt.go index 2e456b811..7f4759b80 100644 --- a/vendor/github.com/xtaci/kcp-go/crypt.go +++ b/vendor/github.com/AudriusButkevicius/kcp-go/crypt.go @@ -6,6 +6,8 @@ import ( "crypto/des" "crypto/sha1" + "github.com/templexxx/xor" + "golang.org/x/crypto/blowfish" "golang.org/x/crypto/cast5" "golang.org/x/crypto/pbkdf2" @@ -218,8 +220,8 @@ func NewSimpleXORBlockCrypt(key []byte) (BlockCrypt, error) { return c, nil } -func (c *simpleXORBlockCrypt) Encrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) } -func (c *simpleXORBlockCrypt) Decrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) } +func (c *simpleXORBlockCrypt) Encrypt(dst, src []byte) { xor.Bytes(dst, src, c.xortbl) } +func (c *simpleXORBlockCrypt) Decrypt(dst, src []byte) { xor.Bytes(dst, src, c.xortbl) } type noneBlockCrypt struct{} @@ -239,11 +241,11 @@ func encrypt(block cipher.Block, dst, src, buf []byte) { n := len(src) / blocksize base := 0 for i := 0; i < n; i++ { - xorWords(dst[base:], src[base:], tbl) + xor.BytesSrc1(dst[base:], src[base:], tbl) block.Encrypt(tbl, dst[base:]) base += blocksize } - xorBytes(dst[base:], src[base:], tbl) + xor.BytesSrc0(dst[base:], src[base:], tbl) } func decrypt(block cipher.Block, dst, src, buf []byte) { @@ -255,9 +257,9 @@ func decrypt(block cipher.Block, dst, src, buf []byte) { base := 0 for i := 0; i < n; i++ { block.Encrypt(next, src[base:]) - xorWords(dst[base:], src[base:], tbl) + xor.BytesSrc1(dst[base:], src[base:], tbl) tbl, next = next, tbl base += blocksize } - xorBytes(dst[base:], src[base:], tbl) + xor.BytesSrc0(dst[base:], src[base:], tbl) } diff --git a/vendor/github.com/xtaci/kcp-go/fec.go b/vendor/github.com/AudriusButkevicius/kcp-go/fec.go similarity index 89% rename from vendor/github.com/xtaci/kcp-go/fec.go rename to vendor/github.com/AudriusButkevicius/kcp-go/fec.go index 1b6db3db6..2cfd66024 100644 --- a/vendor/github.com/xtaci/kcp-go/fec.go +++ b/vendor/github.com/AudriusButkevicius/kcp-go/fec.go @@ -22,8 +22,8 @@ type ( data []byte } - // FECDecoder for decoding incoming packets - FECDecoder struct { + // fecDecoder for decoding incoming packets + fecDecoder struct { rxlimit int // queue size limit dataShards int parityShards int @@ -39,7 +39,7 @@ type ( } ) -func newFECDecoder(rxlimit, dataShards, parityShards int) *FECDecoder { +func newFECDecoder(rxlimit, dataShards, parityShards int) *fecDecoder { if dataShards <= 0 || parityShards <= 0 { return nil } @@ -47,7 +47,7 @@ func newFECDecoder(rxlimit, dataShards, parityShards int) *FECDecoder { return nil } - fec := new(FECDecoder) + fec := new(fecDecoder) fec.rxlimit = rxlimit fec.dataShards = dataShards fec.parityShards = parityShards @@ -63,7 +63,7 @@ func newFECDecoder(rxlimit, dataShards, parityShards int) *FECDecoder { } // decodeBytes a fec packet -func (dec *FECDecoder) decodeBytes(data []byte) fecPacket { +func (dec *fecDecoder) decodeBytes(data []byte) fecPacket { var pkt fecPacket pkt.seqid = binary.LittleEndian.Uint32(data) pkt.flag = binary.LittleEndian.Uint16(data[4:]) @@ -74,8 +74,8 @@ func (dec *FECDecoder) decodeBytes(data []byte) fecPacket { return pkt } -// Decode a fec packet -func (dec *FECDecoder) Decode(pkt fecPacket) (recovered [][]byte) { +// decode a fec packet +func (dec *fecDecoder) decode(pkt fecPacket) (recovered [][]byte) { // insertion n := len(dec.rx) - 1 insertIdx := 0 @@ -179,7 +179,7 @@ func (dec *FECDecoder) Decode(pkt fecPacket) (recovered [][]byte) { } // free a range of fecPacket, and zero for GC recycling -func (dec *FECDecoder) freeRange(first, n int, q []fecPacket) []fecPacket { +func (dec *fecDecoder) freeRange(first, n int, q []fecPacket) []fecPacket { for i := first; i < first+n; i++ { // free xmitBuf.Put(q[i].data) } @@ -191,8 +191,8 @@ func (dec *FECDecoder) freeRange(first, n int, q []fecPacket) []fecPacket { } type ( - // FECEncoder for encoding outgoing packets - FECEncoder struct { + // fecEncoder for encoding outgoing packets + fecEncoder struct { dataShards int parityShards int shardSize int @@ -214,11 +214,11 @@ type ( } ) -func newFECEncoder(dataShards, parityShards, offset int) *FECEncoder { +func newFECEncoder(dataShards, parityShards, offset int) *fecEncoder { if dataShards <= 0 || parityShards <= 0 { return nil } - fec := new(FECEncoder) + fec := new(fecEncoder) fec.dataShards = dataShards fec.parityShards = parityShards fec.shardSize = dataShards + parityShards @@ -241,9 +241,9 @@ func newFECEncoder(dataShards, parityShards, offset int) *FECEncoder { return fec } -// Encode the packet, output parity shards if we have enough datashards -// the content of returned parityshards will change in next Encode -func (enc *FECEncoder) Encode(b []byte) (ps [][]byte) { +// encode the packet, output parity shards if we have enough datashards +// the content of returned parityshards will change in next encode +func (enc *fecEncoder) encode(b []byte) (ps [][]byte) { enc.markData(b[enc.headerOffset:]) binary.LittleEndian.PutUint16(b[enc.payloadOffset:], uint16(len(b[enc.payloadOffset:]))) @@ -290,13 +290,13 @@ func (enc *FECEncoder) Encode(b []byte) (ps [][]byte) { return } -func (enc *FECEncoder) markData(data []byte) { +func (enc *fecEncoder) markData(data []byte) { binary.LittleEndian.PutUint32(data, enc.next) binary.LittleEndian.PutUint16(data[4:], typeData) enc.next++ } -func (enc *FECEncoder) markFEC(data []byte) { +func (enc *fecEncoder) markFEC(data []byte) { binary.LittleEndian.PutUint32(data, enc.next) binary.LittleEndian.PutUint16(data[4:], typeFEC) enc.next = (enc.next + 1) % enc.paws diff --git a/vendor/github.com/xtaci/kcp-go/kcp.go b/vendor/github.com/AudriusButkevicius/kcp-go/kcp.go similarity index 96% rename from vendor/github.com/xtaci/kcp-go/kcp.go rename to vendor/github.com/AudriusButkevicius/kcp-go/kcp.go index 7a2109a01..b343adff6 100644 --- a/vendor/github.com/xtaci/kcp-go/kcp.go +++ b/vendor/github.com/AudriusButkevicius/kcp-go/kcp.go @@ -30,8 +30,8 @@ const ( IKCP_PROBE_LIMIT = 120000 // up to 120 secs to probe window ) -// Output is a closure which captures conn and calls conn.Write -type Output func(buf []byte, size int) +// output_callback is a prototype which ought capture conn and call conn.Write +type output_callback func(buf []byte, size int) /* encode 8 bits unsigned int */ func ikcp_encode8u(p []byte, c byte) []byte { @@ -91,8 +91,8 @@ func _itimediff(later, earlier uint32) int32 { return (int32)(later - earlier) } -// Segment defines a KCP segment -type Segment struct { +// segment defines a KCP segment +type segment struct { conv uint32 cmd uint8 frg uint8 @@ -108,11 +108,11 @@ type Segment struct { } // encode a segment into buffer -func (seg *Segment) encode(ptr []byte) []byte { +func (seg *segment) encode(ptr []byte) []byte { ptr = ikcp_encode32u(ptr, seg.conv) - ptr = ikcp_encode8u(ptr, uint8(seg.cmd)) - ptr = ikcp_encode8u(ptr, uint8(seg.frg)) - ptr = ikcp_encode16u(ptr, uint16(seg.wnd)) + ptr = ikcp_encode8u(ptr, seg.cmd) + ptr = ikcp_encode8u(ptr, seg.frg) + ptr = ikcp_encode16u(ptr, seg.wnd) ptr = ikcp_encode32u(ptr, seg.ts) ptr = ikcp_encode32u(ptr, seg.sn) ptr = ikcp_encode32u(ptr, seg.una) @@ -137,15 +137,15 @@ type KCP struct { fastresend int32 nocwnd, stream int32 - snd_queue []Segment - rcv_queue []Segment - snd_buf []Segment - rcv_buf []Segment + snd_queue []segment + rcv_queue []segment + snd_buf []segment + rcv_buf []segment acklist []ackItem buffer []byte - output Output + output output_callback } type ackItem struct { @@ -155,7 +155,7 @@ type ackItem struct { // NewKCP create a new kcp control object, 'conv' must equal in two endpoint // from the same connection. -func NewKCP(conv uint32, output Output) *KCP { +func NewKCP(conv uint32, output output_callback) *KCP { kcp := new(KCP) kcp.conv = conv kcp.snd_wnd = IKCP_WND_SND @@ -175,13 +175,13 @@ func NewKCP(conv uint32, output Output) *KCP { } // newSegment creates a KCP segment -func (kcp *KCP) newSegment(size int) (seg Segment) { +func (kcp *KCP) newSegment(size int) (seg segment) { seg.data = xmitBuf.Get().([]byte)[:size] return } // delSegment recycles a KCP segment -func (kcp *KCP) delSegment(seg Segment) { +func (kcp *KCP) delSegment(seg segment) { xmitBuf.Put(seg.data) } @@ -384,7 +384,7 @@ func (kcp *KCP) parse_ack(sn uint32) { if sn == seg.sn { kcp.delSegment(*seg) copy(kcp.snd_buf[k:], kcp.snd_buf[k+1:]) - kcp.snd_buf[len(kcp.snd_buf)-1] = Segment{} + kcp.snd_buf[len(kcp.snd_buf)-1] = segment{} kcp.snd_buf = kcp.snd_buf[:len(kcp.snd_buf)-1] break } @@ -430,7 +430,7 @@ func (kcp *KCP) ack_push(sn, ts uint32) { kcp.acklist = append(kcp.acklist, ackItem{sn, ts}) } -func (kcp *KCP) parse_data(newseg Segment) { +func (kcp *KCP) parse_data(newseg segment) { sn := newseg.sn if _itimediff(sn, kcp.rcv_nxt+kcp.rcv_wnd) >= 0 || _itimediff(sn, kcp.rcv_nxt) < 0 { @@ -458,7 +458,7 @@ func (kcp *KCP) parse_data(newseg Segment) { if insert_idx == n+1 { kcp.rcv_buf = append(kcp.rcv_buf, newseg) } else { - kcp.rcv_buf = append(kcp.rcv_buf, Segment{}) + kcp.rcv_buf = append(kcp.rcv_buf, segment{}) copy(kcp.rcv_buf[insert_idx+1:], kcp.rcv_buf[insert_idx:]) kcp.rcv_buf[insert_idx] = newseg } @@ -625,7 +625,7 @@ func (kcp *KCP) wnd_unused() uint16 { // flush pending data func (kcp *KCP) flush(ackOnly bool) { - var seg Segment + var seg segment seg.conv = kcp.conv seg.cmd = IKCP_CMD_ACK seg.wnd = kcp.wnd_unused() @@ -989,10 +989,10 @@ func (kcp *KCP) WaitSnd() int { } // remove front n elements from queue -func (kcp *KCP) remove_front(q []Segment, n int) []Segment { +func (kcp *KCP) remove_front(q []segment, n int) []segment { newn := copy(q, q[n:]) for i := newn; i < len(q); i++ { - q[i] = Segment{} // manual set nil for GC + q[i] = segment{} // manual set nil for GC } return q[:newn] } diff --git a/vendor/github.com/xtaci/kcp-go/sess.go b/vendor/github.com/AudriusButkevicius/kcp-go/sess.go similarity index 79% rename from vendor/github.com/xtaci/kcp-go/sess.go rename to vendor/github.com/AudriusButkevicius/kcp-go/sess.go index a72e5ca08..d2731754e 100644 --- a/vendor/github.com/xtaci/kcp-go/sess.go +++ b/vendor/github.com/AudriusButkevicius/kcp-go/sess.go @@ -3,6 +3,7 @@ package kcp import ( "crypto/rand" "encoding/binary" + "fmt" "hash/crc32" "io" "net" @@ -54,9 +55,6 @@ var ( // global packet buffer // shared among sending/receiving/FEC xmitBuf sync.Pool - - // monotonic session id - sid uint32 ) func init() { @@ -68,11 +66,12 @@ func init() { type ( // UDPSession defines a KCP session implemented by UDP UDPSession struct { - sid uint32 // session id(monotonic) - conn net.PacketConn // the underlying packet connection - kcp *KCP // KCP ARQ protocol - l *Listener // point to the Listener if it's accepted by Listener - block BlockCrypt // block encryption + updaterIdx int // record slice index in updater + conn net.PacketConn // the underlying packet connection + closeConn bool // Should we close the underlying conn once UDPSession is closed. + kcp *KCP // KCP ARQ protocol + l *Listener // point to the Listener if it's accepted by Listener + block BlockCrypt // block encryption // kcp receiving is based on packets // recvbuf turns packets into stream @@ -82,22 +81,23 @@ type ( ext []byte // FEC - fecDecoder *FECDecoder - fecEncoder *FECEncoder + fecDecoder *fecDecoder + fecEncoder *fecEncoder // settings - remote net.Addr // remote peer address - rd time.Time // read deadline - wd time.Time // write deadline - headerSize int // the overall header size added before KCP frame - updateInterval time.Duration // interval in seconds to call kcp.flush() - ackNoDelay bool // send ack immediately for each incoming packet - writeDelay bool // delay kcp.flush() for Write() for bulk transfer + remote net.Addr // remote peer address + rd time.Time // read deadline + wd time.Time // write deadline + headerSize int // the overall header size added before KCP frame + ackNoDelay bool // send ack immediately for each incoming packet + writeDelay bool // delay kcp.flush() for Write() for bulk transfer + dup int // duplicate udp packets // notifications die chan struct{} // notify session has Closed chReadEvent chan struct{} // notify Read() can be called without blocking chWriteEvent chan struct{} // notify Write() can be called without blocking + chErrorEvent chan error // notify Read() have an error isClosed bool // flag the session has Closed mu sync.Mutex @@ -113,14 +113,15 @@ type ( ) // newUDPSession create a new udp session for client or server -func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn net.PacketConn, remote net.Addr, block BlockCrypt) *UDPSession { +func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn net.PacketConn, remote net.Addr, block BlockCrypt, closeConn bool) *UDPSession { sess := new(UDPSession) - sess.sid = atomic.AddUint32(&sid, 1) sess.die = make(chan struct{}) sess.chReadEvent = make(chan struct{}, 1) sess.chWriteEvent = make(chan struct{}, 1) + sess.chErrorEvent = make(chan error, 1) sess.remote = remote sess.conn = conn + sess.closeConn = closeConn sess.l = l sess.block = block sess.recvbuf = make([]byte, mtuLimit) @@ -232,6 +233,11 @@ func (s *UDPSession) Read(b []byte) (n int, err error) { case <-s.chReadEvent: case <-c: case <-s.die: + case err = <-s.chErrorEvent: + if timeout != nil { + timeout.Stop() + } + return n, err } if timeout != nil { @@ -299,9 +305,11 @@ func (s *UDPSession) Write(b []byte) (n int, err error) { // Close closes the connection. func (s *UDPSession) Close() error { + // remove this session from updater & listener(if necessary) updater.removeSession(s) if s.l != nil { // notify listener - s.l.closeSession(s.remote) + key := fmt.Sprintf("%s/%d", s.remote.String(), s.kcp.conv) + s.l.closeSession(key) } s.mu.Lock() @@ -312,7 +320,7 @@ func (s *UDPSession) Close() error { close(s.die) s.isClosed = true atomic.AddUint64(&DefaultSnmp.CurrEstab, ^uint64(0)) - if s.l == nil { // client socket close + if s.l == nil && s.closeConn { // client socket close return s.conn.Close() } return nil @@ -393,12 +401,19 @@ func (s *UDPSession) SetACKNoDelay(nodelay bool) { s.ackNoDelay = nodelay } +// SetDUP duplicates udp packets for kcp output, for testing purpose only +func (s *UDPSession) SetDUP(dup int) { + s.mu.Lock() + defer s.mu.Unlock() + s.dup = dup +} + // SetNoDelay calls nodelay() of kcp +// https://github.com/skywind3000/kcp/blob/master/README.en.md#protocol-configuration func (s *UDPSession) SetNoDelay(nodelay, interval, resend, nc int) { s.mu.Lock() defer s.mu.Unlock() s.kcp.NoDelay(nodelay, interval, resend, nc) - s.updateInterval = time.Duration(interval) * time.Millisecond } // SetDSCP sets the 6bit DSCP field of IP header, no effect if it's accepted from Listener @@ -406,8 +421,8 @@ func (s *UDPSession) SetDSCP(dscp int) error { s.mu.Lock() defer s.mu.Unlock() if s.l == nil { - if nc, ok := s.conn.(*ConnectedUDPConn); ok { - return ipv4.NewConn(nc.Conn).SetTOS(dscp << 2) + if nc, ok := s.conn.(*connectedUDPConn); ok { + return ipv4.NewConn(nc.UDPConn).SetTOS(dscp << 2) } else if nc, ok := s.conn.(net.Conn); ok { return ipv4.NewConn(nc).SetTOS(dscp << 2) } @@ -449,51 +464,47 @@ func (s *UDPSession) SetWriteBuffer(bytes int) error { func (s *UDPSession) output(buf []byte) { var ecc [][]byte - // extend buf's header space + // 0. extend buf's header space(if necessary) ext := buf if s.headerSize > 0 { ext = s.ext[:s.headerSize+len(buf)] copy(ext[s.headerSize:], buf) } - // FEC stage + // 1. FEC encoding if s.fecEncoder != nil { - ecc = s.fecEncoder.Encode(ext) + ecc = s.fecEncoder.encode(ext) } - // encryption stage + // 2&3. crc32 & encryption if s.block != nil { io.ReadFull(rand.Reader, ext[:nonceSize]) checksum := crc32.ChecksumIEEE(ext[cryptHeaderSize:]) binary.LittleEndian.PutUint32(ext[nonceSize:], checksum) s.block.Encrypt(ext, ext) - if ecc != nil { - for k := range ecc { - io.ReadFull(rand.Reader, ecc[k][:nonceSize]) - checksum := crc32.ChecksumIEEE(ecc[k][cryptHeaderSize:]) - binary.LittleEndian.PutUint32(ecc[k][nonceSize:], checksum) - s.block.Encrypt(ecc[k], ecc[k]) - } + for k := range ecc { + io.ReadFull(rand.Reader, ecc[k][:nonceSize]) + checksum := crc32.ChecksumIEEE(ecc[k][cryptHeaderSize:]) + binary.LittleEndian.PutUint32(ecc[k][nonceSize:], checksum) + s.block.Encrypt(ecc[k], ecc[k]) } } - // WriteTo kernel + // 4. WriteTo kernel nbytes := 0 npkts := 0 - // if mrand.Intn(100) < 50 { - if n, err := s.conn.WriteTo(ext, s.remote); err == nil { - nbytes += n - npkts++ + for i := 0; i < s.dup+1; i++ { + if n, err := s.conn.WriteTo(ext, s.remote); err == nil { + nbytes += n + npkts++ + } } - // } - if ecc != nil { - for k := range ecc { - if n, err := s.conn.WriteTo(ecc[k], s.remote); err == nil { - nbytes += n - npkts++ - } + for k := range ecc { + if n, err := s.conn.WriteTo(ecc[k], s.remote); err == nil { + nbytes += n + npkts++ } } atomic.AddUint64(&DefaultSnmp.OutPkts, uint64(npkts)) @@ -507,15 +518,13 @@ func (s *UDPSession) update() (interval time.Duration) { if s.kcp.WaitSnd() < int(s.kcp.snd_wnd) { s.notifyWriteEvent() } - interval = s.updateInterval + interval = time.Duration(s.kcp.interval) * time.Millisecond s.mu.Unlock() return } // GetConv gets conversation id of a session -func (s *UDPSession) GetConv() uint32 { - return s.kcp.conv -} +func (s *UDPSession) GetConv() uint32 { return s.kcp.conv } func (s *UDPSession) notifyReadEvent() { select { @@ -548,22 +557,21 @@ func (s *UDPSession) kcpInput(data []byte) { fecParityShards++ } - if recovers := s.fecDecoder.Decode(f); recovers != nil { - for _, r := range recovers { - if len(r) >= 2 { // must be larger than 2bytes - sz := binary.LittleEndian.Uint16(r) - if int(sz) <= len(r) && sz >= 2 { - if ret := s.kcp.Input(r[2:sz], false, s.ackNoDelay); ret == 0 { - fecRecovered++ - } else { - kcpInErrors++ - } + recovers := s.fecDecoder.decode(f) + for _, r := range recovers { + if len(r) >= 2 { // must be larger than 2bytes + sz := binary.LittleEndian.Uint16(r) + if int(sz) <= len(r) && sz >= 2 { + if ret := s.kcp.Input(r[2:sz], false, s.ackNoDelay); ret == 0 { + fecRecovered++ } else { - fecErrs++ + kcpInErrors++ } } else { fecErrs++ } + } else { + fecErrs++ } } } @@ -601,7 +609,7 @@ func (s *UDPSession) kcpInput(data []byte) { } } -func (s *UDPSession) receiver(ch chan []byte) { +func (s *UDPSession) receiver(ch chan<- []byte) { for { data := xmitBuf.Get().([]byte)[:mtuLimit] if n, _, err := s.conn.ReadFrom(data); err == nil && n >= s.headerSize+IKCP_OVERHEAD { @@ -611,6 +619,7 @@ func (s *UDPSession) receiver(ch chan []byte) { return } } else if err != nil { + s.chErrorEvent <- err return } else { atomic.AddUint64(&DefaultSnmp.InErrs, 1) @@ -658,12 +667,12 @@ type ( block BlockCrypt // block encryption dataShards int // FEC data shard parityShards int // FEC parity shard - fecDecoder *FECDecoder // FEC mock initialization + fecDecoder *fecDecoder // FEC mock initialization conn net.PacketConn // the underlying packet connection sessions map[string]*UDPSession // all sessions accepted by this Listener chAccepts chan *UDPSession // Listen() backlog - chSessionClosed chan net.Addr // session close queue + chSessionClosed chan string // session close queue headerSize int // the overall header size added before KCP frame die chan struct{} // notify the listener has closed rd atomic.Value // read deadline for Accept() @@ -679,6 +688,10 @@ type ( // monitor incoming data for all connections of server func (l *Listener) monitor() { + // cache last session + var lastKey string + var lastSession *UDPSession + chPacket := make(chan inPacket, qlen) go l.receiver(chPacket) for { @@ -703,45 +716,60 @@ func (l *Listener) monitor() { } if dataValid { - addr := from.String() - s, ok := l.sessions[addr] - if !ok { // new session - if len(l.chAccepts) < cap(l.chAccepts) { // do not let new session overwhelm accept queue - var conv uint32 - convValid := false - if l.fecDecoder != nil { - isfec := binary.LittleEndian.Uint16(data[4:]) - if isfec == typeData { - conv = binary.LittleEndian.Uint32(data[fecHeaderSizePlus2:]) - convValid = true - } - } else { - conv = binary.LittleEndian.Uint32(data) - convValid = true - } - - if convValid { - s := newUDPSession(conv, l.dataShards, l.parityShards, l, l.conn, from, l.block) - s.kcpInput(data) - l.sessions[addr] = s - l.chAccepts <- s - } + var conv uint32 + convValid := false + if l.fecDecoder != nil { + isfec := binary.LittleEndian.Uint16(data[4:]) + if isfec == typeData { + conv = binary.LittleEndian.Uint32(data[fecHeaderSizePlus2:]) + convValid = true } } else { - s.kcpInput(data) + conv = binary.LittleEndian.Uint32(data) + convValid = true + } + + if convValid { + addr := from.String() + key := fmt.Sprintf("%s/%d", addr, conv) + var s *UDPSession + var ok bool + + // packets received from an address always come in batch. + // cache the session for next packet, without querying map. + if key == lastKey { + s, ok = lastSession, true + } else if s, ok = l.sessions[key]; ok { + lastSession = s + lastKey = addr + } + + if !ok { // new session + if len(l.chAccepts) < cap(l.chAccepts) && len(l.sessions) < 4096 { // do not let new session overwhelm accept queue and connection count + s := newUDPSession(conv, l.dataShards, l.parityShards, l, l.conn, from, l.block, false) + s.kcpInput(data) + l.sessions[key] = s + l.chAccepts <- s + } + } else { + s.kcpInput(data) + } } } xmitBuf.Put(raw) - case deadlink := <-l.chSessionClosed: - delete(l.sessions, deadlink.String()) + case key := <-l.chSessionClosed: + if key == lastKey { + lastKey = "" + } + delete(l.sessions, key) case <-l.die: return } } } -func (l *Listener) receiver(ch chan inPacket) { +func (l *Listener) receiver(ch chan<- inPacket) { for { data := xmitBuf.Get().([]byte)[:mtuLimit] if n, from, err := l.conn.ReadFrom(data); err == nil && n >= l.headerSize+IKCP_OVERHEAD { @@ -830,9 +858,9 @@ func (l *Listener) Close() error { } // closeSession notify the listener that a session has closed -func (l *Listener) closeSession(remote net.Addr) bool { +func (l *Listener) closeSession(key string) bool { select { - case l.chSessionClosed <- remote: + case l.chSessionClosed <- key: return true case <-l.die: return false @@ -840,14 +868,10 @@ func (l *Listener) closeSession(remote net.Addr) bool { } // Addr returns the listener's network address, The Addr returned is shared by all invocations of Addr, so do not modify it. -func (l *Listener) Addr() net.Addr { - return l.conn.LocalAddr() -} +func (l *Listener) Addr() net.Addr { return l.conn.LocalAddr() } // Listen listens for incoming KCP packets addressed to the local address laddr on the network "udp", -func Listen(laddr string) (net.Listener, error) { - return ListenWithOptions(laddr, nil, 0, 0) -} +func Listen(laddr string) (net.Listener, error) { return ListenWithOptions(laddr, nil, 0, 0) } // ListenWithOptions listens for incoming KCP packets addressed to the local address laddr on the network "udp" with packet encryption, // dataShards, parityShards defines Reed-Solomon Erasure Coding parameters @@ -870,7 +894,7 @@ func ServeConn(block BlockCrypt, dataShards, parityShards int, conn net.PacketCo l.conn = conn l.sessions = make(map[string]*UDPSession) l.chAccepts = make(chan *UDPSession, acceptBacklog) - l.chSessionClosed = make(chan net.Addr) + l.chSessionClosed = make(chan string) l.die = make(chan struct{}) l.dataShards = dataShards l.parityShards = parityShards @@ -890,9 +914,7 @@ func ServeConn(block BlockCrypt, dataShards, parityShards int, conn net.PacketCo } // Dial connects to the remote address "raddr" on the network "udp" -func Dial(raddr string) (net.Conn, error) { - return DialWithOptions(raddr, nil, 0, 0) -} +func Dial(raddr string) (net.Conn, error) { return DialWithOptions(raddr, nil, 0, 0) } // DialWithOptions connects to the remote address "raddr" on the network "udp" with packet encryption func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards int) (*UDPSession, error) { @@ -906,11 +928,11 @@ func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards in return nil, errors.Wrap(err, "net.DialUDP") } - return NewConn(raddr, block, dataShards, parityShards, &ConnectedUDPConn{udpconn, udpconn}) + return NewConn(raddr, block, dataShards, parityShards, &connectedUDPConn{udpconn}, true) } // NewConn establishes a session and talks KCP protocol over a packet connection. -func NewConn(raddr string, block BlockCrypt, dataShards, parityShards int, conn net.PacketConn) (*UDPSession, error) { +func NewConn(raddr string, block BlockCrypt, dataShards, parityShards int, conn net.PacketConn, closeConn bool) (*UDPSession, error) { udpaddr, err := net.ResolveUDPAddr("udp", raddr) if err != nil { return nil, errors.Wrap(err, "net.ResolveUDPAddr") @@ -918,22 +940,16 @@ func NewConn(raddr string, block BlockCrypt, dataShards, parityShards int, conn var convid uint32 binary.Read(rand.Reader, binary.LittleEndian, &convid) - return newUDPSession(convid, dataShards, parityShards, nil, conn, udpaddr, block), nil + return newUDPSession(convid, dataShards, parityShards, nil, conn, udpaddr, block, closeConn), nil } -func currentMs() uint32 { - return uint32(time.Now().UnixNano() / int64(time.Millisecond)) -} +// returns current time in milliseconds +func currentMs() uint32 { return uint32(time.Now().UnixNano() / int64(time.Millisecond)) } -// ConnectedUDPConn is a wrapper for net.UDPConn which converts WriteTo syscalls +// connectedUDPConn is a wrapper for net.UDPConn which converts WriteTo syscalls // to Write syscalls that are 4 times faster on some OS'es. This should only be // used for connections that were produced by a net.Dial* call. -type ConnectedUDPConn struct { - *net.UDPConn - Conn net.Conn // underlying connection if any -} +type connectedUDPConn struct{ *net.UDPConn } // WriteTo redirects all writes to the Write syscall, which is 4 times faster. -func (c *ConnectedUDPConn) WriteTo(b []byte, addr net.Addr) (int, error) { - return c.Write(b) -} +func (c *connectedUDPConn) WriteTo(b []byte, addr net.Addr) (int, error) { return c.Write(b) } diff --git a/vendor/github.com/xtaci/kcp-go/snmp.go b/vendor/github.com/AudriusButkevicius/kcp-go/snmp.go similarity index 100% rename from vendor/github.com/xtaci/kcp-go/snmp.go rename to vendor/github.com/AudriusButkevicius/kcp-go/snmp.go diff --git a/vendor/github.com/xtaci/kcp-go/updater.go b/vendor/github.com/AudriusButkevicius/kcp-go/updater.go similarity index 82% rename from vendor/github.com/xtaci/kcp-go/updater.go rename to vendor/github.com/AudriusButkevicius/kcp-go/updater.go index 5bb913d8d..a5bfc6654 100644 --- a/vendor/github.com/xtaci/kcp-go/updater.go +++ b/vendor/github.com/AudriusButkevicius/kcp-go/updater.go @@ -15,15 +15,13 @@ func init() { // entry contains a session update info type entry struct { - sid uint32 - ts time.Time - s *UDPSession + ts time.Time + s *UDPSession } // a global heap managed kcp.flush() caller type updateHeap struct { entries []entry - indices map[uint32]int mu sync.Mutex chWakeUp chan struct{} } @@ -32,41 +30,40 @@ func (h *updateHeap) Len() int { return len(h.entries) } func (h *updateHeap) Less(i, j int) bool { return h.entries[i].ts.Before(h.entries[j].ts) } func (h *updateHeap) Swap(i, j int) { h.entries[i], h.entries[j] = h.entries[j], h.entries[i] - h.indices[h.entries[i].sid] = i - h.indices[h.entries[j].sid] = j + h.entries[i].s.updaterIdx = i + h.entries[j].s.updaterIdx = j } func (h *updateHeap) Push(x interface{}) { h.entries = append(h.entries, x.(entry)) n := len(h.entries) - h.indices[h.entries[n-1].sid] = n - 1 + h.entries[n-1].s.updaterIdx = n - 1 } func (h *updateHeap) Pop() interface{} { n := len(h.entries) x := h.entries[n-1] + h.entries[n-1].s.updaterIdx = -1 h.entries[n-1] = entry{} // manual set nil for GC h.entries = h.entries[0 : n-1] - delete(h.indices, x.sid) return x } func (h *updateHeap) init() { - h.indices = make(map[uint32]int) h.chWakeUp = make(chan struct{}, 1) } func (h *updateHeap) addSession(s *UDPSession) { h.mu.Lock() - heap.Push(h, entry{s.sid, time.Now(), s}) + heap.Push(h, entry{time.Now(), s}) h.mu.Unlock() h.wakeup() } func (h *updateHeap) removeSession(s *UDPSession) { h.mu.Lock() - if idx, ok := h.indices[s.sid]; ok { - heap.Remove(h, idx) + if s.updaterIdx != -1 { + heap.Remove(h, s.updaterIdx) } h.mu.Unlock() } @@ -99,7 +96,8 @@ func (h *updateHeap) updateTask() { break } } - if h.Len() > 0 { + + if hlen > 0 { timer = time.After(h.entries[0].ts.Sub(now)) } h.mu.Unlock() diff --git a/vendor/github.com/xtaci/kcp-go/xor.go b/vendor/github.com/AudriusButkevicius/kcp-go/xor.go similarity index 100% rename from vendor/github.com/xtaci/kcp-go/xor.go rename to vendor/github.com/AudriusButkevicius/kcp-go/xor.go diff --git a/vendor/github.com/templexxx/xor/LICENSE b/vendor/github.com/templexxx/xor/LICENSE new file mode 100644 index 000000000..e2c290c56 --- /dev/null +++ b/vendor/github.com/templexxx/xor/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Temple3x + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/templexxx/xor/avx2_amd64.s b/vendor/github.com/templexxx/xor/avx2_amd64.s new file mode 100644 index 000000000..1c8fe3046 --- /dev/null +++ b/vendor/github.com/templexxx/xor/avx2_amd64.s @@ -0,0 +1,442 @@ +#include "textflag.h" + +// addr of mem +#define DST BX +#define SRC SI +#define SRC0 TMP4 +#define SRC1 TMP5 + +// loop args +// num of vect +#define VECT CX +#define LEN DX +// pos of matrix +#define POS R8 + +// tmp store +// num of vect or ... +#define TMP1 R9 +// pos of matrix or ... +#define TMP2 R10 +// store addr of data/parity or ... +#define TMP3 R11 +#define TMP4 R12 +#define TMP5 R13 +#define TMP6 R14 + +// func bytesAVX2mini(dst, src0, src1 []byte, size int) +TEXT ·bytesAVX2mini(SB), NOSPLIT, $0 + MOVQ len+72(FP), LEN + CMPQ LEN, $0 + JE ret + MOVQ dst+0(FP), DST + MOVQ src0+24(FP), SRC0 + MOVQ src1+48(FP), SRC1 + TESTQ $31, LEN + JNZ not_aligned + +aligned: + MOVQ $0, POS + +loop32b: + VMOVDQU (SRC0)(POS*1), Y0 + VPXOR (SRC1)(POS*1), Y0, Y0 + VMOVDQU Y0, (DST)(POS*1) + ADDQ $32, POS + CMPQ LEN, POS + JNE loop32b + RET + +loop_1b: + MOVB -1(SRC0)(LEN*1), TMP1 + MOVB -1(SRC1)(LEN*1), TMP2 + XORB TMP1, TMP2 + MOVB TMP2, -1(DST)(LEN*1) + SUBQ $1, LEN + TESTQ $7, LEN + JNZ loop_1b + CMPQ LEN, $0 + JE ret + TESTQ $31, LEN + JZ aligned + +not_aligned: + TESTQ $7, LEN + JNE loop_1b + MOVQ LEN, TMP1 + ANDQ $31, TMP1 + +loop_8b: + MOVQ -8(SRC0)(LEN*1), TMP2 + MOVQ -8(SRC1)(LEN*1), TMP3 + XORQ TMP2, TMP3 + MOVQ TMP3, -8(DST)(LEN*1) + SUBQ $8, LEN + SUBQ $8, TMP1 + JG loop_8b + + CMPQ LEN, $32 + JGE aligned + RET + +ret: + RET + +// func bytesAVX2small(dst, src0, src1 []byte, size int) +TEXT ·bytesAVX2small(SB), NOSPLIT, $0 + MOVQ len+72(FP), LEN + CMPQ LEN, $0 + JE ret + MOVQ dst+0(FP), DST + MOVQ src0+24(FP), SRC0 + MOVQ src1+48(FP), SRC1 + TESTQ $127, LEN + JNZ not_aligned + +aligned: + MOVQ $0, POS + +loop128b: + VMOVDQU (SRC0)(POS*1), Y0 + VMOVDQU 32(SRC0)(POS*1), Y1 + VMOVDQU 64(SRC0)(POS*1), Y2 + VMOVDQU 96(SRC0)(POS*1), Y3 + VPXOR (SRC1)(POS*1), Y0, Y0 + VPXOR 32(SRC1)(POS*1), Y1, Y1 + VPXOR 64(SRC1)(POS*1), Y2, Y2 + VPXOR 96(SRC1)(POS*1), Y3, Y3 + VMOVDQU Y0, (DST)(POS*1) + VMOVDQU Y1, 32(DST)(POS*1) + VMOVDQU Y2, 64(DST)(POS*1) + VMOVDQU Y3, 96(DST)(POS*1) + + ADDQ $128, POS + CMPQ LEN, POS + JNE loop128b + RET + +loop_1b: + MOVB -1(SRC0)(LEN*1), TMP1 + MOVB -1(SRC1)(LEN*1), TMP2 + XORB TMP1, TMP2 + MOVB TMP2, -1(DST)(LEN*1) + SUBQ $1, LEN + TESTQ $7, LEN + JNZ loop_1b + CMPQ LEN, $0 + JE ret + TESTQ $127, LEN + JZ aligned + +not_aligned: + TESTQ $7, LEN + JNE loop_1b + MOVQ LEN, TMP1 + ANDQ $127, TMP1 + +loop_8b: + MOVQ -8(SRC0)(LEN*1), TMP2 + MOVQ -8(SRC1)(LEN*1), TMP3 + XORQ TMP2, TMP3 + MOVQ TMP3, -8(DST)(LEN*1) + SUBQ $8, LEN + SUBQ $8, TMP1 + JG loop_8b + + CMPQ LEN, $128 + JGE aligned + RET + +ret: + RET + +// func bytesAVX2big(dst, src0, src1 []byte, size int) +TEXT ·bytesAVX2big(SB), NOSPLIT, $0 + MOVQ len+72(FP), LEN + CMPQ LEN, $0 + JE ret + MOVQ dst+0(FP), DST + MOVQ src0+24(FP), SRC0 + MOVQ src1+48(FP), SRC1 + TESTQ $127, LEN + JNZ not_aligned + +aligned: + MOVQ $0, POS + +loop128b: + VMOVDQU (SRC0)(POS*1), Y0 + VMOVDQU 32(SRC0)(POS*1), Y1 + VMOVDQU 64(SRC0)(POS*1), Y2 + VMOVDQU 96(SRC0)(POS*1), Y3 + VPXOR (SRC1)(POS*1), Y0, Y0 + VPXOR 32(SRC1)(POS*1), Y1, Y1 + VPXOR 64(SRC1)(POS*1), Y2, Y2 + VPXOR 96(SRC1)(POS*1), Y3, Y3 + LONG $0xe77da1c4; WORD $0x0304 + LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20 + LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40 + LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60 + + ADDQ $128, POS + CMPQ LEN, POS + JNE loop128b + SFENCE + RET + +loop_1b: + MOVB -1(SRC0)(LEN*1), TMP1 + MOVB -1(SRC1)(LEN*1), TMP2 + XORB TMP1, TMP2 + MOVB TMP2, -1(DST)(LEN*1) + SUBQ $1, LEN + TESTQ $7, LEN + JNZ loop_1b + CMPQ LEN, $0 + JE ret + TESTQ $127, LEN + JZ aligned + +not_aligned: + TESTQ $7, LEN + JNE loop_1b + MOVQ LEN, TMP1 + ANDQ $127, TMP1 + +loop_8b: + MOVQ -8(SRC0)(LEN*1), TMP2 + MOVQ -8(SRC1)(LEN*1), TMP3 + XORQ TMP2, TMP3 + MOVQ TMP3, -8(DST)(LEN*1) + SUBQ $8, LEN + SUBQ $8, TMP1 + JG loop_8b + + CMPQ LEN, $128 + JGE aligned + RET + +ret: + RET + +// func matrixAVX2small(dst []byte, src [][]byte) +TEXT ·matrixAVX2small(SB), NOSPLIT, $0 + MOVQ dst+0(FP), DST + MOVQ src+24(FP), SRC + MOVQ vec+32(FP), VECT + MOVQ len+8(FP), LEN + TESTQ $127, LEN + JNZ not_aligned + +aligned: + MOVQ $0, POS + +loop128b: + MOVQ VECT, TMP1 + SUBQ $2, TMP1 + MOVQ $0, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + MOVQ TMP3, TMP4 + VMOVDQU (TMP3)(POS*1), Y0 + VMOVDQU 32(TMP4)(POS*1), Y1 + VMOVDQU 64(TMP3)(POS*1), Y2 + VMOVDQU 96(TMP4)(POS*1), Y3 + +next_vect: + ADDQ $24, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + MOVQ TMP3, TMP4 + VMOVDQU (TMP3)(POS*1), Y4 + VMOVDQU 32(TMP4)(POS*1), Y5 + VMOVDQU 64(TMP3)(POS*1), Y6 + VMOVDQU 96(TMP4)(POS*1), Y7 + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + SUBQ $1, TMP1 + JGE next_vect + + VMOVDQU Y0, (DST)(POS*1) + VMOVDQU Y1, 32(DST)(POS*1) + VMOVDQU Y2, 64(DST)(POS*1) + VMOVDQU Y3, 96(DST)(POS*1) + + ADDQ $128, POS + CMPQ LEN, POS + JNE loop128b + RET + +loop_1b: + MOVQ VECT, TMP1 + MOVQ $0, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + SUBQ $2, TMP1 + MOVB -1(TMP3)(LEN*1), TMP5 + +next_vect_1b: + ADDQ $24, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + MOVB -1(TMP3)(LEN*1), TMP6 + XORB TMP6, TMP5 + SUBQ $1, TMP1 + JGE next_vect_1b + + MOVB TMP5, -1(DST)(LEN*1) + SUBQ $1, LEN + TESTQ $7, LEN + JNZ loop_1b + + CMPQ LEN, $0 + JE ret + TESTQ $127, LEN + JZ aligned + +not_aligned: + TESTQ $7, LEN + JNE loop_1b + MOVQ LEN, TMP4 + ANDQ $127, TMP4 + +loop_8b: + MOVQ VECT, TMP1 + MOVQ $0, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + SUBQ $2, TMP1 + MOVQ -8(TMP3)(LEN*1), TMP5 + +next_vect_8b: + ADDQ $24, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + MOVQ -8(TMP3)(LEN*1), TMP6 + XORQ TMP6, TMP5 + SUBQ $1, TMP1 + JGE next_vect_8b + + MOVQ TMP5, -8(DST)(LEN*1) + SUBQ $8, LEN + SUBQ $8, TMP4 + JG loop_8b + + CMPQ LEN, $128 + JGE aligned + RET + +ret: + RET + +// func matrixAVX2big(dst []byte, src [][]byte) +TEXT ·matrixAVX2big(SB), NOSPLIT, $0 + MOVQ dst+0(FP), DST + MOVQ src+24(FP), SRC + MOVQ vec+32(FP), VECT + MOVQ len+8(FP), LEN + TESTQ $127, LEN + JNZ not_aligned + +aligned: + MOVQ $0, POS + +loop128b: + MOVQ VECT, TMP1 + SUBQ $2, TMP1 + MOVQ $0, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + MOVQ TMP3, TMP4 + VMOVDQU (TMP3)(POS*1), Y0 + VMOVDQU 32(TMP4)(POS*1), Y1 + VMOVDQU 64(TMP3)(POS*1), Y2 + VMOVDQU 96(TMP4)(POS*1), Y3 + +next_vect: + ADDQ $24, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + MOVQ TMP3, TMP4 + VMOVDQU (TMP3)(POS*1), Y4 + VMOVDQU 32(TMP4)(POS*1), Y5 + VMOVDQU 64(TMP3)(POS*1), Y6 + VMOVDQU 96(TMP4)(POS*1), Y7 + VPXOR Y4, Y0, Y0 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + SUBQ $1, TMP1 + JGE next_vect + + LONG $0xe77da1c4; WORD $0x0304 // VMOVNTDQ go1.8 has + LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20 + LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40 + LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60 + + ADDQ $128, POS + CMPQ LEN, POS + JNE loop128b + RET + +loop_1b: + MOVQ VECT, TMP1 + MOVQ $0, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + SUBQ $2, TMP1 + MOVB -1(TMP3)(LEN*1), TMP5 + +next_vect_1b: + ADDQ $24, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + MOVB -1(TMP3)(LEN*1), TMP6 + XORB TMP6, TMP5 + SUBQ $1, TMP1 + JGE next_vect_1b + + MOVB TMP5, -1(DST)(LEN*1) + SUBQ $1, LEN + TESTQ $7, LEN + JNZ loop_1b + + CMPQ LEN, $0 + JE ret + TESTQ $127, LEN + JZ aligned + +not_aligned: + TESTQ $7, LEN + JNE loop_1b + MOVQ LEN, TMP4 + ANDQ $127, TMP4 + +loop_8b: + MOVQ VECT, TMP1 + MOVQ $0, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + SUBQ $2, TMP1 + MOVQ -8(TMP3)(LEN*1), TMP5 + +next_vect_8b: + ADDQ $24, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + MOVQ -8(TMP3)(LEN*1), TMP6 + XORQ TMP6, TMP5 + SUBQ $1, TMP1 + JGE next_vect_8b + + MOVQ TMP5, -8(DST)(LEN*1) + SUBQ $8, LEN + SUBQ $8, TMP4 + JG loop_8b + + CMPQ LEN, $128 + JGE aligned + RET + +ret: + RET + +TEXT ·hasAVX2(SB), NOSPLIT, $0 + XORQ AX, AX + XORQ CX, CX + ADDL $7, AX + CPUID + SHRQ $5, BX + ANDQ $1, BX + MOVB BX, ret+0(FP) + RET diff --git a/vendor/github.com/templexxx/xor/nosimd.go b/vendor/github.com/templexxx/xor/nosimd.go new file mode 100644 index 000000000..ebd7b754b --- /dev/null +++ b/vendor/github.com/templexxx/xor/nosimd.go @@ -0,0 +1,116 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package xor + +import ( + "runtime" + "unsafe" +) + +const wordSize = int(unsafe.Sizeof(uintptr(0))) +const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x" + +// xor the bytes in a and b. The destination is assumed to have enough space. +func bytesNoSIMD(dst, a, b []byte, size int) { + if supportsUnaligned { + fastXORBytes(dst, a, b, size) + } else { + // TODO(hanwen): if (dst, a, b) have common alignment + // we could still try fastXORBytes. It is not clear + // how often this happens, and it's only worth it if + // the block encryption itself is hardware + // accelerated. + safeXORBytes(dst, a, b, size) + } +} + +// split slice for cache-friendly +const unitSize = 16 * 1024 + +func matrixNoSIMD(dst []byte, src [][]byte) { + size := len(src[0]) + start := 0 + do := unitSize + for start < size { + end := start + do + if end <= size { + partNoSIMD(start, end, dst, src) + start = start + do + } else { + partNoSIMD(start, size, dst, src) + start = size + } + } +} + +// split vect will improve performance with big data by reducing cache pollution +func partNoSIMD(start, end int, dst []byte, src [][]byte) { + bytesNoSIMD(dst[start:end], src[0][start:end], src[1][start:end], end-start) + for i := 2; i < len(src); i++ { + bytesNoSIMD(dst[start:end], dst[start:end], src[i][start:end], end-start) + } +} + +// fastXORBytes xor in bulk. It only works on architectures that +// support unaligned read/writes. +func fastXORBytes(dst, a, b []byte, n int) { + w := n / wordSize + if w > 0 { + wordBytes := w * wordSize + fastXORWords(dst[:wordBytes], a[:wordBytes], b[:wordBytes]) + } + for i := n - n%wordSize; i < n; i++ { + dst[i] = a[i] ^ b[i] + } +} + +func safeXORBytes(dst, a, b []byte, n int) { + ex := n % 8 + for i := 0; i < ex; i++ { + dst[i] = a[i] ^ b[i] + } + + for i := ex; i < n; i += 8 { + _dst := dst[i : i+8] + _a := a[i : i+8] + _b := b[i : i+8] + _dst[0] = _a[0] ^ _b[0] + _dst[1] = _a[1] ^ _b[1] + _dst[2] = _a[2] ^ _b[2] + _dst[3] = _a[3] ^ _b[3] + + _dst[4] = _a[4] ^ _b[4] + _dst[5] = _a[5] ^ _b[5] + _dst[6] = _a[6] ^ _b[6] + _dst[7] = _a[7] ^ _b[7] + } +} + +// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.) +// The arguments are assumed to be of equal length. +func fastXORWords(dst, a, b []byte) { + dw := *(*[]uintptr)(unsafe.Pointer(&dst)) + aw := *(*[]uintptr)(unsafe.Pointer(&a)) + bw := *(*[]uintptr)(unsafe.Pointer(&b)) + n := len(b) / wordSize + ex := n % 8 + for i := 0; i < ex; i++ { + dw[i] = aw[i] ^ bw[i] + } + + for i := ex; i < n; i += 8 { + _dw := dw[i : i+8] + _aw := aw[i : i+8] + _bw := bw[i : i+8] + _dw[0] = _aw[0] ^ _bw[0] + _dw[1] = _aw[1] ^ _bw[1] + _dw[2] = _aw[2] ^ _bw[2] + _dw[3] = _aw[3] ^ _bw[3] + _dw[4] = _aw[4] ^ _bw[4] + _dw[5] = _aw[5] ^ _bw[5] + _dw[6] = _aw[6] ^ _bw[6] + _dw[7] = _aw[7] ^ _bw[7] + } +} diff --git a/vendor/github.com/templexxx/xor/sse2_amd64.s b/vendor/github.com/templexxx/xor/sse2_amd64.s new file mode 100644 index 000000000..d7f702b6b --- /dev/null +++ b/vendor/github.com/templexxx/xor/sse2_amd64.s @@ -0,0 +1,574 @@ +#include "textflag.h" + +// addr of mem +#define DST BX +#define SRC SI +#define SRC0 TMP4 +#define SRC1 TMP5 + +// loop args +// num of vect +#define VECT CX +#define LEN DX +// pos of matrix +#define POS R8 + +// tmp store +// num of vect or ... +#define TMP1 R9 +// pos of matrix or ... +#define TMP2 R10 +// store addr of data/parity or ... +#define TMP3 R11 +#define TMP4 R12 +#define TMP5 R13 +#define TMP6 R14 + +// func bytesSrc0(dst, src0, src1 []byte) +TEXT ·xorSrc0(SB), NOSPLIT, $0 + MOVQ len+32(FP), LEN + CMPQ LEN, $0 + JE ret + MOVQ dst+0(FP), DST + MOVQ src0+24(FP), SRC0 + MOVQ src1+48(FP), SRC1 + TESTQ $15, LEN + JNZ not_aligned + +aligned: + MOVQ $0, POS + +loop16b: + MOVOU (SRC0)(POS*1), X0 + XORPD (SRC1)(POS*1), X0 + MOVOU X0, (DST)(POS*1) + ADDQ $16, POS + CMPQ LEN, POS + JNE loop16b + RET + +loop_1b: + MOVB -1(SRC0)(LEN*1), TMP1 + MOVB -1(SRC1)(LEN*1), TMP2 + XORB TMP1, TMP2 + MOVB TMP2, -1(DST)(LEN*1) + SUBQ $1, LEN + TESTQ $7, LEN + JNZ loop_1b + CMPQ LEN, $0 + JE ret + TESTQ $15, LEN + JZ aligned + +not_aligned: + TESTQ $7, LEN + JNE loop_1b + MOVQ LEN, TMP1 + ANDQ $15, TMP1 + +loop_8b: + MOVQ -8(SRC0)(LEN*1), TMP2 + MOVQ -8(SRC1)(LEN*1), TMP3 + XORQ TMP2, TMP3 + MOVQ TMP3, -8(DST)(LEN*1) + SUBQ $8, LEN + SUBQ $8, TMP1 + JG loop_8b + + CMPQ LEN, $16 + JGE aligned + RET + +ret: + RET + +// func bytesSrc1(dst, src0, src1 []byte) +TEXT ·xorSrc1(SB), NOSPLIT, $0 + MOVQ len+56(FP), LEN + CMPQ LEN, $0 + JE ret + MOVQ dst+0(FP), DST + MOVQ src0+24(FP), SRC0 + MOVQ src1+48(FP), SRC1 + TESTQ $15, LEN + JNZ not_aligned + +aligned: + MOVQ $0, POS + +loop16b: + MOVOU (SRC0)(POS*1), X0 + XORPD (SRC1)(POS*1), X0 + MOVOU X0, (DST)(POS*1) + ADDQ $16, POS + CMPQ LEN, POS + JNE loop16b + RET + +loop_1b: + MOVB -1(SRC0)(LEN*1), TMP1 + MOVB -1(SRC1)(LEN*1), TMP2 + XORB TMP1, TMP2 + MOVB TMP2, -1(DST)(LEN*1) + SUBQ $1, LEN + TESTQ $7, LEN + JNZ loop_1b + CMPQ LEN, $0 + JE ret + TESTQ $15, LEN + JZ aligned + +not_aligned: + TESTQ $7, LEN + JNE loop_1b + MOVQ LEN, TMP1 + ANDQ $15, TMP1 + +loop_8b: + MOVQ -8(SRC0)(LEN*1), TMP2 + MOVQ -8(SRC1)(LEN*1), TMP3 + XORQ TMP2, TMP3 + MOVQ TMP3, -8(DST)(LEN*1) + SUBQ $8, LEN + SUBQ $8, TMP1 + JG loop_8b + + CMPQ LEN, $16 + JGE aligned + RET + +ret: + RET + +// func bytesSSE2mini(dst, src0, src1 []byte, size int) +TEXT ·bytesSSE2mini(SB), NOSPLIT, $0 + MOVQ len+72(FP), LEN + CMPQ LEN, $0 + JE ret + MOVQ dst+0(FP), DST + MOVQ src0+24(FP), SRC0 + MOVQ src1+48(FP), SRC1 + TESTQ $15, LEN + JNZ not_aligned + +aligned: + MOVQ $0, POS + +loop16b: + MOVOU (SRC0)(POS*1), X0 + XORPD (SRC1)(POS*1), X0 + + // MOVOU (SRC1)(POS*1), X4 + // PXOR X4, X0 + MOVOU X0, (DST)(POS*1) + ADDQ $16, POS + CMPQ LEN, POS + JNE loop16b + RET + +loop_1b: + MOVB -1(SRC0)(LEN*1), TMP1 + MOVB -1(SRC1)(LEN*1), TMP2 + XORB TMP1, TMP2 + MOVB TMP2, -1(DST)(LEN*1) + SUBQ $1, LEN + TESTQ $7, LEN + JNZ loop_1b + CMPQ LEN, $0 + JE ret + TESTQ $15, LEN + JZ aligned + +not_aligned: + TESTQ $7, LEN + JNE loop_1b + MOVQ LEN, TMP1 + ANDQ $15, TMP1 + +loop_8b: + MOVQ -8(SRC0)(LEN*1), TMP2 + MOVQ -8(SRC1)(LEN*1), TMP3 + XORQ TMP2, TMP3 + MOVQ TMP3, -8(DST)(LEN*1) + SUBQ $8, LEN + SUBQ $8, TMP1 + JG loop_8b + + CMPQ LEN, $16 + JGE aligned + RET + +ret: + RET + +// func bytesSSE2small(dst, src0, src1 []byte, size int) +TEXT ·bytesSSE2small(SB), NOSPLIT, $0 + MOVQ len+72(FP), LEN + CMPQ LEN, $0 + JE ret + MOVQ dst+0(FP), DST + MOVQ src0+24(FP), SRC0 + MOVQ src1+48(FP), SRC1 + TESTQ $63, LEN + JNZ not_aligned + +aligned: + MOVQ $0, POS + +loop64b: + MOVOU (SRC0)(POS*1), X0 + MOVOU 16(SRC0)(POS*1), X1 + MOVOU 32(SRC0)(POS*1), X2 + MOVOU 48(SRC0)(POS*1), X3 + + MOVOU (SRC1)(POS*1), X4 + MOVOU 16(SRC1)(POS*1), X5 + MOVOU 32(SRC1)(POS*1), X6 + MOVOU 48(SRC1)(POS*1), X7 + + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X2 + PXOR X7, X3 + + MOVOU X0, (DST)(POS*1) + MOVOU X1, 16(DST)(POS*1) + MOVOU X2, 32(DST)(POS*1) + MOVOU X3, 48(DST)(POS*1) + + ADDQ $64, POS + CMPQ LEN, POS + JNE loop64b + RET + +loop_1b: + MOVB -1(SRC0)(LEN*1), TMP1 + MOVB -1(SRC1)(LEN*1), TMP2 + XORB TMP1, TMP2 + MOVB TMP2, -1(DST)(LEN*1) + SUBQ $1, LEN + TESTQ $7, LEN + JNZ loop_1b + CMPQ LEN, $0 + JE ret + TESTQ $63, LEN + JZ aligned + +not_aligned: + TESTQ $7, LEN + JNE loop_1b + MOVQ LEN, TMP1 + ANDQ $63, TMP1 + +loop_8b: + MOVQ -8(SRC0)(LEN*1), TMP2 + MOVQ -8(SRC1)(LEN*1), TMP3 + XORQ TMP2, TMP3 + MOVQ TMP3, -8(DST)(LEN*1) + SUBQ $8, LEN + SUBQ $8, TMP1 + JG loop_8b + + CMPQ LEN, $64 + JGE aligned + RET + +ret: + RET + +// func bytesSSE2big(dst, src0, src1 []byte, size int) +TEXT ·bytesSSE2big(SB), NOSPLIT, $0 + MOVQ len+72(FP), LEN + CMPQ LEN, $0 + JE ret + MOVQ dst+0(FP), DST + MOVQ src0+24(FP), SRC0 + MOVQ src1+48(FP), SRC1 + TESTQ $63, LEN + JNZ not_aligned + +aligned: + MOVQ $0, POS + +loop64b: + MOVOU (SRC0)(POS*1), X0 + MOVOU 16(SRC0)(POS*1), X1 + MOVOU 32(SRC0)(POS*1), X2 + MOVOU 48(SRC0)(POS*1), X3 + + MOVOU (SRC1)(POS*1), X4 + MOVOU 16(SRC1)(POS*1), X5 + MOVOU 32(SRC1)(POS*1), X6 + MOVOU 48(SRC1)(POS*1), X7 + + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X2 + PXOR X7, X3 + + LONG $0xe70f4266; WORD $0x0304 // MOVNTDQ + LONG $0xe70f4266; WORD $0x034c; BYTE $0x10 + LONG $0xe70f4266; WORD $0x0354; BYTE $0x20 + LONG $0xe70f4266; WORD $0x035c; BYTE $0x30 + + ADDQ $64, POS + CMPQ LEN, POS + JNE loop64b + RET + +loop_1b: + MOVB -1(SRC0)(LEN*1), TMP1 + MOVB -1(SRC1)(LEN*1), TMP2 + XORB TMP1, TMP2 + MOVB TMP2, -1(DST)(LEN*1) + SUBQ $1, LEN + TESTQ $7, LEN + JNZ loop_1b + CMPQ LEN, $0 + JE ret + TESTQ $63, LEN + JZ aligned + +not_aligned: + TESTQ $7, LEN + JNE loop_1b + MOVQ LEN, TMP1 + ANDQ $63, TMP1 + +loop_8b: + MOVQ -8(SRC0)(LEN*1), TMP2 + MOVQ -8(SRC1)(LEN*1), TMP3 + XORQ TMP2, TMP3 + MOVQ TMP3, -8(DST)(LEN*1) + SUBQ $8, LEN + SUBQ $8, TMP1 + JG loop_8b + + CMPQ LEN, $64 + JGE aligned + RET + +ret: + RET + +// func matrixSSE2small(dst []byte, src [][]byte) +TEXT ·matrixSSE2small(SB), NOSPLIT, $0 + MOVQ dst+0(FP), DST + MOVQ src+24(FP), SRC + MOVQ vec+32(FP), VECT + MOVQ len+8(FP), LEN + TESTQ $63, LEN + JNZ not_aligned + +aligned: + MOVQ $0, POS + +loop64b: + MOVQ VECT, TMP1 + SUBQ $2, TMP1 + MOVQ $0, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + MOVQ TMP3, TMP4 + MOVOU (TMP3)(POS*1), X0 + MOVOU 16(TMP4)(POS*1), X1 + MOVOU 32(TMP3)(POS*1), X2 + MOVOU 48(TMP4)(POS*1), X3 + +next_vect: + ADDQ $24, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + MOVQ TMP3, TMP4 + MOVOU (TMP3)(POS*1), X4 + MOVOU 16(TMP4)(POS*1), X5 + MOVOU 32(TMP3)(POS*1), X6 + MOVOU 48(TMP4)(POS*1), X7 + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X2 + PXOR X7, X3 + SUBQ $1, TMP1 + JGE next_vect + + MOVOU X0, (DST)(POS*1) + MOVOU X1, 16(DST)(POS*1) + MOVOU X2, 32(DST)(POS*1) + MOVOU X3, 48(DST)(POS*1) + + ADDQ $64, POS + CMPQ LEN, POS + JNE loop64b + RET + +loop_1b: + MOVQ VECT, TMP1 + MOVQ $0, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + SUBQ $2, TMP1 + MOVB -1(TMP3)(LEN*1), TMP5 + +next_vect_1b: + ADDQ $24, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + MOVB -1(TMP3)(LEN*1), TMP6 + XORB TMP6, TMP5 + SUBQ $1, TMP1 + JGE next_vect_1b + + MOVB TMP5, -1(DST)(LEN*1) + SUBQ $1, LEN + TESTQ $7, LEN + JNZ loop_1b + + CMPQ LEN, $0 + JE ret + TESTQ $63, LEN + JZ aligned + +not_aligned: + TESTQ $7, LEN + JNE loop_1b + MOVQ LEN, TMP4 + ANDQ $63, TMP4 + +loop_8b: + MOVQ VECT, TMP1 + MOVQ $0, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + SUBQ $2, TMP1 + MOVQ -8(TMP3)(LEN*1), TMP5 + +next_vect_8b: + ADDQ $24, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + MOVQ -8(TMP3)(LEN*1), TMP6 + XORQ TMP6, TMP5 + SUBQ $1, TMP1 + JGE next_vect_8b + + MOVQ TMP5, -8(DST)(LEN*1) + SUBQ $8, LEN + SUBQ $8, TMP4 + JG loop_8b + + CMPQ LEN, $64 + JGE aligned + RET + +ret: + RET + +// func matrixSSE2big(dst []byte, src [][]byte) +TEXT ·matrixSSE2big(SB), NOSPLIT, $0 + MOVQ dst+0(FP), DST + MOVQ src+24(FP), SRC + MOVQ vec+32(FP), VECT + MOVQ len+8(FP), LEN + TESTQ $63, LEN + JNZ not_aligned + +aligned: + MOVQ $0, POS + +loop64b: + MOVQ VECT, TMP1 + SUBQ $2, TMP1 + MOVQ $0, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + MOVQ TMP3, TMP4 + MOVOU (TMP3)(POS*1), X0 + MOVOU 16(TMP4)(POS*1), X1 + MOVOU 32(TMP3)(POS*1), X2 + MOVOU 48(TMP4)(POS*1), X3 + +next_vect: + ADDQ $24, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + MOVQ TMP3, TMP4 + MOVOU (TMP3)(POS*1), X4 + MOVOU 16(TMP4)(POS*1), X5 + MOVOU 32(TMP3)(POS*1), X6 + MOVOU 48(TMP4)(POS*1), X7 + PXOR X4, X0 + PXOR X5, X1 + PXOR X6, X2 + PXOR X7, X3 + SUBQ $1, TMP1 + JGE next_vect + + LONG $0xe70f4266; WORD $0x0304 + LONG $0xe70f4266; WORD $0x034c; BYTE $0x10 + LONG $0xe70f4266; WORD $0x0354; BYTE $0x20 + LONG $0xe70f4266; WORD $0x035c; BYTE $0x30 + + ADDQ $64, POS + CMPQ LEN, POS + JNE loop64b + RET + +loop_1b: + MOVQ VECT, TMP1 + MOVQ $0, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + SUBQ $2, TMP1 + MOVB -1(TMP3)(LEN*1), TMP5 + +next_vect_1b: + ADDQ $24, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + MOVB -1(TMP3)(LEN*1), TMP6 + XORB TMP6, TMP5 + SUBQ $1, TMP1 + JGE next_vect_1b + + MOVB TMP5, -1(DST)(LEN*1) + SUBQ $1, LEN + TESTQ $7, LEN + JNZ loop_1b + + CMPQ LEN, $0 + JE ret + TESTQ $63, LEN + JZ aligned + +not_aligned: + TESTQ $7, LEN + JNE loop_1b + MOVQ LEN, TMP4 + ANDQ $63, TMP4 + +loop_8b: + MOVQ VECT, TMP1 + MOVQ $0, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + SUBQ $2, TMP1 + MOVQ -8(TMP3)(LEN*1), TMP5 + +next_vect_8b: + ADDQ $24, TMP2 + MOVQ (SRC)(TMP2*1), TMP3 + MOVQ -8(TMP3)(LEN*1), TMP6 + XORQ TMP6, TMP5 + SUBQ $1, TMP1 + JGE next_vect_8b + + MOVQ TMP5, -8(DST)(LEN*1) + SUBQ $8, LEN + SUBQ $8, TMP4 + JG loop_8b + + CMPQ LEN, $64 + JGE aligned + RET + +ret: + RET + +TEXT ·hasSSE2(SB), NOSPLIT, $0 + XORQ AX, AX + INCL AX + CPUID + SHRQ $26, DX + ANDQ $1, DX + MOVB DX, ret+0(FP) + RET + diff --git a/vendor/github.com/templexxx/xor/xor.go b/vendor/github.com/templexxx/xor/xor.go new file mode 100644 index 000000000..2fa56167b --- /dev/null +++ b/vendor/github.com/templexxx/xor/xor.go @@ -0,0 +1,49 @@ +package xor + +// SIMD Extensions +const ( + none = iota + avx2 + // first introduced by Intel with the initial version of the Pentium 4 in 2001 + // so I think we can assume all amd64 has sse2 + sse2 +) + +var extension = none + +// Bytes : chose the shortest one as xor size +// it's better to use it for big data ( > 64bytes ) +func Bytes(dst, src0, src1 []byte) { + size := len(dst) + if size > len(src0) { + size = len(src0) + } + if size > len(src1) { + size = len(src1) + } + xorBytes(dst, src0, src1, size) +} + +// BytesSameLen : all slice's length must be equal +// cut size branch, save time for small data +func BytesSameLen(dst, src0, src1 []byte) { + xorSrc1(dst, src0, src1) +} + +// BytesSrc0 : src1 >= src0, dst >= src0 +// xor src0's len bytes +func BytesSrc0(dst, src0, src1 []byte) { + xorSrc0(dst, src0, src1) +} + +// BytesSrc1 : src0 >= src1, dst >= src1 +// xor src1's len bytes +func BytesSrc1(dst, src0, src1 []byte) { + xorSrc1(dst, src0, src1) +} + +// Matrix : all slice's length must be equal && != 0 +// len(src) must >= 2 +func Matrix(dst []byte, src [][]byte) { + xorMatrix(dst, src) +} diff --git a/vendor/github.com/templexxx/xor/xor_amd64.go b/vendor/github.com/templexxx/xor/xor_amd64.go new file mode 100644 index 000000000..158797047 --- /dev/null +++ b/vendor/github.com/templexxx/xor/xor_amd64.go @@ -0,0 +1,118 @@ +package xor + +func init() { + getEXT() +} + +func getEXT() { + if hasAVX2() { + extension = avx2 + } else { + extension = sse2 + } + return +} + +func xorBytes(dst, src0, src1 []byte, size int) { + switch extension { + case avx2: + bytesAVX2(dst, src0, src1, size) + default: + bytesSSE2(dst, src0, src1, size) + } +} + +// non-temporal hint store +const nontmp = 8 * 1024 +const avx2loopsize = 128 + +func bytesAVX2(dst, src0, src1 []byte, size int) { + if size < avx2loopsize { + bytesAVX2mini(dst, src0, src1, size) + } else if size >= avx2loopsize && size <= nontmp { + bytesAVX2small(dst, src0, src1, size) + } else { + bytesAVX2big(dst, src0, src1, size) + } +} + +const sse2loopsize = 64 + +func bytesSSE2(dst, src0, src1 []byte, size int) { + if size < sse2loopsize { + bytesSSE2mini(dst, src0, src1, size) + } else if size >= sse2loopsize && size <= nontmp { + bytesSSE2small(dst, src0, src1, size) + } else { + bytesSSE2big(dst, src0, src1, size) + } +} + +func xorMatrix(dst []byte, src [][]byte) { + switch extension { + case avx2: + matrixAVX2(dst, src) + default: + matrixSSE2(dst, src) + } +} + +func matrixAVX2(dst []byte, src [][]byte) { + size := len(dst) + if size > nontmp { + matrixAVX2big(dst, src) + } else { + matrixAVX2small(dst, src) + } +} + +func matrixSSE2(dst []byte, src [][]byte) { + size := len(dst) + if size > nontmp { + matrixSSE2big(dst, src) + } else { + matrixSSE2small(dst, src) + } +} + +//go:noescape +func xorSrc0(dst, src0, src1 []byte) + +//go:noescape +func xorSrc1(dst, src0, src1 []byte) + +//go:noescape +func bytesAVX2mini(dst, src0, src1 []byte, size int) + +//go:noescape +func bytesAVX2big(dst, src0, src1 []byte, size int) + +//go:noescape +func bytesAVX2small(dst, src0, src1 []byte, size int) + +//go:noescape +func bytesSSE2mini(dst, src0, src1 []byte, size int) + +//go:noescape +func bytesSSE2small(dst, src0, src1 []byte, size int) + +//go:noescape +func bytesSSE2big(dst, src0, src1 []byte, size int) + +//go:noescape +func matrixAVX2small(dst []byte, src [][]byte) + +//go:noescape +func matrixAVX2big(dst []byte, src [][]byte) + +//go:noescape +func matrixSSE2small(dst []byte, src [][]byte) + +//go:noescape +func matrixSSE2big(dst []byte, src [][]byte) + +//go:noescape +func hasAVX2() bool + +//go:noescape +func hasSSE2() bool diff --git a/vendor/github.com/templexxx/xor/xor_other.go b/vendor/github.com/templexxx/xor/xor_other.go new file mode 100644 index 000000000..4aa2967c4 --- /dev/null +++ b/vendor/github.com/templexxx/xor/xor_other.go @@ -0,0 +1,19 @@ +// +build !amd64 noasm + +package xor + +func xorBytes(dst, src0, src1 []byte, size int) { + bytesNoSIMD(dst, src0, src1, size) +} + +func xorMatrix(dst []byte, src [][]byte) { + matrixNoSIMD(dst, src) +} + +func xorSrc0(dst, src0, src1 []byte) { + bytesNoSIMD(dst, src0, src1, len(src0)) +} + +func xorSrc1(dst, src0, src1 []byte) { + bytesNoSIMD(dst, src0, src1, len(src1)) +} diff --git a/vendor/manifest b/vendor/manifest index d9254b17e..aeb8cc2f7 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -17,6 +17,14 @@ "branch": "master", "notests": true }, + { + "importpath": "github.com/AudriusButkevicius/kcp-go", + "repository": "https://github.com/AudriusButkevicius/kcp-go", + "vcs": "git", + "revision": "0ccc04f3b8a7bdf53e2d4d6d0769adbc7cb3851a", + "branch": "master", + "notests": true + }, { "importpath": "github.com/AudriusButkevicius/pfilter", "repository": "https://github.com/AudriusButkevicius/pfilter", @@ -378,6 +386,14 @@ "path": "/leveldb", "notests": true }, + { + "importpath": "github.com/templexxx/xor", + "repository": "https://github.com/templexxx/xor", + "vcs": "git", + "revision": "42f9c041c330b560afb991153bf183c25444bcdc", + "branch": "master", + "notests": true + }, { "importpath": "github.com/thejerf/suture", "repository": "https://github.com/thejerf/suture", @@ -413,14 +429,6 @@ "path": "/qr", "notests": true }, - { - "importpath": "github.com/xtaci/kcp-go", - "repository": "https://github.com/xtaci/kcp-go", - "vcs": "git", - "revision": "0b0731ef3f184a8985edcb4ca26a4b0598c6dc1a", - "branch": "master", - "notests": true - }, { "importpath": "github.com/xtaci/smux", "repository": "https://github.com/xtaci/smux",