syncthing/lib/connections/service.go
DerRockWolf debbe726e0
lib/connections: Add syncthing_connections_active metric (fixes #9527) (#9528)
### Purpose

Adds a new metric `syncthing_connections_active` which equals to the
amount of active connections per device.

Fixes #9527 

<!--
Describe the purpose of this change. If there is an existing issue that
is
resolved by this pull request, ensure that the commit subject is on the
form
`Some short description (fixes #1234)` where 1234 is the issue number.
-->

### Testing

I've manually tested it by running syncthing with these changes locally
and examining the returned metrics from `/metrics`.
I've done the following things:
- Connect & disconnect a device
- Increase & decrease the number of connections and verify that the
value of the metric matches with the amount displayed in the GUI.

### Documentation

https://github.com/syncthing/docs/blob/main/includes/metrics-list.rst
needs to be regenerated with
[find-metrics.go](https://github.com/syncthing/docs/blob/main/_script/find-metrics/find-metrics.go)

## Authorship

Your name and email will be added automatically to the AUTHORS file
based on the commit metadata.

---------

Co-authored-by: Jakob Borg <jakob@kastelo.net>
2024-05-04 22:31:37 +02:00

1478 lines
46 KiB
Go

// Copyright (C) 2015 The Syncthing Authors.
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this file,
// You can obtain one at https://mozilla.org/MPL/2.0/.
//go:generate -command counterfeiter go run github.com/maxbrunsfeld/counterfeiter/v6
//go:generate counterfeiter -o mocks/service.go --fake-name Service . Service
package connections
import (
"context"
"crypto/rand"
"crypto/tls"
"crypto/x509"
"encoding/base32"
"encoding/binary"
"errors"
"fmt"
"io"
"math"
"net"
"net/url"
"slices"
"sort"
"strings"
stdsync "sync"
"time"
"github.com/syncthing/syncthing/lib/build"
"github.com/syncthing/syncthing/lib/config"
"github.com/syncthing/syncthing/lib/connections/registry"
"github.com/syncthing/syncthing/lib/discover"
"github.com/syncthing/syncthing/lib/events"
"github.com/syncthing/syncthing/lib/nat"
"github.com/syncthing/syncthing/lib/osutil"
"github.com/syncthing/syncthing/lib/protocol"
"github.com/syncthing/syncthing/lib/semaphore"
"github.com/syncthing/syncthing/lib/sliceutil"
"github.com/syncthing/syncthing/lib/stringutil"
"github.com/syncthing/syncthing/lib/svcutil"
"github.com/syncthing/syncthing/lib/sync"
// Registers NAT service providers
_ "github.com/syncthing/syncthing/lib/pmp"
_ "github.com/syncthing/syncthing/lib/upnp"
"github.com/thejerf/suture/v4"
"golang.org/x/time/rate"
)
var (
dialers = make(map[string]dialerFactory)
listeners = make(map[string]listenerFactory)
)
var (
// Dialers and listeners return errUnsupported (or a wrapped variant)
// when they are intentionally out of service due to configuration,
// build, etc. This is not logged loudly.
errUnsupported = errors.New("unsupported protocol")
// These are specific explanations for errUnsupported.
errDisabled = fmt.Errorf("%w: disabled by configuration", errUnsupported)
errDeprecated = fmt.Errorf("%w: deprecated", errUnsupported)
// Various reasons to reject a connection
errNetworkNotAllowed = errors.New("network not allowed")
errDeviceAlreadyConnected = errors.New("already connected to this device")
errDeviceIgnored = errors.New("device is ignored")
errConnLimitReached = errors.New("connection limit reached")
errDevicePaused = errors.New("device is paused")
// A connection is being closed to make space for better ones
errReplacingConnection = errors.New("replacing connection")
)
const (
perDeviceWarningIntv = 15 * time.Minute
tlsHandshakeTimeout = 10 * time.Second
minConnectionLoopSleep = 5 * time.Second
stdConnectionLoopSleep = time.Minute
worstDialerPriority = math.MaxInt32
recentlySeenCutoff = 7 * 24 * time.Hour
shortLivedConnectionThreshold = 5 * time.Second
dialMaxParallel = 64
dialMaxParallelPerDevice = 8
maxNumConnections = 128 // the maximum number of connections we maintain to any given device
)
// From go/src/crypto/tls/cipher_suites.go
var tlsCipherSuiteNames = map[uint16]string{
// TLS 1.2
0x0005: "TLS_RSA_WITH_RC4_128_SHA",
0x000a: "TLS_RSA_WITH_3DES_EDE_CBC_SHA",
0x002f: "TLS_RSA_WITH_AES_128_CBC_SHA",
0x0035: "TLS_RSA_WITH_AES_256_CBC_SHA",
0x003c: "TLS_RSA_WITH_AES_128_CBC_SHA256",
0x009c: "TLS_RSA_WITH_AES_128_GCM_SHA256",
0x009d: "TLS_RSA_WITH_AES_256_GCM_SHA384",
0xc007: "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA",
0xc009: "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA",
0xc00a: "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
0xc011: "TLS_ECDHE_RSA_WITH_RC4_128_SHA",
0xc012: "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA",
0xc013: "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA",
0xc014: "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA",
0xc023: "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256",
0xc027: "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
0xc02f: "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256",
0xc02b: "TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256",
0xc030: "TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384",
0xc02c: "TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384",
0xcca8: "TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305",
0xcca9: "TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305",
// TLS 1.3
0x1301: "TLS_AES_128_GCM_SHA256",
0x1302: "TLS_AES_256_GCM_SHA384",
0x1303: "TLS_CHACHA20_POLY1305_SHA256",
}
var tlsVersionNames = map[uint16]string{
tls.VersionTLS12: "TLS1.2",
tls.VersionTLS13: "TLS1.3",
}
// Service listens and dials all configured unconnected devices, via supported
// dialers. Successful connections are handed to the model.
type Service interface {
suture.Service
discover.AddressLister
ListenerStatus() map[string]ListenerStatusEntry
ConnectionStatus() map[string]ConnectionStatusEntry
NATType() string
}
type ListenerStatusEntry struct {
Error *string `json:"error"`
LANAddresses []string `json:"lanAddresses"`
WANAddresses []string `json:"wanAddresses"`
}
type ConnectionStatusEntry struct {
When time.Time `json:"when"`
Error *string `json:"error"`
}
type connWithHello struct {
c internalConn
hello protocol.Hello
err error
remoteID protocol.DeviceID
remoteCert *x509.Certificate
}
type service struct {
*suture.Supervisor
connectionStatusHandler
deviceConnectionTracker
cfg config.Wrapper
myID protocol.DeviceID
model Model
tlsCfg *tls.Config
discoverer discover.Finder
conns chan internalConn
hellos chan *connWithHello
bepProtocolName string
tlsDefaultCommonName string
limiter *limiter
natService *nat.Service
evLogger events.Logger
registry *registry.Registry
keyGen *protocol.KeyGenerator
lanChecker *lanChecker
dialNow chan struct{}
dialNowDevices map[protocol.DeviceID]struct{}
dialNowDevicesMut sync.Mutex
listenersMut sync.RWMutex
listeners map[string]genericListener
listenerTokens map[string]suture.ServiceToken
}
func NewService(cfg config.Wrapper, myID protocol.DeviceID, mdl Model, tlsCfg *tls.Config, discoverer discover.Finder, bepProtocolName string, tlsDefaultCommonName string, evLogger events.Logger, registry *registry.Registry, keyGen *protocol.KeyGenerator) Service {
spec := svcutil.SpecWithInfoLogger(l)
service := &service{
Supervisor: suture.New("connections.Service", spec),
connectionStatusHandler: newConnectionStatusHandler(),
cfg: cfg,
myID: myID,
model: mdl,
tlsCfg: tlsCfg,
discoverer: discoverer,
conns: make(chan internalConn),
hellos: make(chan *connWithHello),
bepProtocolName: bepProtocolName,
tlsDefaultCommonName: tlsDefaultCommonName,
limiter: newLimiter(myID, cfg),
natService: nat.NewService(myID, cfg),
evLogger: evLogger,
registry: registry,
keyGen: keyGen,
lanChecker: &lanChecker{cfg},
dialNowDevicesMut: sync.NewMutex(),
dialNow: make(chan struct{}, 1),
dialNowDevices: make(map[protocol.DeviceID]struct{}),
listenersMut: sync.NewRWMutex(),
listeners: make(map[string]genericListener),
listenerTokens: make(map[string]suture.ServiceToken),
}
cfg.Subscribe(service)
raw := cfg.RawCopy()
// Actually starts the listeners and NAT service
// Need to start this before service.connect so that any dials that
// try punch through already have a listener to cling on.
service.CommitConfiguration(raw, raw)
// There are several moving parts here; one routine per listening address
// (handled in configuration changing) to handle incoming connections,
// one routine to periodically attempt outgoing connections, one routine to
// the common handling regardless of whether the connection was
// incoming or outgoing.
service.Add(svcutil.AsService(service.connect, fmt.Sprintf("%s/connect", service)))
service.Add(svcutil.AsService(service.handleConns, fmt.Sprintf("%s/handleConns", service)))
service.Add(svcutil.AsService(service.handleHellos, fmt.Sprintf("%s/handleHellos", service)))
service.Add(service.natService)
svcutil.OnSupervisorDone(service.Supervisor, func() {
service.cfg.Unsubscribe(service.limiter)
service.cfg.Unsubscribe(service)
})
return service
}
func (s *service) handleConns(ctx context.Context) error {
for {
var c internalConn
select {
case <-ctx.Done():
return ctx.Err()
case c = <-s.conns:
}
cs := c.ConnectionState()
// We should have negotiated the next level protocol "bep/1.0" as part
// of the TLS handshake. Unfortunately this can't be a hard error,
// because there are implementations out there that don't support
// protocol negotiation (iOS for one...).
if cs.NegotiatedProtocol != s.bepProtocolName {
l.Infof("Peer at %s did not negotiate bep/1.0", c)
}
// We should have received exactly one certificate from the other
// side. If we didn't, they don't have a device ID and we drop the
// connection.
certs := cs.PeerCertificates
if cl := len(certs); cl != 1 {
l.Infof("Got peer certificate list of length %d != 1 from peer at %s; protocol error", cl, c)
c.Close()
continue
}
remoteCert := certs[0]
remoteID := protocol.NewDeviceID(remoteCert.Raw)
// The device ID should not be that of ourselves. It can happen
// though, especially in the presence of NAT hairpinning, multiple
// clients between the same NAT gateway, and global discovery.
if remoteID == s.myID {
l.Debugf("Connected to myself (%s) at %s", remoteID, c)
c.Close()
continue
}
if err := s.connectionCheckEarly(remoteID, c); err != nil {
l.Infof("Connection from %s at %s (%s) rejected: %v", remoteID, c.RemoteAddr(), c.Type(), err)
c.Close()
continue
}
_ = c.SetDeadline(time.Now().Add(20 * time.Second))
go func() {
// Exchange Hello messages with the peer.
outgoing := s.helloForDevice(remoteID)
incoming, err := protocol.ExchangeHello(c, outgoing)
// The timestamps are used to create the connection ID.
c.connectionID = newConnectionID(outgoing.Timestamp, incoming.Timestamp)
select {
case s.hellos <- &connWithHello{c, incoming, err, remoteID, remoteCert}:
case <-ctx.Done():
}
}()
}
}
func (s *service) helloForDevice(remoteID protocol.DeviceID) protocol.Hello {
hello := protocol.Hello{
ClientName: "syncthing",
ClientVersion: build.Version,
Timestamp: time.Now().UnixNano(),
}
if cfg, ok := s.cfg.Device(remoteID); ok {
hello.NumConnections = cfg.NumConnections()
// Set our name (from the config of our device ID) only if we
// already know about the other side device ID.
if myCfg, ok := s.cfg.Device(s.myID); ok {
hello.DeviceName = myCfg.Name
}
}
return hello
}
func (s *service) connectionCheckEarly(remoteID protocol.DeviceID, c internalConn) error {
if s.cfg.IgnoredDevice(remoteID) {
return errDeviceIgnored
}
if max := s.cfg.Options().ConnectionLimitMax; max > 0 && s.numConnectedDevices() >= max {
// We're not allowed to accept any more connections.
return errConnLimitReached
}
cfg, ok := s.cfg.Device(remoteID)
if !ok {
// We do go ahead exchanging hello messages to get information about the device.
return nil
}
if cfg.Paused {
return errDevicePaused
}
if len(cfg.AllowedNetworks) > 0 && !IsAllowedNetwork(c.RemoteAddr().String(), cfg.AllowedNetworks) {
// The connection is not from an allowed network.
return errNetworkNotAllowed
}
currentConns := s.numConnectionsForDevice(cfg.DeviceID)
desiredConns := s.desiredConnectionsToDevice(cfg.DeviceID)
worstPrio := s.worstConnectionPriority(remoteID)
ourUpgradeThreshold := c.priority + s.cfg.Options().ConnectionPriorityUpgradeThreshold
if currentConns >= desiredConns && ourUpgradeThreshold >= worstPrio {
l.Debugf("Not accepting connection to %s at %s: already have %d connections, desire %d", remoteID, c, currentConns, desiredConns)
return errDeviceAlreadyConnected
}
return nil
}
func (s *service) handleHellos(ctx context.Context) error {
for {
var c internalConn
var hello protocol.Hello
var err error
var remoteID protocol.DeviceID
var remoteCert *x509.Certificate
select {
case <-ctx.Done():
return ctx.Err()
case withHello := <-s.hellos:
c = withHello.c
hello = withHello.hello
err = withHello.err
remoteID = withHello.remoteID
remoteCert = withHello.remoteCert
}
if err != nil {
if protocol.IsVersionMismatch(err) {
// The error will be a relatively user friendly description
// of what's wrong with the version compatibility. By
// default identify the other side by device ID and IP.
remote := fmt.Sprintf("%v (%v)", remoteID, c.RemoteAddr())
if hello.DeviceName != "" {
// If the name was set in the hello return, use that to
// give the user more info about which device is the
// affected one. It probably says more than the remote
// IP.
remote = fmt.Sprintf("%q (%s %s, %v)", hello.DeviceName, hello.ClientName, hello.ClientVersion, remoteID)
}
msg := fmt.Sprintf("Connecting to %s: %s", remote, err)
warningFor(remoteID, msg)
} else {
// It's something else - connection reset or whatever
l.Infof("Failed to exchange Hello messages with %s at %s: %s", remoteID, c, err)
}
c.Close()
continue
}
_ = c.SetDeadline(time.Time{})
// The Model will return an error for devices that we don't want to
// have a connection with for whatever reason, for example unknown devices.
if err := s.model.OnHello(remoteID, c.RemoteAddr(), hello); err != nil {
l.Infof("Connection from %s at %s (%s) rejected: %v", remoteID, c.RemoteAddr(), c.Type(), err)
c.Close()
continue
}
deviceCfg, ok := s.cfg.Device(remoteID)
if !ok {
l.Infof("Device %s removed from config during connection attempt at %s", remoteID, c)
c.Close()
continue
}
// Verify the name on the certificate. By default we set it to
// "syncthing" when generating, but the user may have replaced
// the certificate and used another name.
certName := deviceCfg.CertName
if certName == "" {
certName = s.tlsDefaultCommonName
}
if remoteCert.Subject.CommonName == certName {
// All good. We do this check because our old style certificates
// have "syncthing" in the CommonName field and no SANs, which
// is not accepted by VerifyHostname() any more as of Go 1.15.
} else if err := remoteCert.VerifyHostname(certName); err != nil {
// Incorrect certificate name is something the user most
// likely wants to know about, since it's an advanced
// config. Warn instead of Info.
l.Warnf("Bad certificate from %s at %s: %v", remoteID, c, err)
c.Close()
continue
}
// Wrap the connection in rate limiters. The limiter itself will
// keep up with config changes to the rate and whether or not LAN
// connections are limited.
rd, wr := s.limiter.getLimiters(remoteID, c, c.IsLocal())
protoConn := protocol.NewConnection(remoteID, rd, wr, c, s.model, c, deviceCfg.Compression, s.cfg.FolderPasswords(remoteID), s.keyGen)
s.accountAddedConnection(protoConn, hello, s.cfg.Options().ConnectionPriorityUpgradeThreshold)
go func() {
<-protoConn.Closed()
s.accountRemovedConnection(protoConn)
s.dialNowDevicesMut.Lock()
s.dialNowDevices[remoteID] = struct{}{}
s.scheduleDialNow()
s.dialNowDevicesMut.Unlock()
}()
l.Infof("Established secure connection to %s at %s", remoteID.Short(), c)
s.model.AddConnection(protoConn, hello)
continue
}
}
func (s *service) connect(ctx context.Context) error {
// Map of when to earliest dial each given device + address again
nextDialAt := make(nextDialRegistry)
// Used as delay for the first few connection attempts (adjusted up to
// minConnectionLoopSleep), increased exponentially until it reaches
// stdConnectionLoopSleep, at which time the normal sleep mechanism
// kicks in.
initialRampup := time.Second
for {
cfg := s.cfg.RawCopy()
bestDialerPriority := s.bestDialerPriority(cfg)
isInitialRampup := initialRampup < stdConnectionLoopSleep
l.Debugln("Connection loop")
if isInitialRampup {
l.Debugln("Connection loop in initial rampup")
}
// Used for consistency throughout this loop run, as time passes
// while we try connections etc.
now := time.Now()
// Attempt to dial all devices that are unconnected or can be connection-upgraded
s.dialDevices(ctx, now, cfg, bestDialerPriority, nextDialAt, isInitialRampup)
var sleep time.Duration
if isInitialRampup {
// We are in the initial rampup time, so we slowly, statically
// increase the sleep time.
sleep = initialRampup
initialRampup *= 2
} else {
// The sleep time is until the next dial scheduled in nextDialAt,
// clamped by stdConnectionLoopSleep as we don't want to sleep too
// long (config changes might happen).
sleep = nextDialAt.sleepDurationAndCleanup(now)
}
// ... while making sure not to loop too quickly either.
if sleep < minConnectionLoopSleep {
sleep = minConnectionLoopSleep
}
l.Debugln("Next connection loop in", sleep)
timeout := time.NewTimer(sleep)
select {
case <-s.dialNow:
// Remove affected devices from nextDialAt to dial immediately,
// regardless of when we last dialed it (there's cool down in the
// registry for too many repeat dials).
s.dialNowDevicesMut.Lock()
for device := range s.dialNowDevices {
nextDialAt.redialDevice(device, now)
}
s.dialNowDevices = make(map[protocol.DeviceID]struct{})
s.dialNowDevicesMut.Unlock()
timeout.Stop()
case <-timeout.C:
case <-ctx.Done():
return ctx.Err()
}
}
}
func (s *service) bestDialerPriority(cfg config.Configuration) int {
bestDialerPriority := worstDialerPriority
for _, df := range dialers {
if df.Valid(cfg) != nil {
continue
}
prio := df.New(cfg.Options, s.tlsCfg, s.registry, s.lanChecker).Priority("127.0.0.1")
if prio < bestDialerPriority {
bestDialerPriority = prio
}
}
return bestDialerPriority
}
func (s *service) dialDevices(ctx context.Context, now time.Time, cfg config.Configuration, bestDialerPriority int, nextDialAt nextDialRegistry, initial bool) {
// Figure out current connection limits up front to see if there's any
// point in resolving devices and such at all.
allowAdditional := 0 // no limit
connectionLimit := cfg.Options.LowestConnectionLimit()
if connectionLimit > 0 {
current := s.numConnectedDevices()
allowAdditional = connectionLimit - current
if allowAdditional <= 0 {
l.Debugf("Skipping dial because we've reached the connection limit, current %d >= limit %d", current, connectionLimit)
return
}
}
// Get device statistics for the last seen time of each device. This
// isn't critical, so ignore the potential error.
stats, _ := s.model.DeviceStatistics()
queue := make(dialQueue, 0, len(cfg.Devices))
for _, deviceCfg := range cfg.Devices {
// Don't attempt to connect to ourselves...
if deviceCfg.DeviceID == s.myID {
continue
}
// Don't attempt to connect to paused devices...
if deviceCfg.Paused {
continue
}
// See if we are already connected and, if so, what our cutoff is
// for dialer priority.
priorityCutoff := worstDialerPriority
if currentConns := s.numConnectionsForDevice(deviceCfg.DeviceID); currentConns > 0 {
// Set the priority cutoff to the current connection's priority,
// so that we don't attempt any dialers with worse priority.
priorityCutoff = s.worstConnectionPriority(deviceCfg.DeviceID)
// Reduce the priority cutoff by the upgrade threshold, so that
// we don't attempt dialers that aren't considered a worthy upgrade.
priorityCutoff -= cfg.Options.ConnectionPriorityUpgradeThreshold
if bestDialerPriority >= priorityCutoff && currentConns >= s.desiredConnectionsToDevice(deviceCfg.DeviceID) {
// Our best dialer is not any better than what we already
// have, and we already have the desired number of
// connections to this device,so nothing to do here.
l.Debugf("Skipping dial to %s because we already have %d connections and our best dialer is not better than %d", deviceCfg.DeviceID.Short(), currentConns, priorityCutoff)
continue
}
}
dialTargets := s.resolveDialTargets(ctx, now, cfg, deviceCfg, nextDialAt, initial, priorityCutoff)
if len(dialTargets) > 0 {
queue = append(queue, dialQueueEntry{
id: deviceCfg.DeviceID,
lastSeen: stats[deviceCfg.DeviceID].LastSeen,
shortLived: stats[deviceCfg.DeviceID].LastConnectionDurationS < shortLivedConnectionThreshold.Seconds(),
targets: dialTargets,
})
}
}
// Sort the queue in an order we think will be useful (most recent
// first, deprioritising unstable devices, randomizing those we haven't
// seen in a long while). If we don't do connection limiting the sorting
// doesn't have much effect, but it may result in getting up and running
// quicker if only a subset of configured devices are actually reachable
// (by prioritizing those that were reachable recently).
queue.Sort()
// Perform dials according to the queue, stopping when we've reached the
// allowed additional number of connections (if limited).
numConns := 0
var numConnsMut stdsync.Mutex
dialSemaphore := semaphore.New(dialMaxParallel)
dialWG := new(stdsync.WaitGroup)
dialCtx, dialCancel := context.WithCancel(ctx)
defer func() {
dialWG.Wait()
dialCancel()
}()
for i := range queue {
select {
case <-dialCtx.Done():
return
default:
}
dialWG.Add(1)
go func(entry dialQueueEntry) {
defer dialWG.Done()
conn, ok := s.dialParallel(dialCtx, entry.id, entry.targets, dialSemaphore)
if !ok {
return
}
numConnsMut.Lock()
if allowAdditional == 0 || numConns < allowAdditional {
select {
case s.conns <- conn:
numConns++
if allowAdditional > 0 && numConns >= allowAdditional {
dialCancel()
}
case <-dialCtx.Done():
}
}
numConnsMut.Unlock()
}(queue[i])
}
}
func (s *service) resolveDialTargets(ctx context.Context, now time.Time, cfg config.Configuration, deviceCfg config.DeviceConfiguration, nextDialAt nextDialRegistry, initial bool, priorityCutoff int) []dialTarget {
deviceID := deviceCfg.DeviceID
addrs := s.resolveDeviceAddrs(ctx, deviceCfg)
l.Debugln("Resolved device", deviceID.Short(), "addresses:", addrs)
dialTargets := make([]dialTarget, 0, len(addrs))
for _, addr := range addrs {
// Use both device and address, as you might have two devices connected
// to the same relay
if !initial && nextDialAt.get(deviceID, addr).After(now) {
l.Debugf("Not dialing %s via %v as it's not time yet", deviceID.Short(), addr)
continue
}
// If we fail at any step before actually getting the dialer
// retry in a minute
nextDialAt.set(deviceID, addr, now.Add(time.Minute))
uri, err := url.Parse(addr)
if err != nil {
s.setConnectionStatus(addr, err)
l.Infof("Parsing dialer address %s: %v", addr, err)
continue
}
if len(deviceCfg.AllowedNetworks) > 0 {
if !IsAllowedNetwork(uri.Host, deviceCfg.AllowedNetworks) {
s.setConnectionStatus(addr, errors.New("network disallowed"))
l.Debugln("Network for", uri, "is disallowed")
continue
}
}
dialerFactory, err := getDialerFactory(cfg, uri)
if err != nil {
s.setConnectionStatus(addr, err)
}
if errors.Is(err, errUnsupported) {
l.Debugf("Dialer for %v: %v", uri, err)
continue
} else if err != nil {
l.Infof("Dialer for %v: %v", uri, err)
continue
}
dialer := dialerFactory.New(s.cfg.Options(), s.tlsCfg, s.registry, s.lanChecker)
priority := dialer.Priority(uri.Host)
currentConns := s.numConnectionsForDevice(deviceCfg.DeviceID)
if priority > priorityCutoff {
l.Debugf("Not dialing %s at %s using %s as priority is worse than current connection (%d > %d)", deviceID.Short(), addr, dialerFactory, priority, priorityCutoff)
continue
}
if currentConns > 0 && !dialer.AllowsMultiConns() {
l.Debugf("Not dialing %s at %s using %s as it does not allow multiple connections and we already have a connection", deviceID.Short(), addr, dialerFactory)
continue
}
if currentConns >= s.desiredConnectionsToDevice(deviceCfg.DeviceID) && priority == priorityCutoff {
l.Debugf("Not dialing %s at %s using %s as priority is equal and we already have %d/%d connections", deviceID.Short(), addr, dialerFactory, currentConns, deviceCfg.NumConnections)
continue
}
nextDialAt.set(deviceID, addr, now.Add(dialer.RedialFrequency()))
dialTargets = append(dialTargets, dialTarget{
addr: addr,
dialer: dialer,
priority: priority,
deviceID: deviceID,
uri: uri,
})
}
return dialTargets
}
func (s *service) resolveDeviceAddrs(ctx context.Context, cfg config.DeviceConfiguration) []string {
var addrs []string
for _, addr := range cfg.Addresses {
if addr == "dynamic" {
if s.discoverer != nil {
if t, err := s.discoverer.Lookup(ctx, cfg.DeviceID); err == nil {
addrs = append(addrs, t...)
}
}
} else {
addrs = append(addrs, addr)
}
}
return stringutil.UniqueTrimmedStrings(addrs)
}
type lanChecker struct {
cfg config.Wrapper
}
func (s *lanChecker) isLANHost(host string) bool {
// Probably we are called with an ip:port combo which we can resolve as
// a TCP address.
if addr, err := net.ResolveTCPAddr("tcp", host); err == nil {
return s.isLAN(addr)
}
// ... but this function looks general enough that someone might try
// with just an IP as well in the future so lets allow that.
if addr, err := net.ResolveIPAddr("ip", host); err == nil {
return s.isLAN(addr)
}
return false
}
func (s *lanChecker) isLAN(addr net.Addr) bool {
var ip net.IP
switch addr := addr.(type) {
case *net.IPAddr:
ip = addr.IP
case *net.TCPAddr:
ip = addr.IP
case *net.UDPAddr:
ip = addr.IP
default:
// From the standard library, just Unix sockets.
// If you invent your own, handle it.
return false
}
if ip.IsLoopback() {
return true
}
if ip.IsLinkLocalUnicast() {
return true
}
for _, lan := range s.cfg.Options().AlwaysLocalNets {
_, ipnet, err := net.ParseCIDR(lan)
if err != nil {
l.Debugln("Network", lan, "is malformed:", err)
continue
}
if ipnet.Contains(ip) {
return true
}
}
lans, err := osutil.GetLans()
if err != nil {
l.Debugln("Failed to retrieve interface IPs:", err)
priv := ip.IsPrivate()
l.Debugf("Assuming isLAN=%v for IP %v", priv, ip)
return priv
}
for _, lan := range lans {
if lan.Contains(ip) {
return true
}
}
return false
}
func (s *service) createListener(factory listenerFactory, uri *url.URL) bool {
// must be called with listenerMut held
l.Debugln("Starting listener", uri)
listener := factory.New(uri, s.cfg, s.tlsCfg, s.conns, s.natService, s.registry, s.lanChecker)
listener.OnAddressesChanged(s.logListenAddressesChangedEvent)
// Retrying a listener many times in rapid succession is unlikely to help,
// thus back off quickly. A listener may soon be functional again, e.g. due
// to a network interface coming back online - retry every minute.
spec := svcutil.SpecWithInfoLogger(l)
spec.FailureThreshold = 2
spec.FailureBackoff = time.Minute
sup := suture.New(fmt.Sprintf("listenerSupervisor@%v", listener), spec)
sup.Add(listener)
s.listeners[uri.String()] = listener
s.listenerTokens[uri.String()] = s.Add(sup)
return true
}
func (s *service) logListenAddressesChangedEvent(l ListenerAddresses) {
s.evLogger.Log(events.ListenAddressesChanged, map[string]interface{}{
"address": l.URI,
"lan": l.LANAddresses,
"wan": l.WANAddresses,
})
}
func (s *service) CommitConfiguration(from, to config.Configuration) bool {
newDevices := make(map[protocol.DeviceID]bool, len(to.Devices))
for _, dev := range to.Devices {
newDevices[dev.DeviceID] = true
registerDeviceMetrics(dev.DeviceID.String())
}
for _, dev := range from.Devices {
if !newDevices[dev.DeviceID] {
warningLimitersMut.Lock()
delete(warningLimiters, dev.DeviceID)
warningLimitersMut.Unlock()
metricDeviceActiveConnections.DeleteLabelValues(dev.DeviceID.String())
}
}
s.checkAndSignalConnectLoopOnUpdatedDevices(from, to)
s.listenersMut.Lock()
seen := make(map[string]struct{})
for _, addr := range to.Options.ListenAddresses() {
if addr == "" {
// We can get an empty address if there is an empty listener
// element in the config, indicating no listeners should be
// used. This is not an error.
continue
}
uri, err := url.Parse(addr)
if err != nil {
l.Warnf("Skipping malformed listener URL %q: %v", addr, err)
continue
}
// Make sure we always have the canonical representation of the URL.
// This is for consistency as we use it as a map key, but also to
// avoid misunderstandings. We do not just use the canonicalized
// version, because an URL that looks very similar to a human might
// mean something entirely different to the computer (e.g.,
// tcp:/127.0.0.1:22000 in fact being equivalent to tcp://:22000).
if canonical := uri.String(); canonical != addr {
l.Warnf("Skipping malformed listener URL %q (not canonical)", addr)
continue
}
if _, ok := s.listeners[addr]; ok {
seen[addr] = struct{}{}
continue
}
factory, err := getListenerFactory(to, uri)
if errors.Is(err, errUnsupported) {
l.Debugf("Listener for %v: %v", uri, err)
continue
} else if err != nil {
l.Infof("Listener for %v: %v", uri, err)
continue
}
s.createListener(factory, uri)
seen[addr] = struct{}{}
}
for addr, listener := range s.listeners {
if _, ok := seen[addr]; !ok || listener.Factory().Valid(to) != nil {
l.Debugln("Stopping listener", addr)
s.Remove(s.listenerTokens[addr])
delete(s.listenerTokens, addr)
delete(s.listeners, addr)
}
}
s.listenersMut.Unlock()
return true
}
func (s *service) checkAndSignalConnectLoopOnUpdatedDevices(from, to config.Configuration) {
oldDevices := from.DeviceMap()
dial := false
s.dialNowDevicesMut.Lock()
for _, dev := range to.Devices {
if dev.Paused {
continue
}
if oldDev, ok := oldDevices[dev.DeviceID]; !ok || oldDev.Paused {
s.dialNowDevices[dev.DeviceID] = struct{}{}
dial = true
} else if !slices.Equal(oldDev.Addresses, dev.Addresses) {
dial = true
}
}
if dial {
s.scheduleDialNow()
}
s.dialNowDevicesMut.Unlock()
}
func (s *service) scheduleDialNow() {
select {
case s.dialNow <- struct{}{}:
default:
// channel is blocked - a config update is already pending for the connection loop.
}
}
func (s *service) AllAddresses() []string {
s.listenersMut.RLock()
var addrs []string
for _, listener := range s.listeners {
for _, lanAddr := range listener.LANAddresses() {
addrs = append(addrs, lanAddr.String())
}
for _, wanAddr := range listener.WANAddresses() {
addrs = append(addrs, wanAddr.String())
}
}
s.listenersMut.RUnlock()
return stringutil.UniqueTrimmedStrings(addrs)
}
func (s *service) ExternalAddresses() []string {
if s.cfg.Options().AnnounceLANAddresses {
return s.AllAddresses()
}
s.listenersMut.RLock()
var addrs []string
for _, listener := range s.listeners {
for _, wanAddr := range listener.WANAddresses() {
addrs = append(addrs, wanAddr.String())
}
}
s.listenersMut.RUnlock()
return stringutil.UniqueTrimmedStrings(addrs)
}
func (s *service) ListenerStatus() map[string]ListenerStatusEntry {
result := make(map[string]ListenerStatusEntry)
s.listenersMut.RLock()
for addr, listener := range s.listeners {
var status ListenerStatusEntry
if err := listener.Error(); err != nil {
errStr := err.Error()
status.Error = &errStr
}
status.LANAddresses = urlsToStrings(listener.LANAddresses())
status.WANAddresses = urlsToStrings(listener.WANAddresses())
result[addr] = status
}
s.listenersMut.RUnlock()
return result
}
type connectionStatusHandler struct {
connectionStatusMut sync.RWMutex
connectionStatus map[string]ConnectionStatusEntry // address -> latest error/status
}
func newConnectionStatusHandler() connectionStatusHandler {
return connectionStatusHandler{
connectionStatusMut: sync.NewRWMutex(),
connectionStatus: make(map[string]ConnectionStatusEntry),
}
}
func (s *connectionStatusHandler) ConnectionStatus() map[string]ConnectionStatusEntry {
result := make(map[string]ConnectionStatusEntry)
s.connectionStatusMut.RLock()
for k, v := range s.connectionStatus {
result[k] = v
}
s.connectionStatusMut.RUnlock()
return result
}
func (s *connectionStatusHandler) setConnectionStatus(address string, err error) {
if errors.Is(err, context.Canceled) {
return
}
status := ConnectionStatusEntry{When: time.Now().UTC().Truncate(time.Second)}
if err != nil {
errStr := err.Error()
status.Error = &errStr
}
s.connectionStatusMut.Lock()
s.connectionStatus[address] = status
s.connectionStatusMut.Unlock()
}
func (s *service) NATType() string {
s.listenersMut.RLock()
defer s.listenersMut.RUnlock()
for _, listener := range s.listeners {
natType := listener.NATType()
if natType != "unknown" {
return natType
}
}
return "unknown"
}
func getDialerFactory(cfg config.Configuration, uri *url.URL) (dialerFactory, error) {
dialerFactory, ok := dialers[uri.Scheme]
if !ok {
return nil, fmt.Errorf("unknown address scheme %q", uri.Scheme)
}
if err := dialerFactory.Valid(cfg); err != nil {
return nil, err
}
return dialerFactory, nil
}
func getListenerFactory(cfg config.Configuration, uri *url.URL) (listenerFactory, error) {
listenerFactory, ok := listeners[uri.Scheme]
if !ok {
return nil, fmt.Errorf("unknown address scheme %q", uri.Scheme)
}
if err := listenerFactory.Valid(cfg); err != nil {
return nil, err
}
return listenerFactory, nil
}
func urlsToStrings(urls []*url.URL) []string {
strings := make([]string, len(urls))
for i, url := range urls {
strings[i] = url.String()
}
return strings
}
var (
warningLimiters = make(map[protocol.DeviceID]*rate.Limiter)
warningLimitersMut = sync.NewMutex()
)
func warningFor(dev protocol.DeviceID, msg string) {
warningLimitersMut.Lock()
defer warningLimitersMut.Unlock()
lim, ok := warningLimiters[dev]
if !ok {
lim = rate.NewLimiter(rate.Every(perDeviceWarningIntv), 1)
warningLimiters[dev] = lim
}
if lim.Allow() {
l.Warnln(msg)
}
}
func tlsTimedHandshake(tc *tls.Conn) error {
tc.SetDeadline(time.Now().Add(tlsHandshakeTimeout))
defer tc.SetDeadline(time.Time{})
return tc.Handshake()
}
// IsAllowedNetwork returns true if the given host (IP or resolvable
// hostname) is in the set of allowed networks (CIDR format only).
func IsAllowedNetwork(host string, allowed []string) bool {
if hostNoPort, _, err := net.SplitHostPort(host); err == nil {
host = hostNoPort
}
addr, err := net.ResolveIPAddr("ip", host)
if err != nil {
return false
}
for _, n := range allowed {
result := true
if strings.HasPrefix(n, "!") {
result = false
n = n[1:]
}
_, cidr, err := net.ParseCIDR(n)
if err != nil {
continue
}
if cidr.Contains(addr.IP) {
return result
}
}
return false
}
func (s *service) dialParallel(ctx context.Context, deviceID protocol.DeviceID, dialTargets []dialTarget, parentSema *semaphore.Semaphore) (internalConn, bool) {
// Group targets into buckets by priority
dialTargetBuckets := make(map[int][]dialTarget, len(dialTargets))
for _, tgt := range dialTargets {
dialTargetBuckets[tgt.priority] = append(dialTargetBuckets[tgt.priority], tgt)
}
// Get all available priorities
priorities := make([]int, 0, len(dialTargetBuckets))
for prio := range dialTargetBuckets {
priorities = append(priorities, prio)
}
// Sort the priorities so that we dial lowest first (which means highest...)
sort.Ints(priorities)
sema := semaphore.MultiSemaphore{semaphore.New(dialMaxParallelPerDevice), parentSema}
for _, prio := range priorities {
tgts := dialTargetBuckets[prio]
res := make(chan internalConn, len(tgts))
wg := stdsync.WaitGroup{}
for _, tgt := range tgts {
sema.Take(1)
wg.Add(1)
go func(tgt dialTarget) {
defer func() {
wg.Done()
sema.Give(1)
}()
conn, err := tgt.Dial(ctx)
if err == nil {
// Closes the connection on error
err = s.validateIdentity(conn, deviceID)
}
s.setConnectionStatus(tgt.addr, err)
if err != nil {
l.Debugln("dialing", deviceID, tgt.uri, "error:", err)
} else {
l.Debugln("dialing", deviceID, tgt.uri, "success:", conn)
res <- conn
}
}(tgt)
}
// Spawn a routine which will unblock main routine in case we fail
// to connect to anyone.
go func() {
wg.Wait()
close(res)
}()
// Wait for the first connection, or for channel closure.
if conn, ok := <-res; ok {
// Got a connection, means more might come back, hence spawn a
// routine that will do the discarding.
l.Debugln("connected to", deviceID, prio, "using", conn, conn.priority)
go func(deviceID protocol.DeviceID, prio int) {
wg.Wait()
l.Debugln("discarding", len(res), "connections while connecting to", deviceID, prio)
for conn := range res {
conn.Close()
}
}(deviceID, prio)
return conn, ok
}
// Failed to connect, report that fact.
l.Debugln("failed to connect to", deviceID, prio)
}
return internalConn{}, false
}
func (s *service) validateIdentity(c internalConn, expectedID protocol.DeviceID) error {
cs := c.ConnectionState()
// We should have received exactly one certificate from the other
// side. If we didn't, they don't have a device ID and we drop the
// connection.
certs := cs.PeerCertificates
if cl := len(certs); cl != 1 {
l.Infof("Got peer certificate list of length %d != 1 from peer at %s; protocol error", cl, c)
c.Close()
return fmt.Errorf("expected 1 certificate, got %d", cl)
}
remoteCert := certs[0]
remoteID := protocol.NewDeviceID(remoteCert.Raw)
// The device ID should not be that of ourselves. It can happen
// though, especially in the presence of NAT hairpinning, multiple
// clients between the same NAT gateway, and global discovery.
if remoteID == s.myID {
l.Debugf("Connected to myself (%s) at %s", remoteID, c)
c.Close()
return errors.New("connected to self")
}
// We should see the expected device ID
if !remoteID.Equals(expectedID) {
c.Close()
return fmt.Errorf("unexpected device id, expected %s got %s", expectedID, remoteID)
}
return nil
}
type nextDialRegistry map[protocol.DeviceID]nextDialDevice
type nextDialDevice struct {
nextDial map[string]time.Time
coolDownIntervalStart time.Time
attempts int
}
func (r nextDialRegistry) get(device protocol.DeviceID, addr string) time.Time {
return r[device].nextDial[addr]
}
const (
dialCoolDownInterval = 2 * time.Minute
dialCoolDownDelay = 5 * time.Minute
dialCoolDownMaxAttempts = 3
)
// redialDevice marks the device for immediate redial, unless the remote keeps
// dropping established connections. Thus we keep track of when the first forced
// re-dial happened, and how many attempts happen in the dialCoolDownInterval
// after that. If it's more than dialCoolDownMaxAttempts, don't force-redial
// that device for dialCoolDownDelay (regular dials still happen).
func (r nextDialRegistry) redialDevice(device protocol.DeviceID, now time.Time) {
dev, ok := r[device]
if !ok {
r[device] = nextDialDevice{
nextDial: make(map[string]time.Time),
coolDownIntervalStart: now,
attempts: 1,
}
return
}
if dev.attempts == 0 || now.Before(dev.coolDownIntervalStart.Add(dialCoolDownInterval)) {
if dev.attempts >= dialCoolDownMaxAttempts {
// Device has been force redialed too often - let it cool down.
return
}
if dev.attempts == 0 {
dev.coolDownIntervalStart = now
}
dev.attempts++
dev.nextDial = make(map[string]time.Time)
return
}
if dev.attempts >= dialCoolDownMaxAttempts && now.Before(dev.coolDownIntervalStart.Add(dialCoolDownDelay)) {
return // Still cooling down
}
delete(r, device)
}
func (r nextDialRegistry) set(device protocol.DeviceID, addr string, next time.Time) {
if _, ok := r[device]; !ok {
r[device] = nextDialDevice{nextDial: make(map[string]time.Time)}
}
r[device].nextDial[addr] = next
}
func (r nextDialRegistry) sleepDurationAndCleanup(now time.Time) time.Duration {
sleep := stdConnectionLoopSleep
for id, dev := range r {
for address, next := range dev.nextDial {
if next.Before(now) {
// Expired entry, address was not seen in last pass(es)
delete(dev.nextDial, address)
continue
}
if cur := next.Sub(now); cur < sleep {
sleep = cur
}
}
if dev.attempts > 0 {
interval := dialCoolDownInterval
if dev.attempts >= dialCoolDownMaxAttempts {
interval = dialCoolDownDelay
}
if now.After(dev.coolDownIntervalStart.Add(interval)) {
dev.attempts = 0
}
}
if len(dev.nextDial) == 0 && dev.attempts == 0 {
delete(r, id)
}
}
return sleep
}
func (s *service) desiredConnectionsToDevice(deviceID protocol.DeviceID) int {
cfg, ok := s.cfg.Device(deviceID)
if !ok {
// We want no connections to an unknown device.
return 0
}
otherSide := s.wantConnectionsForDevice(deviceID)
thisSide := cfg.NumConnections()
switch {
case otherSide <= 0:
// The other side doesn't support multiple connections, or we
// haven't yet connected to them so we don't know what they support
// or not. Use a single connection until we know better.
return 1
case otherSide == 1:
// The other side supports multiple connections, but only wants
// one. We should honour that.
return 1
case thisSide == 1:
// We want only one connection, so we should honour that.
return 1
// Finally, we allow negotiation and use the higher of the two values,
// while keeping at or below the max allowed value.
default:
return min(max(thisSide, otherSide), maxNumConnections)
}
}
// The deviceConnectionTracker keeps track of how many devices we are
// connected to and how many connections we have to each device. It also
// tracks how many connections they are willing to use.
type deviceConnectionTracker struct {
connectionsMut stdsync.Mutex
connections map[protocol.DeviceID][]protocol.Connection // current connections
wantConnections map[protocol.DeviceID]int // number of connections they want
}
func (c *deviceConnectionTracker) accountAddedConnection(conn protocol.Connection, h protocol.Hello, upgradeThreshold int) {
c.connectionsMut.Lock()
defer c.connectionsMut.Unlock()
// Lazily initialize the maps
if c.connections == nil {
c.connections = make(map[protocol.DeviceID][]protocol.Connection)
c.wantConnections = make(map[protocol.DeviceID]int)
}
// Add the connection to the list of current connections and remember
// how many total connections they want
d := conn.DeviceID()
c.connections[d] = append(c.connections[d], conn)
c.wantConnections[d] = int(h.NumConnections)
l.Debugf("Added connection for %s (now %d), they want %d connections", d.Short(), len(c.connections[d]), h.NumConnections)
// Update active connections metric
metricDeviceActiveConnections.WithLabelValues(d.String()).Inc()
// Close any connections we no longer want to retain.
c.closeWorsePriorityConnectionsLocked(d, conn.Priority()-upgradeThreshold)
}
func (c *deviceConnectionTracker) accountRemovedConnection(conn protocol.Connection) {
c.connectionsMut.Lock()
defer c.connectionsMut.Unlock()
d := conn.DeviceID()
cid := conn.ConnectionID()
// Remove the connection from the list of current connections
for i, conn := range c.connections[d] {
if conn.ConnectionID() == cid {
c.connections[d] = sliceutil.RemoveAndZero(c.connections[d], i)
break
}
}
// Clean up if required
if len(c.connections[d]) == 0 {
delete(c.connections, d)
delete(c.wantConnections, d)
}
// Update active connections metric
metricDeviceActiveConnections.WithLabelValues(d.String()).Dec()
l.Debugf("Removed connection for %s (now %d)", d.Short(), c.connections[d])
}
func (c *deviceConnectionTracker) numConnectionsForDevice(d protocol.DeviceID) int {
c.connectionsMut.Lock()
defer c.connectionsMut.Unlock()
return len(c.connections[d])
}
func (c *deviceConnectionTracker) wantConnectionsForDevice(d protocol.DeviceID) int {
c.connectionsMut.Lock()
defer c.connectionsMut.Unlock()
return c.wantConnections[d]
}
func (c *deviceConnectionTracker) numConnectedDevices() int {
c.connectionsMut.Lock()
defer c.connectionsMut.Unlock()
return len(c.connections)
}
func (c *deviceConnectionTracker) worstConnectionPriority(d protocol.DeviceID) int {
c.connectionsMut.Lock()
defer c.connectionsMut.Unlock()
if len(c.connections[d]) == 0 {
return math.MaxInt // worst possible priority
}
worstPriority := c.connections[d][0].Priority()
for _, conn := range c.connections[d][1:] {
if p := conn.Priority(); p > worstPriority {
worstPriority = p
}
}
return worstPriority
}
// closeWorsePriorityConnectionsLocked closes all connections to the given
// device that are worse than the cutoff priority. Must be called with the
// lock held.
func (c *deviceConnectionTracker) closeWorsePriorityConnectionsLocked(d protocol.DeviceID, cutoff int) {
for _, conn := range c.connections[d] {
if p := conn.Priority(); p > cutoff {
l.Debugf("Closing connection %s to %s with priority %d (cutoff %d)", conn, d.Short(), p, cutoff)
go conn.Close(errReplacingConnection)
}
}
}
// newConnectionID generates a connection ID. The connection ID is designed
// to be unique for each connection and chronologically sortable. It is
// based on the sum of two timestamps: when we think the connection was
// started, and when the other side thinks the connection was started. We
// then add some random data for good measure. This way, even if the other
// side does some funny business with the timestamp, we will get no worse
// than random connection IDs.
func newConnectionID(t0, t1 int64) string {
var buf [16]byte // 8 bytes timestamp, 8 bytes random
binary.BigEndian.PutUint64(buf[:], uint64(t0+t1))
_, _ = io.ReadFull(rand.Reader, buf[8:])
enc := base32.HexEncoding.WithPadding(base32.NoPadding)
// We encode the two parts separately and concatenate the results. The
// reason for this is that the timestamp (64 bits) doesn't precisely
// align to the base32 encoding (5 bits per character), so we'd get a
// character in the middle that is a mix of bits from the timestamp and
// from the random. We want the timestamp part deterministic.
return enc.EncodeToString(buf[:8]) + enc.EncodeToString(buf[8:])
}