Handle relay connect timeout (fixes #8749) (#8755)

This makes sure the service manager doesn't interpret timeout errors, or any other error, as a signal to stop the service instead of restarting it.

I added it directly to our service utility function, as it may help catch other instances of the same problem... We would typically want timeouts etc to be a retryable error, unless it is the top level context that has timed out and we check for that specifically.
This commit is contained in:
Jakob Borg 2023-01-19 11:15:18 +01:00 committed by GitHub
parent 5f1e27bb7f
commit abdac2caa2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -122,7 +122,12 @@ func (s *service) Serve(ctx context.Context) error {
s.err = nil
s.mut.Unlock()
err := s.serve(ctx)
// The error returned by serve() may well be a network timeout, which as
// of Go 1.19 is a context.DeadlineExceeded, which Suture interprets as
// a signal to stop the service instead of restarting it. This typically
// isn't what we want, so we make sure to remove the context specific
// error types unless *our* context is actually cancelled.
err := asNonContextError(ctx, s.serve(ctx))
s.mut.Lock()
s.err = err
@ -139,7 +144,6 @@ func (s *service) Error() error {
func (s *service) String() string {
return fmt.Sprintf("Service@%p created by %v", s, s.creator)
}
type doneService func()
@ -203,3 +207,19 @@ func infoEventHook(l logger.Logger) suture.EventHook {
}
}
}
// asNonContextError returns err, except if it is context.Canceled or
// context.DeadlineExceeded in which case the error will be a simple string
// representation instead. The given context is checked for cancellation,
// and if it is cancelled then that error is returned instead of err.
func asNonContextError(ctx context.Context, err error) error {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return fmt.Errorf("%s (non-context)", err.Error())
}
return err
}