2
2
mirror of https://github.com/octoleo/restic.git synced 2024-06-01 08:30:49 +00:00
restic/src/restic/backend/rest/rest.go

285 lines
5.8 KiB
Go
Raw Normal View History

package rest
import (
"bytes"
"encoding/json"
"fmt"
"io"
Fix REST backend HTTP keepalive This is subtle. A combination od fast client disk (read: SSD) with lots of files and fast network connection to restic-server would suddenly start getting lots of "dial tcp: connect: cannot assign requested address" errors during backup stage. Further inspection revealed that client machine was plagued with TCP sockets in TIME_WAIT state. When ephemeral port range was finally exhausted, no more sockets could be opened, so restic would freak out. To understand the magnitude of this problem, with ~18k ports and default timeout of 60 seconds, it means more than 300 HTTP connections per seconds were created and teared down. Yeah, restic-server is that fast. :) As it turns out, this behavior was product of 2 subtle issues: 1) The body of HTTP response wasn't read completely with io.ReadFull() at the end of the Load() function. This deactivated HTTP keepalive, so already open connections were not reused, but closed instead, and new ones opened for every new request. io.Copy(ioutil.Discard, resp.Body) before resp.Body.Close() remedies this. 2) Even with the above fix, somehow having MaxIdleConnsPerHost at its default value of 2 wasn't enough to stop reconnecting. It is hard to understand why this would be so detrimental, it could even be some subtle Go runtime bug. Anyhow, setting this value to match the connection limit, as set by connLimit global variable, finally nails this ugly bug. I fixed several other places where the response body wasn't read in full (or at all). For example, json.NewDecoder() is also known not to read the whole body of response. Unfortunately, this is not over yet. :( The check command is firing up to 40 simultaneous connections to the restic-server. Then, once again, MaxIdleConnsPerHost is too low to support keepalive, and sockets in the TIME_WAIT state pile up. But, as this kind of concurrency absolutely kill the poor disk on the server side, this is a completely different bug then.
2016-11-09 21:37:20 +00:00
"io/ioutil"
"net/http"
"net/url"
"path"
2016-08-31 20:39:36 +00:00
"restic"
2016-02-21 15:35:25 +00:00
"strings"
2016-09-01 20:17:37 +00:00
"restic/errors"
"restic/backend"
)
const connLimit = 10
// restPath returns the path to the given resource.
2016-08-31 20:39:36 +00:00
func restPath(url *url.URL, h restic.Handle) string {
2016-02-21 15:35:25 +00:00
u := *url
var dir string
2016-09-01 19:19:30 +00:00
switch h.Type {
2016-08-31 20:39:36 +00:00
case restic.ConfigFile:
dir = ""
2016-02-21 15:35:25 +00:00
h.Name = "config"
2016-08-31 20:39:36 +00:00
case restic.DataFile:
dir = backend.Paths.Data
2016-08-31 20:39:36 +00:00
case restic.SnapshotFile:
dir = backend.Paths.Snapshots
2016-08-31 20:39:36 +00:00
case restic.IndexFile:
dir = backend.Paths.Index
2016-08-31 20:39:36 +00:00
case restic.LockFile:
dir = backend.Paths.Locks
2016-08-31 20:39:36 +00:00
case restic.KeyFile:
dir = backend.Paths.Keys
default:
2016-09-01 19:19:30 +00:00
dir = string(h.Type)
}
2016-02-21 15:35:25 +00:00
u.Path = path.Join(url.Path, dir, h.Name)
return u.String()
}
type restBackend struct {
url *url.URL
connChan chan struct{}
client http.Client
}
// Open opens the REST backend with the given config.
2016-08-31 20:51:35 +00:00
func Open(cfg Config) (restic.Backend, error) {
connChan := make(chan struct{}, connLimit)
for i := 0; i < connLimit; i++ {
connChan <- struct{}{}
}
Fix REST backend HTTP keepalive This is subtle. A combination od fast client disk (read: SSD) with lots of files and fast network connection to restic-server would suddenly start getting lots of "dial tcp: connect: cannot assign requested address" errors during backup stage. Further inspection revealed that client machine was plagued with TCP sockets in TIME_WAIT state. When ephemeral port range was finally exhausted, no more sockets could be opened, so restic would freak out. To understand the magnitude of this problem, with ~18k ports and default timeout of 60 seconds, it means more than 300 HTTP connections per seconds were created and teared down. Yeah, restic-server is that fast. :) As it turns out, this behavior was product of 2 subtle issues: 1) The body of HTTP response wasn't read completely with io.ReadFull() at the end of the Load() function. This deactivated HTTP keepalive, so already open connections were not reused, but closed instead, and new ones opened for every new request. io.Copy(ioutil.Discard, resp.Body) before resp.Body.Close() remedies this. 2) Even with the above fix, somehow having MaxIdleConnsPerHost at its default value of 2 wasn't enough to stop reconnecting. It is hard to understand why this would be so detrimental, it could even be some subtle Go runtime bug. Anyhow, setting this value to match the connection limit, as set by connLimit global variable, finally nails this ugly bug. I fixed several other places where the response body wasn't read in full (or at all). For example, json.NewDecoder() is also known not to read the whole body of response. Unfortunately, this is not over yet. :( The check command is firing up to 40 simultaneous connections to the restic-server. Then, once again, MaxIdleConnsPerHost is too low to support keepalive, and sockets in the TIME_WAIT state pile up. But, as this kind of concurrency absolutely kill the poor disk on the server side, this is a completely different bug then.
2016-11-09 21:37:20 +00:00
tr := &http.Transport{MaxIdleConnsPerHost: connLimit}
client := http.Client{Transport: tr}
return &restBackend{url: cfg.URL, connChan: connChan, client: client}, nil
}
// Location returns this backend's location (the server's URL).
func (b *restBackend) Location() string {
return b.url.String()
}
// Load returns the data stored in the backend for h at the given offset
// and saves it in p. Load has the same semantics as io.ReaderAt.
2016-08-31 20:39:36 +00:00
func (b *restBackend) Load(h restic.Handle, p []byte, off int64) (n int, err error) {
if err := h.Valid(); err != nil {
return 0, err
}
2016-08-07 12:50:24 +00:00
// invert offset
if off < 0 {
info, err := b.Stat(h)
if err != nil {
2016-08-29 19:54:50 +00:00
return 0, errors.Wrap(err, "Stat")
2016-08-07 12:50:24 +00:00
}
if -off > info.Size {
off = 0
} else {
off = info.Size + off
2016-08-07 12:50:24 +00:00
}
}
req, err := http.NewRequest("GET", restPath(b.url, h), nil)
if err != nil {
2016-08-29 19:54:50 +00:00
return 0, errors.Wrap(err, "http.NewRequest")
}
req.Header.Add("Range", fmt.Sprintf("bytes=%d-%d", off, off+int64(len(p))))
<-b.connChan
resp, err := b.client.Do(req)
b.connChan <- struct{}{}
if resp != nil {
defer func() {
Fix REST backend HTTP keepalive This is subtle. A combination od fast client disk (read: SSD) with lots of files and fast network connection to restic-server would suddenly start getting lots of "dial tcp: connect: cannot assign requested address" errors during backup stage. Further inspection revealed that client machine was plagued with TCP sockets in TIME_WAIT state. When ephemeral port range was finally exhausted, no more sockets could be opened, so restic would freak out. To understand the magnitude of this problem, with ~18k ports and default timeout of 60 seconds, it means more than 300 HTTP connections per seconds were created and teared down. Yeah, restic-server is that fast. :) As it turns out, this behavior was product of 2 subtle issues: 1) The body of HTTP response wasn't read completely with io.ReadFull() at the end of the Load() function. This deactivated HTTP keepalive, so already open connections were not reused, but closed instead, and new ones opened for every new request. io.Copy(ioutil.Discard, resp.Body) before resp.Body.Close() remedies this. 2) Even with the above fix, somehow having MaxIdleConnsPerHost at its default value of 2 wasn't enough to stop reconnecting. It is hard to understand why this would be so detrimental, it could even be some subtle Go runtime bug. Anyhow, setting this value to match the connection limit, as set by connLimit global variable, finally nails this ugly bug. I fixed several other places where the response body wasn't read in full (or at all). For example, json.NewDecoder() is also known not to read the whole body of response. Unfortunately, this is not over yet. :( The check command is firing up to 40 simultaneous connections to the restic-server. Then, once again, MaxIdleConnsPerHost is too low to support keepalive, and sockets in the TIME_WAIT state pile up. But, as this kind of concurrency absolutely kill the poor disk on the server side, this is a completely different bug then.
2016-11-09 21:37:20 +00:00
io.Copy(ioutil.Discard, resp.Body)
e := resp.Body.Close()
if err == nil {
2016-08-29 19:54:50 +00:00
err = errors.Wrap(e, "Close")
}
}()
}
if err != nil {
2016-08-29 19:54:50 +00:00
return 0, errors.Wrap(err, "client.Do")
}
2016-02-21 15:35:25 +00:00
if resp.StatusCode != 200 && resp.StatusCode != 206 {
return 0, errors.Errorf("unexpected HTTP response code %v", resp.StatusCode)
}
return io.ReadFull(resp.Body, p)
}
// Save stores data in the backend at the handle.
2016-08-31 20:39:36 +00:00
func (b *restBackend) Save(h restic.Handle, p []byte) (err error) {
if err := h.Valid(); err != nil {
return err
}
<-b.connChan
resp, err := b.client.Post(restPath(b.url, h), "binary/octet-stream", bytes.NewReader(p))
b.connChan <- struct{}{}
if resp != nil {
defer func() {
Fix REST backend HTTP keepalive This is subtle. A combination od fast client disk (read: SSD) with lots of files and fast network connection to restic-server would suddenly start getting lots of "dial tcp: connect: cannot assign requested address" errors during backup stage. Further inspection revealed that client machine was plagued with TCP sockets in TIME_WAIT state. When ephemeral port range was finally exhausted, no more sockets could be opened, so restic would freak out. To understand the magnitude of this problem, with ~18k ports and default timeout of 60 seconds, it means more than 300 HTTP connections per seconds were created and teared down. Yeah, restic-server is that fast. :) As it turns out, this behavior was product of 2 subtle issues: 1) The body of HTTP response wasn't read completely with io.ReadFull() at the end of the Load() function. This deactivated HTTP keepalive, so already open connections were not reused, but closed instead, and new ones opened for every new request. io.Copy(ioutil.Discard, resp.Body) before resp.Body.Close() remedies this. 2) Even with the above fix, somehow having MaxIdleConnsPerHost at its default value of 2 wasn't enough to stop reconnecting. It is hard to understand why this would be so detrimental, it could even be some subtle Go runtime bug. Anyhow, setting this value to match the connection limit, as set by connLimit global variable, finally nails this ugly bug. I fixed several other places where the response body wasn't read in full (or at all). For example, json.NewDecoder() is also known not to read the whole body of response. Unfortunately, this is not over yet. :( The check command is firing up to 40 simultaneous connections to the restic-server. Then, once again, MaxIdleConnsPerHost is too low to support keepalive, and sockets in the TIME_WAIT state pile up. But, as this kind of concurrency absolutely kill the poor disk on the server side, this is a completely different bug then.
2016-11-09 21:37:20 +00:00
io.Copy(ioutil.Discard, resp.Body)
e := resp.Body.Close()
if err == nil {
2016-08-29 19:54:50 +00:00
err = errors.Wrap(e, "Close")
}
}()
}
if err != nil {
2016-08-29 19:54:50 +00:00
return errors.Wrap(err, "client.Post")
}
if resp.StatusCode != 200 {
return errors.Errorf("unexpected HTTP response code %v", resp.StatusCode)
}
return nil
}
// Stat returns information about a blob.
2016-08-31 20:51:35 +00:00
func (b *restBackend) Stat(h restic.Handle) (restic.FileInfo, error) {
if err := h.Valid(); err != nil {
2016-08-31 20:51:35 +00:00
return restic.FileInfo{}, err
}
<-b.connChan
resp, err := b.client.Head(restPath(b.url, h))
b.connChan <- struct{}{}
if err != nil {
2016-08-31 20:51:35 +00:00
return restic.FileInfo{}, errors.Wrap(err, "client.Head")
}
Fix REST backend HTTP keepalive This is subtle. A combination od fast client disk (read: SSD) with lots of files and fast network connection to restic-server would suddenly start getting lots of "dial tcp: connect: cannot assign requested address" errors during backup stage. Further inspection revealed that client machine was plagued with TCP sockets in TIME_WAIT state. When ephemeral port range was finally exhausted, no more sockets could be opened, so restic would freak out. To understand the magnitude of this problem, with ~18k ports and default timeout of 60 seconds, it means more than 300 HTTP connections per seconds were created and teared down. Yeah, restic-server is that fast. :) As it turns out, this behavior was product of 2 subtle issues: 1) The body of HTTP response wasn't read completely with io.ReadFull() at the end of the Load() function. This deactivated HTTP keepalive, so already open connections were not reused, but closed instead, and new ones opened for every new request. io.Copy(ioutil.Discard, resp.Body) before resp.Body.Close() remedies this. 2) Even with the above fix, somehow having MaxIdleConnsPerHost at its default value of 2 wasn't enough to stop reconnecting. It is hard to understand why this would be so detrimental, it could even be some subtle Go runtime bug. Anyhow, setting this value to match the connection limit, as set by connLimit global variable, finally nails this ugly bug. I fixed several other places where the response body wasn't read in full (or at all). For example, json.NewDecoder() is also known not to read the whole body of response. Unfortunately, this is not over yet. :( The check command is firing up to 40 simultaneous connections to the restic-server. Then, once again, MaxIdleConnsPerHost is too low to support keepalive, and sockets in the TIME_WAIT state pile up. But, as this kind of concurrency absolutely kill the poor disk on the server side, this is a completely different bug then.
2016-11-09 21:37:20 +00:00
io.Copy(ioutil.Discard, resp.Body)
if err = resp.Body.Close(); err != nil {
2016-08-31 20:51:35 +00:00
return restic.FileInfo{}, errors.Wrap(err, "Close")
}
if resp.StatusCode != 200 {
2016-08-31 20:51:35 +00:00
return restic.FileInfo{}, errors.Errorf("unexpected HTTP response code %v", resp.StatusCode)
}
if resp.ContentLength < 0 {
2016-08-31 20:51:35 +00:00
return restic.FileInfo{}, errors.New("negative content length")
}
2016-08-31 20:51:35 +00:00
bi := restic.FileInfo{
Size: resp.ContentLength,
}
return bi, nil
}
// Test returns true if a blob of the given type and name exists in the backend.
2016-08-31 20:39:36 +00:00
func (b *restBackend) Test(t restic.FileType, name string) (bool, error) {
2016-09-01 19:19:30 +00:00
_, err := b.Stat(restic.Handle{Type: t, Name: name})
if err != nil {
return false, nil
}
return true, nil
}
// Remove removes the blob with the given name and type.
2016-08-31 20:39:36 +00:00
func (b *restBackend) Remove(t restic.FileType, name string) error {
2016-09-01 19:19:30 +00:00
h := restic.Handle{Type: t, Name: name}
if err := h.Valid(); err != nil {
return err
}
req, err := http.NewRequest("DELETE", restPath(b.url, h), nil)
if err != nil {
2016-08-29 19:54:50 +00:00
return errors.Wrap(err, "http.NewRequest")
}
<-b.connChan
resp, err := b.client.Do(req)
b.connChan <- struct{}{}
if err != nil {
2016-08-29 19:54:50 +00:00
return errors.Wrap(err, "client.Do")
}
if resp.StatusCode != 200 {
return errors.New("blob not removed")
}
Fix REST backend HTTP keepalive This is subtle. A combination od fast client disk (read: SSD) with lots of files and fast network connection to restic-server would suddenly start getting lots of "dial tcp: connect: cannot assign requested address" errors during backup stage. Further inspection revealed that client machine was plagued with TCP sockets in TIME_WAIT state. When ephemeral port range was finally exhausted, no more sockets could be opened, so restic would freak out. To understand the magnitude of this problem, with ~18k ports and default timeout of 60 seconds, it means more than 300 HTTP connections per seconds were created and teared down. Yeah, restic-server is that fast. :) As it turns out, this behavior was product of 2 subtle issues: 1) The body of HTTP response wasn't read completely with io.ReadFull() at the end of the Load() function. This deactivated HTTP keepalive, so already open connections were not reused, but closed instead, and new ones opened for every new request. io.Copy(ioutil.Discard, resp.Body) before resp.Body.Close() remedies this. 2) Even with the above fix, somehow having MaxIdleConnsPerHost at its default value of 2 wasn't enough to stop reconnecting. It is hard to understand why this would be so detrimental, it could even be some subtle Go runtime bug. Anyhow, setting this value to match the connection limit, as set by connLimit global variable, finally nails this ugly bug. I fixed several other places where the response body wasn't read in full (or at all). For example, json.NewDecoder() is also known not to read the whole body of response. Unfortunately, this is not over yet. :( The check command is firing up to 40 simultaneous connections to the restic-server. Then, once again, MaxIdleConnsPerHost is too low to support keepalive, and sockets in the TIME_WAIT state pile up. But, as this kind of concurrency absolutely kill the poor disk on the server side, this is a completely different bug then.
2016-11-09 21:37:20 +00:00
io.Copy(ioutil.Discard, resp.Body)
return resp.Body.Close()
}
// List returns a channel that yields all names of blobs of type t. A
// goroutine is started for this. If the channel done is closed, sending
// stops.
2016-08-31 20:39:36 +00:00
func (b *restBackend) List(t restic.FileType, done <-chan struct{}) <-chan string {
ch := make(chan string)
2016-09-01 19:19:30 +00:00
url := restPath(b.url, restic.Handle{Type: t})
2016-02-21 15:35:25 +00:00
if !strings.HasSuffix(url, "/") {
url += "/"
}
<-b.connChan
resp, err := b.client.Get(url)
b.connChan <- struct{}{}
if resp != nil {
Fix REST backend HTTP keepalive This is subtle. A combination od fast client disk (read: SSD) with lots of files and fast network connection to restic-server would suddenly start getting lots of "dial tcp: connect: cannot assign requested address" errors during backup stage. Further inspection revealed that client machine was plagued with TCP sockets in TIME_WAIT state. When ephemeral port range was finally exhausted, no more sockets could be opened, so restic would freak out. To understand the magnitude of this problem, with ~18k ports and default timeout of 60 seconds, it means more than 300 HTTP connections per seconds were created and teared down. Yeah, restic-server is that fast. :) As it turns out, this behavior was product of 2 subtle issues: 1) The body of HTTP response wasn't read completely with io.ReadFull() at the end of the Load() function. This deactivated HTTP keepalive, so already open connections were not reused, but closed instead, and new ones opened for every new request. io.Copy(ioutil.Discard, resp.Body) before resp.Body.Close() remedies this. 2) Even with the above fix, somehow having MaxIdleConnsPerHost at its default value of 2 wasn't enough to stop reconnecting. It is hard to understand why this would be so detrimental, it could even be some subtle Go runtime bug. Anyhow, setting this value to match the connection limit, as set by connLimit global variable, finally nails this ugly bug. I fixed several other places where the response body wasn't read in full (or at all). For example, json.NewDecoder() is also known not to read the whole body of response. Unfortunately, this is not over yet. :( The check command is firing up to 40 simultaneous connections to the restic-server. Then, once again, MaxIdleConnsPerHost is too low to support keepalive, and sockets in the TIME_WAIT state pile up. But, as this kind of concurrency absolutely kill the poor disk on the server side, this is a completely different bug then.
2016-11-09 21:37:20 +00:00
defer func() {
io.Copy(ioutil.Discard, resp.Body)
e := resp.Body.Close()
if err == nil {
err = errors.Wrap(e, "Close")
}
}()
}
if err != nil {
close(ch)
return ch
}
dec := json.NewDecoder(resp.Body)
var list []string
if err = dec.Decode(&list); err != nil {
close(ch)
return ch
}
go func() {
defer close(ch)
for _, m := range list {
select {
case ch <- m:
case <-done:
return
}
}
}()
return ch
}
// Close closes all open files.
func (b *restBackend) Close() error {
// this does not need to do anything, all open files are closed within the
// same function.
return nil
}