Update goleveldb

This commit is contained in:
Jakob Borg 2014-11-18 16:24:42 +04:00
parent aa637fd942
commit 68399601ce
43 changed files with 2364 additions and 1181 deletions

11
Godeps/Godeps.json generated
View File

@ -25,11 +25,6 @@
"Comment": "null-90", "Comment": "null-90",
"Rev": "d65bffbc88a153d23a6d2a864531e6e7c2cde59b" "Rev": "d65bffbc88a153d23a6d2a864531e6e7c2cde59b"
}, },
{
"ImportPath": "code.google.com/p/snappy-go/snappy",
"Comment": "null-15",
"Rev": "12e4b4183793ac4b061921e7980845e750679fd0"
},
{ {
"ImportPath": "github.com/AudriusButkevicius/lfu-go", "ImportPath": "github.com/AudriusButkevicius/lfu-go",
"Rev": "164bcecceb92fd6037f4d18a8d97b495ec6ef669" "Rev": "164bcecceb92fd6037f4d18a8d97b495ec6ef669"
@ -56,7 +51,11 @@
}, },
{ {
"ImportPath": "github.com/syndtr/goleveldb/leveldb", "ImportPath": "github.com/syndtr/goleveldb/leveldb",
"Rev": "cd2b8f743192883ab9fbc5f070ebda1dc90f3732" "Rev": "d8d1d2a5cc2d34c950dffa2f554525415d59f737"
},
{
"ImportPath": "github.com/syndtr/gosnappy/snappy",
"Rev": "ce8acff4829e0c2458a67ead32390ac0a381c862"
}, },
{ {
"ImportPath": "github.com/vitrun/qart/coding", "ImportPath": "github.com/vitrun/qart/coding",

View File

@ -8,65 +8,84 @@ package leveldb
import ( import (
"encoding/binary" "encoding/binary"
"errors" "fmt"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/memdb" "github.com/syndtr/goleveldb/leveldb/memdb"
) )
var ( type ErrBatchCorrupted struct {
errBatchTooShort = errors.New("leveldb: batch is too short") Reason string
errBatchBadRecord = errors.New("leveldb: bad record in batch") }
func (e *ErrBatchCorrupted) Error() string {
return fmt.Sprintf("leveldb: batch corrupted: %s", e.Reason)
}
func newErrBatchCorrupted(reason string) error {
return errors.NewErrCorrupted(nil, &ErrBatchCorrupted{reason})
}
const (
batchHdrLen = 8 + 4
batchGrowRec = 3000
) )
const kBatchHdrLen = 8 + 4 type BatchReplay interface {
Put(key, value []byte)
type batchReplay interface { Delete(key []byte)
put(key, value []byte, seq uint64)
delete(key []byte, seq uint64)
} }
// Batch is a write batch. // Batch is a write batch.
type Batch struct { type Batch struct {
buf []byte data []byte
rLen, bLen int rLen, bLen int
seq uint64 seq uint64
sync bool sync bool
} }
func (b *Batch) grow(n int) { func (b *Batch) grow(n int) {
off := len(b.buf) off := len(b.data)
if off == 0 { if off == 0 {
// include headers off = batchHdrLen
off = kBatchHdrLen if b.data != nil {
n += off b.data = b.data[:off]
}
}
if cap(b.data)-off < n {
if b.data == nil {
b.data = make([]byte, off, off+n)
} else {
odata := b.data
div := 1
if b.rLen > batchGrowRec {
div = b.rLen / batchGrowRec
}
b.data = make([]byte, off, off+n+(off-batchHdrLen)/div)
copy(b.data, odata)
} }
if cap(b.buf)-off >= n {
return
} }
buf := make([]byte, 2*cap(b.buf)+n)
copy(buf, b.buf)
b.buf = buf[:off]
} }
func (b *Batch) appendRec(t vType, key, value []byte) { func (b *Batch) appendRec(kt kType, key, value []byte) {
n := 1 + binary.MaxVarintLen32 + len(key) n := 1 + binary.MaxVarintLen32 + len(key)
if t == tVal { if kt == ktVal {
n += binary.MaxVarintLen32 + len(value) n += binary.MaxVarintLen32 + len(value)
} }
b.grow(n) b.grow(n)
off := len(b.buf) off := len(b.data)
buf := b.buf[:off+n] data := b.data[:off+n]
buf[off] = byte(t) data[off] = byte(kt)
off += 1 off += 1
off += binary.PutUvarint(buf[off:], uint64(len(key))) off += binary.PutUvarint(data[off:], uint64(len(key)))
copy(buf[off:], key) copy(data[off:], key)
off += len(key) off += len(key)
if t == tVal { if kt == ktVal {
off += binary.PutUvarint(buf[off:], uint64(len(value))) off += binary.PutUvarint(data[off:], uint64(len(value)))
copy(buf[off:], value) copy(data[off:], value)
off += len(value) off += len(value)
} }
b.buf = buf[:off] b.data = data[:off]
b.rLen++ b.rLen++
// Include 8-byte ikey header // Include 8-byte ikey header
b.bLen += len(key) + len(value) + 8 b.bLen += len(key) + len(value) + 8
@ -75,18 +94,51 @@ func (b *Batch) appendRec(t vType, key, value []byte) {
// Put appends 'put operation' of the given key/value pair to the batch. // Put appends 'put operation' of the given key/value pair to the batch.
// It is safe to modify the contents of the argument after Put returns. // It is safe to modify the contents of the argument after Put returns.
func (b *Batch) Put(key, value []byte) { func (b *Batch) Put(key, value []byte) {
b.appendRec(tVal, key, value) b.appendRec(ktVal, key, value)
} }
// Delete appends 'delete operation' of the given key to the batch. // Delete appends 'delete operation' of the given key to the batch.
// It is safe to modify the contents of the argument after Delete returns. // It is safe to modify the contents of the argument after Delete returns.
func (b *Batch) Delete(key []byte) { func (b *Batch) Delete(key []byte) {
b.appendRec(tDel, key, nil) b.appendRec(ktDel, key, nil)
}
// Dump dumps batch contents. The returned slice can be loaded into the
// batch using Load method.
// The returned slice is not its own copy, so the contents should not be
// modified.
func (b *Batch) Dump() []byte {
return b.encode()
}
// Load loads given slice into the batch. Previous contents of the batch
// will be discarded.
// The given slice will not be copied and will be used as batch buffer, so
// it is not safe to modify the contents of the slice.
func (b *Batch) Load(data []byte) error {
return b.decode(0, data)
}
// Replay replays batch contents.
func (b *Batch) Replay(r BatchReplay) error {
return b.decodeRec(func(i int, kt kType, key, value []byte) {
switch kt {
case ktVal:
r.Put(key, value)
case ktDel:
r.Delete(key)
}
})
}
// Len returns number of records in the batch.
func (b *Batch) Len() int {
return b.rLen
} }
// Reset resets the batch. // Reset resets the batch.
func (b *Batch) Reset() { func (b *Batch) Reset() {
b.buf = nil b.data = b.data[:0]
b.seq = 0 b.seq = 0
b.rLen = 0 b.rLen = 0
b.bLen = 0 b.bLen = 0
@ -97,24 +149,10 @@ func (b *Batch) init(sync bool) {
b.sync = sync b.sync = sync
} }
func (b *Batch) put(key, value []byte, seq uint64) {
if b.rLen == 0 {
b.seq = seq
}
b.Put(key, value)
}
func (b *Batch) delete(key []byte, seq uint64) {
if b.rLen == 0 {
b.seq = seq
}
b.Delete(key)
}
func (b *Batch) append(p *Batch) { func (b *Batch) append(p *Batch) {
if p.rLen > 0 { if p.rLen > 0 {
b.grow(len(p.buf) - kBatchHdrLen) b.grow(len(p.data) - batchHdrLen)
b.buf = append(b.buf, p.buf[kBatchHdrLen:]...) b.data = append(b.data, p.data[batchHdrLen:]...)
b.rLen += p.rLen b.rLen += p.rLen
} }
if p.sync { if p.sync {
@ -122,95 +160,93 @@ func (b *Batch) append(p *Batch) {
} }
} }
func (b *Batch) len() int { // size returns sums of key/value pair length plus 8-bytes ikey.
return b.rLen
}
func (b *Batch) size() int { func (b *Batch) size() int {
return b.bLen return b.bLen
} }
func (b *Batch) encode() []byte { func (b *Batch) encode() []byte {
b.grow(0) b.grow(0)
binary.LittleEndian.PutUint64(b.buf, b.seq) binary.LittleEndian.PutUint64(b.data, b.seq)
binary.LittleEndian.PutUint32(b.buf[8:], uint32(b.rLen)) binary.LittleEndian.PutUint32(b.data[8:], uint32(b.rLen))
return b.buf return b.data
} }
func (b *Batch) decode(buf []byte) error { func (b *Batch) decode(prevSeq uint64, data []byte) error {
if len(buf) < kBatchHdrLen { if len(data) < batchHdrLen {
return errBatchTooShort return newErrBatchCorrupted("too short")
} }
b.seq = binary.LittleEndian.Uint64(buf) b.seq = binary.LittleEndian.Uint64(data)
b.rLen = int(binary.LittleEndian.Uint32(buf[8:])) if b.seq < prevSeq {
return newErrBatchCorrupted("invalid sequence number")
}
b.rLen = int(binary.LittleEndian.Uint32(data[8:]))
if b.rLen < 0 {
return newErrBatchCorrupted("invalid records length")
}
// No need to be precise at this point, it won't be used anyway // No need to be precise at this point, it won't be used anyway
b.bLen = len(buf) - kBatchHdrLen b.bLen = len(data) - batchHdrLen
b.buf = buf b.data = data
return nil return nil
} }
func (b *Batch) decodeRec(f func(i int, t vType, key, value []byte)) error { func (b *Batch) decodeRec(f func(i int, kt kType, key, value []byte)) (err error) {
off := kBatchHdrLen off := batchHdrLen
for i := 0; i < b.rLen; i++ { for i := 0; i < b.rLen; i++ {
if off >= len(b.buf) { if off >= len(b.data) {
return errors.New("leveldb: invalid batch record length") return newErrBatchCorrupted("invalid records length")
} }
t := vType(b.buf[off]) kt := kType(b.data[off])
if t > tVal { if kt > ktVal {
return errors.New("leveldb: invalid batch record type in batch") return newErrBatchCorrupted("bad record: invalid type")
} }
off += 1 off += 1
x, n := binary.Uvarint(b.buf[off:]) x, n := binary.Uvarint(b.data[off:])
off += n off += n
if n <= 0 || off+int(x) > len(b.buf) { if n <= 0 || off+int(x) > len(b.data) {
return errBatchBadRecord return newErrBatchCorrupted("bad record: invalid key length")
} }
key := b.buf[off : off+int(x)] key := b.data[off : off+int(x)]
off += int(x) off += int(x)
var value []byte var value []byte
if t == tVal { if kt == ktVal {
x, n := binary.Uvarint(b.buf[off:]) x, n := binary.Uvarint(b.data[off:])
off += n off += n
if n <= 0 || off+int(x) > len(b.buf) { if n <= 0 || off+int(x) > len(b.data) {
return errBatchBadRecord return newErrBatchCorrupted("bad record: invalid value length")
} }
value = b.buf[off : off+int(x)] value = b.data[off : off+int(x)]
off += int(x) off += int(x)
} }
f(i, t, key, value) f(i, kt, key, value)
} }
return nil return nil
} }
func (b *Batch) replay(to batchReplay) error {
return b.decodeRec(func(i int, t vType, key, value []byte) {
switch t {
case tVal:
to.put(key, value, b.seq+uint64(i))
case tDel:
to.delete(key, b.seq+uint64(i))
}
})
}
func (b *Batch) memReplay(to *memdb.DB) error { func (b *Batch) memReplay(to *memdb.DB) error {
return b.decodeRec(func(i int, t vType, key, value []byte) { return b.decodeRec(func(i int, kt kType, key, value []byte) {
ikey := newIKey(key, b.seq+uint64(i), t) ikey := newIkey(key, b.seq+uint64(i), kt)
to.Put(ikey, value) to.Put(ikey, value)
}) })
} }
func (b *Batch) memDecodeAndReplay(prevSeq uint64, data []byte, to *memdb.DB) error {
if err := b.decode(prevSeq, data); err != nil {
return err
}
return b.memReplay(to)
}
func (b *Batch) revertMemReplay(to *memdb.DB) error { func (b *Batch) revertMemReplay(to *memdb.DB) error {
return b.decodeRec(func(i int, t vType, key, value []byte) { return b.decodeRec(func(i int, kt kType, key, value []byte) {
ikey := newIKey(key, b.seq+uint64(i), t) ikey := newIkey(key, b.seq+uint64(i), kt)
to.Delete(ikey) to.Delete(ikey)
}) })
} }

View File

@ -15,7 +15,7 @@ import (
) )
type tbRec struct { type tbRec struct {
t vType kt kType
key, value []byte key, value []byte
} }
@ -23,39 +23,39 @@ type testBatch struct {
rec []*tbRec rec []*tbRec
} }
func (p *testBatch) put(key, value []byte, seq uint64) { func (p *testBatch) Put(key, value []byte) {
p.rec = append(p.rec, &tbRec{tVal, key, value}) p.rec = append(p.rec, &tbRec{ktVal, key, value})
} }
func (p *testBatch) delete(key []byte, seq uint64) { func (p *testBatch) Delete(key []byte) {
p.rec = append(p.rec, &tbRec{tDel, key, nil}) p.rec = append(p.rec, &tbRec{ktDel, key, nil})
} }
func compareBatch(t *testing.T, b1, b2 *Batch) { func compareBatch(t *testing.T, b1, b2 *Batch) {
if b1.seq != b2.seq { if b1.seq != b2.seq {
t.Errorf("invalid seq number want %d, got %d", b1.seq, b2.seq) t.Errorf("invalid seq number want %d, got %d", b1.seq, b2.seq)
} }
if b1.len() != b2.len() { if b1.Len() != b2.Len() {
t.Fatalf("invalid record length want %d, got %d", b1.len(), b2.len()) t.Fatalf("invalid record length want %d, got %d", b1.Len(), b2.Len())
} }
p1, p2 := new(testBatch), new(testBatch) p1, p2 := new(testBatch), new(testBatch)
err := b1.replay(p1) err := b1.Replay(p1)
if err != nil { if err != nil {
t.Fatal("error when replaying batch 1: ", err) t.Fatal("error when replaying batch 1: ", err)
} }
err = b2.replay(p2) err = b2.Replay(p2)
if err != nil { if err != nil {
t.Fatal("error when replaying batch 2: ", err) t.Fatal("error when replaying batch 2: ", err)
} }
for i := range p1.rec { for i := range p1.rec {
r1, r2 := p1.rec[i], p2.rec[i] r1, r2 := p1.rec[i], p2.rec[i]
if r1.t != r2.t { if r1.kt != r2.kt {
t.Errorf("invalid type on record '%d' want %d, got %d", i, r1.t, r2.t) t.Errorf("invalid type on record '%d' want %d, got %d", i, r1.kt, r2.kt)
} }
if !bytes.Equal(r1.key, r2.key) { if !bytes.Equal(r1.key, r2.key) {
t.Errorf("invalid key on record '%d' want %s, got %s", i, string(r1.key), string(r2.key)) t.Errorf("invalid key on record '%d' want %s, got %s", i, string(r1.key), string(r2.key))
} }
if r1.t == tVal { if r1.kt == ktVal {
if !bytes.Equal(r1.value, r2.value) { if !bytes.Equal(r1.value, r2.value) {
t.Errorf("invalid value on record '%d' want %s, got %s", i, string(r1.value), string(r2.value)) t.Errorf("invalid value on record '%d' want %s, got %s", i, string(r1.value), string(r2.value))
} }
@ -75,7 +75,7 @@ func TestBatch_EncodeDecode(t *testing.T) {
b1.Delete([]byte("k")) b1.Delete([]byte("k"))
buf := b1.encode() buf := b1.encode()
b2 := new(Batch) b2 := new(Batch)
err := b2.decode(buf) err := b2.decode(0, buf)
if err != nil { if err != nil {
t.Error("error when decoding batch: ", err) t.Error("error when decoding batch: ", err)
} }

View File

@ -249,7 +249,7 @@ func (x *testingCacheObject) Release() {
x.releaseCalled = true x.releaseCalled = true
x.cnt.releaseOne() x.cnt.releaseOne()
} else { } else {
x.t.Errorf("duplicate setfin NS#%d KEY#%s", x.ns, x.key) x.t.Errorf("duplicate setfin NS#%d KEY#%d", x.ns, x.key)
} }
} }
@ -489,7 +489,7 @@ func TestLRUCache_Finalizer(t *testing.T) {
return true return true
} else { } else {
if p.delfinCalled != keymax { if p.delfinCalled != keymax {
t.Errorf("(2) #%d not all delete fin called, diff=%d", p.ns, keymax-p.delfinCalled) t.Errorf("(2) NS#%d not all delete fin called, diff=%d", p.nsid, keymax-p.delfinCalled)
} }
return false return false
} }

View File

@ -1,40 +0,0 @@
// Copyright (c) 2012, Suryandaru Triandana <syndtr@gmail.com>
// All rights reserved.
//
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package leveldb
const (
kNumLevels = 7
// Level-0 compaction is started when we hit this many files.
kL0_CompactionTrigger float64 = 4
// Soft limit on number of level-0 files. We slow down writes at this point.
kL0_SlowdownWritesTrigger = 8
// Maximum number of level-0 files. We stop writes at this point.
kL0_StopWritesTrigger = 12
// Maximum level to which a new compacted memdb is pushed if it
// does not create overlap. We try to push to level 2 to avoid the
// relatively expensive level 0=>1 compactions and to avoid some
// expensive manifest file operations. We do not push all the way to
// the largest level since that can generate a lot of wasted disk
// space if the same key space is being repeatedly overwritten.
kMaxMemCompactLevel = 2
// Maximum size of a table.
kMaxTableSize = 2 * 1048576
// Maximum bytes of overlaps in grandparent (i.e., level+2) before we
// stop building a single file in a level->level+1 compaction.
kMaxGrandParentOverlapBytes = 10 * kMaxTableSize
// Maximum number of bytes in all compacted files. We avoid expanding
// the lower level file set of a compaction if it would make the
// total compaction cover more than this many bytes.
kExpCompactionMaxBytes = 25 * kMaxTableSize
)

View File

@ -8,7 +8,6 @@ package leveldb
import ( import (
"container/list" "container/list"
"errors"
"fmt" "fmt"
"io" "io"
"os" "os"
@ -18,6 +17,7 @@ import (
"sync/atomic" "sync/atomic"
"time" "time"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/iterator"
"github.com/syndtr/goleveldb/leveldb/journal" "github.com/syndtr/goleveldb/leveldb/journal"
"github.com/syndtr/goleveldb/leveldb/memdb" "github.com/syndtr/goleveldb/leveldb/memdb"
@ -57,18 +57,19 @@ type DB struct {
writeMergedC chan bool writeMergedC chan bool
writeLockC chan struct{} writeLockC chan struct{}
writeAckC chan error writeAckC chan error
writeDelay time.Duration
writeDelayN int
journalC chan *Batch journalC chan *Batch
journalAckC chan error journalAckC chan error
// Compaction. // Compaction.
tcompCmdC chan cCmd tcompCmdC chan cCmd
tcompPauseC chan chan<- struct{} tcompPauseC chan chan<- struct{}
tcompTriggerC chan struct{}
mcompCmdC chan cCmd mcompCmdC chan cCmd
mcompTriggerC chan struct{}
compErrC chan error compErrC chan error
compPerErrC chan error
compErrSetC chan error compErrSetC chan error
compStats [kNumLevels]cStats compStats []cStats
// Close. // Close.
closeW sync.WaitGroup closeW sync.WaitGroup
@ -83,7 +84,7 @@ func openDB(s *session) (*DB, error) {
db := &DB{ db := &DB{
s: s, s: s,
// Initial sequence // Initial sequence
seq: s.stSeq, seq: s.stSeqNum,
// MemDB // MemDB
memPool: make(chan *memdb.DB, 1), memPool: make(chan *memdb.DB, 1),
// Snapshot // Snapshot
@ -98,11 +99,11 @@ func openDB(s *session) (*DB, error) {
// Compaction // Compaction
tcompCmdC: make(chan cCmd), tcompCmdC: make(chan cCmd),
tcompPauseC: make(chan chan<- struct{}), tcompPauseC: make(chan chan<- struct{}),
tcompTriggerC: make(chan struct{}, 1),
mcompCmdC: make(chan cCmd), mcompCmdC: make(chan cCmd),
mcompTriggerC: make(chan struct{}, 1),
compErrC: make(chan error), compErrC: make(chan error),
compPerErrC: make(chan error),
compErrSetC: make(chan error), compErrSetC: make(chan error),
compStats: make([]cStats, s.o.GetNumLevel()),
// Close // Close
closeC: make(chan struct{}), closeC: make(chan struct{}),
} }
@ -121,14 +122,14 @@ func openDB(s *session) (*DB, error) {
return nil, err return nil, err
} }
// Don't include compaction error goroutine into wait group. // Doesn't need to be included in the wait group.
go db.compactionError() go db.compactionError()
go db.mpoolDrain()
db.closeW.Add(3) db.closeW.Add(3)
go db.tCompaction() go db.tCompaction()
go db.mCompaction() go db.mCompaction()
go db.jWriter() go db.jWriter()
go db.mpoolDrain()
s.logf("db@open done T·%v", time.Since(start)) s.logf("db@open done T·%v", time.Since(start))
@ -255,6 +256,10 @@ func RecoverFile(path string, o *opt.Options) (db *DB, err error) {
} }
func recoverTable(s *session, o *opt.Options) error { func recoverTable(s *session, o *opt.Options) error {
o = dupOptions(o)
// Mask StrictReader, lets StrictRecovery doing its job.
o.Strict &= ^opt.StrictReader
// Get all tables and sort it by file number. // Get all tables and sort it by file number.
tableFiles_, err := s.getFiles(storage.TypeTable) tableFiles_, err := s.getFiles(storage.TypeTable)
if err != nil { if err != nil {
@ -263,10 +268,16 @@ func recoverTable(s *session, o *opt.Options) error {
tableFiles := files(tableFiles_) tableFiles := files(tableFiles_)
tableFiles.sort() tableFiles.sort()
var mSeq uint64 var (
var good, corrupted int mSeq uint64
rec := new(sessionRecord) recoveredKey, goodKey, corruptedKey, corruptedBlock, droppedTable int
bpool := util.NewBufferPool(o.GetBlockSize() + 5)
// We will drop corrupted table.
strict = o.GetStrict(opt.StrictRecovery)
rec = &sessionRecord{numLevel: o.GetNumLevel()}
bpool = util.NewBufferPool(o.GetBlockSize() + 5)
)
buildTable := func(iter iterator.Iterator) (tmp storage.File, size int64, err error) { buildTable := func(iter iterator.Iterator) (tmp storage.File, size int64, err error) {
tmp = s.newTemp() tmp = s.newTemp()
writer, err := tmp.Create() writer, err := tmp.Create()
@ -321,25 +332,32 @@ func recoverTable(s *session, o *opt.Options) error {
return err return err
} }
var tSeq uint64 var (
var tgood, tcorrupted, blockerr int tSeq uint64
var imin, imax []byte tgoodKey, tcorruptedKey, tcorruptedBlock int
tr := table.NewReader(reader, size, nil, bpool, o) imin, imax []byte
)
tr, err := table.NewReader(reader, size, storage.NewFileInfo(file), nil, bpool, o)
if err != nil {
return err
}
iter := tr.NewIterator(nil, nil) iter := tr.NewIterator(nil, nil)
iter.(iterator.ErrorCallbackSetter).SetErrorCallback(func(err error) { iter.(iterator.ErrorCallbackSetter).SetErrorCallback(func(err error) {
s.logf("table@recovery found error @%d %q", file.Num(), err) if errors.IsCorrupted(err) {
blockerr++ s.logf("table@recovery block corruption @%d %q", file.Num(), err)
tcorruptedBlock++
}
}) })
// Scan the table. // Scan the table.
for iter.Next() { for iter.Next() {
key := iter.Key() key := iter.Key()
_, seq, _, ok := parseIkey(key) _, seq, _, kerr := parseIkey(key)
if !ok { if kerr != nil {
tcorrupted++ tcorruptedKey++
continue continue
} }
tgood++ tgoodKey++
if seq > tSeq { if seq > tSeq {
tSeq = seq tSeq = seq
} }
@ -354,8 +372,18 @@ func recoverTable(s *session, o *opt.Options) error {
} }
iter.Release() iter.Release()
if tgood > 0 { goodKey += tgoodKey
if tcorrupted > 0 || blockerr > 0 { corruptedKey += tcorruptedKey
corruptedBlock += tcorruptedBlock
if strict && (tcorruptedKey > 0 || tcorruptedBlock > 0) {
droppedTable++
s.logf("table@recovery dropped @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", file.Num(), tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq)
return nil
}
if tgoodKey > 0 {
if tcorruptedKey > 0 || tcorruptedBlock > 0 {
// Rebuild the table. // Rebuild the table.
s.logf("table@recovery rebuilding @%d", file.Num()) s.logf("table@recovery rebuilding @%d", file.Num())
iter := tr.NewIterator(nil, nil) iter := tr.NewIterator(nil, nil)
@ -373,16 +401,15 @@ func recoverTable(s *session, o *opt.Options) error {
if tSeq > mSeq { if tSeq > mSeq {
mSeq = tSeq mSeq = tSeq
} }
recoveredKey += tgoodKey
// Add table to level 0. // Add table to level 0.
rec.addTable(0, file.Num(), uint64(size), imin, imax) rec.addTable(0, file.Num(), uint64(size), imin, imax)
s.logf("table@recovery recovered @%d N·%d C·%d B·%d S·%d Q·%d", file.Num(), tgood, tcorrupted, blockerr, size, tSeq) s.logf("table@recovery recovered @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", file.Num(), tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq)
} else { } else {
s.logf("table@recovery unrecoverable @%d C·%d B·%d S·%d", file.Num(), tcorrupted, blockerr, size) droppedTable++
s.logf("table@recovery unrecoverable @%d Ck·%d Cb·%d S·%d", file.Num(), tcorruptedKey, tcorruptedBlock, size)
} }
good += tgood
corrupted += tcorrupted
return nil return nil
} }
@ -399,11 +426,11 @@ func recoverTable(s *session, o *opt.Options) error {
} }
} }
s.logf("table@recovery recovered F·%d N·%d C·%d Q·%d", len(tableFiles), good, corrupted, mSeq) s.logf("table@recovery recovered F·%d N·%d Gk·%d Ck·%d Q·%d", len(tableFiles), recoveredKey, goodKey, corruptedKey, mSeq)
} }
// Set sequence number. // Set sequence number.
rec.setSeq(mSeq + 1) rec.setSeqNum(mSeq + 1)
// Create new manifest. // Create new manifest.
if err := s.create(); err != nil { if err := s.create(); err != nil {
@ -486,26 +513,30 @@ func (db *DB) recoverJournal() error {
if err == io.EOF { if err == io.EOF {
break break
} }
return err return errors.SetFile(err, file)
} }
buf.Reset() buf.Reset()
if _, err := buf.ReadFrom(r); err != nil { if _, err := buf.ReadFrom(r); err != nil {
if err == io.ErrUnexpectedEOF { if err == io.ErrUnexpectedEOF {
// This is error returned due to corruption, with strict == false.
continue continue
} else { } else {
return err return errors.SetFile(err, file)
} }
} }
if err := batch.decode(buf.Bytes()); err != nil { if err := batch.memDecodeAndReplay(db.seq, buf.Bytes(), mem); err != nil {
return err if strict || !errors.IsCorrupted(err) {
return errors.SetFile(err, file)
} else {
db.s.logf("journal error: %v (skipped)", err)
// We won't apply sequence number as it might be corrupted.
continue
} }
if err := batch.memReplay(mem); err != nil {
return err
} }
// Save sequence number. // Save sequence number.
db.seq = batch.seq + uint64(batch.len()) db.seq = batch.seq + uint64(batch.Len())
// Flush it if large enough. // Flush it if large enough.
if mem.Size() >= writeBuffer { if mem.Size() >= writeBuffer {
@ -566,7 +597,7 @@ func (db *DB) recoverJournal() error {
} }
func (db *DB) get(key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, err error) { func (db *DB) get(key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, err error) {
ikey := newIKey(key, seq, tSeek) ikey := newIkey(key, seq, ktSeek)
em, fm := db.getMems() em, fm := db.getMems()
for _, m := range [...]*memDB{em, fm} { for _, m := range [...]*memDB{em, fm} {
@ -577,9 +608,13 @@ func (db *DB) get(key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, er
mk, mv, me := m.mdb.Find(ikey) mk, mv, me := m.mdb.Find(ikey)
if me == nil { if me == nil {
ukey, _, t, ok := parseIkey(mk) ukey, _, kt, kerr := parseIkey(mk)
if ok && db.s.icmp.uCompare(ukey, key) == 0 { if kerr != nil {
if t == tDel { // Shouldn't have had happen.
panic(kerr)
}
if db.s.icmp.uCompare(ukey, key) == 0 {
if kt == ktDel {
return nil, ErrNotFound return nil, ErrNotFound
} }
return append([]byte{}, mv...), nil return append([]byte{}, mv...), nil
@ -594,7 +629,7 @@ func (db *DB) get(key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, er
v.release() v.release()
if cSched { if cSched {
// Trigger table compaction. // Trigger table compaction.
db.compTrigger(db.tcompTriggerC) db.compSendTrigger(db.tcompCmdC)
} }
return return
} }
@ -697,7 +732,7 @@ func (db *DB) GetProperty(name string) (value string, err error) {
var level uint var level uint
var rest string var rest string
n, _ := fmt.Sscanf(p[len(numFilesPrefix):], "%d%s", &level, &rest) n, _ := fmt.Sscanf(p[len(numFilesPrefix):], "%d%s", &level, &rest)
if n != 1 || level >= kNumLevels { if n != 1 || int(level) >= db.s.o.GetNumLevel() {
err = errors.New("leveldb: GetProperty: invalid property: " + name) err = errors.New("leveldb: GetProperty: invalid property: " + name)
} else { } else {
value = fmt.Sprint(v.tLen(int(level))) value = fmt.Sprint(v.tLen(int(level)))
@ -759,8 +794,8 @@ func (db *DB) SizeOf(ranges []util.Range) (Sizes, error) {
sizes := make(Sizes, 0, len(ranges)) sizes := make(Sizes, 0, len(ranges))
for _, r := range ranges { for _, r := range ranges {
imin := newIKey(r.Start, kMaxSeq, tSeek) imin := newIkey(r.Start, kMaxSeq, ktSeek)
imax := newIKey(r.Limit, kMaxSeq, tSeek) imax := newIkey(r.Limit, kMaxSeq, ktSeek)
start, err := v.offsetOf(imin) start, err := v.offsetOf(imin)
if err != nil { if err != nil {
return nil, err return nil, err
@ -816,6 +851,10 @@ func (db *DB) Close() error {
db.journalWriter.Close() db.journalWriter.Close()
} }
if db.writeDelayN > 0 {
db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay)
}
// Close session. // Close session.
db.s.close() db.s.close()
db.logf("db@close done T·%v", time.Since(start)) db.logf("db@close done T·%v", time.Since(start))

View File

@ -7,11 +7,12 @@
package leveldb package leveldb
import ( import (
"errors"
"sync" "sync"
"time" "time"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/memdb" "github.com/syndtr/goleveldb/leveldb/memdb"
"github.com/syndtr/goleveldb/leveldb/opt"
) )
var ( var (
@ -68,7 +69,7 @@ type cMem struct {
} }
func newCMem(s *session) *cMem { func newCMem(s *session) *cMem {
return &cMem{s: s, rec: new(sessionRecord)} return &cMem{s: s, rec: &sessionRecord{numLevel: s.o.GetNumLevel()}}
} }
func (c *cMem) flush(mem *memdb.DB, level int) error { func (c *cMem) flush(mem *memdb.DB, level int) error {
@ -84,7 +85,9 @@ func (c *cMem) flush(mem *memdb.DB, level int) error {
// Pick level. // Pick level.
if level < 0 { if level < 0 {
level = s.version_NB().pickLevel(t.imin.ukey(), t.imax.ukey()) v := s.version()
level = v.pickLevel(t.imin.ukey(), t.imax.ukey())
v.release()
} }
c.rec.addTableFile(level, t) c.rec.addTableFile(level, t)
@ -95,24 +98,32 @@ func (c *cMem) flush(mem *memdb.DB, level int) error {
} }
func (c *cMem) reset() { func (c *cMem) reset() {
c.rec = new(sessionRecord) c.rec = &sessionRecord{numLevel: c.s.o.GetNumLevel()}
} }
func (c *cMem) commit(journal, seq uint64) error { func (c *cMem) commit(journal, seq uint64) error {
c.rec.setJournalNum(journal) c.rec.setJournalNum(journal)
c.rec.setSeq(seq) c.rec.setSeqNum(seq)
// Commit changes. // Commit changes.
return c.s.commit(c.rec) return c.s.commit(c.rec)
} }
func (db *DB) compactionError() { func (db *DB) compactionError() {
var err error var (
err error
wlocked bool
)
noerr: noerr:
// No error.
for { for {
select { select {
case err = <-db.compErrSetC: case err = <-db.compErrSetC:
if err != nil { switch {
case err == nil:
case errors.IsCorrupted(err):
goto hasperr
default:
goto haserr goto haserr
} }
case _, _ = <-db.closeC: case _, _ = <-db.closeC:
@ -120,17 +131,39 @@ noerr:
} }
} }
haserr: haserr:
// Transient error.
for { for {
select { select {
case db.compErrC <- err: case db.compErrC <- err:
case err = <-db.compErrSetC: case err = <-db.compErrSetC:
if err == nil { switch {
case err == nil:
goto noerr goto noerr
case errors.IsCorrupted(err):
goto hasperr
default:
} }
case _, _ = <-db.closeC: case _, _ = <-db.closeC:
return return
} }
} }
hasperr:
// Persistent error.
for {
select {
case db.compErrC <- err:
case db.compPerErrC <- err:
case db.writeLockC <- struct{}{}:
// Hold write lock, so that write won't pass-through.
wlocked = true
case _, _ = <-db.closeC:
if wlocked {
// We should release the lock or Close will hang.
<-db.writeLockC
}
return
}
}
} }
type compactionTransactCounter int type compactionTransactCounter int
@ -139,12 +172,17 @@ func (cnt *compactionTransactCounter) incr() {
*cnt++ *cnt++
} }
func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactCounter) error, rollback func() error) { type compactionTransactInterface interface {
run(cnt *compactionTransactCounter) error
revert() error
}
func (db *DB) compactionTransact(name string, t compactionTransactInterface) {
defer func() { defer func() {
if x := recover(); x != nil { if x := recover(); x != nil {
if x == errCompactionTransactExiting && rollback != nil { if x == errCompactionTransactExiting {
if err := rollback(); err != nil { if err := t.revert(); err != nil {
db.logf("%s rollback error %q", name, err) db.logf("%s revert error %q", name, err)
} }
} }
panic(x) panic(x)
@ -156,9 +194,13 @@ func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactC
backoffMax = 8 * time.Second backoffMax = 8 * time.Second
backoffMul = 2 * time.Second backoffMul = 2 * time.Second
) )
backoff := backoffMin var (
backoffT := time.NewTimer(backoff) backoff = backoffMin
lastCnt := compactionTransactCounter(0) backoffT = time.NewTimer(backoff)
lastCnt = compactionTransactCounter(0)
disableBackoff = db.s.o.GetDisableCompactionBackoff()
)
for n := 0; ; n++ { for n := 0; ; n++ {
// Check wether the DB is closed. // Check wether the DB is closed.
if db.isClosed() { if db.isClosed() {
@ -170,11 +212,19 @@ func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactC
// Execute. // Execute.
cnt := compactionTransactCounter(0) cnt := compactionTransactCounter(0)
err := exec(&cnt) err := t.run(&cnt)
if err != nil {
db.logf("%s error I·%d %q", name, cnt, err)
}
// Set compaction error status. // Set compaction error status.
select { select {
case db.compErrSetC <- err: case db.compErrSetC <- err:
case perr := <-db.compPerErrC:
if err != nil {
db.logf("%s exiting (persistent error %q)", name, perr)
db.compactionExitTransact()
}
case _, _ = <-db.closeC: case _, _ = <-db.closeC:
db.logf("%s exiting", name) db.logf("%s exiting", name)
db.compactionExitTransact() db.compactionExitTransact()
@ -182,8 +232,12 @@ func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactC
if err == nil { if err == nil {
return return
} }
db.logf("%s error I·%d %q", name, cnt, err) if errors.IsCorrupted(err) {
db.logf("%s exiting (corruption detected)", name)
db.compactionExitTransact()
}
if !disableBackoff {
// Reset backoff duration if counter is advancing. // Reset backoff duration if counter is advancing.
if cnt > lastCnt { if cnt > lastCnt {
backoff = backoffMin backoff = backoffMin
@ -205,6 +259,27 @@ func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactC
db.compactionExitTransact() db.compactionExitTransact()
} }
} }
}
}
type compactionTransactFunc struct {
runFunc func(cnt *compactionTransactCounter) error
revertFunc func() error
}
func (t *compactionTransactFunc) run(cnt *compactionTransactCounter) error {
return t.runFunc(cnt)
}
func (t *compactionTransactFunc) revert() error {
if t.revertFunc != nil {
return t.revertFunc()
}
return nil
}
func (db *DB) compactionTransactFunc(name string, run func(cnt *compactionTransactCounter) error, revert func() error) {
db.compactionTransact(name, &compactionTransactFunc{run, revert})
} }
func (db *DB) compactionExitTransact() { func (db *DB) compactionExitTransact() {
@ -232,20 +307,23 @@ func (db *DB) memCompaction() {
} }
// Pause table compaction. // Pause table compaction.
ch := make(chan struct{}) resumeC := make(chan struct{})
select { select {
case db.tcompPauseC <- (chan<- struct{})(ch): case db.tcompPauseC <- (chan<- struct{})(resumeC):
case <-db.compPerErrC:
close(resumeC)
resumeC = nil
case _, _ = <-db.closeC: case _, _ = <-db.closeC:
return return
} }
db.compactionTransact("mem@flush", func(cnt *compactionTransactCounter) (err error) { db.compactionTransactFunc("mem@flush", func(cnt *compactionTransactCounter) (err error) {
stats.startTimer() stats.startTimer()
defer stats.stopTimer() defer stats.stopTimer()
return c.flush(mem.mdb, -1) return c.flush(mem.mdb, -1)
}, func() error { }, func() error {
for _, r := range c.rec.addedTables { for _, r := range c.rec.addedTables {
db.logf("mem@flush rollback @%d", r.num) db.logf("mem@flush revert @%d", r.num)
f := db.s.getTableFile(r.num) f := db.s.getTableFile(r.num)
if err := f.Remove(); err != nil { if err := f.Remove(); err != nil {
return err return err
@ -254,7 +332,7 @@ func (db *DB) memCompaction() {
return nil return nil
}) })
db.compactionTransact("mem@commit", func(cnt *compactionTransactCounter) (err error) { db.compactionTransactFunc("mem@commit", func(cnt *compactionTransactCounter) (err error) {
stats.startTimer() stats.startTimer()
defer stats.stopTimer() defer stats.stopTimer()
return c.commit(db.journalFile.Num(), db.frozenSeq) return c.commit(db.journalFile.Num(), db.frozenSeq)
@ -271,26 +349,223 @@ func (db *DB) memCompaction() {
db.dropFrozenMem() db.dropFrozenMem()
// Resume table compaction. // Resume table compaction.
if resumeC != nil {
select { select {
case <-ch: case <-resumeC:
close(resumeC)
case _, _ = <-db.closeC: case _, _ = <-db.closeC:
return return
} }
}
// Trigger table compaction. // Trigger table compaction.
db.compTrigger(db.mcompTriggerC) db.compSendTrigger(db.tcompCmdC)
}
type tableCompactionBuilder struct {
db *DB
s *session
c *compaction
rec *sessionRecord
stat0, stat1 *cStatsStaging
snapHasLastUkey bool
snapLastUkey []byte
snapLastSeq uint64
snapIter int
snapKerrCnt int
snapDropCnt int
kerrCnt int
dropCnt int
minSeq uint64
strict bool
tableSize int
tw *tWriter
}
func (b *tableCompactionBuilder) appendKV(key, value []byte) error {
// Create new table if not already.
if b.tw == nil {
// Check for pause event.
if b.db != nil {
select {
case ch := <-b.db.tcompPauseC:
b.db.pauseCompaction(ch)
case _, _ = <-b.db.closeC:
b.db.compactionExitTransact()
default:
}
}
// Create new table.
var err error
b.tw, err = b.s.tops.create()
if err != nil {
return err
}
}
// Write key/value into table.
return b.tw.append(key, value)
}
func (b *tableCompactionBuilder) needFlush() bool {
return b.tw.tw.BytesLen() >= b.tableSize
}
func (b *tableCompactionBuilder) flush() error {
t, err := b.tw.finish()
if err != nil {
return err
}
b.rec.addTableFile(b.c.level+1, t)
b.stat1.write += t.size
b.s.logf("table@build created L%d@%d N·%d S·%s %q:%q", b.c.level+1, t.file.Num(), b.tw.tw.EntriesLen(), shortenb(int(t.size)), t.imin, t.imax)
b.tw = nil
return nil
}
func (b *tableCompactionBuilder) cleanup() {
if b.tw != nil {
b.tw.drop()
b.tw = nil
}
}
func (b *tableCompactionBuilder) run(cnt *compactionTransactCounter) error {
snapResumed := b.snapIter > 0
hasLastUkey := b.snapHasLastUkey // The key might has zero length, so this is necessary.
lastUkey := append([]byte{}, b.snapLastUkey...)
lastSeq := b.snapLastSeq
b.kerrCnt = b.snapKerrCnt
b.dropCnt = b.snapDropCnt
// Restore compaction state.
b.c.restore()
defer b.cleanup()
b.stat1.startTimer()
defer b.stat1.stopTimer()
iter := b.c.newIterator()
defer iter.Release()
for i := 0; iter.Next(); i++ {
// Incr transact counter.
cnt.incr()
// Skip until last state.
if i < b.snapIter {
continue
}
resumed := false
if snapResumed {
resumed = true
snapResumed = false
}
ikey := iter.Key()
ukey, seq, kt, kerr := parseIkey(ikey)
if kerr == nil {
shouldStop := !resumed && b.c.shouldStopBefore(ikey)
if !hasLastUkey || b.s.icmp.uCompare(lastUkey, ukey) != 0 {
// First occurrence of this user key.
// Only rotate tables if ukey doesn't hop across.
if b.tw != nil && (shouldStop || b.needFlush()) {
if err := b.flush(); err != nil {
return err
}
// Creates snapshot of the state.
b.c.save()
b.snapHasLastUkey = hasLastUkey
b.snapLastUkey = append(b.snapLastUkey[:0], lastUkey...)
b.snapLastSeq = lastSeq
b.snapIter = i
b.snapKerrCnt = b.kerrCnt
b.snapDropCnt = b.dropCnt
}
hasLastUkey = true
lastUkey = append(lastUkey[:0], ukey...)
lastSeq = kMaxSeq
}
switch {
case lastSeq <= b.minSeq:
// Dropped because newer entry for same user key exist
fallthrough // (A)
case kt == ktDel && seq <= b.minSeq && b.c.baseLevelForKey(lastUkey):
// For this user key:
// (1) there is no data in higher levels
// (2) data in lower levels will have larger seq numbers
// (3) data in layers that are being compacted here and have
// smaller seq numbers will be dropped in the next
// few iterations of this loop (by rule (A) above).
// Therefore this deletion marker is obsolete and can be dropped.
lastSeq = seq
b.dropCnt++
continue
default:
lastSeq = seq
}
} else {
if b.strict {
return kerr
}
// Don't drop corrupted keys.
hasLastUkey = false
lastUkey = lastUkey[:0]
lastSeq = kMaxSeq
b.kerrCnt++
}
if err := b.appendKV(ikey, iter.Value()); err != nil {
return err
}
}
if err := iter.Error(); err != nil {
return err
}
// Finish last table.
if b.tw != nil && !b.tw.empty() {
return b.flush()
}
return nil
}
func (b *tableCompactionBuilder) revert() error {
for _, at := range b.rec.addedTables {
b.s.logf("table@build revert @%d", at.num)
f := b.s.getTableFile(at.num)
if err := f.Remove(); err != nil {
return err
}
}
return nil
} }
func (db *DB) tableCompaction(c *compaction, noTrivial bool) { func (db *DB) tableCompaction(c *compaction, noTrivial bool) {
rec := new(sessionRecord) defer c.release()
rec.addCompactionPointer(c.level, c.imax)
rec := &sessionRecord{numLevel: db.s.o.GetNumLevel()}
rec.addCompPtr(c.level, c.imax)
if !noTrivial && c.trivial() { if !noTrivial && c.trivial() {
t := c.tables[0][0] t := c.tables[0][0]
db.logf("table@move L%d@%d -> L%d", c.level, t.file.Num(), c.level+1) db.logf("table@move L%d@%d -> L%d", c.level, t.file.Num(), c.level+1)
rec.deleteTable(c.level, t.file.Num()) rec.delTable(c.level, t.file.Num())
rec.addTableFile(c.level+1, t) rec.addTableFile(c.level+1, t)
db.compactionTransact("table@move", func(cnt *compactionTransactCounter) (err error) { db.compactionTransactFunc("table@move", func(cnt *compactionTransactCounter) (err error) {
return db.s.commit(rec) return db.s.commit(rec)
}, nil) }, nil)
return return
@ -301,184 +576,34 @@ func (db *DB) tableCompaction(c *compaction, noTrivial bool) {
for _, t := range tables { for _, t := range tables {
stats[i].read += t.size stats[i].read += t.size
// Insert deleted tables into record // Insert deleted tables into record
rec.deleteTable(c.level+i, t.file.Num()) rec.delTable(c.level+i, t.file.Num())
} }
} }
sourceSize := int(stats[0].read + stats[1].read) sourceSize := int(stats[0].read + stats[1].read)
minSeq := db.minSeq() minSeq := db.minSeq()
db.logf("table@compaction L%d·%d -> L%d·%d S·%s Q·%d", c.level, len(c.tables[0]), c.level+1, len(c.tables[1]), shortenb(sourceSize), minSeq) db.logf("table@compaction L%d·%d -> L%d·%d S·%s Q·%d", c.level, len(c.tables[0]), c.level+1, len(c.tables[1]), shortenb(sourceSize), minSeq)
var snapUkey []byte b := &tableCompactionBuilder{
var snapHasUkey bool db: db,
var snapSeq uint64 s: db.s,
var snapIter int c: c,
var snapDropCnt int rec: rec,
var dropCnt int stat1: &stats[1],
db.compactionTransact("table@build", func(cnt *compactionTransactCounter) (err error) { minSeq: minSeq,
ukey := append([]byte{}, snapUkey...) strict: db.s.o.GetStrict(opt.StrictCompaction),
hasUkey := snapHasUkey tableSize: db.s.o.GetCompactionTableSize(c.level + 1),
lseq := snapSeq
dropCnt = snapDropCnt
snapSched := snapIter == 0
var tw *tWriter
finish := func() error {
t, err := tw.finish()
if err != nil {
return err
} }
rec.addTableFile(c.level+1, t) db.compactionTransact("table@build", b)
stats[1].write += t.size
db.logf("table@build created L%d@%d N·%d S·%s %q:%q", c.level+1, t.file.Num(), tw.tw.EntriesLen(), shortenb(int(t.size)), t.imin, t.imax)
return nil
}
defer func() {
stats[1].stopTimer()
if tw != nil {
tw.drop()
tw = nil
}
}()
stats[1].startTimer()
iter := c.newIterator()
defer iter.Release()
for i := 0; iter.Next(); i++ {
// Incr transact counter.
cnt.incr()
// Skip until last state.
if i < snapIter {
continue
}
ikey := iKey(iter.Key())
if c.shouldStopBefore(ikey) && tw != nil {
err = finish()
if err != nil {
return
}
snapSched = true
tw = nil
}
// Scheduled for snapshot, snapshot will used to retry compaction
// if error occured.
if snapSched {
snapUkey = append(snapUkey[:0], ukey...)
snapHasUkey = hasUkey
snapSeq = lseq
snapIter = i
snapDropCnt = dropCnt
snapSched = false
}
if seq, vt, ok := ikey.parseNum(); !ok {
// Don't drop error keys
ukey = ukey[:0]
hasUkey = false
lseq = kMaxSeq
} else {
if !hasUkey || db.s.icmp.uCompare(ikey.ukey(), ukey) != 0 {
// First occurrence of this user key
ukey = append(ukey[:0], ikey.ukey()...)
hasUkey = true
lseq = kMaxSeq
}
drop := false
if lseq <= minSeq {
// Dropped because newer entry for same user key exist
drop = true // (A)
} else if vt == tDel && seq <= minSeq && c.baseLevelForKey(ukey) {
// For this user key:
// (1) there is no data in higher levels
// (2) data in lower levels will have larger seq numbers
// (3) data in layers that are being compacted here and have
// smaller seq numbers will be dropped in the next
// few iterations of this loop (by rule (A) above).
// Therefore this deletion marker is obsolete and can be dropped.
drop = true
}
lseq = seq
if drop {
dropCnt++
continue
}
}
// Create new table if not already
if tw == nil {
// Check for pause event.
select {
case ch := <-db.tcompPauseC:
db.pauseCompaction(ch)
case _, _ = <-db.closeC:
db.compactionExitTransact()
default:
}
// Create new table.
tw, err = db.s.tops.create()
if err != nil {
return
}
}
// Write key/value into table
err = tw.append(ikey, iter.Value())
if err != nil {
return
}
// Finish table if it is big enough
if tw.tw.BytesLen() >= kMaxTableSize {
err = finish()
if err != nil {
return
}
snapSched = true
tw = nil
}
}
err = iter.Error()
if err != nil {
return
}
// Finish last table
if tw != nil && !tw.empty() {
err = finish()
if err != nil {
return
}
tw = nil
}
return
}, func() error {
for _, r := range rec.addedTables {
db.logf("table@build rollback @%d", r.num)
f := db.s.getTableFile(r.num)
if err := f.Remove(); err != nil {
return err
}
}
return nil
})
// Commit changes // Commit changes
db.compactionTransact("table@commit", func(cnt *compactionTransactCounter) (err error) { db.compactionTransactFunc("table@commit", func(cnt *compactionTransactCounter) (err error) {
stats[1].startTimer() stats[1].startTimer()
defer stats[1].stopTimer() defer stats[1].stopTimer()
return db.s.commit(rec) return db.s.commit(rec)
}, nil) }, nil)
resultSize := int(stats[1].write) resultSize := int(stats[1].write)
db.logf("table@compaction committed F%s S%s D·%d T·%v", sint(len(rec.addedTables)-len(rec.deletedTables)), sshortenb(resultSize-sourceSize), dropCnt, stats[1].duration) db.logf("table@compaction committed F%s S%s Ke·%d D·%d T·%v", sint(len(rec.addedTables)-len(rec.deletedTables)), sshortenb(resultSize-sourceSize), b.kerrCnt, b.dropCnt, stats[1].duration)
// Save compaction stats // Save compaction stats
for i := range stats { for i := range stats {
@ -494,14 +619,14 @@ func (db *DB) tableRangeCompaction(level int, umin, umax []byte) {
db.tableCompaction(c, true) db.tableCompaction(c, true)
} }
} else { } else {
v := db.s.version_NB() v := db.s.version()
m := 1 m := 1
for i, t := range v.tables[1:] { for i, t := range v.tables[1:] {
if t.overlaps(db.s.icmp, umin, umax, false) { if t.overlaps(db.s.icmp, umin, umax, false) {
m = i + 1 m = i + 1
} }
} }
v.release()
for level := 0; level < m; level++ { for level := 0; level < m; level++ {
if c := db.s.getCompactionRange(level, umin, umax); c != nil { if c := db.s.getCompactionRange(level, umin, umax); c != nil {
@ -518,7 +643,9 @@ func (db *DB) tableAutoCompaction() {
} }
func (db *DB) tableNeedCompaction() bool { func (db *DB) tableNeedCompaction() bool {
return db.s.version_NB().needCompaction() v := db.s.version()
defer v.release()
return v.needCompaction()
} }
func (db *DB) pauseCompaction(ch chan<- struct{}) { func (db *DB) pauseCompaction(ch chan<- struct{}) {
@ -538,10 +665,12 @@ type cIdle struct {
} }
func (r cIdle) ack(err error) { func (r cIdle) ack(err error) {
if r.ackC != nil {
defer func() { defer func() {
recover() recover()
}() }()
r.ackC <- err r.ackC <- err
}
} }
type cRange struct { type cRange struct {
@ -559,6 +688,7 @@ func (r cRange) ack(err error) {
} }
} }
// This will trigger auto compation and/or wait for all compaction to be done.
func (db *DB) compSendIdle(compC chan<- cCmd) (err error) { func (db *DB) compSendIdle(compC chan<- cCmd) (err error) {
ch := make(chan error) ch := make(chan error)
defer close(ch) defer close(ch)
@ -580,6 +710,15 @@ func (db *DB) compSendIdle(compC chan<- cCmd) (err error) {
return err return err
} }
// This will trigger auto compaction but will not wait for it.
func (db *DB) compSendTrigger(compC chan<- cCmd) {
select {
case compC <- cIdle{}:
default:
}
}
// Send range compaction request.
func (db *DB) compSendRange(compC chan<- cCmd, level int, min, max []byte) (err error) { func (db *DB) compSendRange(compC chan<- cCmd, level int, min, max []byte) (err error) {
ch := make(chan error) ch := make(chan error)
defer close(ch) defer close(ch)
@ -601,13 +740,6 @@ func (db *DB) compSendRange(compC chan<- cCmd, level int, min, max []byte) (err
return err return err
} }
func (db *DB) compTrigger(compTriggerC chan struct{}) {
select {
case compTriggerC <- struct{}{}:
default:
}
}
func (db *DB) mCompaction() { func (db *DB) mCompaction() {
var x cCmd var x cCmd
@ -626,11 +758,14 @@ func (db *DB) mCompaction() {
for { for {
select { select {
case x = <-db.mcompCmdC: case x = <-db.mcompCmdC:
switch x.(type) {
case cIdle:
db.memCompaction() db.memCompaction()
x.ack(nil) x.ack(nil)
x = nil x = nil
case <-db.mcompTriggerC: default:
db.memCompaction() panic("leveldb: unknown command")
}
case _, _ = <-db.closeC: case _, _ = <-db.closeC:
return return
} }
@ -661,7 +796,6 @@ func (db *DB) tCompaction() {
if db.tableNeedCompaction() { if db.tableNeedCompaction() {
select { select {
case x = <-db.tcompCmdC: case x = <-db.tcompCmdC:
case <-db.tcompTriggerC:
case ch := <-db.tcompPauseC: case ch := <-db.tcompPauseC:
db.pauseCompaction(ch) db.pauseCompaction(ch)
continue continue
@ -677,7 +811,6 @@ func (db *DB) tCompaction() {
ackQ = ackQ[:0] ackQ = ackQ[:0]
select { select {
case x = <-db.tcompCmdC: case x = <-db.tcompCmdC:
case <-db.tcompTriggerC:
case ch := <-db.tcompPauseC: case ch := <-db.tcompPauseC:
db.pauseCompaction(ch) db.pauseCompaction(ch)
continue continue
@ -692,6 +825,8 @@ func (db *DB) tCompaction() {
case cRange: case cRange:
db.tableRangeCompaction(cmd.level, cmd.min, cmd.max) db.tableRangeCompaction(cmd.level, cmd.min, cmd.max)
x.ack(nil) x.ack(nil)
default:
panic("leveldb: unknown command")
} }
x = nil x = nil
} }

View File

@ -48,7 +48,8 @@ func (db *DB) newRawIterator(slice *util.Range, ro *opt.ReadOptions) iterator.It
i = append(i, fmi) i = append(i, fmi)
} }
i = append(i, ti...) i = append(i, ti...)
mi := iterator.NewMergedIterator(i, db.s.icmp, true) strict := opt.GetStrict(db.s.o.Options, ro, opt.StrictReader)
mi := iterator.NewMergedIterator(i, db.s.icmp, strict)
mi.SetReleaser(&versionReleaser{v: v}) mi.SetReleaser(&versionReleaser{v: v})
return mi return mi
} }
@ -58,10 +59,10 @@ func (db *DB) newIterator(seq uint64, slice *util.Range, ro *opt.ReadOptions) *d
if slice != nil { if slice != nil {
islice = &util.Range{} islice = &util.Range{}
if slice.Start != nil { if slice.Start != nil {
islice.Start = newIKey(slice.Start, kMaxSeq, tSeek) islice.Start = newIkey(slice.Start, kMaxSeq, ktSeek)
} }
if slice.Limit != nil { if slice.Limit != nil {
islice.Limit = newIKey(slice.Limit, kMaxSeq, tSeek) islice.Limit = newIkey(slice.Limit, kMaxSeq, ktSeek)
} }
} }
rawIter := db.newRawIterator(islice, ro) rawIter := db.newRawIterator(islice, ro)
@ -70,7 +71,7 @@ func (db *DB) newIterator(seq uint64, slice *util.Range, ro *opt.ReadOptions) *d
icmp: db.s.icmp, icmp: db.s.icmp,
iter: rawIter, iter: rawIter,
seq: seq, seq: seq,
strict: db.s.o.GetStrict(opt.StrictIterator) || ro.GetStrict(opt.StrictIterator), strict: opt.GetStrict(db.s.o.Options, ro, opt.StrictReader),
key: make([]byte, 0), key: make([]byte, 0),
value: make([]byte, 0), value: make([]byte, 0),
} }
@ -161,7 +162,7 @@ func (i *dbIter) Seek(key []byte) bool {
return false return false
} }
ikey := newIKey(key, i.seq, tSeek) ikey := newIkey(key, i.seq, ktSeek)
if i.iter.Seek(ikey) { if i.iter.Seek(ikey) {
i.dir = dirSOI i.dir = dirSOI
return i.next() return i.next()
@ -173,15 +174,14 @@ func (i *dbIter) Seek(key []byte) bool {
func (i *dbIter) next() bool { func (i *dbIter) next() bool {
for { for {
ukey, seq, t, ok := parseIkey(i.iter.Key()) if ukey, seq, kt, kerr := parseIkey(i.iter.Key()); kerr == nil {
if ok {
if seq <= i.seq { if seq <= i.seq {
switch t { switch kt {
case tDel: case ktDel:
// Skip deleted key. // Skip deleted key.
i.key = append(i.key[:0], ukey...) i.key = append(i.key[:0], ukey...)
i.dir = dirForward i.dir = dirForward
case tVal: case ktVal:
if i.dir == dirSOI || i.icmp.uCompare(ukey, i.key) > 0 { if i.dir == dirSOI || i.icmp.uCompare(ukey, i.key) > 0 {
i.key = append(i.key[:0], ukey...) i.key = append(i.key[:0], ukey...)
i.value = append(i.value[:0], i.iter.Value()...) i.value = append(i.value[:0], i.iter.Value()...)
@ -191,7 +191,7 @@ func (i *dbIter) next() bool {
} }
} }
} else if i.strict { } else if i.strict {
i.setErr(errInvalidIkey) i.setErr(kerr)
break break
} }
if !i.iter.Next() { if !i.iter.Next() {
@ -224,20 +224,19 @@ func (i *dbIter) prev() bool {
del := true del := true
if i.iter.Valid() { if i.iter.Valid() {
for { for {
ukey, seq, t, ok := parseIkey(i.iter.Key()) if ukey, seq, kt, kerr := parseIkey(i.iter.Key()); kerr == nil {
if ok {
if seq <= i.seq { if seq <= i.seq {
if !del && i.icmp.uCompare(ukey, i.key) < 0 { if !del && i.icmp.uCompare(ukey, i.key) < 0 {
return true return true
} }
del = (t == tDel) del = (kt == ktDel)
if !del { if !del {
i.key = append(i.key[:0], ukey...) i.key = append(i.key[:0], ukey...)
i.value = append(i.value[:0], i.iter.Value()...) i.value = append(i.value[:0], i.iter.Value()...)
} }
} }
} else if i.strict { } else if i.strict {
i.setErr(errInvalidIkey) i.setErr(kerr)
return false return false
} }
if !i.iter.Prev() { if !i.iter.Prev() {
@ -266,13 +265,12 @@ func (i *dbIter) Prev() bool {
return i.Last() return i.Last()
case dirForward: case dirForward:
for i.iter.Prev() { for i.iter.Prev() {
ukey, _, _, ok := parseIkey(i.iter.Key()) if ukey, _, _, kerr := parseIkey(i.iter.Key()); kerr == nil {
if ok {
if i.icmp.uCompare(ukey, i.key) < 0 { if i.icmp.uCompare(ukey, i.key) < 0 {
goto cont goto cont
} }
} else if i.strict { } else if i.strict {
i.setErr(errInvalidIkey) i.setErr(kerr)
return false return false
} }
} }

View File

@ -7,6 +7,7 @@
package leveldb package leveldb
import ( import (
"bytes"
"container/list" "container/list"
crand "crypto/rand" crand "crypto/rand"
"encoding/binary" "encoding/binary"
@ -23,6 +24,7 @@ import (
"unsafe" "unsafe"
"github.com/syndtr/goleveldb/leveldb/comparer" "github.com/syndtr/goleveldb/leveldb/comparer"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/filter" "github.com/syndtr/goleveldb/leveldb/filter"
"github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/iterator"
"github.com/syndtr/goleveldb/leveldb/opt" "github.com/syndtr/goleveldb/leveldb/opt"
@ -151,7 +153,10 @@ func (h *dbHarness) maxNextLevelOverlappingBytes(want uint64) {
t := h.t t := h.t
db := h.db db := h.db
var res uint64 var (
maxOverlaps uint64
maxLevel int
)
v := db.s.version() v := db.s.version()
for i, tt := range v.tables[1 : len(v.tables)-1] { for i, tt := range v.tables[1 : len(v.tables)-1] {
level := i + 1 level := i + 1
@ -159,15 +164,18 @@ func (h *dbHarness) maxNextLevelOverlappingBytes(want uint64) {
for _, t := range tt { for _, t := range tt {
r := next.getOverlaps(nil, db.s.icmp, t.imin.ukey(), t.imax.ukey(), false) r := next.getOverlaps(nil, db.s.icmp, t.imin.ukey(), t.imax.ukey(), false)
sum := r.size() sum := r.size()
if sum > res { if sum > maxOverlaps {
res = sum maxOverlaps = sum
maxLevel = level
} }
} }
} }
v.release() v.release()
if res > want { if maxOverlaps > want {
t.Errorf("next level overlapping bytes is more than %d, got=%d", want, res) t.Errorf("next level most overlapping bytes is more than %d, got=%d level=%d", want, maxOverlaps, maxLevel)
} else {
t.Logf("next level most overlapping bytes is %d, level=%d want=%d", maxOverlaps, maxLevel, want)
} }
} }
@ -240,7 +248,7 @@ func (h *dbHarness) allEntriesFor(key, want string) {
db := h.db db := h.db
s := db.s s := db.s
ikey := newIKey([]byte(key), kMaxSeq, tVal) ikey := newIkey([]byte(key), kMaxSeq, ktVal)
iter := db.newRawIterator(nil, nil) iter := db.newRawIterator(nil, nil)
if !iter.Seek(ikey) && iter.Error() != nil { if !iter.Seek(ikey) && iter.Error() != nil {
t.Error("AllEntries: error during seek, err: ", iter.Error()) t.Error("AllEntries: error during seek, err: ", iter.Error())
@ -249,19 +257,18 @@ func (h *dbHarness) allEntriesFor(key, want string) {
res := "[ " res := "[ "
first := true first := true
for iter.Valid() { for iter.Valid() {
rkey := iKey(iter.Key()) if ukey, _, kt, kerr := parseIkey(iter.Key()); kerr == nil {
if _, t, ok := rkey.parseNum(); ok { if s.icmp.uCompare(ikey.ukey(), ukey) != 0 {
if s.icmp.uCompare(ikey.ukey(), rkey.ukey()) != 0 {
break break
} }
if !first { if !first {
res += ", " res += ", "
} }
first = false first = false
switch t { switch kt {
case tVal: case ktVal:
res += string(iter.Value()) res += string(iter.Value())
case tDel: case ktDel:
res += "DEL" res += "DEL"
} }
} else { } else {
@ -326,6 +333,8 @@ func (h *dbHarness) compactMem() {
t := h.t t := h.t
db := h.db db := h.db
t.Log("starting memdb compaction")
db.writeLockC <- struct{}{} db.writeLockC <- struct{}{}
defer func() { defer func() {
<-db.writeLockC <-db.writeLockC
@ -341,6 +350,8 @@ func (h *dbHarness) compactMem() {
if h.totalTables() == 0 { if h.totalTables() == 0 {
t.Error("zero tables after mem compaction") t.Error("zero tables after mem compaction")
} }
t.Log("memdb compaction done")
} }
func (h *dbHarness) compactRangeAtErr(level int, min, max string, wanterr bool) { func (h *dbHarness) compactRangeAtErr(level int, min, max string, wanterr bool) {
@ -355,6 +366,8 @@ func (h *dbHarness) compactRangeAtErr(level int, min, max string, wanterr bool)
_max = []byte(max) _max = []byte(max)
} }
t.Logf("starting table range compaction: level=%d, min=%q, max=%q", level, min, max)
if err := db.compSendRange(db.tcompCmdC, level, _min, _max); err != nil { if err := db.compSendRange(db.tcompCmdC, level, _min, _max); err != nil {
if wanterr { if wanterr {
t.Log("CompactRangeAt: got error (expected): ", err) t.Log("CompactRangeAt: got error (expected): ", err)
@ -364,6 +377,8 @@ func (h *dbHarness) compactRangeAtErr(level int, min, max string, wanterr bool)
} else if wanterr { } else if wanterr {
t.Error("CompactRangeAt: expect error") t.Error("CompactRangeAt: expect error")
} }
t.Log("table range compaction done")
} }
func (h *dbHarness) compactRangeAt(level int, min, max string) { func (h *dbHarness) compactRangeAt(level int, min, max string) {
@ -374,6 +389,8 @@ func (h *dbHarness) compactRange(min, max string) {
t := h.t t := h.t
db := h.db db := h.db
t.Logf("starting DB range compaction: min=%q, max=%q", min, max)
var r util.Range var r util.Range
if min != "" { if min != "" {
r.Start = []byte(min) r.Start = []byte(min)
@ -384,6 +401,8 @@ func (h *dbHarness) compactRange(min, max string) {
if err := db.CompactRange(r); err != nil { if err := db.CompactRange(r); err != nil {
t.Error("CompactRange: got error: ", err) t.Error("CompactRange: got error: ", err)
} }
t.Log("DB range compaction done")
} }
func (h *dbHarness) sizeAssert(start, limit string, low, hi uint64) { func (h *dbHarness) sizeAssert(start, limit string, low, hi uint64) {
@ -505,10 +524,10 @@ func Test_FieldsAligned(t *testing.T) {
p1 := new(DB) p1 := new(DB)
testAligned(t, "DB.seq", unsafe.Offsetof(p1.seq)) testAligned(t, "DB.seq", unsafe.Offsetof(p1.seq))
p2 := new(session) p2 := new(session)
testAligned(t, "session.stFileNum", unsafe.Offsetof(p2.stFileNum)) testAligned(t, "session.stNextFileNum", unsafe.Offsetof(p2.stNextFileNum))
testAligned(t, "session.stJournalNum", unsafe.Offsetof(p2.stJournalNum)) testAligned(t, "session.stJournalNum", unsafe.Offsetof(p2.stJournalNum))
testAligned(t, "session.stPrevJournalNum", unsafe.Offsetof(p2.stPrevJournalNum)) testAligned(t, "session.stPrevJournalNum", unsafe.Offsetof(p2.stPrevJournalNum))
testAligned(t, "session.stSeq", unsafe.Offsetof(p2.stSeq)) testAligned(t, "session.stSeqNum", unsafe.Offsetof(p2.stSeqNum))
} }
func TestDb_Locking(t *testing.T) { func TestDb_Locking(t *testing.T) {
@ -944,7 +963,7 @@ func TestDb_RepeatedWritesToSameKey(t *testing.T) {
h := newDbHarnessWopt(t, &opt.Options{WriteBuffer: 100000}) h := newDbHarnessWopt(t, &opt.Options{WriteBuffer: 100000})
defer h.close() defer h.close()
maxTables := kNumLevels + kL0_StopWritesTrigger maxTables := h.o.GetNumLevel() + h.o.GetWriteL0PauseTrigger()
value := strings.Repeat("v", 2*h.o.GetWriteBuffer()) value := strings.Repeat("v", 2*h.o.GetWriteBuffer())
for i := 0; i < 5*maxTables; i++ { for i := 0; i < 5*maxTables; i++ {
@ -962,7 +981,7 @@ func TestDb_RepeatedWritesToSameKeyAfterReopen(t *testing.T) {
h.reopenDB() h.reopenDB()
maxTables := kNumLevels + kL0_StopWritesTrigger maxTables := h.o.GetNumLevel() + h.o.GetWriteL0PauseTrigger()
value := strings.Repeat("v", 2*h.o.GetWriteBuffer()) value := strings.Repeat("v", 2*h.o.GetWriteBuffer())
for i := 0; i < 5*maxTables; i++ { for i := 0; i < 5*maxTables; i++ {
@ -978,7 +997,7 @@ func TestDb_SparseMerge(t *testing.T) {
h := newDbHarnessWopt(t, &opt.Options{Compression: opt.NoCompression}) h := newDbHarnessWopt(t, &opt.Options{Compression: opt.NoCompression})
defer h.close() defer h.close()
h.putMulti(kNumLevels, "A", "Z") h.putMulti(h.o.GetNumLevel(), "A", "Z")
// Suppose there is: // Suppose there is:
// small amount of data with prefix A // small amount of data with prefix A
@ -1002,6 +1021,7 @@ func TestDb_SparseMerge(t *testing.T) {
h.put("C", "vc2") h.put("C", "vc2")
h.compactMem() h.compactMem()
h.waitCompaction()
h.maxNextLevelOverlappingBytes(20 * 1048576) h.maxNextLevelOverlappingBytes(20 * 1048576)
h.compactRangeAt(0, "", "") h.compactRangeAt(0, "", "")
h.waitCompaction() h.waitCompaction()
@ -1172,7 +1192,7 @@ func TestDb_HiddenValuesAreRemoved(t *testing.T) {
h.put("foo", "v1") h.put("foo", "v1")
h.compactMem() h.compactMem()
m := kMaxMemCompactLevel m := h.o.GetMaxMemCompationLevel()
v := s.version() v := s.version()
num := v.tLen(m) num := v.tLen(m)
v.release() v.release()
@ -1216,7 +1236,7 @@ func TestDb_DeletionMarkers2(t *testing.T) {
h.put("foo", "v1") h.put("foo", "v1")
h.compactMem() h.compactMem()
m := kMaxMemCompactLevel m := h.o.GetMaxMemCompationLevel()
v := s.version() v := s.version()
num := v.tLen(m) num := v.tLen(m)
v.release() v.release()
@ -1269,14 +1289,14 @@ func TestDb_CompactionTableOpenError(t *testing.T) {
t.Errorf("total tables is %d, want %d", n, im) t.Errorf("total tables is %d, want %d", n, im)
} }
h.stor.SetOpenErr(storage.TypeTable) h.stor.SetEmuErr(storage.TypeTable, tsOpOpen)
go h.db.CompactRange(util.Range{}) go h.db.CompactRange(util.Range{})
if err := h.db.compSendIdle(h.db.tcompCmdC); err != nil { if err := h.db.compSendIdle(h.db.tcompCmdC); err != nil {
t.Log("compaction error: ", err) t.Log("compaction error: ", err)
} }
h.closeDB0() h.closeDB0()
h.openDB() h.openDB()
h.stor.SetOpenErr(0) h.stor.SetEmuErr(0, tsOpOpen)
for i := 0; i < im; i++ { for i := 0; i < im; i++ {
for j := 0; j < jm; j++ { for j := 0; j < jm; j++ {
@ -1287,7 +1307,7 @@ func TestDb_CompactionTableOpenError(t *testing.T) {
func TestDb_OverlapInLevel0(t *testing.T) { func TestDb_OverlapInLevel0(t *testing.T) {
trun(t, func(h *dbHarness) { trun(t, func(h *dbHarness) {
if kMaxMemCompactLevel != 2 { if h.o.GetMaxMemCompationLevel() != 2 {
t.Fatal("fix test to reflect the config") t.Fatal("fix test to reflect the config")
} }
@ -1407,23 +1427,23 @@ func TestDb_ManifestWriteError(t *testing.T) {
h.compactMem() h.compactMem()
h.getVal("foo", "bar") h.getVal("foo", "bar")
v := h.db.s.version() v := h.db.s.version()
if n := v.tLen(kMaxMemCompactLevel); n != 1 { if n := v.tLen(h.o.GetMaxMemCompationLevel()); n != 1 {
t.Errorf("invalid total tables, want=1 got=%d", n) t.Errorf("invalid total tables, want=1 got=%d", n)
} }
v.release() v.release()
if i == 0 { if i == 0 {
h.stor.SetWriteErr(storage.TypeManifest) h.stor.SetEmuErr(storage.TypeManifest, tsOpWrite)
} else { } else {
h.stor.SetSyncErr(storage.TypeManifest) h.stor.SetEmuErr(storage.TypeManifest, tsOpSync)
} }
// Merging compaction (will fail) // Merging compaction (will fail)
h.compactRangeAtErr(kMaxMemCompactLevel, "", "", true) h.compactRangeAtErr(h.o.GetMaxMemCompationLevel(), "", "", true)
h.db.Close() h.db.Close()
h.stor.SetWriteErr(0) h.stor.SetEmuErr(0, tsOpWrite)
h.stor.SetSyncErr(0) h.stor.SetEmuErr(0, tsOpSync)
// Should not lose data // Should not lose data
h.openDB() h.openDB()
@ -1573,7 +1593,7 @@ func TestDb_ManualCompaction(t *testing.T) {
h := newDbHarness(t) h := newDbHarness(t)
defer h.close() defer h.close()
if kMaxMemCompactLevel != 2 { if h.o.GetMaxMemCompationLevel() != 2 {
t.Fatal("fix test to reflect the config") t.Fatal("fix test to reflect the config")
} }
@ -1857,7 +1877,7 @@ func TestDb_DeletionMarkersOnMemdb(t *testing.T) {
} }
func TestDb_LeveldbIssue178(t *testing.T) { func TestDb_LeveldbIssue178(t *testing.T) {
nKeys := (kMaxTableSize / 30) * 5 nKeys := (opt.DefaultCompactionTableSize / 30) * 5
key1 := func(i int) string { key1 := func(i int) string {
return fmt.Sprintf("my_key_%d", i) return fmt.Sprintf("my_key_%d", i)
} }
@ -2125,7 +2145,7 @@ func TestDb_GoleveldbIssue72and83(t *testing.T) {
} }
} }
if err := iter.Error(); err != nil { if err := iter.Error(); err != nil {
t.Fatalf("READER0 #%d.%d W#%d snap.Iterator: %v", i, k, err) t.Fatalf("READER0 #%d.%d W#%d snap.Iterator: %v", i, k, writei, err)
} }
iter.Release() iter.Release()
snap.Release() snap.Release()
@ -2164,5 +2184,385 @@ func TestDb_GoleveldbIssue72and83(t *testing.T) {
}() }()
wg.Wait() wg.Wait()
}
func TestDb_TransientError(t *testing.T) {
h := newDbHarnessWopt(t, &opt.Options{
WriteBuffer: 128 * opt.KiB,
CachedOpenFiles: 3,
DisableCompactionBackoff: true,
})
defer h.close()
const (
nSnap = 20
nKey = 10000
)
var (
snaps [nSnap]*Snapshot
b = &Batch{}
)
for i := range snaps {
vtail := fmt.Sprintf("VAL%030d", i)
b.Reset()
for k := 0; k < nKey; k++ {
key := fmt.Sprintf("KEY%8d", k)
b.Put([]byte(key), []byte(key+vtail))
}
h.stor.SetEmuRandErr(storage.TypeTable, tsOpOpen, tsOpRead, tsOpReadAt)
if err := h.db.Write(b, nil); err != nil {
t.Logf("WRITE #%d error: %v", i, err)
h.stor.SetEmuRandErr(0, tsOpOpen, tsOpRead, tsOpReadAt, tsOpWrite)
for {
if err := h.db.Write(b, nil); err == nil {
break
} else if errors.IsCorrupted(err) {
t.Fatalf("WRITE #%d corrupted: %v", i, err)
}
}
}
snaps[i] = h.db.newSnapshot()
b.Reset()
for k := 0; k < nKey; k++ {
key := fmt.Sprintf("KEY%8d", k)
b.Delete([]byte(key))
}
h.stor.SetEmuRandErr(storage.TypeTable, tsOpOpen, tsOpRead, tsOpReadAt)
if err := h.db.Write(b, nil); err != nil {
t.Logf("WRITE #%d error: %v", i, err)
h.stor.SetEmuRandErr(0, tsOpOpen, tsOpRead, tsOpReadAt)
for {
if err := h.db.Write(b, nil); err == nil {
break
} else if errors.IsCorrupted(err) {
t.Fatalf("WRITE #%d corrupted: %v", i, err)
}
}
}
}
h.stor.SetEmuRandErr(0, tsOpOpen, tsOpRead, tsOpReadAt)
runtime.GOMAXPROCS(runtime.NumCPU())
rnd := rand.New(rand.NewSource(0xecafdaed))
wg := &sync.WaitGroup{}
for i, snap := range snaps {
wg.Add(2)
go func(i int, snap *Snapshot, sk []int) {
defer wg.Done()
vtail := fmt.Sprintf("VAL%030d", i)
for _, k := range sk {
key := fmt.Sprintf("KEY%8d", k)
xvalue, err := snap.Get([]byte(key), nil)
if err != nil {
t.Fatalf("READER_GET #%d SEQ=%d K%d error: %v", i, snap.elem.seq, k, err)
}
value := key + vtail
if !bytes.Equal([]byte(value), xvalue) {
t.Fatalf("READER_GET #%d SEQ=%d K%d invalid value: want %q, got %q", i, snap.elem.seq, k, value, xvalue)
}
}
}(i, snap, rnd.Perm(nKey))
go func(i int, snap *Snapshot) {
defer wg.Done()
vtail := fmt.Sprintf("VAL%030d", i)
iter := snap.NewIterator(nil, nil)
defer iter.Release()
for k := 0; k < nKey; k++ {
if !iter.Next() {
if err := iter.Error(); err != nil {
t.Fatalf("READER_ITER #%d K%d error: %v", i, k, err)
} else {
t.Fatalf("READER_ITER #%d K%d eoi", i, k)
}
}
key := fmt.Sprintf("KEY%8d", k)
xkey := iter.Key()
if !bytes.Equal([]byte(key), xkey) {
t.Fatalf("READER_ITER #%d K%d invalid key: want %q, got %q", i, k, key, xkey)
}
value := key + vtail
xvalue := iter.Value()
if !bytes.Equal([]byte(value), xvalue) {
t.Fatalf("READER_ITER #%d K%d invalid value: want %q, got %q", i, k, value, xvalue)
}
}
}(i, snap)
}
wg.Wait()
}
func TestDb_UkeyShouldntHopAcrossTable(t *testing.T) {
h := newDbHarnessWopt(t, &opt.Options{
WriteBuffer: 112 * opt.KiB,
CompactionTableSize: 90 * opt.KiB,
CompactionExpandLimitFactor: 1,
})
defer h.close()
const (
nSnap = 190
nKey = 140
)
var (
snaps [nSnap]*Snapshot
b = &Batch{}
)
for i := range snaps {
vtail := fmt.Sprintf("VAL%030d", i)
b.Reset()
for k := 0; k < nKey; k++ {
key := fmt.Sprintf("KEY%08d", k)
b.Put([]byte(key), []byte(key+vtail))
}
if err := h.db.Write(b, nil); err != nil {
t.Fatalf("WRITE #%d error: %v", i, err)
}
snaps[i] = h.db.newSnapshot()
b.Reset()
for k := 0; k < nKey; k++ {
key := fmt.Sprintf("KEY%08d", k)
b.Delete([]byte(key))
}
if err := h.db.Write(b, nil); err != nil {
t.Fatalf("WRITE #%d error: %v", i, err)
}
}
h.compactMem()
h.waitCompaction()
for level, tables := range h.db.s.stVersion.tables {
for _, table := range tables {
t.Logf("L%d@%d %q:%q", level, table.file.Num(), table.imin, table.imax)
}
}
h.compactRangeAt(0, "", "")
h.waitCompaction()
for level, tables := range h.db.s.stVersion.tables {
for _, table := range tables {
t.Logf("L%d@%d %q:%q", level, table.file.Num(), table.imin, table.imax)
}
}
h.compactRangeAt(1, "", "")
h.waitCompaction()
for level, tables := range h.db.s.stVersion.tables {
for _, table := range tables {
t.Logf("L%d@%d %q:%q", level, table.file.Num(), table.imin, table.imax)
}
}
runtime.GOMAXPROCS(runtime.NumCPU())
wg := &sync.WaitGroup{}
for i, snap := range snaps {
wg.Add(1)
go func(i int, snap *Snapshot) {
defer wg.Done()
vtail := fmt.Sprintf("VAL%030d", i)
for k := 0; k < nKey; k++ {
key := fmt.Sprintf("KEY%08d", k)
xvalue, err := snap.Get([]byte(key), nil)
if err != nil {
t.Fatalf("READER_GET #%d SEQ=%d K%d error: %v", i, snap.elem.seq, k, err)
}
value := key + vtail
if !bytes.Equal([]byte(value), xvalue) {
t.Fatalf("READER_GET #%d SEQ=%d K%d invalid value: want %q, got %q", i, snap.elem.seq, k, value, xvalue)
}
}
}(i, snap)
}
wg.Wait()
}
func TestDb_TableCompactionBuilder(t *testing.T) {
stor := newTestStorage(t)
defer stor.Close()
const nSeq = 99
o := &opt.Options{
WriteBuffer: 112 * opt.KiB,
CompactionTableSize: 43 * opt.KiB,
CompactionExpandLimitFactor: 1,
CompactionGPOverlapsFactor: 1,
BlockCache: opt.NoCache,
}
s, err := newSession(stor, o)
if err != nil {
t.Fatal(err)
}
if err := s.create(); err != nil {
t.Fatal(err)
}
defer s.close()
var (
seq uint64
targetSize = 5 * o.CompactionTableSize
value = bytes.Repeat([]byte{'0'}, 100)
)
for i := 0; i < 2; i++ {
tw, err := s.tops.create()
if err != nil {
t.Fatal(err)
}
for k := 0; tw.tw.BytesLen() < targetSize; k++ {
key := []byte(fmt.Sprintf("%09d", k))
seq += nSeq - 1
for x := uint64(0); x < nSeq; x++ {
if err := tw.append(newIkey(key, seq-x, ktVal), value); err != nil {
t.Fatal(err)
}
}
}
tf, err := tw.finish()
if err != nil {
t.Fatal(err)
}
rec := &sessionRecord{numLevel: s.o.GetNumLevel()}
rec.addTableFile(i, tf)
if err := s.commit(rec); err != nil {
t.Fatal(err)
}
}
// Build grandparent.
v := s.version()
c := newCompaction(s, v, 1, append(tFiles{}, v.tables[1]...))
rec := &sessionRecord{numLevel: s.o.GetNumLevel()}
b := &tableCompactionBuilder{
s: s,
c: c,
rec: rec,
stat1: new(cStatsStaging),
minSeq: 0,
strict: true,
tableSize: o.CompactionTableSize/3 + 961,
}
if err := b.run(new(compactionTransactCounter)); err != nil {
t.Fatal(err)
}
for _, t := range c.tables[0] {
rec.delTable(c.level, t.file.Num())
}
if err := s.commit(rec); err != nil {
t.Fatal(err)
}
c.release()
// Build level-1.
v = s.version()
c = newCompaction(s, v, 0, append(tFiles{}, v.tables[0]...))
rec = &sessionRecord{numLevel: s.o.GetNumLevel()}
b = &tableCompactionBuilder{
s: s,
c: c,
rec: rec,
stat1: new(cStatsStaging),
minSeq: 0,
strict: true,
tableSize: o.CompactionTableSize,
}
if err := b.run(new(compactionTransactCounter)); err != nil {
t.Fatal(err)
}
for _, t := range c.tables[0] {
rec.delTable(c.level, t.file.Num())
}
// Move grandparent to level-3
for _, t := range v.tables[2] {
rec.delTable(2, t.file.Num())
rec.addTableFile(3, t)
}
if err := s.commit(rec); err != nil {
t.Fatal(err)
}
c.release()
v = s.version()
for level, want := range []bool{false, true, false, true, false} {
got := len(v.tables[level]) > 0
if want != got {
t.Fatalf("invalid level-%d tables len: want %v, got %v", level, want, got)
}
}
for i, f := range v.tables[1][:len(v.tables[1])-1] {
nf := v.tables[1][i+1]
if bytes.Equal(f.imax.ukey(), nf.imin.ukey()) {
t.Fatalf("KEY %q hop across table %d .. %d", f.imax.ukey(), f.file.Num(), nf.file.Num())
}
}
v.release()
// Compaction with transient error.
v = s.version()
c = newCompaction(s, v, 1, append(tFiles{}, v.tables[1]...))
rec = &sessionRecord{numLevel: s.o.GetNumLevel()}
b = &tableCompactionBuilder{
s: s,
c: c,
rec: rec,
stat1: new(cStatsStaging),
minSeq: 0,
strict: true,
tableSize: o.CompactionTableSize,
}
stor.SetEmuErrOnce(storage.TypeTable, tsOpSync)
stor.SetEmuRandErr(storage.TypeTable, tsOpRead, tsOpReadAt, tsOpWrite)
stor.SetEmuRandErrProb(0xf0)
for {
if err := b.run(new(compactionTransactCounter)); err != nil {
t.Logf("(expected) b.run: %v", err)
} else {
break
}
}
if err := s.commit(rec); err != nil {
t.Fatal(err)
}
c.release()
stor.SetEmuErrOnce(0, tsOpSync)
stor.SetEmuRandErr(0, tsOpRead, tsOpReadAt, tsOpWrite)
v = s.version()
if len(v.tables[1]) != len(v.tables[2]) {
t.Fatalf("invalid tables length, want %d, got %d", len(v.tables[1]), len(v.tables[2]))
}
for i, f0 := range v.tables[1] {
f1 := v.tables[2][i]
iter0 := s.tops.newIterator(f0, nil, nil)
iter1 := s.tops.newIterator(f1, nil, nil)
for j := 0; true; j++ {
next0 := iter0.Next()
next1 := iter1.Next()
if next0 != next1 {
t.Fatalf("#%d.%d invalid eoi: want %v, got %v", i, j, next0, next1)
}
key0 := iter0.Key()
key1 := iter1.Key()
if !bytes.Equal(key0, key1) {
t.Fatalf("#%d.%d invalid key: want %q, got %q", i, j, key0, key1)
}
if next0 == false {
break
}
}
iter0.Release()
iter1.Release()
}
v.release()
} }

View File

@ -7,8 +7,7 @@
package leveldb package leveldb
import ( import (
"errors" "github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/iterator"
"github.com/syndtr/goleveldb/leveldb/opt" "github.com/syndtr/goleveldb/leveldb/opt"
"github.com/syndtr/goleveldb/leveldb/storage" "github.com/syndtr/goleveldb/leveldb/storage"
@ -38,7 +37,9 @@ func (db *DB) logf(format string, v ...interface{}) { db.s.logf(format, v...) }
// Check and clean files. // Check and clean files.
func (db *DB) checkAndCleanFiles() error { func (db *DB) checkAndCleanFiles() error {
v := db.s.version_NB() v := db.s.version()
defer v.release()
tablesMap := make(map[uint64]bool) tablesMap := make(map[uint64]bool)
for _, tables := range v.tables { for _, tables := range v.tables {
for _, t := range tables { for _, t := range tables {
@ -78,12 +79,14 @@ func (db *DB) checkAndCleanFiles() error {
} }
if nTables != len(tablesMap) { if nTables != len(tablesMap) {
var missing []*storage.FileInfo
for num, present := range tablesMap { for num, present := range tablesMap {
if !present { if !present {
missing = append(missing, &storage.FileInfo{Type: storage.TypeTable, Num: num})
db.logf("db@janitor table missing @%d", num) db.logf("db@janitor table missing @%d", num)
} }
} }
return ErrCorrupted{Type: MissingFiles, Err: errors.New("leveldb: table files missing")} return errors.NewErrCorrupted(nil, &errors.ErrMissingFiles{Files: missing})
} }
db.logf("db@janitor F·%d G·%d", len(files), len(rem)) db.logf("db@janitor F·%d G·%d", len(files), len(rem))

View File

@ -59,7 +59,7 @@ func (db *DB) rotateMem(n int) (mem *memDB, err error) {
} }
// Schedule memdb compaction. // Schedule memdb compaction.
db.compTrigger(db.mcompTriggerC) db.compSendTrigger(db.mcompCmdC)
return return
} }
@ -77,12 +77,12 @@ func (db *DB) flush(n int) (mem *memDB, nn int, err error) {
}() }()
nn = mem.mdb.Free() nn = mem.mdb.Free()
switch { switch {
case v.tLen(0) >= kL0_SlowdownWritesTrigger && !delayed: case v.tLen(0) >= db.s.o.GetWriteL0SlowdownTrigger() && !delayed:
delayed = true delayed = true
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
case nn >= n: case nn >= n:
return false return false
case v.tLen(0) >= kL0_StopWritesTrigger: case v.tLen(0) >= db.s.o.GetWriteL0PauseTrigger():
delayed = true delayed = true
err = db.compSendIdle(db.tcompCmdC) err = db.compSendIdle(db.tcompCmdC)
if err != nil { if err != nil {
@ -109,7 +109,12 @@ func (db *DB) flush(n int) (mem *memDB, nn int, err error) {
for flush() { for flush() {
} }
if delayed { if delayed {
db.logf("db@write delayed T·%v", time.Since(start)) db.writeDelay += time.Since(start)
db.writeDelayN++
} else if db.writeDelayN > 0 {
db.writeDelay = 0
db.writeDelayN = 0
db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay)
} }
return return
} }
@ -120,7 +125,7 @@ func (db *DB) flush(n int) (mem *memDB, nn int, err error) {
// It is safe to modify the contents of the arguments after Write returns. // It is safe to modify the contents of the arguments after Write returns.
func (db *DB) Write(b *Batch, wo *opt.WriteOptions) (err error) { func (db *DB) Write(b *Batch, wo *opt.WriteOptions) (err error) {
err = db.ok() err = db.ok()
if err != nil || b == nil || b.len() == 0 { if err != nil || b == nil || b.Len() == 0 {
return return
} }
@ -133,6 +138,8 @@ func (db *DB) Write(b *Batch, wo *opt.WriteOptions) (err error) {
return <-db.writeAckC return <-db.writeAckC
} }
case db.writeLockC <- struct{}{}: case db.writeLockC <- struct{}{}:
case err = <-db.compPerErrC:
return
case _, _ = <-db.closeC: case _, _ = <-db.closeC:
return ErrClosed return ErrClosed
} }
@ -188,35 +195,43 @@ drain:
if b.size() >= (128 << 10) { if b.size() >= (128 << 10) {
// Push the write batch to the journal writer // Push the write batch to the journal writer
select { select {
case db.journalC <- b:
// Write into memdb
if berr := b.memReplay(mem.mdb); berr != nil {
panic(berr)
}
case err = <-db.compPerErrC:
return
case _, _ = <-db.closeC: case _, _ = <-db.closeC:
err = ErrClosed err = ErrClosed
return return
case db.journalC <- b:
// Write into memdb
b.memReplay(mem.mdb)
} }
// Wait for journal writer // Wait for journal writer
select { select {
case _, _ = <-db.closeC:
err = ErrClosed
return
case err = <-db.journalAckC: case err = <-db.journalAckC:
if err != nil { if err != nil {
// Revert memdb if error detected // Revert memdb if error detected
b.revertMemReplay(mem.mdb) if berr := b.revertMemReplay(mem.mdb); berr != nil {
panic(berr)
}
return return
} }
case _, _ = <-db.closeC:
err = ErrClosed
return
} }
} else { } else {
err = db.writeJournal(b) err = db.writeJournal(b)
if err != nil { if err != nil {
return return
} }
b.memReplay(mem.mdb) if berr := b.memReplay(mem.mdb); berr != nil {
panic(berr)
}
} }
// Set last seq number. // Set last seq number.
db.addSeq(uint64(b.len())) db.addSeq(uint64(b.Len()))
if b.size() >= memFree { if b.size() >= memFree {
db.rotateMem(0) db.rotateMem(0)
@ -268,6 +283,8 @@ func (db *DB) CompactRange(r util.Range) error {
// Lock writer. // Lock writer.
select { select {
case db.writeLockC <- struct{}{}: case db.writeLockC <- struct{}{}:
case err := <-db.compPerErrC:
return err
case _, _ = <-db.closeC: case _, _ = <-db.closeC:
return ErrClosed return ErrClosed
} }

View File

@ -7,32 +7,12 @@
package leveldb package leveldb
import ( import (
"errors" "github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/util"
) )
var ( var (
ErrNotFound = util.ErrNotFound ErrNotFound = errors.ErrNotFound
ErrSnapshotReleased = errors.New("leveldb: snapshot released") ErrSnapshotReleased = errors.New("leveldb: snapshot released")
ErrIterReleased = errors.New("leveldb: iterator released") ErrIterReleased = errors.New("leveldb: iterator released")
ErrClosed = errors.New("leveldb: closed") ErrClosed = errors.New("leveldb: closed")
) )
type CorruptionType int
const (
CorruptedManifest CorruptionType = iota
MissingFiles
)
// ErrCorrupted is the type that wraps errors that indicate corruption in
// the database.
type ErrCorrupted struct {
Type CorruptionType
Err error
}
func (e ErrCorrupted) Error() string {
return e.Err.Error()
}

View File

@ -0,0 +1,76 @@
// Copyright (c) 2014, Suryandaru Triandana <syndtr@gmail.com>
// All rights reserved.
//
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Package errors provides common error types used throughout leveldb.
package errors
import (
"errors"
"fmt"
"github.com/syndtr/goleveldb/leveldb/storage"
"github.com/syndtr/goleveldb/leveldb/util"
)
var (
ErrNotFound = New("leveldb: not found")
ErrReleased = util.ErrReleased
ErrHasReleaser = util.ErrHasReleaser
)
// New returns an error that formats as the given text.
func New(text string) error {
return errors.New(text)
}
// ErrCorrupted is the type that wraps errors that indicate corruption in
// the database.
type ErrCorrupted struct {
File *storage.FileInfo
Err error
}
func (e *ErrCorrupted) Error() string {
if e.File != nil {
return fmt.Sprintf("%v [file=%v]", e.Err, e.File)
} else {
return e.Err.Error()
}
}
// NewErrCorrupted creates new ErrCorrupted error.
func NewErrCorrupted(f storage.File, err error) error {
return &ErrCorrupted{storage.NewFileInfo(f), err}
}
// IsCorrupted returns a boolean indicating whether the error is indicating
// a corruption.
func IsCorrupted(err error) bool {
switch err.(type) {
case *ErrCorrupted:
return true
}
return false
}
// ErrMissingFiles is the type that indicating a corruption due to missing
// files.
type ErrMissingFiles struct {
Files []*storage.FileInfo
}
func (e *ErrMissingFiles) Error() string { return "file missing" }
// SetFile sets 'file info' of the given error with the given file.
// Currently only ErrCorrupted is supported, otherwise will do nothing.
func SetFile(err error, f storage.File) error {
switch x := err.(type) {
case *ErrCorrupted:
x.File = storage.NewFileInfo(f)
return x
}
return err
}

View File

@ -7,6 +7,7 @@
package iterator package iterator
import ( import (
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/util" "github.com/syndtr/goleveldb/leveldb/util"
) )
@ -24,7 +25,6 @@ type indexedIterator struct {
util.BasicReleaser util.BasicReleaser
index IteratorIndexer index IteratorIndexer
strict bool strict bool
strictGet bool
data Iterator data Iterator
err error err error
@ -37,11 +37,6 @@ func (i *indexedIterator) setData() {
i.data.Release() i.data.Release()
} }
i.data = i.index.Get() i.data = i.index.Get()
if i.strictGet {
if err := i.data.Error(); err != nil {
i.err = err
}
}
} }
func (i *indexedIterator) clearData() { func (i *indexedIterator) clearData() {
@ -61,13 +56,11 @@ func (i *indexedIterator) indexErr() {
} }
func (i *indexedIterator) dataErr() bool { func (i *indexedIterator) dataErr() bool {
if i.errf != nil {
if err := i.data.Error(); err != nil { if err := i.data.Error(); err != nil {
if i.errf != nil {
i.errf(err) i.errf(err)
} }
} if i.strict || !errors.IsCorrupted(err) {
if i.strict {
if err := i.data.Error(); err != nil {
i.err = err i.err = err
return true return true
} }
@ -236,16 +229,14 @@ func (i *indexedIterator) SetErrorCallback(f func(err error)) {
i.errf = f i.errf = f
} }
// NewIndexedIterator returns an indexed iterator. An index is iterator // NewIndexedIterator returns an 'indexed iterator'. An index is iterator
// that returns another iterator, a data iterator. A data iterator is the // that returns another iterator, a 'data iterator'. A 'data iterator' is the
// iterator that contains actual key/value pairs. // iterator that contains actual key/value pairs.
// //
// If strict is true then error yield by data iterator will halt the indexed // If strict is true the any 'corruption errors' (i.e errors.IsCorrupted(err) == true)
// iterator, on contrary if strict is false then the indexed iterator will // won't be ignored and will halt 'indexed iterator', otherwise the iterator will
// ignore those error and move on to the next index. If strictGet is true and // continue to the next 'data iterator'. Corruption on 'index iterator' will not be
// index.Get() yield an 'error iterator' then the indexed iterator will be halted. // ignored and will halt the iterator.
// An 'error iterator' is iterator which its Error() method always return non-nil func NewIndexedIterator(index IteratorIndexer, strict bool) Iterator {
// even before any 'seeks method' is called. return &indexedIterator{index: index, strict: strict}
func NewIndexedIterator(index IteratorIndexer, strict, strictGet bool) Iterator {
return &indexedIterator{index: index, strict: strict, strictGet: strictGet}
} }

View File

@ -65,7 +65,7 @@ var _ = testutil.Defer(func() {
// Test the iterator. // Test the iterator.
t := testutil.IteratorTesting{ t := testutil.IteratorTesting{
KeyValue: kv.Clone(), KeyValue: kv.Clone(),
Iter: NewIndexedIterator(NewArrayIndexer(index), true, true), Iter: NewIndexedIterator(NewArrayIndexer(index), true),
} }
testutil.DoIteratorTesting(&t) testutil.DoIteratorTesting(&t)
done <- true done <- true

View File

@ -8,6 +8,7 @@ package iterator
import ( import (
"github.com/syndtr/goleveldb/leveldb/comparer" "github.com/syndtr/goleveldb/leveldb/comparer"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/util" "github.com/syndtr/goleveldb/leveldb/util"
) )
@ -42,13 +43,11 @@ func assertKey(key []byte) []byte {
} }
func (i *mergedIterator) iterErr(iter Iterator) bool { func (i *mergedIterator) iterErr(iter Iterator) bool {
if i.errf != nil {
if err := iter.Error(); err != nil { if err := iter.Error(); err != nil {
if i.errf != nil {
i.errf(err) i.errf(err)
} }
} if i.strict || !errors.IsCorrupted(err) {
if i.strict {
if err := iter.Error(); err != nil {
i.err = err i.err = err
return true return true
} }
@ -292,9 +291,9 @@ func (i *mergedIterator) SetErrorCallback(f func(err error)) {
// keys: if iters[i] contains a key k then iters[j] will not contain that key k. // keys: if iters[i] contains a key k then iters[j] will not contain that key k.
// None of the iters may be nil. // None of the iters may be nil.
// //
// If strict is true then error yield by any iterators will halt the merged // If strict is true the any 'corruption errors' (i.e errors.IsCorrupted(err) == true)
// iterator, on contrary if strict is false then the merged iterator will // won't be ignored and will halt 'merged iterator', otherwise the iterator will
// ignore those error and move on to the next iterator. // continue to the next 'input iterator'.
func NewMergedIterator(iters []Iterator, cmp comparer.Comparer, strict bool) Iterator { func NewMergedIterator(iters []Iterator, cmp comparer.Comparer, strict bool) Iterator {
return &mergedIterator{ return &mergedIterator{
iters: iters, iters: iters,

View File

@ -79,10 +79,10 @@ package journal
import ( import (
"encoding/binary" "encoding/binary"
"errors"
"fmt" "fmt"
"io" "io"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/util" "github.com/syndtr/goleveldb/leveldb/util"
) )
@ -109,7 +109,7 @@ type ErrCorrupted struct {
Reason string Reason string
} }
func (e ErrCorrupted) Error() string { func (e *ErrCorrupted) Error() string {
return fmt.Sprintf("leveldb/journal: block/chunk corrupted: %s (%d bytes)", e.Reason, e.Size) return fmt.Sprintf("leveldb/journal: block/chunk corrupted: %s (%d bytes)", e.Reason, e.Size)
} }
@ -162,10 +162,10 @@ var errSkip = errors.New("leveldb/journal: skipped")
func (r *Reader) corrupt(n int, reason string, skip bool) error { func (r *Reader) corrupt(n int, reason string, skip bool) error {
if r.dropper != nil { if r.dropper != nil {
r.dropper.Drop(ErrCorrupted{n, reason}) r.dropper.Drop(&ErrCorrupted{n, reason})
} }
if r.strict && !skip { if r.strict && !skip {
r.err = ErrCorrupted{n, reason} r.err = errors.NewErrCorrupted(nil, &ErrCorrupted{n, reason})
return r.err return r.err
} }
return errSkip return errSkip

View File

@ -9,15 +9,30 @@ package leveldb
import ( import (
"encoding/binary" "encoding/binary"
"fmt" "fmt"
"github.com/syndtr/goleveldb/leveldb/errors"
) )
type vType int type ErrIkeyCorrupted struct {
Ikey []byte
Reason string
}
func (t vType) String() string { func (e *ErrIkeyCorrupted) Error() string {
switch t { return fmt.Sprintf("leveldb: iKey %q corrupted: %s", e.Ikey, e.Reason)
case tDel: }
func newErrIkeyCorrupted(ikey []byte, reason string) error {
return errors.NewErrCorrupted(nil, &ErrIkeyCorrupted{append([]byte{}, ikey...), reason})
}
type kType int
func (kt kType) String() string {
switch kt {
case ktDel:
return "d" return "d"
case tVal: case ktVal:
return "v" return "v"
} }
return "x" return "x"
@ -26,16 +41,16 @@ func (t vType) String() string {
// Value types encoded as the last component of internal keys. // Value types encoded as the last component of internal keys.
// Don't modify; this value are saved to disk. // Don't modify; this value are saved to disk.
const ( const (
tDel vType = iota ktDel kType = iota
tVal ktVal
) )
// tSeek defines the vType that should be passed when constructing an // ktSeek defines the kType that should be passed when constructing an
// internal key for seeking to a particular sequence number (since we // internal key for seeking to a particular sequence number (since we
// sort sequence numbers in decreasing order and the value type is // sort sequence numbers in decreasing order and the value type is
// embedded as the low 8 bits in the sequence number in internal keys, // embedded as the low 8 bits in the sequence number in internal keys,
// we need to use the highest-numbered ValueType, not the lowest). // we need to use the highest-numbered ValueType, not the lowest).
const tSeek = tVal const ktSeek = ktVal
const ( const (
// Maximum value possible for sequence number; the 8-bits are // Maximum value possible for sequence number; the 8-bits are
@ -43,7 +58,7 @@ const (
// 64-bit integer. // 64-bit integer.
kMaxSeq uint64 = (uint64(1) << 56) - 1 kMaxSeq uint64 = (uint64(1) << 56) - 1
// Maximum value possible for packed sequence number and type. // Maximum value possible for packed sequence number and type.
kMaxNum uint64 = (kMaxSeq << 8) | uint64(tSeek) kMaxNum uint64 = (kMaxSeq << 8) | uint64(ktSeek)
) )
// Maximum number encoded in bytes. // Maximum number encoded in bytes.
@ -55,85 +70,73 @@ func init() {
type iKey []byte type iKey []byte
func newIKey(ukey []byte, seq uint64, t vType) iKey { func newIkey(ukey []byte, seq uint64, kt kType) iKey {
if seq > kMaxSeq || t > tVal { if seq > kMaxSeq {
panic("invalid seq number or value type") panic("leveldb: invalid sequence number")
} else if kt > ktVal {
panic("leveldb: invalid type")
} }
b := make(iKey, len(ukey)+8) ik := make(iKey, len(ukey)+8)
copy(b, ukey) copy(ik, ukey)
binary.LittleEndian.PutUint64(b[len(ukey):], (seq<<8)|uint64(t)) binary.LittleEndian.PutUint64(ik[len(ukey):], (seq<<8)|uint64(kt))
return b return ik
} }
func parseIkey(p []byte) (ukey []byte, seq uint64, t vType, ok bool) { func parseIkey(ik []byte) (ukey []byte, seq uint64, kt kType, err error) {
if len(p) < 8 { if len(ik) < 8 {
return return nil, 0, 0, newErrIkeyCorrupted(ik, "invalid length")
} }
num := binary.LittleEndian.Uint64(p[len(p)-8:]) num := binary.LittleEndian.Uint64(ik[len(ik)-8:])
seq, t = uint64(num>>8), vType(num&0xff) seq, kt = uint64(num>>8), kType(num&0xff)
if t > tVal { if kt > ktVal {
return return nil, 0, 0, newErrIkeyCorrupted(ik, "invalid type")
} }
ukey = p[:len(p)-8] ukey = ik[:len(ik)-8]
ok = true
return return
} }
func validIkey(p []byte) bool { func validIkey(ik []byte) bool {
_, _, _, ok := parseIkey(p) _, _, _, err := parseIkey(ik)
return ok return err == nil
} }
func (p iKey) assert() { func (ik iKey) assert() {
if p == nil { if ik == nil {
panic("nil iKey") panic("leveldb: nil iKey")
} }
if len(p) < 8 { if len(ik) < 8 {
panic(fmt.Sprintf("invalid iKey %q, len=%d", []byte(p), len(p))) panic(fmt.Sprintf("leveldb: iKey %q, len=%d: invalid length", ik, len(ik)))
} }
} }
func (p iKey) ok() bool { func (ik iKey) ukey() []byte {
if len(p) < 8 { ik.assert()
return false return ik[:len(ik)-8]
}
_, _, ok := p.parseNum()
return ok
} }
func (p iKey) ukey() []byte { func (ik iKey) num() uint64 {
p.assert() ik.assert()
return p[:len(p)-8] return binary.LittleEndian.Uint64(ik[len(ik)-8:])
} }
func (p iKey) num() uint64 { func (ik iKey) parseNum() (seq uint64, kt kType) {
p.assert() num := ik.num()
return binary.LittleEndian.Uint64(p[len(p)-8:]) seq, kt = uint64(num>>8), kType(num&0xff)
} if kt > ktVal {
panic(fmt.Sprintf("leveldb: iKey %q, len=%d: invalid type %#x", ik, len(ik), kt))
func (p iKey) parseNum() (seq uint64, t vType, ok bool) {
if p == nil {
panic("nil iKey")
} }
if len(p) < 8 {
return
}
num := p.num()
seq, t = uint64(num>>8), vType(num&0xff)
if t > tVal {
return 0, 0, false
}
ok = true
return return
} }
func (p iKey) String() string { func (ik iKey) String() string {
if len(p) == 0 { if ik == nil {
return "<nil>" return "<nil>"
} }
if seq, t, ok := p.parseNum(); ok {
return fmt.Sprintf("%s,%s%d", shorten(string(p.ukey())), t, seq) if ukey, seq, kt, err := parseIkey(ik); err == nil {
} return fmt.Sprintf("%s,%s%d", shorten(string(ukey)), kt, seq)
} else {
return "<invalid>" return "<invalid>"
}
} }

View File

@ -15,8 +15,8 @@ import (
var defaultIComparer = &iComparer{comparer.DefaultComparer} var defaultIComparer = &iComparer{comparer.DefaultComparer}
func ikey(key string, seq uint64, t vType) iKey { func ikey(key string, seq uint64, kt kType) iKey {
return newIKey([]byte(key), uint64(seq), t) return newIkey([]byte(key), uint64(seq), kt)
} }
func shortSep(a, b []byte) []byte { func shortSep(a, b []byte) []byte {
@ -37,27 +37,37 @@ func shortSuccessor(b []byte) []byte {
return dst return dst
} }
func testSingleKey(t *testing.T, key string, seq uint64, vt vType) { func testSingleKey(t *testing.T, key string, seq uint64, kt kType) {
ik := ikey(key, seq, vt) ik := ikey(key, seq, kt)
if !bytes.Equal(ik.ukey(), []byte(key)) { if !bytes.Equal(ik.ukey(), []byte(key)) {
t.Errorf("user key does not equal, got %v, want %v", string(ik.ukey()), key) t.Errorf("user key does not equal, got %v, want %v", string(ik.ukey()), key)
} }
if rseq, rt, ok := ik.parseNum(); ok { rseq, rt := ik.parseNum()
if rseq != seq { if rseq != seq {
t.Errorf("seq number does not equal, got %v, want %v", rseq, seq) t.Errorf("seq number does not equal, got %v, want %v", rseq, seq)
} }
if rt != kt {
t.Errorf("type does not equal, got %v, want %v", rt, kt)
}
if rt != vt { if rukey, rseq, rt, kerr := parseIkey(ik); kerr == nil {
t.Errorf("type does not equal, got %v, want %v", rt, vt) if !bytes.Equal(rukey, []byte(key)) {
t.Errorf("user key does not equal, got %v, want %v", string(ik.ukey()), key)
}
if rseq != seq {
t.Errorf("seq number does not equal, got %v, want %v", rseq, seq)
}
if rt != kt {
t.Errorf("type does not equal, got %v, want %v", rt, kt)
} }
} else { } else {
t.Error("cannot parse seq and type") t.Errorf("key error: %v", kerr)
} }
} }
func TestIKey_EncodeDecode(t *testing.T) { func TestIkey_EncodeDecode(t *testing.T) {
keys := []string{"", "k", "hello", "longggggggggggggggggggggg"} keys := []string{"", "k", "hello", "longggggggggggggggggggggg"}
seqs := []uint64{ seqs := []uint64{
1, 2, 3, 1, 2, 3,
@ -67,8 +77,8 @@ func TestIKey_EncodeDecode(t *testing.T) {
} }
for _, key := range keys { for _, key := range keys {
for _, seq := range seqs { for _, seq := range seqs {
testSingleKey(t, key, seq, tVal) testSingleKey(t, key, seq, ktVal)
testSingleKey(t, "hello", 1, tDel) testSingleKey(t, "hello", 1, ktDel)
} }
} }
} }
@ -79,45 +89,45 @@ func assertBytes(t *testing.T, want, got []byte) {
} }
} }
func TestIKeyShortSeparator(t *testing.T) { func TestIkeyShortSeparator(t *testing.T) {
// When user keys are same // When user keys are same
assertBytes(t, ikey("foo", 100, tVal), assertBytes(t, ikey("foo", 100, ktVal),
shortSep(ikey("foo", 100, tVal), shortSep(ikey("foo", 100, ktVal),
ikey("foo", 99, tVal))) ikey("foo", 99, ktVal)))
assertBytes(t, ikey("foo", 100, tVal), assertBytes(t, ikey("foo", 100, ktVal),
shortSep(ikey("foo", 100, tVal), shortSep(ikey("foo", 100, ktVal),
ikey("foo", 101, tVal))) ikey("foo", 101, ktVal)))
assertBytes(t, ikey("foo", 100, tVal), assertBytes(t, ikey("foo", 100, ktVal),
shortSep(ikey("foo", 100, tVal), shortSep(ikey("foo", 100, ktVal),
ikey("foo", 100, tVal))) ikey("foo", 100, ktVal)))
assertBytes(t, ikey("foo", 100, tVal), assertBytes(t, ikey("foo", 100, ktVal),
shortSep(ikey("foo", 100, tVal), shortSep(ikey("foo", 100, ktVal),
ikey("foo", 100, tDel))) ikey("foo", 100, ktDel)))
// When user keys are misordered // When user keys are misordered
assertBytes(t, ikey("foo", 100, tVal), assertBytes(t, ikey("foo", 100, ktVal),
shortSep(ikey("foo", 100, tVal), shortSep(ikey("foo", 100, ktVal),
ikey("bar", 99, tVal))) ikey("bar", 99, ktVal)))
// When user keys are different, but correctly ordered // When user keys are different, but correctly ordered
assertBytes(t, ikey("g", uint64(kMaxSeq), tSeek), assertBytes(t, ikey("g", uint64(kMaxSeq), ktSeek),
shortSep(ikey("foo", 100, tVal), shortSep(ikey("foo", 100, ktVal),
ikey("hello", 200, tVal))) ikey("hello", 200, ktVal)))
// When start user key is prefix of limit user key // When start user key is prefix of limit user key
assertBytes(t, ikey("foo", 100, tVal), assertBytes(t, ikey("foo", 100, ktVal),
shortSep(ikey("foo", 100, tVal), shortSep(ikey("foo", 100, ktVal),
ikey("foobar", 200, tVal))) ikey("foobar", 200, ktVal)))
// When limit user key is prefix of start user key // When limit user key is prefix of start user key
assertBytes(t, ikey("foobar", 100, tVal), assertBytes(t, ikey("foobar", 100, ktVal),
shortSep(ikey("foobar", 100, tVal), shortSep(ikey("foobar", 100, ktVal),
ikey("foo", 200, tVal))) ikey("foo", 200, ktVal)))
} }
func TestIKeyShortestSuccessor(t *testing.T) { func TestIkeyShortestSuccessor(t *testing.T) {
assertBytes(t, ikey("g", uint64(kMaxSeq), tSeek), assertBytes(t, ikey("g", uint64(kMaxSeq), ktSeek),
shortSuccessor(ikey("foo", 100, tVal))) shortSuccessor(ikey("foo", 100, ktVal)))
assertBytes(t, ikey("\xff\xff", 100, tVal), assertBytes(t, ikey("\xff\xff", 100, ktVal),
shortSuccessor(ikey("\xff\xff", 100, tVal))) shortSuccessor(ikey("\xff\xff", 100, ktVal)))
} }

View File

@ -8,17 +8,17 @@
package memdb package memdb
import ( import (
"errors"
"math/rand" "math/rand"
"sync" "sync"
"github.com/syndtr/goleveldb/leveldb/comparer" "github.com/syndtr/goleveldb/leveldb/comparer"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/iterator"
"github.com/syndtr/goleveldb/leveldb/util" "github.com/syndtr/goleveldb/leveldb/util"
) )
var ( var (
ErrNotFound = util.ErrNotFound ErrNotFound = errors.ErrNotFound
ErrIterReleased = errors.New("leveldb/memdb: iterator released") ErrIterReleased = errors.New("leveldb/memdb: iterator released")
) )

View File

@ -11,6 +11,7 @@ import (
"github.com/syndtr/goleveldb/leveldb/cache" "github.com/syndtr/goleveldb/leveldb/cache"
"github.com/syndtr/goleveldb/leveldb/comparer" "github.com/syndtr/goleveldb/leveldb/comparer"
"github.com/syndtr/goleveldb/leveldb/filter" "github.com/syndtr/goleveldb/leveldb/filter"
"math"
) )
const ( const (
@ -23,9 +24,21 @@ const (
DefaultBlockCacheSize = 8 * MiB DefaultBlockCacheSize = 8 * MiB
DefaultBlockRestartInterval = 16 DefaultBlockRestartInterval = 16
DefaultBlockSize = 4 * KiB DefaultBlockSize = 4 * KiB
DefaultCompactionExpandLimitFactor = 25
DefaultCompactionGPOverlapsFactor = 10
DefaultCompactionL0Trigger = 4
DefaultCompactionSourceLimitFactor = 1
DefaultCompactionTableSize = 2 * MiB
DefaultCompactionTableSizeMultiplier = 1.0
DefaultCompactionTotalSize = 10 * MiB
DefaultCompactionTotalSizeMultiplier = 10.0
DefaultCompressionType = SnappyCompression DefaultCompressionType = SnappyCompression
DefaultCachedOpenFiles = 500 DefaultCachedOpenFiles = 500
DefaultMaxMemCompationLevel = 2
DefaultNumLevel = 7
DefaultWriteBuffer = 4 * MiB DefaultWriteBuffer = 4 * MiB
DefaultWriteL0PauseTrigger = 12
DefaultWriteL0SlowdownTrigger = 8
) )
type noCache struct{} type noCache struct{}
@ -65,34 +78,47 @@ const (
nCompression nCompression
) )
// Strict is the DB strict level. // Strict is the DB 'strict level'.
type Strict uint type Strict uint
const ( const (
// If present then a corrupted or invalid chunk or block in manifest // If present then a corrupted or invalid chunk or block in manifest
// journal will cause an error istead of being dropped. // journal will cause an error instead of being dropped.
// This will prevent database with corrupted manifest to be opened.
StrictManifest Strict = 1 << iota StrictManifest Strict = 1 << iota
// If present then a corrupted or invalid chunk or block in journal
// will cause an error istead of being dropped.
StrictJournal
// If present then journal chunk checksum will be verified. // If present then journal chunk checksum will be verified.
StrictJournalChecksum StrictJournalChecksum
// If present then an invalid key/value pair will cause an error // If present then a corrupted or invalid chunk or block in journal
// instead of being skipped. // will cause an error instead of being dropped.
StrictIterator // This will prevent database with corrupted journal to be opened.
StrictJournal
// If present then 'sorted table' block checksum will be verified. // If present then 'sorted table' block checksum will be verified.
// This has effect on both 'read operation' and compaction.
StrictBlockChecksum StrictBlockChecksum
// If present then a corrupted 'sorted table' will fails compaction.
// The database will enter read-only mode.
StrictCompaction
// If present then a corrupted 'sorted table' will halts 'read operation'.
StrictReader
// If present then leveldb.Recover will drop corrupted 'sorted table'.
StrictRecovery
// This only applicable for ReadOptions, if present then this ReadOptions
// 'strict level' will override global ones.
StrictOverride
// StrictAll enables all strict flags. // StrictAll enables all strict flags.
StrictAll = StrictManifest | StrictJournal | StrictJournalChecksum | StrictIterator | StrictBlockChecksum StrictAll = StrictManifest | StrictJournalChecksum | StrictJournal | StrictBlockChecksum | StrictCompaction | StrictReader
// DefaultStrict is the default strict flags. Specify any strict flags // DefaultStrict is the default strict flags. Specify any strict flags
// will override default strict flags as whole (i.e. not OR'ed). // will override default strict flags as whole (i.e. not OR'ed).
DefaultStrict = StrictJournalChecksum | StrictIterator | StrictBlockChecksum DefaultStrict = StrictJournalChecksum | StrictBlockChecksum | StrictCompaction | StrictReader
// NoStrict disables all strict flags. Override default strict flags. // NoStrict disables all strict flags. Override default strict flags.
NoStrict = ^StrictAll NoStrict = ^StrictAll
@ -132,6 +158,73 @@ type Options struct {
// The default value is 500. // The default value is 500.
CachedOpenFiles int CachedOpenFiles int
// CompactionExpandLimitFactor limits compaction size after expanded.
// This will be multiplied by table size limit at compaction target level.
//
// The default value is 25.
CompactionExpandLimitFactor int
// CompactionGPOverlapsFactor limits overlaps in grandparent (Level + 2) that a
// single 'sorted table' generates.
// This will be multiplied by table size limit at grandparent level.
//
// The default value is 10.
CompactionGPOverlapsFactor int
// CompactionL0Trigger defines number of 'sorted table' at level-0 that will
// trigger compaction.
//
// The default value is 4.
CompactionL0Trigger int
// CompactionSourceLimitFactor limits compaction source size. This doesn't apply to
// level-0.
// This will be multiplied by table size limit at compaction target level.
//
// The default value is 1.
CompactionSourceLimitFactor int
// CompactionTableSize limits size of 'sorted table' that compaction generates.
// The limits for each level will be calculated as:
// CompactionTableSize * (CompactionTableSizeMultiplier ^ Level)
// The multiplier for each level can also fine-tuned using CompactionTableSizeMultiplierPerLevel.
//
// The default value is 2MiB.
CompactionTableSize int
// CompactionTableSizeMultiplier defines multiplier for CompactionTableSize.
//
// The default value is 1.
CompactionTableSizeMultiplier float64
// CompactionTableSizeMultiplierPerLevel defines per-level multiplier for
// CompactionTableSize.
// Use zero to skip a level.
//
// The default value is nil.
CompactionTableSizeMultiplierPerLevel []float64
// CompactionTotalSize limits total size of 'sorted table' for each level.
// The limits for each level will be calculated as:
// CompactionTotalSize * (CompactionTotalSizeMultiplier ^ Level)
// The multiplier for each level can also fine-tuned using
// CompactionTotalSizeMultiplierPerLevel.
//
// The default value is 10MiB.
CompactionTotalSize int
// CompactionTotalSizeMultiplier defines multiplier for CompactionTotalSize.
//
// The default value is 10.
CompactionTotalSizeMultiplier float64
// CompactionTotalSizeMultiplierPerLevel defines per-level multiplier for
// CompactionTotalSize.
// Use zero to skip a level.
//
// The default value is nil.
CompactionTotalSizeMultiplierPerLevel []float64
// Comparer defines a total ordering over the space of []byte keys: a 'less // Comparer defines a total ordering over the space of []byte keys: a 'less
// than' relationship. The same comparison algorithm must be used for reads // than' relationship. The same comparison algorithm must be used for reads
// and writes over the lifetime of the DB. // and writes over the lifetime of the DB.
@ -144,6 +237,11 @@ type Options struct {
// The default value (DefaultCompression) uses snappy compression. // The default value (DefaultCompression) uses snappy compression.
Compression Compression Compression Compression
// DisableCompactionBackoff allows disable compaction retry backoff.
//
// The default value is false.
DisableCompactionBackoff bool
// ErrorIfExist defines whether an error should returned if the DB already // ErrorIfExist defines whether an error should returned if the DB already
// exist. // exist.
// //
@ -172,6 +270,19 @@ type Options struct {
// The default value is nil. // The default value is nil.
Filter filter.Filter Filter filter.Filter
// MaxMemCompationLevel defines maximum level a newly compacted 'memdb'
// will be pushed into if doesn't creates overlap. This should less than
// NumLevel. Use -1 for level-0.
//
// The default is 2.
MaxMemCompationLevel int
// NumLevel defines number of database level. The level shouldn't changed
// between opens, or the database will panic.
//
// The default is 7.
NumLevel int
// Strict defines the DB strict level. // Strict defines the DB strict level.
Strict Strict Strict Strict
@ -183,6 +294,18 @@ type Options struct {
// //
// The default value is 4MiB. // The default value is 4MiB.
WriteBuffer int WriteBuffer int
// WriteL0StopTrigger defines number of 'sorted table' at level-0 that will
// pause write.
//
// The default value is 12.
WriteL0PauseTrigger int
// WriteL0SlowdownTrigger defines number of 'sorted table' at level-0 that
// will trigger write slowdown.
//
// The default value is 8.
WriteL0SlowdownTrigger int
} }
func (o *Options) GetAltFilters() []filter.Filter { func (o *Options) GetAltFilters() []filter.Filter {
@ -222,6 +345,79 @@ func (o *Options) GetCachedOpenFiles() int {
return o.CachedOpenFiles return o.CachedOpenFiles
} }
func (o *Options) GetCompactionExpandLimit(level int) int {
factor := DefaultCompactionExpandLimitFactor
if o != nil && o.CompactionExpandLimitFactor > 0 {
factor = o.CompactionExpandLimitFactor
}
return o.GetCompactionTableSize(level+1) * factor
}
func (o *Options) GetCompactionGPOverlaps(level int) int {
factor := DefaultCompactionGPOverlapsFactor
if o != nil && o.CompactionGPOverlapsFactor > 0 {
factor = o.CompactionGPOverlapsFactor
}
return o.GetCompactionTableSize(level+2) * factor
}
func (o *Options) GetCompactionL0Trigger() int {
if o == nil || o.CompactionL0Trigger == 0 {
return DefaultCompactionL0Trigger
}
return o.CompactionL0Trigger
}
func (o *Options) GetCompactionSourceLimit(level int) int {
factor := DefaultCompactionSourceLimitFactor
if o != nil && o.CompactionSourceLimitFactor > 0 {
factor = o.CompactionSourceLimitFactor
}
return o.GetCompactionTableSize(level+1) * factor
}
func (o *Options) GetCompactionTableSize(level int) int {
var (
base = DefaultCompactionTableSize
mult float64
)
if o != nil {
if o.CompactionTableSize > 0 {
base = o.CompactionTableSize
}
if len(o.CompactionTableSizeMultiplierPerLevel) > level && o.CompactionTableSizeMultiplierPerLevel[level] > 0 {
mult = o.CompactionTableSizeMultiplierPerLevel[level]
} else if o.CompactionTableSizeMultiplier > 0 {
mult = math.Pow(o.CompactionTableSizeMultiplier, float64(level))
}
}
if mult == 0 {
mult = math.Pow(DefaultCompactionTableSizeMultiplier, float64(level))
}
return int(float64(base) * mult)
}
func (o *Options) GetCompactionTotalSize(level int) int64 {
var (
base = DefaultCompactionTotalSize
mult float64
)
if o != nil {
if o.CompactionTotalSize > 0 {
base = o.CompactionTotalSize
}
if len(o.CompactionTotalSizeMultiplierPerLevel) > level && o.CompactionTotalSizeMultiplierPerLevel[level] > 0 {
mult = o.CompactionTotalSizeMultiplierPerLevel[level]
} else if o.CompactionTotalSizeMultiplier > 0 {
mult = math.Pow(o.CompactionTotalSizeMultiplier, float64(level))
}
}
if mult == 0 {
mult = math.Pow(DefaultCompactionTotalSizeMultiplier, float64(level))
}
return int64(float64(base) * mult)
}
func (o *Options) GetComparer() comparer.Comparer { func (o *Options) GetComparer() comparer.Comparer {
if o == nil || o.Comparer == nil { if o == nil || o.Comparer == nil {
return comparer.DefaultComparer return comparer.DefaultComparer
@ -236,6 +432,13 @@ func (o *Options) GetCompression() Compression {
return o.Compression return o.Compression
} }
func (o *Options) GetDisableCompactionBackoff() bool {
if o == nil {
return false
}
return o.DisableCompactionBackoff
}
func (o *Options) GetErrorIfExist() bool { func (o *Options) GetErrorIfExist() bool {
if o == nil { if o == nil {
return false return false
@ -257,6 +460,28 @@ func (o *Options) GetFilter() filter.Filter {
return o.Filter return o.Filter
} }
func (o *Options) GetMaxMemCompationLevel() int {
level := DefaultMaxMemCompationLevel
if o != nil {
if o.MaxMemCompationLevel > 0 {
level = o.MaxMemCompationLevel
} else if o.MaxMemCompationLevel == -1 {
level = 0
}
}
if level >= o.GetNumLevel() {
return o.GetNumLevel() - 1
}
return level
}
func (o *Options) GetNumLevel() int {
if o == nil || o.NumLevel <= 0 {
return DefaultNumLevel
}
return o.NumLevel
}
func (o *Options) GetStrict(strict Strict) bool { func (o *Options) GetStrict(strict Strict) bool {
if o == nil || o.Strict == 0 { if o == nil || o.Strict == 0 {
return DefaultStrict&strict != 0 return DefaultStrict&strict != 0
@ -271,6 +496,20 @@ func (o *Options) GetWriteBuffer() int {
return o.WriteBuffer return o.WriteBuffer
} }
func (o *Options) GetWriteL0PauseTrigger() int {
if o == nil || o.WriteL0PauseTrigger == 0 {
return DefaultWriteL0PauseTrigger
}
return o.WriteL0PauseTrigger
}
func (o *Options) GetWriteL0SlowdownTrigger() int {
if o == nil || o.WriteL0SlowdownTrigger == 0 {
return DefaultWriteL0SlowdownTrigger
}
return o.WriteL0SlowdownTrigger
}
// ReadOptions holds the optional parameters for 'read operation'. The // ReadOptions holds the optional parameters for 'read operation'. The
// 'read operation' includes Get, Find and NewIterator. // 'read operation' includes Get, Find and NewIterator.
type ReadOptions struct { type ReadOptions struct {
@ -281,8 +520,8 @@ type ReadOptions struct {
// The default value is false. // The default value is false.
DontFillCache bool DontFillCache bool
// Strict overrides global DB strict level. Only StrictIterator and // Strict will be OR'ed with global DB 'strict level' unless StrictOverride
// StrictBlockChecksum that does have effects here. // is present. Currently only StrictReader that has effect here.
Strict Strict Strict Strict
} }
@ -324,3 +563,11 @@ func (wo *WriteOptions) GetSync() bool {
} }
return wo.Sync return wo.Sync
} }
func GetStrict(o *Options, ro *ReadOptions, strict Strict) bool {
if ro.GetStrict(StrictOverride) {
return ro.GetStrict(strict)
} else {
return o.GetStrict(strict) || ro.GetStrict(strict)
}
}

View File

@ -12,30 +12,86 @@ import (
"github.com/syndtr/goleveldb/leveldb/opt" "github.com/syndtr/goleveldb/leveldb/opt"
) )
func (s *session) setOptions(o *opt.Options) { func dupOptions(o *opt.Options) *opt.Options {
s.o = &opt.Options{} newo := &opt.Options{}
if o != nil { if o != nil {
*s.o = *o *newo = *o
} }
return newo
}
func (s *session) setOptions(o *opt.Options) {
no := dupOptions(o)
// Alternative filters. // Alternative filters.
if filters := o.GetAltFilters(); len(filters) > 0 { if filters := o.GetAltFilters(); len(filters) > 0 {
s.o.AltFilters = make([]filter.Filter, len(filters)) no.AltFilters = make([]filter.Filter, len(filters))
for i, filter := range filters { for i, filter := range filters {
s.o.AltFilters[i] = &iFilter{filter} no.AltFilters[i] = &iFilter{filter}
} }
} }
// Block cache. // Block cache.
switch o.GetBlockCache() { switch o.GetBlockCache() {
case nil: case nil:
s.o.BlockCache = cache.NewLRUCache(opt.DefaultBlockCacheSize) no.BlockCache = cache.NewLRUCache(opt.DefaultBlockCacheSize)
case opt.NoCache: case opt.NoCache:
s.o.BlockCache = nil no.BlockCache = nil
} }
// Comparer. // Comparer.
s.icmp = &iComparer{o.GetComparer()} s.icmp = &iComparer{o.GetComparer()}
s.o.Comparer = s.icmp no.Comparer = s.icmp
// Filter. // Filter.
if filter := o.GetFilter(); filter != nil { if filter := o.GetFilter(); filter != nil {
s.o.Filter = &iFilter{filter} no.Filter = &iFilter{filter}
}
s.o = &cachedOptions{Options: no}
s.o.cache()
}
type cachedOptions struct {
*opt.Options
compactionExpandLimit []int
compactionGPOverlaps []int
compactionSourceLimit []int
compactionTableSize []int
compactionTotalSize []int64
}
func (co *cachedOptions) cache() {
numLevel := co.Options.GetNumLevel()
co.compactionExpandLimit = make([]int, numLevel)
co.compactionGPOverlaps = make([]int, numLevel)
co.compactionSourceLimit = make([]int, numLevel)
co.compactionTableSize = make([]int, numLevel)
co.compactionTotalSize = make([]int64, numLevel)
for level := 0; level < numLevel; level++ {
co.compactionExpandLimit[level] = co.Options.GetCompactionExpandLimit(level)
co.compactionGPOverlaps[level] = co.Options.GetCompactionGPOverlaps(level)
co.compactionSourceLimit[level] = co.Options.GetCompactionSourceLimit(level)
co.compactionTableSize[level] = co.Options.GetCompactionTableSize(level)
co.compactionTotalSize[level] = co.Options.GetCompactionTotalSize(level)
} }
} }
func (co *cachedOptions) GetCompactionExpandLimit(level int) int {
return co.compactionExpandLimit[level]
}
func (co *cachedOptions) GetCompactionGPOverlaps(level int) int {
return co.compactionGPOverlaps[level]
}
func (co *cachedOptions) GetCompactionSourceLimit(level int) int {
return co.compactionSourceLimit[level]
}
func (co *cachedOptions) GetCompactionTableSize(level int) int {
return co.compactionTableSize[level]
}
func (co *cachedOptions) GetCompactionTotalSize(level int) int64 {
return co.compactionTotalSize[level]
}

View File

@ -7,12 +7,13 @@
package leveldb package leveldb
import ( import (
"errors" "fmt"
"io" "io"
"os" "os"
"sync" "sync"
"sync/atomic" "sync/atomic"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/iterator"
"github.com/syndtr/goleveldb/leveldb/journal" "github.com/syndtr/goleveldb/leveldb/journal"
"github.com/syndtr/goleveldb/leveldb/opt" "github.com/syndtr/goleveldb/leveldb/opt"
@ -20,18 +21,31 @@ import (
"github.com/syndtr/goleveldb/leveldb/util" "github.com/syndtr/goleveldb/leveldb/util"
) )
type ErrManifestCorrupted struct {
Field string
Reason string
}
func (e *ErrManifestCorrupted) Error() string {
return fmt.Sprintf("leveldb: manifest corrupted (field '%s'): %s", e.Field, e.Reason)
}
func newErrManifestCorrupted(f storage.File, field, reason string) error {
return errors.NewErrCorrupted(f, &ErrManifestCorrupted{field, reason})
}
// session represent a persistent database session. // session represent a persistent database session.
type session struct { type session struct {
// Need 64-bit alignment. // Need 64-bit alignment.
stFileNum uint64 // current unused file number stNextFileNum uint64 // current unused file number
stJournalNum uint64 // current journal file number; need external synchronization stJournalNum uint64 // current journal file number; need external synchronization
stPrevJournalNum uint64 // prev journal file number; no longer used; for compatibility with older version of leveldb stPrevJournalNum uint64 // prev journal file number; no longer used; for compatibility with older version of leveldb
stSeq uint64 // last mem compacted seq; need external synchronization stSeqNum uint64 // last mem compacted seq; need external synchronization
stTempFileNum uint64 stTempFileNum uint64
stor storage.Storage stor storage.Storage
storLock util.Releaser storLock util.Releaser
o *opt.Options o *cachedOptions
icmp *iComparer icmp *iComparer
tops *tOps tops *tOps
@ -39,7 +53,7 @@ type session struct {
manifestWriter storage.Writer manifestWriter storage.Writer
manifestFile storage.File manifestFile storage.File
stCptrs [kNumLevels]iKey // compact pointers; need external synchronization stCompPtrs []iKey // compaction pointers; need external synchronization
stVersion *version // current version stVersion *version // current version
vmu sync.Mutex vmu sync.Mutex
} }
@ -56,11 +70,12 @@ func newSession(stor storage.Storage, o *opt.Options) (s *session, err error) {
s = &session{ s = &session{
stor: stor, stor: stor,
storLock: storLock, storLock: storLock,
stCompPtrs: make([]iKey, o.GetNumLevel()),
} }
s.setOptions(o) s.setOptions(o)
s.tops = newTableOps(s, s.o.GetCachedOpenFiles()) s.tops = newTableOps(s, s.o.GetCachedOpenFiles())
s.setVersion(&version{s: s}) s.setVersion(newVersion(s))
s.log("log@legend F·NumFile S·FileSize N·Entry C·BadEntry B·BadBlock D·DeletedEntry L·Level Q·SeqNum T·TimeElapsed") s.log("log@legend F·NumFile S·FileSize N·Entry C·BadEntry B·BadBlock Ke·KeyError D·DroppedEntry L·Level Q·SeqNum T·TimeElapsed")
return return
} }
@ -100,26 +115,26 @@ func (s *session) recover() (err error) {
// Don't return os.ErrNotExist if the underlying storage contains // Don't return os.ErrNotExist if the underlying storage contains
// other files that belong to LevelDB. So the DB won't get trashed. // other files that belong to LevelDB. So the DB won't get trashed.
if files, _ := s.stor.GetFiles(storage.TypeAll); len(files) > 0 { if files, _ := s.stor.GetFiles(storage.TypeAll); len(files) > 0 {
err = ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest file missing")} err = &errors.ErrCorrupted{File: &storage.FileInfo{Type: storage.TypeManifest}, Err: &errors.ErrMissingFiles{}}
} }
} }
}() }()
file, err := s.stor.GetManifest() m, err := s.stor.GetManifest()
if err != nil { if err != nil {
return return
} }
reader, err := file.Open() reader, err := m.Open()
if err != nil { if err != nil {
return return
} }
defer reader.Close() defer reader.Close()
strict := s.o.GetStrict(opt.StrictManifest) strict := s.o.GetStrict(opt.StrictManifest)
jr := journal.NewReader(reader, dropper{s, file}, strict, true) jr := journal.NewReader(reader, dropper{s, m}, strict, true)
staging := s.version_NB().newStaging() staging := s.stVersion.newStaging()
rec := &sessionRecord{} rec := &sessionRecord{numLevel: s.o.GetNumLevel()}
for { for {
var r io.Reader var r io.Reader
r, err = jr.Next() r, err = jr.Next()
@ -128,51 +143,57 @@ func (s *session) recover() (err error) {
err = nil err = nil
break break
} }
return return errors.SetFile(err, m)
} }
err = rec.decode(r) err = rec.decode(r)
if err == nil { if err == nil {
// save compact pointers // save compact pointers
for _, r := range rec.compactionPointers { for _, r := range rec.compPtrs {
s.stCptrs[r.level] = iKey(r.ikey) s.stCompPtrs[r.level] = iKey(r.ikey)
} }
// commit record to version staging // commit record to version staging
staging.commit(rec) staging.commit(rec)
} else if strict {
return ErrCorrupted{Type: CorruptedManifest, Err: err}
} else { } else {
s.logf("manifest error: %v (skipped)", err) err = errors.SetFile(err, m)
if strict || !errors.IsCorrupted(err) {
return
} else {
s.logf("manifest error: %v (skipped)", errors.SetFile(err, m))
} }
rec.resetCompactionPointers() }
rec.resetCompPtrs()
rec.resetAddedTables() rec.resetAddedTables()
rec.resetDeletedTables() rec.resetDeletedTables()
} }
switch { switch {
case !rec.has(recComparer): case !rec.has(recComparer):
return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing comparer name")} return newErrManifestCorrupted(m, "comparer", "missing")
case rec.comparer != s.icmp.uName(): case rec.comparer != s.icmp.uName():
return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: comparer mismatch, " + "want '" + s.icmp.uName() + "', " + "got '" + rec.comparer + "'")} return newErrManifestCorrupted(m, "comparer", fmt.Sprintf("mismatch: want '%s', got '%s'", s.icmp.uName(), rec.comparer))
case !rec.has(recNextNum): case !rec.has(recNextFileNum):
return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing next file number")} return newErrManifestCorrupted(m, "next-file-num", "missing")
case !rec.has(recJournalNum): case !rec.has(recJournalNum):
return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing journal file number")} return newErrManifestCorrupted(m, "journal-file-num", "missing")
case !rec.has(recSeq): case !rec.has(recSeqNum):
return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing seq number")} return newErrManifestCorrupted(m, "seq-num", "missing")
} }
s.manifestFile = file s.manifestFile = m
s.setVersion(staging.finish()) s.setVersion(staging.finish())
s.setFileNum(rec.nextNum) s.setNextFileNum(rec.nextFileNum)
s.recordCommited(rec) s.recordCommited(rec)
return nil return nil
} }
// Commit session; need external synchronization. // Commit session; need external synchronization.
func (s *session) commit(r *sessionRecord) (err error) { func (s *session) commit(r *sessionRecord) (err error) {
v := s.version()
defer v.release()
// spawn new version based on current version // spawn new version based on current version
nv := s.version_NB().spawn(r) nv := v.spawn(r)
if s.manifest == nil { if s.manifest == nil {
// manifest journal writer not yet created, create one // manifest journal writer not yet created, create one
@ -191,13 +212,13 @@ func (s *session) commit(r *sessionRecord) (err error) {
// Pick a compaction based on current state; need external synchronization. // Pick a compaction based on current state; need external synchronization.
func (s *session) pickCompaction() *compaction { func (s *session) pickCompaction() *compaction {
v := s.version_NB() v := s.version()
var level int var level int
var t0 tFiles var t0 tFiles
if v.cScore >= 1 { if v.cScore >= 1 {
level = v.cLevel level = v.cLevel
cptr := s.stCptrs[level] cptr := s.stCompPtrs[level]
tables := v.tables[level] tables := v.tables[level]
for _, t := range tables { for _, t := range tables {
if cptr == nil || s.icmp.Compare(t.imax, cptr) > 0 { if cptr == nil || s.icmp.Compare(t.imax, cptr) > 0 {
@ -214,27 +235,21 @@ func (s *session) pickCompaction() *compaction {
level = ts.level level = ts.level
t0 = append(t0, ts.table) t0 = append(t0, ts.table)
} else { } else {
v.release()
return nil return nil
} }
} }
c := &compaction{s: s, v: v, level: level} return newCompaction(s, v, level, t0)
if level == 0 {
imin, imax := t0.getRange(s.icmp)
t0 = v.tables[0].getOverlaps(t0[:0], s.icmp, imin.ukey(), imax.ukey(), true)
}
c.tables[0] = t0
c.expand()
return c
} }
// Create compaction from given level and range; need external synchronization. // Create compaction from given level and range; need external synchronization.
func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction { func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction {
v := s.version_NB() v := s.version()
t0 := v.tables[level].getOverlaps(nil, s.icmp, umin, umax, level == 0) t0 := v.tables[level].getOverlaps(nil, s.icmp, umin, umax, level == 0)
if len(t0) == 0 { if len(t0) == 0 {
v.release()
return nil return nil
} }
@ -243,7 +258,7 @@ func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction {
// and we must not pick one file and drop another older file if the // and we must not pick one file and drop another older file if the
// two files overlap. // two files overlap.
if level > 0 { if level > 0 {
limit := uint64(kMaxTableSize) limit := uint64(v.s.o.GetCompactionSourceLimit(level))
total := uint64(0) total := uint64(0)
for i, t := range t0 { for i, t := range t0 {
total += t.size total += t.size
@ -255,9 +270,20 @@ func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction {
} }
} }
c := &compaction{s: s, v: v, level: level} return newCompaction(s, v, level, t0)
c.tables[0] = t0 }
func newCompaction(s *session, v *version, level int, t0 tFiles) *compaction {
c := &compaction{
s: s,
v: v,
level: level,
tables: [2]tFiles{t0, nil},
maxGPOverlaps: uint64(s.o.GetCompactionGPOverlaps(level)),
tPtrs: make([]int, s.o.GetNumLevel()),
}
c.expand() c.expand()
c.save()
return c return c
} }
@ -268,23 +294,55 @@ type compaction struct {
level int level int
tables [2]tFiles tables [2]tFiles
maxGPOverlaps uint64
gp tFiles gp tFiles
gpidx int gpi int
seenKey bool seenKey bool
overlappedBytes uint64 gpOverlappedBytes uint64
imin, imax iKey imin, imax iKey
tPtrs []int
released bool
tPtrs [kNumLevels]int snapGPI int
snapSeenKey bool
snapGPOverlappedBytes uint64
snapTPtrs []int
}
func (c *compaction) save() {
c.snapGPI = c.gpi
c.snapSeenKey = c.seenKey
c.snapGPOverlappedBytes = c.gpOverlappedBytes
c.snapTPtrs = append(c.snapTPtrs[:0], c.tPtrs...)
}
func (c *compaction) restore() {
c.gpi = c.snapGPI
c.seenKey = c.snapSeenKey
c.gpOverlappedBytes = c.snapGPOverlappedBytes
c.tPtrs = append(c.tPtrs[:0], c.snapTPtrs...)
}
func (c *compaction) release() {
if !c.released {
c.released = true
c.v.release()
}
} }
// Expand compacted tables; need external synchronization. // Expand compacted tables; need external synchronization.
func (c *compaction) expand() { func (c *compaction) expand() {
level := c.level limit := uint64(c.s.o.GetCompactionExpandLimit(c.level))
vt0, vt1 := c.v.tables[level], c.v.tables[level+1] vt0, vt1 := c.v.tables[c.level], c.v.tables[c.level+1]
t0, t1 := c.tables[0], c.tables[1] t0, t1 := c.tables[0], c.tables[1]
imin, imax := t0.getRange(c.s.icmp) imin, imax := t0.getRange(c.s.icmp)
// We expand t0 here just incase ukey hop across tables.
t0 = vt0.getOverlaps(t0, c.s.icmp, imin.ukey(), imax.ukey(), c.level == 0)
if len(t0) != len(c.tables[0]) {
imin, imax = t0.getRange(c.s.icmp)
}
t1 = vt1.getOverlaps(t1, c.s.icmp, imin.ukey(), imax.ukey(), false) t1 = vt1.getOverlaps(t1, c.s.icmp, imin.ukey(), imax.ukey(), false)
// Get entire range covered by compaction. // Get entire range covered by compaction.
amin, amax := append(t0, t1...).getRange(c.s.icmp) amin, amax := append(t0, t1...).getRange(c.s.icmp)
@ -292,13 +350,13 @@ func (c *compaction) expand() {
// See if we can grow the number of inputs in "level" without // See if we can grow the number of inputs in "level" without
// changing the number of "level+1" files we pick up. // changing the number of "level+1" files we pick up.
if len(t1) > 0 { if len(t1) > 0 {
exp0 := vt0.getOverlaps(nil, c.s.icmp, amin.ukey(), amax.ukey(), level == 0) exp0 := vt0.getOverlaps(nil, c.s.icmp, amin.ukey(), amax.ukey(), c.level == 0)
if len(exp0) > len(t0) && t1.size()+exp0.size() < kExpCompactionMaxBytes { if len(exp0) > len(t0) && t1.size()+exp0.size() < limit {
xmin, xmax := exp0.getRange(c.s.icmp) xmin, xmax := exp0.getRange(c.s.icmp)
exp1 := vt1.getOverlaps(nil, c.s.icmp, xmin.ukey(), xmax.ukey(), false) exp1 := vt1.getOverlaps(nil, c.s.icmp, xmin.ukey(), xmax.ukey(), false)
if len(exp1) == len(t1) { if len(exp1) == len(t1) {
c.s.logf("table@compaction expanding L%d+L%d (F·%d S·%s)+(F·%d S·%s) -> (F·%d S·%s)+(F·%d S·%s)", c.s.logf("table@compaction expanding L%d+L%d (F·%d S·%s)+(F·%d S·%s) -> (F·%d S·%s)+(F·%d S·%s)",
level, level+1, len(t0), shortenb(int(t0.size())), len(t1), shortenb(int(t1.size())), c.level, c.level+1, len(t0), shortenb(int(t0.size())), len(t1), shortenb(int(t1.size())),
len(exp0), shortenb(int(exp0.size())), len(exp1), shortenb(int(exp1.size()))) len(exp0), shortenb(int(exp0.size())), len(exp1), shortenb(int(exp1.size())))
imin, imax = xmin, xmax imin, imax = xmin, xmax
t0, t1 = exp0, exp1 t0, t1 = exp0, exp1
@ -309,8 +367,8 @@ func (c *compaction) expand() {
// Compute the set of grandparent files that overlap this compaction // Compute the set of grandparent files that overlap this compaction
// (parent == level+1; grandparent == level+2) // (parent == level+1; grandparent == level+2)
if level+2 < kNumLevels { if c.level+2 < c.s.o.GetNumLevel() {
c.gp = c.v.tables[level+2].getOverlaps(c.gp, c.s.icmp, amin.ukey(), amax.ukey(), false) c.gp = c.v.tables[c.level+2].getOverlaps(c.gp, c.s.icmp, amin.ukey(), amax.ukey(), false)
} }
c.tables[0], c.tables[1] = t0, t1 c.tables[0], c.tables[1] = t0, t1
@ -319,7 +377,7 @@ func (c *compaction) expand() {
// Check whether compaction is trivial. // Check whether compaction is trivial.
func (c *compaction) trivial() bool { func (c *compaction) trivial() bool {
return len(c.tables[0]) == 1 && len(c.tables[1]) == 0 && c.gp.size() <= kMaxGrandParentOverlapBytes return len(c.tables[0]) == 1 && len(c.tables[1]) == 0 && c.gp.size() <= c.maxGPOverlaps
} }
func (c *compaction) baseLevelForKey(ukey []byte) bool { func (c *compaction) baseLevelForKey(ukey []byte) bool {
@ -341,20 +399,20 @@ func (c *compaction) baseLevelForKey(ukey []byte) bool {
} }
func (c *compaction) shouldStopBefore(ikey iKey) bool { func (c *compaction) shouldStopBefore(ikey iKey) bool {
for ; c.gpidx < len(c.gp); c.gpidx++ { for ; c.gpi < len(c.gp); c.gpi++ {
gp := c.gp[c.gpidx] gp := c.gp[c.gpi]
if c.s.icmp.Compare(ikey, gp.imax) <= 0 { if c.s.icmp.Compare(ikey, gp.imax) <= 0 {
break break
} }
if c.seenKey { if c.seenKey {
c.overlappedBytes += gp.size c.gpOverlappedBytes += gp.size
} }
} }
c.seenKey = true c.seenKey = true
if c.overlappedBytes > kMaxGrandParentOverlapBytes { if c.gpOverlappedBytes > c.maxGPOverlaps {
// Too much overlap for current output; start new output. // Too much overlap for current output; start new output.
c.overlappedBytes = 0 c.gpOverlappedBytes = 0
return true return true
} }
return false return false
@ -373,8 +431,12 @@ func (c *compaction) newIterator() iterator.Iterator {
// Options. // Options.
ro := &opt.ReadOptions{ ro := &opt.ReadOptions{
DontFillCache: true, DontFillCache: true,
Strict: opt.StrictOverride,
}
strict := c.s.o.GetStrict(opt.StrictCompaction)
if strict {
ro.Strict |= opt.StrictReader
} }
strict := c.s.o.GetStrict(opt.StrictIterator)
for i, tables := range c.tables { for i, tables := range c.tables {
if len(tables) == 0 { if len(tables) == 0 {
@ -387,10 +449,10 @@ func (c *compaction) newIterator() iterator.Iterator {
its = append(its, c.s.tops.newIterator(t, nil, ro)) its = append(its, c.s.tops.newIterator(t, nil, ro))
} }
} else { } else {
it := iterator.NewIndexedIterator(tables.newIndexIterator(c.s.tops, c.s.icmp, nil, ro), strict, true) it := iterator.NewIndexedIterator(tables.newIndexIterator(c.s.tops, c.s.icmp, nil, ro), strict)
its = append(its, it) its = append(its, it)
} }
} }
return iterator.NewMergedIterator(its, c.s.icmp, true) return iterator.NewMergedIterator(its, c.s.icmp, strict)
} }

View File

@ -9,11 +9,11 @@ package leveldb
import ( import (
"bufio" "bufio"
"encoding/binary" "encoding/binary"
"errors"
"io" "io"
) "strings"
var errCorruptManifest = errors.New("leveldb: corrupt manifest") "github.com/syndtr/goleveldb/leveldb/errors"
)
type byteReader interface { type byteReader interface {
io.Reader io.Reader
@ -24,11 +24,11 @@ type byteReader interface {
const ( const (
recComparer = 1 recComparer = 1
recJournalNum = 2 recJournalNum = 2
recNextNum = 3 recNextFileNum = 3
recSeq = 4 recSeqNum = 4
recCompactionPointer = 5 recCompPtr = 5
recDeletedTable = 6 recDelTable = 6
recNewTable = 7 recAddTable = 7
// 8 was used for large value refs // 8 was used for large value refs
recPrevJournalNum = 9 recPrevJournalNum = 9
) )
@ -38,7 +38,7 @@ type cpRecord struct {
ikey iKey ikey iKey
} }
type ntRecord struct { type atRecord struct {
level int level int
num uint64 num uint64
size uint64 size uint64
@ -46,25 +46,24 @@ type ntRecord struct {
imax iKey imax iKey
} }
func (r ntRecord) makeFile(s *session) *tFile {
return newTableFile(s.getTableFile(r.num), r.size, r.imin, r.imax)
}
type dtRecord struct { type dtRecord struct {
level int level int
num uint64 num uint64
} }
type sessionRecord struct { type sessionRecord struct {
numLevel int
hasRec int hasRec int
comparer string comparer string
journalNum uint64 journalNum uint64
prevJournalNum uint64 prevJournalNum uint64
nextNum uint64 nextFileNum uint64
seq uint64 seqNum uint64
compactionPointers []cpRecord compPtrs []cpRecord
addedTables []ntRecord addedTables []atRecord
deletedTables []dtRecord deletedTables []dtRecord
scratch [binary.MaxVarintLen64]byte scratch [binary.MaxVarintLen64]byte
err error err error
} }
@ -88,29 +87,29 @@ func (p *sessionRecord) setPrevJournalNum(num uint64) {
p.prevJournalNum = num p.prevJournalNum = num
} }
func (p *sessionRecord) setNextNum(num uint64) { func (p *sessionRecord) setNextFileNum(num uint64) {
p.hasRec |= 1 << recNextNum p.hasRec |= 1 << recNextFileNum
p.nextNum = num p.nextFileNum = num
} }
func (p *sessionRecord) setSeq(seq uint64) { func (p *sessionRecord) setSeqNum(num uint64) {
p.hasRec |= 1 << recSeq p.hasRec |= 1 << recSeqNum
p.seq = seq p.seqNum = num
} }
func (p *sessionRecord) addCompactionPointer(level int, ikey iKey) { func (p *sessionRecord) addCompPtr(level int, ikey iKey) {
p.hasRec |= 1 << recCompactionPointer p.hasRec |= 1 << recCompPtr
p.compactionPointers = append(p.compactionPointers, cpRecord{level, ikey}) p.compPtrs = append(p.compPtrs, cpRecord{level, ikey})
} }
func (p *sessionRecord) resetCompactionPointers() { func (p *sessionRecord) resetCompPtrs() {
p.hasRec &= ^(1 << recCompactionPointer) p.hasRec &= ^(1 << recCompPtr)
p.compactionPointers = p.compactionPointers[:0] p.compPtrs = p.compPtrs[:0]
} }
func (p *sessionRecord) addTable(level int, num, size uint64, imin, imax iKey) { func (p *sessionRecord) addTable(level int, num, size uint64, imin, imax iKey) {
p.hasRec |= 1 << recNewTable p.hasRec |= 1 << recAddTable
p.addedTables = append(p.addedTables, ntRecord{level, num, size, imin, imax}) p.addedTables = append(p.addedTables, atRecord{level, num, size, imin, imax})
} }
func (p *sessionRecord) addTableFile(level int, t *tFile) { func (p *sessionRecord) addTableFile(level int, t *tFile) {
@ -118,17 +117,17 @@ func (p *sessionRecord) addTableFile(level int, t *tFile) {
} }
func (p *sessionRecord) resetAddedTables() { func (p *sessionRecord) resetAddedTables() {
p.hasRec &= ^(1 << recNewTable) p.hasRec &= ^(1 << recAddTable)
p.addedTables = p.addedTables[:0] p.addedTables = p.addedTables[:0]
} }
func (p *sessionRecord) deleteTable(level int, num uint64) { func (p *sessionRecord) delTable(level int, num uint64) {
p.hasRec |= 1 << recDeletedTable p.hasRec |= 1 << recDelTable
p.deletedTables = append(p.deletedTables, dtRecord{level, num}) p.deletedTables = append(p.deletedTables, dtRecord{level, num})
} }
func (p *sessionRecord) resetDeletedTables() { func (p *sessionRecord) resetDeletedTables() {
p.hasRec &= ^(1 << recDeletedTable) p.hasRec &= ^(1 << recDelTable)
p.deletedTables = p.deletedTables[:0] p.deletedTables = p.deletedTables[:0]
} }
@ -161,26 +160,26 @@ func (p *sessionRecord) encode(w io.Writer) error {
p.putUvarint(w, recJournalNum) p.putUvarint(w, recJournalNum)
p.putUvarint(w, p.journalNum) p.putUvarint(w, p.journalNum)
} }
if p.has(recNextNum) { if p.has(recNextFileNum) {
p.putUvarint(w, recNextNum) p.putUvarint(w, recNextFileNum)
p.putUvarint(w, p.nextNum) p.putUvarint(w, p.nextFileNum)
} }
if p.has(recSeq) { if p.has(recSeqNum) {
p.putUvarint(w, recSeq) p.putUvarint(w, recSeqNum)
p.putUvarint(w, p.seq) p.putUvarint(w, p.seqNum)
} }
for _, r := range p.compactionPointers { for _, r := range p.compPtrs {
p.putUvarint(w, recCompactionPointer) p.putUvarint(w, recCompPtr)
p.putUvarint(w, uint64(r.level)) p.putUvarint(w, uint64(r.level))
p.putBytes(w, r.ikey) p.putBytes(w, r.ikey)
} }
for _, r := range p.deletedTables { for _, r := range p.deletedTables {
p.putUvarint(w, recDeletedTable) p.putUvarint(w, recDelTable)
p.putUvarint(w, uint64(r.level)) p.putUvarint(w, uint64(r.level))
p.putUvarint(w, r.num) p.putUvarint(w, r.num)
} }
for _, r := range p.addedTables { for _, r := range p.addedTables {
p.putUvarint(w, recNewTable) p.putUvarint(w, recAddTable)
p.putUvarint(w, uint64(r.level)) p.putUvarint(w, uint64(r.level))
p.putUvarint(w, r.num) p.putUvarint(w, r.num)
p.putUvarint(w, r.size) p.putUvarint(w, r.size)
@ -190,14 +189,16 @@ func (p *sessionRecord) encode(w io.Writer) error {
return p.err return p.err
} }
func (p *sessionRecord) readUvarint(r io.ByteReader) uint64 { func (p *sessionRecord) readUvarintMayEOF(field string, r io.ByteReader, mayEOF bool) uint64 {
if p.err != nil { if p.err != nil {
return 0 return 0
} }
x, err := binary.ReadUvarint(r) x, err := binary.ReadUvarint(r)
if err != nil { if err != nil {
if err == io.EOF { if err == io.ErrUnexpectedEOF || (mayEOF == false && err == io.EOF) {
p.err = errCorruptManifest p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "short read"})
} else if strings.HasPrefix(err.Error(), "binary:") {
p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, err.Error()})
} else { } else {
p.err = err p.err = err
} }
@ -206,35 +207,39 @@ func (p *sessionRecord) readUvarint(r io.ByteReader) uint64 {
return x return x
} }
func (p *sessionRecord) readBytes(r byteReader) []byte { func (p *sessionRecord) readUvarint(field string, r io.ByteReader) uint64 {
return p.readUvarintMayEOF(field, r, false)
}
func (p *sessionRecord) readBytes(field string, r byteReader) []byte {
if p.err != nil { if p.err != nil {
return nil return nil
} }
n := p.readUvarint(r) n := p.readUvarint(field, r)
if p.err != nil { if p.err != nil {
return nil return nil
} }
x := make([]byte, n) x := make([]byte, n)
_, p.err = io.ReadFull(r, x) _, p.err = io.ReadFull(r, x)
if p.err != nil { if p.err != nil {
if p.err == io.EOF { if p.err == io.ErrUnexpectedEOF {
p.err = errCorruptManifest p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "short read"})
} }
return nil return nil
} }
return x return x
} }
func (p *sessionRecord) readLevel(r io.ByteReader) int { func (p *sessionRecord) readLevel(field string, r io.ByteReader) int {
if p.err != nil { if p.err != nil {
return 0 return 0
} }
x := p.readUvarint(r) x := p.readUvarint(field, r)
if p.err != nil { if p.err != nil {
return 0 return 0
} }
if x >= kNumLevels { if x >= uint64(p.numLevel) {
p.err = errCorruptManifest p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "invalid level number"})
return 0 return 0
} }
return int(x) return int(x)
@ -247,59 +252,59 @@ func (p *sessionRecord) decode(r io.Reader) error {
} }
p.err = nil p.err = nil
for p.err == nil { for p.err == nil {
rec, err := binary.ReadUvarint(br) rec := p.readUvarintMayEOF("field-header", br, true)
if err != nil { if p.err != nil {
if err == io.EOF { if p.err == io.EOF {
err = nil return nil
} }
return err return p.err
} }
switch rec { switch rec {
case recComparer: case recComparer:
x := p.readBytes(br) x := p.readBytes("comparer", br)
if p.err == nil { if p.err == nil {
p.setComparer(string(x)) p.setComparer(string(x))
} }
case recJournalNum: case recJournalNum:
x := p.readUvarint(br) x := p.readUvarint("journal-num", br)
if p.err == nil { if p.err == nil {
p.setJournalNum(x) p.setJournalNum(x)
} }
case recPrevJournalNum: case recPrevJournalNum:
x := p.readUvarint(br) x := p.readUvarint("prev-journal-num", br)
if p.err == nil { if p.err == nil {
p.setPrevJournalNum(x) p.setPrevJournalNum(x)
} }
case recNextNum: case recNextFileNum:
x := p.readUvarint(br) x := p.readUvarint("next-file-num", br)
if p.err == nil { if p.err == nil {
p.setNextNum(x) p.setNextFileNum(x)
} }
case recSeq: case recSeqNum:
x := p.readUvarint(br) x := p.readUvarint("seq-num", br)
if p.err == nil { if p.err == nil {
p.setSeq(x) p.setSeqNum(x)
} }
case recCompactionPointer: case recCompPtr:
level := p.readLevel(br) level := p.readLevel("comp-ptr.level", br)
ikey := p.readBytes(br) ikey := p.readBytes("comp-ptr.ikey", br)
if p.err == nil { if p.err == nil {
p.addCompactionPointer(level, iKey(ikey)) p.addCompPtr(level, iKey(ikey))
} }
case recNewTable: case recAddTable:
level := p.readLevel(br) level := p.readLevel("add-table.level", br)
num := p.readUvarint(br) num := p.readUvarint("add-table.num", br)
size := p.readUvarint(br) size := p.readUvarint("add-table.size", br)
imin := p.readBytes(br) imin := p.readBytes("add-table.imin", br)
imax := p.readBytes(br) imax := p.readBytes("add-table.imax", br)
if p.err == nil { if p.err == nil {
p.addTable(level, num, size, imin, imax) p.addTable(level, num, size, imin, imax)
} }
case recDeletedTable: case recDelTable:
level := p.readLevel(br) level := p.readLevel("del-table.level", br)
num := p.readUvarint(br) num := p.readUvarint("del-table.num", br)
if p.err == nil { if p.err == nil {
p.deleteTable(level, num) p.delTable(level, num)
} }
} }
} }

View File

@ -9,6 +9,8 @@ package leveldb
import ( import (
"bytes" "bytes"
"testing" "testing"
"github.com/syndtr/goleveldb/leveldb/opt"
) )
func decodeEncode(v *sessionRecord) (res bool, err error) { func decodeEncode(v *sessionRecord) (res bool, err error) {
@ -17,7 +19,7 @@ func decodeEncode(v *sessionRecord) (res bool, err error) {
if err != nil { if err != nil {
return return
} }
v2 := new(sessionRecord) v2 := &sessionRecord{numLevel: opt.DefaultNumLevel}
err = v.decode(b) err = v.decode(b)
if err != nil { if err != nil {
return return
@ -32,7 +34,7 @@ func decodeEncode(v *sessionRecord) (res bool, err error) {
func TestSessionRecord_EncodeDecode(t *testing.T) { func TestSessionRecord_EncodeDecode(t *testing.T) {
big := uint64(1) << 50 big := uint64(1) << 50
v := new(sessionRecord) v := &sessionRecord{numLevel: opt.DefaultNumLevel}
i := uint64(0) i := uint64(0)
test := func() { test := func() {
res, err := decodeEncode(v) res, err := decodeEncode(v)
@ -47,16 +49,16 @@ func TestSessionRecord_EncodeDecode(t *testing.T) {
for ; i < 4; i++ { for ; i < 4; i++ {
test() test()
v.addTable(3, big+300+i, big+400+i, v.addTable(3, big+300+i, big+400+i,
newIKey([]byte("foo"), big+500+1, tVal), newIkey([]byte("foo"), big+500+1, ktVal),
newIKey([]byte("zoo"), big+600+1, tDel)) newIkey([]byte("zoo"), big+600+1, ktDel))
v.deleteTable(4, big+700+i) v.delTable(4, big+700+i)
v.addCompactionPointer(int(i), newIKey([]byte("x"), big+900+1, tVal)) v.addCompPtr(int(i), newIkey([]byte("x"), big+900+1, ktVal))
} }
v.setComparer("foo") v.setComparer("foo")
v.setJournalNum(big + 100) v.setJournalNum(big + 100)
v.setPrevJournalNum(big + 99) v.setPrevJournalNum(big + 99)
v.setNextNum(big + 200) v.setNextFileNum(big + 200)
v.setSeq(big + 1000) v.setSeqNum(big + 1000)
test() test()
} }

View File

@ -22,7 +22,7 @@ type dropper struct {
} }
func (d dropper) Drop(err error) { func (d dropper) Drop(err error) {
if e, ok := err.(journal.ErrCorrupted); ok { if e, ok := err.(*journal.ErrCorrupted); ok {
d.s.logf("journal@drop %s-%d S·%s %q", d.file.Type(), d.file.Num(), shortenb(e.Size), e.Reason) d.s.logf("journal@drop %s-%d S·%s %q", d.file.Type(), d.file.Num(), shortenb(e.Size), e.Reason)
} else { } else {
d.s.logf("journal@drop %s-%d %q", d.file.Type(), d.file.Num(), err) d.s.logf("journal@drop %s-%d %q", d.file.Type(), d.file.Num(), err)
@ -51,9 +51,14 @@ func (s *session) newTemp() storage.File {
return s.stor.GetFile(num, storage.TypeTemp) return s.stor.GetFile(num, storage.TypeTemp)
} }
func (s *session) tableFileFromRecord(r atRecord) *tFile {
return newTableFile(s.getTableFile(r.num), r.size, r.imin, r.imax)
}
// Session state. // Session state.
// Get current version. // Get current version. This will incr version ref, must call
// version.release (exactly once) after use.
func (s *session) version() *version { func (s *session) version() *version {
s.vmu.Lock() s.vmu.Lock()
defer s.vmu.Unlock() defer s.vmu.Unlock()
@ -61,61 +66,56 @@ func (s *session) version() *version {
return s.stVersion return s.stVersion
} }
// Get current version; no barrier.
func (s *session) version_NB() *version {
return s.stVersion
}
// Set current version to v. // Set current version to v.
func (s *session) setVersion(v *version) { func (s *session) setVersion(v *version) {
s.vmu.Lock() s.vmu.Lock()
v.ref = 1 v.ref = 1 // Holds by session.
if old := s.stVersion; old != nil { if old := s.stVersion; old != nil {
v.ref++ v.ref++ // Holds by old version.
old.next = v old.next = v
old.release_NB() old.releaseNB()
} }
s.stVersion = v s.stVersion = v
s.vmu.Unlock() s.vmu.Unlock()
} }
// Get current unused file number. // Get current unused file number.
func (s *session) fileNum() uint64 { func (s *session) nextFileNum() uint64 {
return atomic.LoadUint64(&s.stFileNum) return atomic.LoadUint64(&s.stNextFileNum)
} }
// Get current unused file number to num. // Set current unused file number to num.
func (s *session) setFileNum(num uint64) { func (s *session) setNextFileNum(num uint64) {
atomic.StoreUint64(&s.stFileNum, num) atomic.StoreUint64(&s.stNextFileNum, num)
} }
// Mark file number as used. // Mark file number as used.
func (s *session) markFileNum(num uint64) { func (s *session) markFileNum(num uint64) {
num += 1 nextFileNum := num + 1
for { for {
old, x := s.stFileNum, num old, x := s.stNextFileNum, nextFileNum
if old > x { if old > x {
x = old x = old
} }
if atomic.CompareAndSwapUint64(&s.stFileNum, old, x) { if atomic.CompareAndSwapUint64(&s.stNextFileNum, old, x) {
break break
} }
} }
} }
// Allocate a file number. // Allocate a file number.
func (s *session) allocFileNum() (num uint64) { func (s *session) allocFileNum() uint64 {
return atomic.AddUint64(&s.stFileNum, 1) - 1 return atomic.AddUint64(&s.stNextFileNum, 1) - 1
} }
// Reuse given file number. // Reuse given file number.
func (s *session) reuseFileNum(num uint64) { func (s *session) reuseFileNum(num uint64) {
for { for {
old, x := s.stFileNum, num old, x := s.stNextFileNum, num
if old != x+1 { if old != x+1 {
x = old x = old
} }
if atomic.CompareAndSwapUint64(&s.stFileNum, old, x) { if atomic.CompareAndSwapUint64(&s.stNextFileNum, old, x) {
break break
} }
} }
@ -126,20 +126,20 @@ func (s *session) reuseFileNum(num uint64) {
// Fill given session record obj with current states; need external // Fill given session record obj with current states; need external
// synchronization. // synchronization.
func (s *session) fillRecord(r *sessionRecord, snapshot bool) { func (s *session) fillRecord(r *sessionRecord, snapshot bool) {
r.setNextNum(s.fileNum()) r.setNextFileNum(s.nextFileNum())
if snapshot { if snapshot {
if !r.has(recJournalNum) { if !r.has(recJournalNum) {
r.setJournalNum(s.stJournalNum) r.setJournalNum(s.stJournalNum)
} }
if !r.has(recSeq) { if !r.has(recSeqNum) {
r.setSeq(s.stSeq) r.setSeqNum(s.stSeqNum)
} }
for level, ik := range s.stCptrs { for level, ik := range s.stCompPtrs {
if ik != nil { if ik != nil {
r.addCompactionPointer(level, ik) r.addCompPtr(level, ik)
} }
} }
@ -158,12 +158,12 @@ func (s *session) recordCommited(r *sessionRecord) {
s.stPrevJournalNum = r.prevJournalNum s.stPrevJournalNum = r.prevJournalNum
} }
if r.has(recSeq) { if r.has(recSeqNum) {
s.stSeq = r.seq s.stSeqNum = r.seqNum
} }
for _, p := range r.compactionPointers { for _, p := range r.compPtrs {
s.stCptrs[p.level] = iKey(p.ikey) s.stCompPtrs[p.level] = iKey(p.ikey)
} }
} }
@ -178,10 +178,11 @@ func (s *session) newManifest(rec *sessionRecord, v *version) (err error) {
jw := journal.NewWriter(writer) jw := journal.NewWriter(writer)
if v == nil { if v == nil {
v = s.version_NB() v = s.version()
defer v.release()
} }
if rec == nil { if rec == nil {
rec = new(sessionRecord) rec = &sessionRecord{numLevel: s.o.GetNumLevel()}
} }
s.fillRecord(rec, true) s.fillRecord(rec, true)
v.fillRecord(rec) v.fillRecord(rec)

View File

@ -125,3 +125,33 @@ type Storage interface {
// Other methods should not be called after the storage has been closed. // Other methods should not be called after the storage has been closed.
Close() error Close() error
} }
// FileInfo wraps basic file info.
type FileInfo struct {
Type FileType
Num uint64
}
func (fi FileInfo) String() string {
switch fi.Type {
case TypeManifest:
return fmt.Sprintf("MANIFEST-%06d", fi.Num)
case TypeJournal:
return fmt.Sprintf("%06d.log", fi.Num)
case TypeTable:
return fmt.Sprintf("%06d.ldb", fi.Num)
case TypeTemp:
return fmt.Sprintf("%06d.tmp", fi.Num)
default:
return fmt.Sprintf("%#x-%d", fi.Type, fi.Num)
}
}
// NewFileInfo creates new FileInfo from the given File. It will returns nil
// if File is nil.
func NewFileInfo(f File) *FileInfo {
if f == nil {
return nil
}
return &FileInfo{f.Type(), f.Num()}
}

View File

@ -11,6 +11,7 @@ import (
"fmt" "fmt"
"io" "io"
"io/ioutil" "io/ioutil"
"math/rand"
"os" "os"
"path/filepath" "path/filepath"
"sync" "sync"
@ -36,6 +37,19 @@ var (
tsNum = 0 tsNum = 0
) )
type tsOp uint
const (
tsOpOpen tsOp = iota
tsOpCreate
tsOpRead
tsOpReadAt
tsOpWrite
tsOpSync
tsOpNum
)
type tsLock struct { type tsLock struct {
ts *testStorage ts *testStorage
r util.Releaser r util.Releaser
@ -54,6 +68,9 @@ type tsReader struct {
func (tr tsReader) Read(b []byte) (n int, err error) { func (tr tsReader) Read(b []byte) (n int, err error) {
ts := tr.tf.ts ts := tr.tf.ts
ts.countRead(tr.tf.Type()) ts.countRead(tr.tf.Type())
if tr.tf.shouldErrLocked(tsOpRead) {
return 0, errors.New("leveldb.testStorage: emulated read error")
}
n, err = tr.Reader.Read(b) n, err = tr.Reader.Read(b)
if err != nil && err != io.EOF { if err != nil && err != io.EOF {
ts.t.Errorf("E: read error, num=%d type=%v n=%d: %v", tr.tf.Num(), tr.tf.Type(), n, err) ts.t.Errorf("E: read error, num=%d type=%v n=%d: %v", tr.tf.Num(), tr.tf.Type(), n, err)
@ -64,6 +81,9 @@ func (tr tsReader) Read(b []byte) (n int, err error) {
func (tr tsReader) ReadAt(b []byte, off int64) (n int, err error) { func (tr tsReader) ReadAt(b []byte, off int64) (n int, err error) {
ts := tr.tf.ts ts := tr.tf.ts
ts.countRead(tr.tf.Type()) ts.countRead(tr.tf.Type())
if tr.tf.shouldErrLocked(tsOpReadAt) {
return 0, errors.New("leveldb.testStorage: emulated readAt error")
}
n, err = tr.Reader.ReadAt(b, off) n, err = tr.Reader.ReadAt(b, off)
if err != nil && err != io.EOF { if err != nil && err != io.EOF {
ts.t.Errorf("E: readAt error, num=%d type=%v off=%d n=%d: %v", tr.tf.Num(), tr.tf.Type(), off, n, err) ts.t.Errorf("E: readAt error, num=%d type=%v off=%d n=%d: %v", tr.tf.Num(), tr.tf.Type(), off, n, err)
@ -83,15 +103,12 @@ type tsWriter struct {
} }
func (tw tsWriter) Write(b []byte) (n int, err error) { func (tw tsWriter) Write(b []byte) (n int, err error) {
ts := tw.tf.ts if tw.tf.shouldErrLocked(tsOpWrite) {
ts.mu.Lock()
defer ts.mu.Unlock()
if ts.emuWriteErr&tw.tf.Type() != 0 {
return 0, errors.New("leveldb.testStorage: emulated write error") return 0, errors.New("leveldb.testStorage: emulated write error")
} }
n, err = tw.Writer.Write(b) n, err = tw.Writer.Write(b)
if err != nil { if err != nil {
ts.t.Errorf("E: write error, num=%d type=%v n=%d: %v", tw.tf.Num(), tw.tf.Type(), n, err) tw.tf.ts.t.Errorf("E: write error, num=%d type=%v n=%d: %v", tw.tf.Num(), tw.tf.Type(), n, err)
} }
return return
} }
@ -99,23 +116,23 @@ func (tw tsWriter) Write(b []byte) (n int, err error) {
func (tw tsWriter) Sync() (err error) { func (tw tsWriter) Sync() (err error) {
ts := tw.tf.ts ts := tw.tf.ts
ts.mu.Lock() ts.mu.Lock()
defer ts.mu.Unlock()
for ts.emuDelaySync&tw.tf.Type() != 0 { for ts.emuDelaySync&tw.tf.Type() != 0 {
ts.cond.Wait() ts.cond.Wait()
} }
if ts.emuSyncErr&tw.tf.Type() != 0 { ts.mu.Unlock()
if tw.tf.shouldErrLocked(tsOpSync) {
return errors.New("leveldb.testStorage: emulated sync error") return errors.New("leveldb.testStorage: emulated sync error")
} }
err = tw.Writer.Sync() err = tw.Writer.Sync()
if err != nil { if err != nil {
ts.t.Errorf("E: sync error, num=%d type=%v: %v", tw.tf.Num(), tw.tf.Type(), err) tw.tf.ts.t.Errorf("E: sync error, num=%d type=%v: %v", tw.tf.Num(), tw.tf.Type(), err)
} }
return return
} }
func (tw tsWriter) Close() (err error) { func (tw tsWriter) Close() (err error) {
err = tw.Writer.Close() err = tw.Writer.Close()
tw.tf.close("reader", err) tw.tf.close("writer", err)
return return
} }
@ -128,6 +145,16 @@ func (tf tsFile) x() uint64 {
return tf.Num()<<typeShift | uint64(tf.Type()) return tf.Num()<<typeShift | uint64(tf.Type())
} }
func (tf tsFile) shouldErr(op tsOp) bool {
return tf.ts.shouldErr(tf, op)
}
func (tf tsFile) shouldErrLocked(op tsOp) bool {
tf.ts.mu.Lock()
defer tf.ts.mu.Unlock()
return tf.shouldErr(op)
}
func (tf tsFile) checkOpen(m string) error { func (tf tsFile) checkOpen(m string) error {
ts := tf.ts ts := tf.ts
if writer, ok := ts.opens[tf.x()]; ok { if writer, ok := ts.opens[tf.x()]; ok {
@ -164,7 +191,7 @@ func (tf tsFile) Open() (r storage.Reader, err error) {
if err != nil { if err != nil {
return return
} }
if ts.emuOpenErr&tf.Type() != 0 { if tf.shouldErr(tsOpOpen) {
err = errors.New("leveldb.testStorage: emulated open error") err = errors.New("leveldb.testStorage: emulated open error")
return return
} }
@ -191,7 +218,7 @@ func (tf tsFile) Create() (w storage.Writer, err error) {
if err != nil { if err != nil {
return return
} }
if ts.emuCreateErr&tf.Type() != 0 { if tf.shouldErr(tsOpCreate) {
err = errors.New("leveldb.testStorage: emulated create error") err = errors.New("leveldb.testStorage: emulated create error")
return return
} }
@ -232,25 +259,61 @@ type testStorage struct {
cond sync.Cond cond sync.Cond
// Open files, true=writer, false=reader // Open files, true=writer, false=reader
opens map[uint64]bool opens map[uint64]bool
emuOpenErr storage.FileType
emuCreateErr storage.FileType
emuDelaySync storage.FileType emuDelaySync storage.FileType
emuWriteErr storage.FileType
emuSyncErr storage.FileType
ignoreOpenErr storage.FileType ignoreOpenErr storage.FileType
readCnt uint64 readCnt uint64
readCntEn storage.FileType readCntEn storage.FileType
emuErr [tsOpNum]storage.FileType
emuErrOnce [tsOpNum]storage.FileType
emuRandErr [tsOpNum]storage.FileType
emuRandErrProb int
emuErrOnceMap map[uint64]uint
emuRandRand *rand.Rand
} }
func (ts *testStorage) SetOpenErr(t storage.FileType) { func (ts *testStorage) shouldErr(tf tsFile, op tsOp) bool {
if ts.emuErr[op]&tf.Type() != 0 {
return true
} else if ts.emuRandErr[op]&tf.Type() != 0 || ts.emuErrOnce[op]&tf.Type() != 0 {
sop := uint(1) << op
eop := ts.emuErrOnceMap[tf.x()]
if eop&sop == 0 && (ts.emuRandRand.Int()%ts.emuRandErrProb == 0 || ts.emuErrOnce[op]&tf.Type() != 0) {
ts.emuErrOnceMap[tf.x()] = eop | sop
ts.t.Logf("I: emulated error: file=%d type=%v op=%v", tf.Num(), tf.Type(), op)
return true
}
}
return false
}
func (ts *testStorage) SetEmuErr(t storage.FileType, ops ...tsOp) {
ts.mu.Lock() ts.mu.Lock()
ts.emuOpenErr = t for _, op := range ops {
ts.emuErr[op] = t
}
ts.mu.Unlock() ts.mu.Unlock()
} }
func (ts *testStorage) SetCreateErr(t storage.FileType) { func (ts *testStorage) SetEmuErrOnce(t storage.FileType, ops ...tsOp) {
ts.mu.Lock() ts.mu.Lock()
ts.emuCreateErr = t for _, op := range ops {
ts.emuErrOnce[op] = t
}
ts.mu.Unlock()
}
func (ts *testStorage) SetEmuRandErr(t storage.FileType, ops ...tsOp) {
ts.mu.Lock()
for _, op := range ops {
ts.emuRandErr[op] = t
}
ts.mu.Unlock()
}
func (ts *testStorage) SetEmuRandErrProb(prob int) {
ts.mu.Lock()
ts.emuRandErrProb = prob
ts.mu.Unlock() ts.mu.Unlock()
} }
@ -268,18 +331,6 @@ func (ts *testStorage) ReleaseSync(t storage.FileType) {
ts.mu.Unlock() ts.mu.Unlock()
} }
func (ts *testStorage) SetWriteErr(t storage.FileType) {
ts.mu.Lock()
ts.emuWriteErr = t
ts.mu.Unlock()
}
func (ts *testStorage) SetSyncErr(t storage.FileType) {
ts.mu.Lock()
ts.emuSyncErr = t
ts.mu.Unlock()
}
func (ts *testStorage) ReadCounter() uint64 { func (ts *testStorage) ReadCounter() uint64 {
ts.mu.Lock() ts.mu.Lock()
defer ts.mu.Unlock() defer ts.mu.Unlock()
@ -458,6 +509,9 @@ func newTestStorage(t *testing.T) *testStorage {
Storage: stor, Storage: stor,
closeFn: closeFn, closeFn: closeFn,
opens: make(map[uint64]bool), opens: make(map[uint64]bool),
emuErrOnceMap: make(map[uint64]uint),
emuRandErrProb: 0x999,
emuRandRand: rand.New(rand.NewSource(0xfacedead)),
} }
ts.cond.L = &ts.mu ts.cond.L = &ts.mu
return ts return ts

View File

@ -7,6 +7,7 @@
package leveldb package leveldb
import ( import (
"fmt"
"sort" "sort"
"sync/atomic" "sync/atomic"
@ -82,6 +83,18 @@ type tFiles []*tFile
func (tf tFiles) Len() int { return len(tf) } func (tf tFiles) Len() int { return len(tf) }
func (tf tFiles) Swap(i, j int) { tf[i], tf[j] = tf[j], tf[i] } func (tf tFiles) Swap(i, j int) { tf[i], tf[j] = tf[j], tf[i] }
func (tf tFiles) nums() string {
x := "[ "
for i, f := range tf {
if i != 0 {
x += ", "
}
x += fmt.Sprint(f.file.Num())
}
x += " ]"
return x
}
// Returns true if i smallest key is less than j. // Returns true if i smallest key is less than j.
// This used for sort by key in ascending order. // This used for sort by key in ascending order.
func (tf tFiles) lessByKey(icmp *iComparer, i, j int) bool { func (tf tFiles) lessByKey(icmp *iComparer, i, j int) bool {
@ -149,7 +162,7 @@ func (tf tFiles) overlaps(icmp *iComparer, umin, umax []byte, unsorted bool) boo
i := 0 i := 0
if len(umin) > 0 { if len(umin) > 0 {
// Find the earliest possible internal key for min. // Find the earliest possible internal key for min.
i = tf.searchMax(icmp, newIKey(umin, kMaxSeq, tSeek)) i = tf.searchMax(icmp, newIkey(umin, kMaxSeq, ktSeek))
} }
if i >= len(tf) { if i >= len(tf) {
// Beginning of range is after all files, so no overlap. // Beginning of range is after all files, so no overlap.
@ -159,24 +172,25 @@ func (tf tFiles) overlaps(icmp *iComparer, umin, umax []byte, unsorted bool) boo
} }
// Returns tables whose its key range overlaps with given key range. // Returns tables whose its key range overlaps with given key range.
// If overlapped is true then the search will be expanded to tables that // Range will be expanded if ukey found hop across tables.
// overlaps with each other. // If overlapped is true then the search will be restarted if umax
// expanded.
// The dst content will be overwritten.
func (tf tFiles) getOverlaps(dst tFiles, icmp *iComparer, umin, umax []byte, overlapped bool) tFiles { func (tf tFiles) getOverlaps(dst tFiles, icmp *iComparer, umin, umax []byte, overlapped bool) tFiles {
x := len(dst) dst = dst[:0]
for i := 0; i < len(tf); { for i := 0; i < len(tf); {
t := tf[i] t := tf[i]
if t.overlaps(icmp, umin, umax) { if t.overlaps(icmp, umin, umax) {
if overlapped {
// For overlapped files, check if the newly added file has
// expanded the range. If so, restart search.
if umin != nil && icmp.uCompare(t.imin.ukey(), umin) < 0 { if umin != nil && icmp.uCompare(t.imin.ukey(), umin) < 0 {
umin = t.imin.ukey() umin = t.imin.ukey()
dst = dst[:x] dst = dst[:0]
i = 0 i = 0
continue continue
} else if umax != nil && icmp.uCompare(t.imax.ukey(), umax) > 0 { } else if umax != nil && icmp.uCompare(t.imax.ukey(), umax) > 0 {
umax = t.imax.ukey() umax = t.imax.ukey()
dst = dst[:x] // Restart search if it is overlapped.
if overlapped {
dst = dst[:0]
i = 0 i = 0
continue continue
} }
@ -289,7 +303,7 @@ func (t *tOps) create() (*tWriter, error) {
t: t, t: t,
file: file, file: file,
w: fw, w: fw,
tw: table.NewWriter(fw, t.s.o), tw: table.NewWriter(fw, t.s.o.Options),
}, nil }, nil
} }
@ -337,7 +351,13 @@ func (t *tOps) open(f *tFile) (ch cache.Handle, err error) {
if bc := t.s.o.GetBlockCache(); bc != nil { if bc := t.s.o.GetBlockCache(); bc != nil {
bcacheNS = bc.GetNamespace(num) bcacheNS = bc.GetNamespace(num)
} }
return 1, table.NewReader(r, int64(f.size), bcacheNS, t.bpool, t.s.o) var tr *table.Reader
tr, err = table.NewReader(r, int64(f.size), storage.NewFileInfo(f.file), bcacheNS, t.bpool, t.s.o.Options)
if err != nil {
r.Close()
return 0, nil
}
return 1, tr
}) })
if ch == nil && err == nil { if ch == nil && err == nil {
err = ErrClosed err = ErrClosed
@ -440,28 +460,34 @@ func (w *tWriter) empty() bool {
return w.first == nil return w.first == nil
} }
// Closes the storage.Writer.
func (w *tWriter) close() {
if w.w != nil {
w.w.Close()
w.w = nil
}
}
// Finalizes the table and returns table file. // Finalizes the table and returns table file.
func (w *tWriter) finish() (f *tFile, err error) { func (w *tWriter) finish() (f *tFile, err error) {
defer w.close()
err = w.tw.Close() err = w.tw.Close()
if err != nil { if err != nil {
return return
} }
err = w.w.Sync() err = w.w.Sync()
if err != nil { if err != nil {
w.w.Close()
return return
} }
w.w.Close()
f = newTableFile(w.file, uint64(w.tw.BytesLen()), iKey(w.first), iKey(w.last)) f = newTableFile(w.file, uint64(w.tw.BytesLen()), iKey(w.first), iKey(w.last))
return return
} }
// Drops the table. // Drops the table.
func (w *tWriter) drop() { func (w *tWriter) drop() {
w.w.Close() w.close()
w.file.Remove() w.file.Remove()
w.t.s.reuseFileNum(w.file.Num()) w.t.s.reuseFileNum(w.file.Num())
w.w = nil
w.file = nil w.file = nil
w.tw = nil w.tw = nil
w.first = nil w.first = nil

View File

@ -8,29 +8,41 @@ package table
import ( import (
"encoding/binary" "encoding/binary"
"errors"
"fmt" "fmt"
"io" "io"
"sort" "sort"
"strings" "strings"
"sync" "sync"
"code.google.com/p/snappy-go/snappy" "github.com/syndtr/gosnappy/snappy"
"github.com/syndtr/goleveldb/leveldb/cache" "github.com/syndtr/goleveldb/leveldb/cache"
"github.com/syndtr/goleveldb/leveldb/comparer" "github.com/syndtr/goleveldb/leveldb/comparer"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/filter" "github.com/syndtr/goleveldb/leveldb/filter"
"github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/iterator"
"github.com/syndtr/goleveldb/leveldb/opt" "github.com/syndtr/goleveldb/leveldb/opt"
"github.com/syndtr/goleveldb/leveldb/storage"
"github.com/syndtr/goleveldb/leveldb/util" "github.com/syndtr/goleveldb/leveldb/util"
) )
var ( var (
ErrNotFound = util.ErrNotFound ErrNotFound = errors.ErrNotFound
ErrReaderReleased = errors.New("leveldb/table: reader released") ErrReaderReleased = errors.New("leveldb/table: reader released")
ErrIterReleased = errors.New("leveldb/table: iterator released") ErrIterReleased = errors.New("leveldb/table: iterator released")
) )
type ErrCorrupted struct {
Pos int64
Size int64
Kind string
Reason string
}
func (e *ErrCorrupted) Error() string {
return fmt.Sprintf("leveldb/table: corruption on %s (pos=%d): %s", e.Kind, e.Pos, e.Reason)
}
func max(x, y int) int { func max(x, y int) int {
if x > y { if x > y {
return x return x
@ -38,13 +50,19 @@ func max(x, y int) int {
return y return y
} }
func verifyBlockChecksum(data []byte) bool {
n := len(data) - 4
checksum0 := binary.LittleEndian.Uint32(data[n:])
checksum1 := util.NewCRC(data[:n]).Value()
return checksum0 == checksum1
}
type block struct { type block struct {
bpool *util.BufferPool bpool *util.BufferPool
bh blockHandle
data []byte data []byte
restartsLen int restartsLen int
restartsOffset int restartsOffset int
// Whether checksum is verified and valid.
checksum bool
} }
func (b *block) seek(cmp comparer.Comparer, rstart, rlimit int, key []byte) (index, offset int, err error) { func (b *block) seek(cmp comparer.Comparer, rstart, rlimit int, key []byte) (index, offset int, err error) {
@ -77,7 +95,7 @@ func (b *block) restartOffset(index int) int {
func (b *block) entry(offset int) (key, value []byte, nShared, n int, err error) { func (b *block) entry(offset int) (key, value []byte, nShared, n int, err error) {
if offset >= b.restartsOffset { if offset >= b.restartsOffset {
if offset != b.restartsOffset { if offset != b.restartsOffset {
err = errors.New("leveldb/table: Reader: BlockEntry: invalid block (block entries offset not aligned)") err = &ErrCorrupted{Reason: "entries offset not aligned"}
} }
return return
} }
@ -87,7 +105,7 @@ func (b *block) entry(offset int) (key, value []byte, nShared, n int, err error)
m := n0 + n1 + n2 m := n0 + n1 + n2
n = m + int(v1) + int(v2) n = m + int(v1) + int(v2)
if n0 <= 0 || n1 <= 0 || n2 <= 0 || offset+n > b.restartsOffset { if n0 <= 0 || n1 <= 0 || n2 <= 0 || offset+n > b.restartsOffset {
err = errors.New("leveldb/table: Reader: invalid block (block entries corrupted)") err = &ErrCorrupted{Reason: "entries corrupted"}
return return
} }
key = b.data[offset+m : offset+m+int(v1)] key = b.data[offset+m : offset+m+int(v1)]
@ -251,7 +269,7 @@ func (i *blockIter) Next() bool {
for i.offset < i.offsetRealStart { for i.offset < i.offsetRealStart {
key, value, nShared, n, err := i.block.entry(i.offset) key, value, nShared, n, err := i.block.entry(i.offset)
if err != nil { if err != nil {
i.sErr(err) i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err))
return false return false
} }
if n == 0 { if n == 0 {
@ -265,13 +283,13 @@ func (i *blockIter) Next() bool {
if i.offset >= i.offsetLimit { if i.offset >= i.offsetLimit {
i.dir = dirEOI i.dir = dirEOI
if i.offset != i.offsetLimit { if i.offset != i.offsetLimit {
i.sErr(errors.New("leveldb/table: Reader: Next: invalid block (block entries offset not aligned)")) i.sErr(i.tr.newErrCorruptedBH(i.block.bh, "entries offset not aligned"))
} }
return false return false
} }
key, value, nShared, n, err := i.block.entry(i.offset) key, value, nShared, n, err := i.block.entry(i.offset)
if err != nil { if err != nil {
i.sErr(err) i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err))
return false return false
} }
if n == 0 { if n == 0 {
@ -356,7 +374,7 @@ func (i *blockIter) Prev() bool {
for { for {
key, value, nShared, n, err := i.block.entry(offset) key, value, nShared, n, err := i.block.entry(offset)
if err != nil { if err != nil {
i.sErr(err) i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err))
return false return false
} }
if offset >= i.offsetRealStart { if offset >= i.offsetRealStart {
@ -375,7 +393,7 @@ func (i *blockIter) Prev() bool {
// Stop if target offset reached. // Stop if target offset reached.
if offset >= i.offset { if offset >= i.offset {
if offset != i.offset { if offset != i.offset {
i.sErr(errors.New("leveldb/table: Reader: Prev: invalid block (block entries offset not aligned)")) i.sErr(i.tr.newErrCorruptedBH(i.block.bh, "entries offset not aligned"))
return false return false
} }
@ -473,7 +491,6 @@ type indexIter struct {
tr *Reader tr *Reader
slice *util.Range slice *util.Range
// Options // Options
checksum bool
fillCache bool fillCache bool
} }
@ -484,28 +501,29 @@ func (i *indexIter) Get() iterator.Iterator {
} }
dataBH, n := decodeBlockHandle(value) dataBH, n := decodeBlockHandle(value)
if n == 0 { if n == 0 {
return iterator.NewEmptyIterator(errors.New("leveldb/table: Reader: invalid table (bad data block handle)")) return iterator.NewEmptyIterator(i.tr.newErrCorruptedBH(i.tr.indexBH, "bad data block handle"))
} }
var slice *util.Range var slice *util.Range
if i.slice != nil && (i.blockIter.isFirst() || i.blockIter.isLast()) { if i.slice != nil && (i.blockIter.isFirst() || i.blockIter.isLast()) {
slice = i.slice slice = i.slice
} }
return i.tr.getDataIterErr(dataBH, slice, i.checksum, i.fillCache) return i.tr.getDataIterErr(dataBH, slice, i.tr.verifyChecksum, i.fillCache)
} }
// Reader is a table reader. // Reader is a table reader.
type Reader struct { type Reader struct {
mu sync.RWMutex mu sync.RWMutex
fi *storage.FileInfo
reader io.ReaderAt reader io.ReaderAt
cache cache.Namespace cache cache.Namespace
err error err error
bpool *util.BufferPool bpool *util.BufferPool
// Options // Options
o *opt.Options
cmp comparer.Comparer cmp comparer.Comparer
filter filter.Filter filter filter.Filter
checksum bool verifyChecksum bool
strictIter bool
dataEnd int64 dataEnd int64
indexBH, filterBH blockHandle indexBH, filterBH blockHandle
@ -513,23 +531,43 @@ type Reader struct {
filterBlock *filterBlock filterBlock *filterBlock
} }
func verifyChecksum(data []byte) bool { func (r *Reader) blockKind(bh blockHandle) string {
n := len(data) - 4 switch bh.offset {
checksum0 := binary.LittleEndian.Uint32(data[n:]) case r.indexBH.offset:
checksum1 := util.NewCRC(data[:n]).Value() return "index-block"
return checksum0 == checksum1 case r.filterBH.offset:
return "filter-block"
default:
return "data-block"
}
} }
func (r *Reader) readRawBlock(bh blockHandle, checksum bool) ([]byte, error) { func (r *Reader) newErrCorrupted(pos, size int64, kind, reason string) error {
return &errors.ErrCorrupted{File: r.fi, Err: &ErrCorrupted{Pos: pos, Size: size, Kind: kind, Reason: reason}}
}
func (r *Reader) newErrCorruptedBH(bh blockHandle, reason string) error {
return r.newErrCorrupted(int64(bh.offset), int64(bh.length), r.blockKind(bh), reason)
}
func (r *Reader) fixErrCorruptedBH(bh blockHandle, err error) error {
if cerr, ok := err.(*ErrCorrupted); ok {
cerr.Pos = int64(bh.offset)
cerr.Size = int64(bh.length)
cerr.Kind = r.blockKind(bh)
return &errors.ErrCorrupted{File: r.fi, Err: cerr}
}
return err
}
func (r *Reader) readRawBlock(bh blockHandle, verifyChecksum bool) ([]byte, error) {
data := r.bpool.Get(int(bh.length + blockTrailerLen)) data := r.bpool.Get(int(bh.length + blockTrailerLen))
if _, err := r.reader.ReadAt(data, int64(bh.offset)); err != nil && err != io.EOF { if _, err := r.reader.ReadAt(data, int64(bh.offset)); err != nil && err != io.EOF {
return nil, err return nil, err
} }
if checksum || r.checksum { if verifyChecksum && !verifyBlockChecksum(data) {
if !verifyChecksum(data) {
r.bpool.Put(data) r.bpool.Put(data)
return nil, errors.New("leveldb/table: Reader: invalid block (checksum mismatch)") return nil, r.newErrCorruptedBH(bh, "checksum mismatch")
}
} }
switch data[bh.length] { switch data[bh.length] {
case blockTypeNoCompression: case blockTypeNoCompression:
@ -537,38 +575,40 @@ func (r *Reader) readRawBlock(bh blockHandle, checksum bool) ([]byte, error) {
case blockTypeSnappyCompression: case blockTypeSnappyCompression:
decLen, err := snappy.DecodedLen(data[:bh.length]) decLen, err := snappy.DecodedLen(data[:bh.length])
if err != nil { if err != nil {
return nil, err return nil, r.newErrCorruptedBH(bh, err.Error())
} }
tmp := data decData := r.bpool.Get(decLen)
data, err = snappy.Decode(r.bpool.Get(decLen), tmp[:bh.length]) decData, err = snappy.Decode(decData, data[:bh.length])
r.bpool.Put(tmp) r.bpool.Put(data)
if err != nil { if err != nil {
return nil, err r.bpool.Put(decData)
return nil, r.newErrCorruptedBH(bh, err.Error())
} }
data = decData
default: default:
r.bpool.Put(data) r.bpool.Put(data)
return nil, fmt.Errorf("leveldb/table: Reader: unknown block compression type: %d", data[bh.length]) return nil, r.newErrCorruptedBH(bh, fmt.Sprintf("unknown compression type %#x", data[bh.length]))
} }
return data, nil return data, nil
} }
func (r *Reader) readBlock(bh blockHandle, checksum bool) (*block, error) { func (r *Reader) readBlock(bh blockHandle, verifyChecksum bool) (*block, error) {
data, err := r.readRawBlock(bh, checksum) data, err := r.readRawBlock(bh, verifyChecksum)
if err != nil { if err != nil {
return nil, err return nil, err
} }
restartsLen := int(binary.LittleEndian.Uint32(data[len(data)-4:])) restartsLen := int(binary.LittleEndian.Uint32(data[len(data)-4:]))
b := &block{ b := &block{
bpool: r.bpool, bpool: r.bpool,
bh: bh,
data: data, data: data,
restartsLen: restartsLen, restartsLen: restartsLen,
restartsOffset: len(data) - (restartsLen+1)*4, restartsOffset: len(data) - (restartsLen+1)*4,
checksum: checksum || r.checksum,
} }
return b, nil return b, nil
} }
func (r *Reader) readBlockCached(bh blockHandle, checksum, fillCache bool) (*block, util.Releaser, error) { func (r *Reader) readBlockCached(bh blockHandle, verifyChecksum, fillCache bool) (*block, util.Releaser, error) {
if r.cache != nil { if r.cache != nil {
var err error var err error
ch := r.cache.Get(bh.offset, func() (charge int, value interface{}) { ch := r.cache.Get(bh.offset, func() (charge int, value interface{}) {
@ -576,7 +616,7 @@ func (r *Reader) readBlockCached(bh blockHandle, checksum, fillCache bool) (*blo
return 0, nil return 0, nil
} }
var b *block var b *block
b, err = r.readBlock(bh, checksum) b, err = r.readBlock(bh, verifyChecksum)
if err != nil { if err != nil {
return 0, nil return 0, nil
} }
@ -586,14 +626,7 @@ func (r *Reader) readBlockCached(bh blockHandle, checksum, fillCache bool) (*blo
b, ok := ch.Value().(*block) b, ok := ch.Value().(*block)
if !ok { if !ok {
ch.Release() ch.Release()
return nil, nil, errors.New("leveldb/table: Reader: inconsistent block type") return nil, nil, errors.New("leveldb/table: inconsistent block type")
}
if !b.checksum && (r.checksum || checksum) {
if !verifyChecksum(b.data) {
ch.Release()
return nil, nil, errors.New("leveldb/table: Reader: invalid block (checksum mismatch)")
}
b.checksum = true
} }
return b, ch, err return b, ch, err
} else if err != nil { } else if err != nil {
@ -601,7 +634,7 @@ func (r *Reader) readBlockCached(bh blockHandle, checksum, fillCache bool) (*blo
} }
} }
b, err := r.readBlock(bh, checksum) b, err := r.readBlock(bh, verifyChecksum)
return b, b, err return b, b, err
} }
@ -612,12 +645,12 @@ func (r *Reader) readFilterBlock(bh blockHandle) (*filterBlock, error) {
} }
n := len(data) n := len(data)
if n < 5 { if n < 5 {
return nil, errors.New("leveldb/table: Reader: invalid filter block (too short)") return nil, r.newErrCorruptedBH(bh, "too short")
} }
m := n - 5 m := n - 5
oOffset := int(binary.LittleEndian.Uint32(data[m:])) oOffset := int(binary.LittleEndian.Uint32(data[m:]))
if oOffset > m { if oOffset > m {
return nil, errors.New("leveldb/table: Reader: invalid filter block (invalid offset)") return nil, r.newErrCorruptedBH(bh, "invalid data-offsets offset")
} }
b := &filterBlock{ b := &filterBlock{
bpool: r.bpool, bpool: r.bpool,
@ -647,7 +680,7 @@ func (r *Reader) readFilterBlockCached(bh blockHandle, fillCache bool) (*filterB
b, ok := ch.Value().(*filterBlock) b, ok := ch.Value().(*filterBlock)
if !ok { if !ok {
ch.Release() ch.Release()
return nil, nil, errors.New("leveldb/table: Reader: inconsistent block type") return nil, nil, errors.New("leveldb/table: inconsistent block type")
} }
return b, ch, err return b, ch, err
} else if err != nil { } else if err != nil {
@ -673,25 +706,6 @@ func (r *Reader) getFilterBlock(fillCache bool) (*filterBlock, util.Releaser, er
return r.filterBlock, util.NoopReleaser{}, nil return r.filterBlock, util.NoopReleaser{}, nil
} }
func (r *Reader) getDataIter(dataBH blockHandle, slice *util.Range, checksum, fillCache bool) iterator.Iterator {
b, rel, err := r.readBlockCached(dataBH, checksum, fillCache)
if err != nil {
return iterator.NewEmptyIterator(err)
}
return r.newBlockIter(b, rel, slice, false)
}
func (r *Reader) getDataIterErr(dataBH blockHandle, slice *util.Range, checksum, fillCache bool) iterator.Iterator {
r.mu.RLock()
defer r.mu.RUnlock()
if r.err != nil {
return iterator.NewEmptyIterator(r.err)
}
return r.getDataIter(dataBH, slice, checksum, fillCache)
}
func (r *Reader) newBlockIter(b *block, bReleaser util.Releaser, slice *util.Range, inclLimit bool) *blockIter { func (r *Reader) newBlockIter(b *block, bReleaser util.Releaser, slice *util.Range, inclLimit bool) *blockIter {
bi := &blockIter{ bi := &blockIter{
tr: r, tr: r,
@ -726,12 +740,31 @@ func (r *Reader) newBlockIter(b *block, bReleaser util.Releaser, slice *util.Ran
} }
bi.reset() bi.reset()
if bi.offsetStart > bi.offsetLimit { if bi.offsetStart > bi.offsetLimit {
bi.sErr(errors.New("leveldb/table: Reader: invalid slice range")) bi.sErr(errors.New("leveldb/table: invalid slice range"))
} }
} }
return bi return bi
} }
func (r *Reader) getDataIter(dataBH blockHandle, slice *util.Range, verifyChecksum, fillCache bool) iterator.Iterator {
b, rel, err := r.readBlockCached(dataBH, verifyChecksum, fillCache)
if err != nil {
return iterator.NewEmptyIterator(err)
}
return r.newBlockIter(b, rel, slice, false)
}
func (r *Reader) getDataIterErr(dataBH blockHandle, slice *util.Range, verifyChecksum, fillCache bool) iterator.Iterator {
r.mu.RLock()
defer r.mu.RUnlock()
if r.err != nil {
return iterator.NewEmptyIterator(r.err)
}
return r.getDataIter(dataBH, slice, verifyChecksum, fillCache)
}
// NewIterator creates an iterator from the table. // NewIterator creates an iterator from the table.
// //
// Slice allows slicing the iterator to only contains keys in the given // Slice allows slicing the iterator to only contains keys in the given
@ -760,10 +793,9 @@ func (r *Reader) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.It
blockIter: r.newBlockIter(indexBlock, rel, slice, true), blockIter: r.newBlockIter(indexBlock, rel, slice, true),
tr: r, tr: r,
slice: slice, slice: slice,
checksum: ro.GetStrict(opt.StrictBlockChecksum),
fillCache: !ro.GetDontFillCache(), fillCache: !ro.GetDontFillCache(),
} }
return iterator.NewIndexedIterator(index, r.strictIter || ro.GetStrict(opt.StrictIterator), true) return iterator.NewIndexedIterator(index, opt.GetStrict(r.o, ro, opt.StrictReader))
} }
// Find finds key/value pair whose key is greater than or equal to the // Find finds key/value pair whose key is greater than or equal to the
@ -798,7 +830,7 @@ func (r *Reader) Find(key []byte, ro *opt.ReadOptions) (rkey, value []byte, err
} }
dataBH, n := decodeBlockHandle(index.Value()) dataBH, n := decodeBlockHandle(index.Value())
if n == 0 { if n == 0 {
err = errors.New("leveldb/table: Reader: invalid table (bad data block handle)") r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle")
return return
} }
if r.filter != nil { if r.filter != nil {
@ -811,7 +843,7 @@ func (r *Reader) Find(key []byte, ro *opt.ReadOptions) (rkey, value []byte, err
rel.Release() rel.Release()
} }
} }
data := r.getDataIter(dataBH, nil, ro.GetStrict(opt.StrictBlockChecksum), !ro.GetDontFillCache()) data := r.getDataIter(dataBH, nil, r.verifyChecksum, !ro.GetDontFillCache())
defer data.Release() defer data.Release()
if !data.Seek(key) { if !data.Seek(key) {
err = data.Error() err = data.Error()
@ -877,7 +909,7 @@ func (r *Reader) OffsetOf(key []byte) (offset int64, err error) {
if index.Seek(key) { if index.Seek(key) {
dataBH, n := decodeBlockHandle(index.Value()) dataBH, n := decodeBlockHandle(index.Value())
if n == 0 { if n == 0 {
err = errors.New("leveldb/table: Reader: invalid table (bad data block handle)") r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle")
return return
} }
offset = int64(dataBH.offset) offset = int64(dataBH.offset)
@ -914,51 +946,56 @@ func (r *Reader) Release() {
} }
// NewReader creates a new initialized table reader for the file. // NewReader creates a new initialized table reader for the file.
// The cache and bpool is optional and can be nil. // The fi, cache and bpool is optional and can be nil.
// //
// The returned table reader instance is goroutine-safe. // The returned table reader instance is goroutine-safe.
func NewReader(f io.ReaderAt, size int64, cache cache.Namespace, bpool *util.BufferPool, o *opt.Options) *Reader { func NewReader(f io.ReaderAt, size int64, fi *storage.FileInfo, cache cache.Namespace, bpool *util.BufferPool, o *opt.Options) (*Reader, error) {
r := &Reader{ r := &Reader{
fi: fi,
reader: f, reader: f,
cache: cache, cache: cache,
bpool: bpool, bpool: bpool,
o: o,
cmp: o.GetComparer(), cmp: o.GetComparer(),
checksum: o.GetStrict(opt.StrictBlockChecksum), verifyChecksum: o.GetStrict(opt.StrictBlockChecksum),
strictIter: o.GetStrict(opt.StrictIterator),
} }
if f == nil { if f == nil {
r.err = errors.New("leveldb/table: Reader: nil file") return nil, errors.New("leveldb/table: nil file")
return r
} }
if size < footerLen { if size < footerLen {
r.err = errors.New("leveldb/table: Reader: invalid table (file size is too small)") r.err = r.newErrCorrupted(0, size, "table", "too small")
return r return r, nil
} }
footerPos := size - footerLen
var footer [footerLen]byte var footer [footerLen]byte
if _, err := r.reader.ReadAt(footer[:], size-footerLen); err != nil && err != io.EOF { if _, err := r.reader.ReadAt(footer[:], footerPos); err != nil && err != io.EOF {
r.err = fmt.Errorf("leveldb/table: Reader: invalid table (could not read footer): %v", err) return nil, err
} }
if string(footer[footerLen-len(magic):footerLen]) != magic { if string(footer[footerLen-len(magic):footerLen]) != magic {
r.err = errors.New("leveldb/table: Reader: invalid table (bad magic number)") r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad magic number")
return r return r, nil
} }
// Decode the metaindex block handle. // Decode the metaindex block handle.
metaBH, n := decodeBlockHandle(footer[:]) metaBH, n := decodeBlockHandle(footer[:])
if n == 0 { if n == 0 {
r.err = errors.New("leveldb/table: Reader: invalid table (bad metaindex block handle)") r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad metaindex block handle")
return r return r, nil
} }
// Decode the index block handle. // Decode the index block handle.
r.indexBH, n = decodeBlockHandle(footer[n:]) r.indexBH, n = decodeBlockHandle(footer[n:])
if n == 0 { if n == 0 {
r.err = errors.New("leveldb/table: Reader: invalid table (bad index block handle)") r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad index block handle")
return r return r, nil
} }
// Read metaindex block. // Read metaindex block.
metaBlock, err := r.readBlock(metaBH, true) metaBlock, err := r.readBlock(metaBH, true)
if err != nil { if err != nil {
if errors.IsCorrupted(err) {
r.err = err r.err = err
return r return r, nil
} else {
return nil, err
}
} }
// Set data end. // Set data end.
r.dataEnd = int64(metaBH.offset) r.dataEnd = int64(metaBH.offset)
@ -995,13 +1032,22 @@ func NewReader(f io.ReaderAt, size int64, cache cache.Namespace, bpool *util.Buf
// Cache index and filter block locally, since we don't have global cache. // Cache index and filter block locally, since we don't have global cache.
if cache == nil { if cache == nil {
r.indexBlock, r.err = r.readBlock(r.indexBH, true) r.indexBlock, err = r.readBlock(r.indexBH, true)
if r.err != nil { if err != nil {
return r if errors.IsCorrupted(err) {
r.err = err
return r, nil
} else {
return nil, err
}
} }
if r.filter != nil { if r.filter != nil {
r.filterBlock, err = r.readFilterBlock(r.filterBH) r.filterBlock, err = r.readFilterBlock(r.filterBH)
if err != nil { if err != nil {
if !errors.IsCorrupted(r.err) {
return nil, err
}
// Don't use filter then. // Don't use filter then.
r.filter = nil r.filter = nil
r.filterBH = blockHandle{} r.filterBH = blockHandle{}
@ -1009,5 +1055,5 @@ func NewReader(f io.ReaderAt, size int64, cache cache.Namespace, bpool *util.Buf
} }
} }
return r return r, nil
} }

View File

@ -133,9 +133,9 @@ Filter block trailer:
+- 4-bytes -+ +- 4-bytes -+
/ \ / \
+---------------+---------------+---------------+-------------------------+------------------+ +---------------+---------------+---------------+-------------------------------+------------------+
| offset 1 | .... | offset n | filter offset (4-bytes) | base Lg (1-byte) | | data 1 offset | .... | data n offset | data-offsets offset (4-bytes) | base Lg (1-byte) |
+-------------- +---------------+---------------+-------------------------+------------------+ +-------------- +---------------+---------------+-------------------------------+------------------+
NOTE: All fixed-length integer are little-endian. NOTE: All fixed-length integer are little-endian.

View File

@ -59,7 +59,8 @@ var _ = testutil.Defer(func() {
It("Should be able to approximate offset of a key correctly", func() { It("Should be able to approximate offset of a key correctly", func() {
Expect(err).ShouldNot(HaveOccurred()) Expect(err).ShouldNot(HaveOccurred())
tr := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, o) tr, err := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, nil, o)
Expect(err).ShouldNot(HaveOccurred())
CheckOffset := func(key string, expect, threshold int) { CheckOffset := func(key string, expect, threshold int) {
offset, err := tr.OffsetOf([]byte(key)) offset, err := tr.OffsetOf([]byte(key))
Expect(err).ShouldNot(HaveOccurred()) Expect(err).ShouldNot(HaveOccurred())
@ -95,7 +96,7 @@ var _ = testutil.Defer(func() {
tw.Close() tw.Close()
// Opening the table. // Opening the table.
tr := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, o) tr, _ := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, nil, o)
return tableWrapper{tr} return tableWrapper{tr}
} }
Test := func(kv *testutil.KeyValue, body func(r *Reader)) func() { Test := func(kv *testutil.KeyValue, body func(r *Reader)) func() {

View File

@ -12,7 +12,7 @@ import (
"fmt" "fmt"
"io" "io"
"code.google.com/p/snappy-go/snappy" "github.com/syndtr/gosnappy/snappy"
"github.com/syndtr/goleveldb/leveldb/comparer" "github.com/syndtr/goleveldb/leveldb/comparer"
"github.com/syndtr/goleveldb/leveldb/filter" "github.com/syndtr/goleveldb/leveldb/filter"

View File

@ -12,6 +12,7 @@ import (
. "github.com/onsi/gomega" . "github.com/onsi/gomega"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/iterator"
"github.com/syndtr/goleveldb/leveldb/util" "github.com/syndtr/goleveldb/leveldb/util"
) )
@ -110,7 +111,7 @@ func (t *DBTesting) TestAllPresent() {
func (t *DBTesting) TestDeletedKey(key []byte) { func (t *DBTesting) TestDeletedKey(key []byte) {
_, err := t.DB.TestGet(key) _, err := t.DB.TestGet(key)
Expect(err).Should(Equal(util.ErrNotFound), "Get on deleted key %q, %s", key, t.text()) Expect(err).Should(Equal(errors.ErrNotFound), "Get on deleted key %q, %s", key, t.text())
} }
func (t *DBTesting) TestAllDeleted() { func (t *DBTesting) TestAllDeleted() {

View File

@ -13,6 +13,7 @@ import (
. "github.com/onsi/ginkgo" . "github.com/onsi/ginkgo"
. "github.com/onsi/gomega" . "github.com/onsi/gomega"
"github.com/syndtr/goleveldb/leveldb/errors"
"github.com/syndtr/goleveldb/leveldb/util" "github.com/syndtr/goleveldb/leveldb/util"
) )
@ -59,7 +60,7 @@ func KeyValueTesting(rnd *rand.Rand, kv KeyValue, p DB, setup func(KeyValue) DB,
} }
rkey, _, err := db.TestFind(key) rkey, _, err := db.TestFind(key)
Expect(err).Should(HaveOccurred(), "Find for key %q yield key %q", key, rkey) Expect(err).Should(HaveOccurred(), "Find for key %q yield key %q", key, rkey)
Expect(err).Should(Equal(util.ErrNotFound)) Expect(err).Should(Equal(errors.ErrNotFound))
} }
}) })
@ -77,7 +78,7 @@ func KeyValueTesting(rnd *rand.Rand, kv KeyValue, p DB, setup func(KeyValue) DB,
if len(key_) > 0 { if len(key_) > 0 {
_, err = db.TestGet(key_) _, err = db.TestGet(key_)
Expect(err).Should(HaveOccurred(), "Error for key %q", key_) Expect(err).Should(HaveOccurred(), "Error for key %q", key_)
Expect(err).Should(Equal(util.ErrNotFound)) Expect(err).Should(Equal(errors.ErrNotFound))
} }
}) })
} }

View File

@ -14,10 +14,10 @@ import (
) )
func shorten(str string) string { func shorten(str string) string {
if len(str) <= 4 { if len(str) <= 8 {
return str return str
} }
return str[:1] + ".." + str[len(str)-1:] return str[:3] + ".." + str[len(str)-3:]
} }
var bunits = [...]string{"", "Ki", "Mi", "Gi"} var bunits = [...]string{"", "Ki", "Mi", "Gi"}

View File

@ -8,6 +8,7 @@ package util
import ( import (
"fmt" "fmt"
"sync"
"sync/atomic" "sync/atomic"
"time" "time"
) )
@ -24,12 +25,11 @@ type BufferPool struct {
sizeMiss [5]uint32 sizeMiss [5]uint32
sizeHalf [5]uint32 sizeHalf [5]uint32
baseline [4]int baseline [4]int
baselinex0 int
baselinex1 int
baseline0 int baseline0 int
baseline1 int
baseline2 int mu sync.RWMutex
close chan struct{} closed bool
closeC chan struct{}
get uint32 get uint32
put uint32 put uint32
@ -58,6 +58,13 @@ func (p *BufferPool) Get(n int) []byte {
return make([]byte, n) return make([]byte, n)
} }
p.mu.RLock()
defer p.mu.RUnlock()
if p.closed {
return make([]byte, n)
}
atomic.AddUint32(&p.get, 1) atomic.AddUint32(&p.get, 1)
poolNum := p.poolNum(n) poolNum := p.poolNum(n)
@ -153,12 +160,16 @@ func (p *BufferPool) Put(b []byte) {
return return
} }
p.mu.RLock()
defer p.mu.RUnlock()
if p.closed {
return
}
atomic.AddUint32(&p.put, 1) atomic.AddUint32(&p.put, 1)
pool := p.pool[p.poolNum(cap(b))] pool := p.pool[p.poolNum(cap(b))]
defer func() {
recover()
}()
select { select {
case pool <- b: case pool <- b:
default: default:
@ -171,10 +182,12 @@ func (p *BufferPool) Close() {
return return
} }
select { p.mu.Lock()
case p.close <- struct{}{}: if !p.closed {
default: p.closed = true
p.closeC <- struct{}{}
} }
p.mu.Unlock()
} }
func (p *BufferPool) String() string { func (p *BufferPool) String() string {
@ -197,7 +210,8 @@ func (p *BufferPool) drain() {
default: default:
} }
} }
case <-p.close: case <-p.closeC:
close(p.closeC)
for _, ch := range p.pool { for _, ch := range p.pool {
close(ch) close(ch)
} }
@ -214,7 +228,7 @@ func NewBufferPool(baseline int) *BufferPool {
p := &BufferPool{ p := &BufferPool{
baseline0: baseline, baseline0: baseline,
baseline: [...]int{baseline / 4, baseline / 2, baseline * 2, baseline * 4}, baseline: [...]int{baseline / 4, baseline / 2, baseline * 2, baseline * 4},
close: make(chan struct{}, 1), closeC: make(chan struct{}, 1),
} }
for i, cap := range []int{2, 2, 4, 4, 2, 1} { for i, cap := range []int{2, 2, 4, 4, 2, 1} {
p.pool[i] = make(chan []byte, cap) p.pool[i] = make(chan []byte, cap)

View File

@ -12,7 +12,6 @@ import (
) )
var ( var (
ErrNotFound = errors.New("leveldb: not found")
ErrReleased = errors.New("leveldb: resource already relesed") ErrReleased = errors.New("leveldb: resource already relesed")
ErrHasReleaser = errors.New("leveldb: releaser already defined") ErrHasReleaser = errors.New("leveldb: releaser already defined")
) )

View File

@ -7,7 +7,6 @@
package leveldb package leveldb
import ( import (
"errors"
"sync/atomic" "sync/atomic"
"unsafe" "unsafe"
@ -16,19 +15,6 @@ import (
"github.com/syndtr/goleveldb/leveldb/util" "github.com/syndtr/goleveldb/leveldb/util"
) )
var levelMaxSize [kNumLevels]float64
func init() {
// Precompute max size of each level
for level := range levelMaxSize {
res := float64(10 * 1048576)
for n := level; n > 1; n-- {
res *= 10
}
levelMaxSize[level] = res
}
}
type tSet struct { type tSet struct {
level int level int
table *tFile table *tFile
@ -37,7 +23,7 @@ type tSet struct {
type version struct { type version struct {
s *session s *session
tables [kNumLevels]tFiles tables []tFiles
// Level that should be compacted next and its compaction score. // Level that should be compacted next and its compaction score.
// Score < 1 means compaction is not strictly needed. These fields // Score < 1 means compaction is not strictly needed. These fields
@ -48,10 +34,15 @@ type version struct {
cSeek unsafe.Pointer cSeek unsafe.Pointer
ref int ref int
// Succeeding version.
next *version next *version
} }
func (v *version) release_NB() { func newVersion(s *session) *version {
return &version{s: s, tables: make([]tFiles, s.o.GetNumLevel())}
}
func (v *version) releaseNB() {
v.ref-- v.ref--
if v.ref > 0 { if v.ref > 0 {
return return
@ -77,13 +68,13 @@ func (v *version) release_NB() {
} }
} }
v.next.release_NB() v.next.releaseNB()
v.next = nil v.next = nil
} }
func (v *version) release() { func (v *version) release() {
v.s.vmu.Lock() v.s.vmu.Lock()
v.release_NB() v.releaseNB()
v.s.vmu.Unlock() v.s.vmu.Unlock()
} }
@ -130,10 +121,11 @@ func (v *version) get(ikey iKey, ro *opt.ReadOptions) (value []byte, tcomp bool,
tset *tSet tset *tSet
tseek bool tseek bool
l0found bool // Level-0.
l0seq uint64 zfound bool
l0vt vType zseq uint64
l0val []byte zkt kType
zval []byte
) )
err = ErrNotFound err = ErrNotFound
@ -150,55 +142,52 @@ func (v *version) get(ikey iKey, ro *opt.ReadOptions) (value []byte, tcomp bool,
} }
} }
ikey__, val_, err_ := v.s.tops.find(t, ikey, ro) fikey, fval, ferr := v.s.tops.find(t, ikey, ro)
switch err_ { switch ferr {
case nil: case nil:
case ErrNotFound: case ErrNotFound:
return true return true
default: default:
err = err_ err = ferr
return false return false
} }
ikey_ := iKey(ikey__) if fukey, fseq, fkt, fkerr := parseIkey(fikey); fkerr == nil {
if seq, vt, ok := ikey_.parseNum(); ok { if v.s.icmp.uCompare(ukey, fukey) == 0 {
if v.s.icmp.uCompare(ukey, ikey_.ukey()) != 0 {
return true
}
if level == 0 { if level == 0 {
if seq >= l0seq { if fseq >= zseq {
l0found = true zfound = true
l0seq = seq zseq = fseq
l0vt = vt zkt = fkt
l0val = val_ zval = fval
} }
} else { } else {
switch vt { switch fkt {
case tVal: case ktVal:
value = val_ value = fval
err = nil err = nil
case tDel: case ktDel:
default: default:
panic("leveldb: invalid internal key type") panic("leveldb: invalid iKey type")
} }
return false return false
} }
}
} else { } else {
err = errors.New("leveldb: internal key corrupted") err = fkerr
return false return false
} }
return true return true
}, func(level int) bool { }, func(level int) bool {
if l0found { if zfound {
switch l0vt { switch zkt {
case tVal: case ktVal:
value = l0val value = zval
err = nil err = nil
case tDel: case ktDel:
default: default:
panic("leveldb: invalid internal key type") panic("leveldb: invalid iKey type")
} }
return false return false
} }
@ -216,13 +205,13 @@ func (v *version) getIterators(slice *util.Range, ro *opt.ReadOptions) (its []it
its = append(its, it) its = append(its, it)
} }
strict := v.s.o.GetStrict(opt.StrictIterator) || ro.GetStrict(opt.StrictIterator) strict := opt.GetStrict(v.s.o.Options, ro, opt.StrictReader)
for _, tables := range v.tables[1:] { for _, tables := range v.tables[1:] {
if len(tables) == 0 { if len(tables) == 0 {
continue continue
} }
it := iterator.NewIndexedIterator(tables.newIndexIterator(v.s.tops, v.s.icmp, slice, ro), strict, true) it := iterator.NewIndexedIterator(tables.newIndexIterator(v.s.tops, v.s.icmp, slice, ro), strict)
its = append(its, it) its = append(its, it)
} }
@ -230,7 +219,7 @@ func (v *version) getIterators(slice *util.Range, ro *opt.ReadOptions) (its []it
} }
func (v *version) newStaging() *versionStaging { func (v *version) newStaging() *versionStaging {
return &versionStaging{base: v} return &versionStaging{base: v, tables: make([]tablesScratch, v.s.o.GetNumLevel())}
} }
// Spawn a new version based on this version. // Spawn a new version based on this version.
@ -285,12 +274,13 @@ func (v *version) offsetOf(ikey iKey) (n uint64, err error) {
func (v *version) pickLevel(umin, umax []byte) (level int) { func (v *version) pickLevel(umin, umax []byte) (level int) {
if !v.tables[0].overlaps(v.s.icmp, umin, umax, true) { if !v.tables[0].overlaps(v.s.icmp, umin, umax, true) {
var overlaps tFiles var overlaps tFiles
for ; level < kMaxMemCompactLevel; level++ { maxLevel := v.s.o.GetMaxMemCompationLevel()
for ; level < maxLevel; level++ {
if v.tables[level+1].overlaps(v.s.icmp, umin, umax, false) { if v.tables[level+1].overlaps(v.s.icmp, umin, umax, false) {
break break
} }
overlaps = v.tables[level+2].getOverlaps(overlaps, v.s.icmp, umin, umax, false) overlaps = v.tables[level+2].getOverlaps(overlaps, v.s.icmp, umin, umax, false)
if overlaps.size() > kMaxGrandParentOverlapBytes { if overlaps.size() > uint64(v.s.o.GetCompactionGPOverlaps(level)) {
break break
} }
} }
@ -318,9 +308,9 @@ func (v *version) computeCompaction() {
// file size is small (perhaps because of a small write-buffer // file size is small (perhaps because of a small write-buffer
// setting, or very high compression ratios, or lots of // setting, or very high compression ratios, or lots of
// overwrites/deletions). // overwrites/deletions).
score = float64(len(tables)) / kL0_CompactionTrigger score = float64(len(tables)) / float64(v.s.o.GetCompactionL0Trigger())
} else { } else {
score = float64(tables.size()) / levelMaxSize[level] score = float64(tables.size()) / float64(v.s.o.GetCompactionTotalSize(level))
} }
if score > bestScore { if score > bestScore {
@ -337,12 +327,14 @@ func (v *version) needCompaction() bool {
return v.cScore >= 1 || atomic.LoadPointer(&v.cSeek) != nil return v.cScore >= 1 || atomic.LoadPointer(&v.cSeek) != nil
} }
type tablesScratch struct {
added map[uint64]atRecord
deleted map[uint64]struct{}
}
type versionStaging struct { type versionStaging struct {
base *version base *version
tables [kNumLevels]struct { tables []tablesScratch
added map[uint64]ntRecord
deleted map[uint64]struct{}
}
} }
func (p *versionStaging) commit(r *sessionRecord) { func (p *versionStaging) commit(r *sessionRecord) {
@ -367,7 +359,7 @@ func (p *versionStaging) commit(r *sessionRecord) {
tm := &(p.tables[r.level]) tm := &(p.tables[r.level])
if tm.added == nil { if tm.added == nil {
tm.added = make(map[uint64]ntRecord) tm.added = make(map[uint64]atRecord)
} }
tm.added[r.num] = r tm.added[r.num] = r
@ -379,7 +371,7 @@ func (p *versionStaging) commit(r *sessionRecord) {
func (p *versionStaging) finish() *version { func (p *versionStaging) finish() *version {
// Build new version. // Build new version.
nv := &version{s: p.base.s} nv := newVersion(p.base.s)
for level, tm := range p.tables { for level, tm := range p.tables {
btables := p.base.tables[level] btables := p.base.tables[level]
@ -402,7 +394,7 @@ func (p *versionStaging) finish() *version {
// New tables. // New tables.
for _, r := range tm.added { for _, r := range tm.added {
nt = append(nt, r.makeFile(p.base.s)) nt = append(nt, p.base.s.tableFileFromRecord(r))
} }
// Sort tables. // Sort tables.
@ -429,7 +421,7 @@ func (vr *versionReleaser) Release() {
v := vr.v v := vr.v
v.s.vmu.Lock() v.s.vmu.Lock()
if !vr.once { if !vr.once {
v.release_NB() v.releaseNB()
vr.once = true vr.once = true
} }
v.s.vmu.Unlock() v.s.vmu.Unlock()