Fast parallel file hasher (fixes #293)

This commit is contained in:
Jakob Borg 2014-07-30 20:10:46 +02:00
parent c47aebdd2a
commit 2be1218aa3
3 changed files with 91 additions and 32 deletions

65
scanner/blockqueue.go Normal file
View File

@ -0,0 +1,65 @@
// Copyright (C) 2014 Jakob Borg and Contributors (see the CONTRIBUTORS file).
// All rights reserved. Use of this source code is governed by an MIT-style
// license that can be found in the LICENSE file.
package scanner
import (
"os"
"path/filepath"
"sync"
"github.com/calmh/syncthing/protocol"
)
// The parallell hasher reads FileInfo structures from the inbox, hashes the
// file to populate the Blocks element and sends it to the outbox. A number of
// workers are used in parallel. The outbox will become closed when the inbox
// is closed and all items handled.
func newParallelHasher(dir string, blockSize, workers int, outbox, inbox chan protocol.FileInfo) {
var wg sync.WaitGroup
wg.Add(workers)
for i := 0; i < workers; i++ {
go func() {
hashFile(dir, blockSize, outbox, inbox)
wg.Done()
}()
}
go func() {
wg.Wait()
close(outbox)
}()
}
func hashFile(dir string, blockSize int, outbox, inbox chan protocol.FileInfo) {
for f := range inbox {
if protocol.IsDirectory(f.Flags) || protocol.IsDeleted(f.Flags) {
outbox <- f
continue
}
fd, err := os.Open(filepath.Join(dir, f.Name))
if err != nil {
if debug {
l.Debugln("open:", err)
}
continue
}
blocks, err := Blocks(fd, blockSize)
fd.Close()
if err != nil {
if debug {
l.Debugln("hash error:", f.Name, err)
}
continue
}
f.Blocks = blocks
outbox <- f
}
}

View File

@ -13,7 +13,6 @@ import (
"path/filepath" "path/filepath"
"runtime" "runtime"
"strings" "strings"
"time"
"code.google.com/p/go.text/unicode/norm" "code.google.com/p/go.text/unicode/norm"
"github.com/calmh/syncthing/lamport" "github.com/calmh/syncthing/lamport"
@ -60,18 +59,20 @@ type CurrentFiler interface {
// Walk returns the list of files found in the local repository by scanning the // Walk returns the list of files found in the local repository by scanning the
// file system. Files are blockwise hashed. // file system. Files are blockwise hashed.
func (w *Walker) Walk() (files chan protocol.FileInfo, ignore map[string][]string, err error) { func (w *Walker) Walk() (chan protocol.FileInfo, map[string][]string, error) {
if debug { if debug {
l.Debugln("Walk", w.Dir, w.BlockSize, w.IgnoreFile) l.Debugln("Walk", w.Dir, w.BlockSize, w.IgnoreFile)
} }
err = checkDir(w.Dir) err := checkDir(w.Dir)
if err != nil { if err != nil {
return return nil, nil, err
} }
ignore = make(map[string][]string) ignore := make(map[string][]string)
files = make(chan protocol.FileInfo) files := make(chan protocol.FileInfo)
hashedFiles := make(chan protocol.FileInfo)
newParallelHasher(w.Dir, w.BlockSize, runtime.NumCPU(), hashedFiles, files)
hashFiles := w.walkAndHashFiles(files, ignore) hashFiles := w.walkAndHashFiles(files, ignore)
go func() { go func() {
@ -80,7 +81,7 @@ func (w *Walker) Walk() (files chan protocol.FileInfo, ignore map[string][]strin
close(files) close(files)
}() }()
return return hashedFiles, ignore, nil
} }
// CleanTempFiles removes all files that match the temporary filename pattern. // CleanTempFiles removes all files that match the temporary filename pattern.
@ -219,40 +220,17 @@ func (w *Walker) walkAndHashFiles(fchan chan protocol.FileInfo, ign map[string][
} }
} }
fd, err := os.Open(p)
if err != nil {
if debug {
l.Debugln("open:", p, err)
}
return nil
}
defer fd.Close()
t0 := time.Now()
blocks, err := Blocks(fd, w.BlockSize)
if err != nil {
if debug {
l.Debugln("hash error:", rn, err)
}
return nil
}
if debug {
t1 := time.Now()
l.Debugln("hashed:", rn, ";", len(blocks), "blocks;", info.Size(), "bytes;", int(float64(info.Size())/1024/t1.Sub(t0).Seconds()), "KB/s")
}
var flags = uint32(info.Mode() & os.ModePerm) var flags = uint32(info.Mode() & os.ModePerm)
if w.IgnorePerms { if w.IgnorePerms {
flags = protocol.FlagNoPermBits | 0666 flags = protocol.FlagNoPermBits | 0666
} }
f := protocol.FileInfo{
fchan <- protocol.FileInfo{
Name: rn, Name: rn,
Version: lamport.Default.Tick(0), Version: lamport.Default.Tick(0),
Flags: flags, Flags: flags,
Modified: info.ModTime().Unix(), Modified: info.ModTime().Unix(),
Blocks: blocks,
} }
fchan <- f
} }
return nil return nil

View File

@ -7,6 +7,7 @@ package scanner
import ( import (
"fmt" "fmt"
"reflect" "reflect"
"sort"
"testing" "testing"
"time" "time"
@ -39,6 +40,7 @@ func TestWalk(t *testing.T) {
for f := range fchan { for f := range fchan {
files = append(files, f) files = append(files, f)
} }
sort.Sort(fileList(files))
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
@ -133,3 +135,17 @@ func TestIgnore(t *testing.T) {
} }
} }
} }
type fileList []protocol.FileInfo
func (f fileList) Len() int {
return len(f)
}
func (f fileList) Less(a, b int) bool {
return f[a].Name < f[b].Name
}
func (f fileList) Swap(a, b int) {
f[a], f[b] = f[b], f[a]
}