Automatically fix file name normalization errors (fixes #430)

This commit is contained in:
Jakob Borg 2015-03-18 23:54:50 +01:00
parent 75523556e8
commit 8311162be3
4 changed files with 168 additions and 11 deletions

View File

@ -50,6 +50,7 @@ type FolderConfiguration struct {
ReadOnly bool `xml:"ro,attr" json:"readOnly"`
RescanIntervalS int `xml:"rescanIntervalS,attr" json:"rescanIntervalS" default:"60"`
IgnorePerms bool `xml:"ignorePerms,attr" json:"ignorePerms"`
AutoNormalize bool `xml:"autoNormalize,attr" json:"autoNormalize" default:"true"`
Versioning VersioningConfiguration `xml:"versioning" json:"versioning"`
LenientMtimes bool `xml:"lenientMtimes" json:"lenientMTimes"`
Copiers int `xml:"copiers" json:"copiers" default:"1"` // This defines how many files are handled concurrently.

View File

@ -1139,15 +1139,16 @@ func (m *Model) ScanFolderSub(folder, sub string) error {
}
w := &scanner.Walker{
Dir: folderCfg.Path,
Sub: sub,
Matcher: ignores,
BlockSize: protocol.BlockSize,
TempNamer: defTempNamer,
TempLifetime: time.Duration(m.cfg.Options().KeepTemporariesH) * time.Hour,
CurrentFiler: cFiler{m, folder},
IgnorePerms: folderCfg.IgnorePerms,
Hashers: folderCfg.Hashers,
Dir: folderCfg.Path,
Sub: sub,
Matcher: ignores,
BlockSize: protocol.BlockSize,
TempNamer: defTempNamer,
TempLifetime: time.Duration(m.cfg.Options().KeepTemporariesH) * time.Hour,
CurrentFiler: cFiler{m, folder},
IgnorePerms: folderCfg.IgnorePerms,
AutoNormalize: folderCfg.AutoNormalize,
Hashers: folderCfg.Hashers,
}
runner.setState(FolderScanning)

View File

@ -13,6 +13,7 @@ import (
"runtime"
"strings"
"time"
"unicode/utf8"
"github.com/syncthing/protocol"
"github.com/syncthing/syncthing/internal/ignore"
@ -55,6 +56,9 @@ type Walker struct {
// detected. Scanned files will get zero permission bits and the
// NoPermissionBits flag set.
IgnorePerms bool
// When AutoNormalize is set, file names that are in UTF8 but incorrect
// normalization form will be corrected.
AutoNormalize bool
// Number of routines to use for hashing
Hashers int
}
@ -149,11 +153,63 @@ func (w *Walker) walkAndHashFiles(fchan chan protocol.FileInfo) filepath.WalkFun
return nil
}
if (runtime.GOOS == "linux" || runtime.GOOS == "windows") && !norm.NFC.IsNormalString(rn) {
l.Warnf("File %q contains non-NFC UTF-8 sequences and cannot be synced. Consider renaming.", rn)
if !utf8.ValidString(rn) {
l.Warnf("File name %q is not in UTF8 encoding; skipping.", rn)
if info.IsDir() {
return filepath.SkipDir
}
return nil
}
var normalizedRn string
if runtime.GOOS == "darwin" {
// Mac OS X file names should always be NFD normalized.
normalizedRn = norm.NFD.String(rn)
} else {
// Every other OS in the known universe uses NFC or just plain
// doesn't bother to define an encoding. In our case *we* do care,
// so we enforce NFC regardless.
normalizedRn = norm.NFC.String(rn)
}
if rn != normalizedRn {
// The file name was not normalized.
if !w.AutoNormalize {
// We're not authorized to do anything about it, so complain and skip.
l.Warnf("File name %q is not in the correct UTF8 normalization form; skipping.", rn)
if info.IsDir() {
return filepath.SkipDir
}
return nil
}
// We will attempt to normalize it.
normalizedPath := filepath.Join(w.Dir, normalizedRn)
if _, err := os.Lstat(normalizedPath); os.IsNotExist(err) {
// Nothing exists with the normalized filename. Good.
if err = os.Rename(p, normalizedPath); err != nil {
l.Infof(`Error normalizing UTF8 encoding of file "%s": %v`, rn, err)
if info.IsDir() {
return filepath.SkipDir
}
return nil
}
l.Infof(`Normalized UTF8 encoding of file name "%s".`, rn)
} else {
// There is something already in the way at the normalized
// file name.
l.Infof(`File "%s" has UTF8 encoding conflict with another file; ignoring.`, rn)
if info.IsDir() {
return filepath.SkipDir
}
return nil
}
rn = normalizedRn
}
// Index wise symlinks are always files, regardless of what the target
// is, because symlinks carry their target path as their content.
if info.Mode()&os.ModeSymlink == os.ModeSymlink {

View File

@ -9,14 +9,17 @@ package scanner
import (
"bytes"
"fmt"
"os"
"path/filepath"
"reflect"
"runtime"
rdebug "runtime/debug"
"sort"
"testing"
"github.com/syncthing/protocol"
"github.com/syncthing/syncthing/internal/ignore"
"golang.org/x/text/unicode/norm"
)
type testfile struct {
@ -181,6 +184,102 @@ func TestVerify(t *testing.T) {
}
}
func TestNormalization(t *testing.T) {
if runtime.GOOS == "darwin" {
t.Skip("Normalization test not possible on darwin")
return
}
os.RemoveAll("testdata/normalization")
defer os.RemoveAll("testdata/normalization")
tests := []string{
"0-A", // ASCII A -- accepted
"1-\xC3\x84", // NFC 'Ä' -- conflicts with the entry below, accepted
"1-\x41\xCC\x88", // NFD 'Ä' -- conflicts with the entry above, ignored
"2-\xC3\x85", // NFC 'Å' -- accepted
"3-\x41\xCC\x83", // NFD 'Ã' -- converted to NFC
"4-\xE2\x98\x95", // U+2615 HOT BEVERAGE (☕) -- accepted
"5-\xCD\xE2", // EUC-CN "wài" (外) -- ignored (not UTF8)
}
numInvalid := 2
numValid := len(tests) - numInvalid
for _, s1 := range tests {
// Create a directory for each of the interesting strings above
if err := os.MkdirAll(filepath.Join("testdata/normalization", s1), 0755); err != nil {
t.Fatal(err)
}
for _, s2 := range tests {
// Within each dir, create a file with each of the interesting
// file names. Ensure that the file doesn't exist when it's
// created. This detects and fails if there's file name
// normalization stuff at the filesystem level.
if fd, err := os.OpenFile(filepath.Join("testdata/normalization", s1, s2), os.O_CREATE|os.O_EXCL, 0644); err != nil {
t.Fatal(err)
} else {
fd.WriteString("test")
fd.Close()
}
}
}
// We can normalize a directory name, but we can't descend into it in the
// same pass due to how filepath.Walk works. So we run the scan twice to
// make sure it all gets done. In production, things will be correct
// eventually...
_, err := walkDir("testdata/normalization")
if err != nil {
t.Fatal(err)
}
tmp, err := walkDir("testdata/normalization")
if err != nil {
t.Fatal(err)
}
files := fileList(tmp).testfiles()
// We should have one file per combination, plus the directories
// themselves
expectedNum := numValid*numValid + numValid
if len(files) != expectedNum {
t.Errorf("Expected %d files, got %d", expectedNum, len(files))
}
// The file names should all be in NFC form.
for _, f := range files {
t.Logf("%q (% x) %v", f.name, f.name, norm.NFC.IsNormalString(f.name))
if !norm.NFC.IsNormalString(f.name) {
t.Errorf("File name %q is not NFC normalized", f.name)
}
}
}
func walkDir(dir string) ([]protocol.FileInfo, error) {
w := Walker{
Dir: dir,
BlockSize: 128 * 1024,
AutoNormalize: true,
}
fchan, err := w.Walk()
if err != nil {
return nil, err
}
var tmp []protocol.FileInfo
for f := range fchan {
tmp = append(tmp, f)
}
sort.Sort(fileList(tmp))
return tmp, nil
}
type fileList []protocol.FileInfo
func (l fileList) Len() int {