Merge pull request #1488 from calmh/utf8

Automatically fix file name normalization errors (fixes #430)
2025-01-24 15:38:27 +00:00 · 2015-03-19 08:33:08 +00:00 · 2015-03-19 08:33:08 +00:00 · f568e76fd4
commit f568e76fd4
parent 75523556e8 e947223aaa
4 changed files with 173 additions and 32 deletions
--- a/internal/config/config.go
+++ b/internal/config/config.go
@ -50,6 +50,7 @@ type FolderConfiguration struct {
 	ReadOnly        bool                        `xml:"ro,attr" json:"readOnly"`
 	RescanIntervalS int                         `xml:"rescanIntervalS,attr" json:"rescanIntervalS" default:"60"`
 	IgnorePerms     bool                        `xml:"ignorePerms,attr" json:"ignorePerms"`
 	AutoNormalize   bool                        `xml:"autoNormalize,attr" json:"autoNormalize" default:"true"`
 	Versioning      VersioningConfiguration     `xml:"versioning" json:"versioning"`
 	LenientMtimes   bool                        `xml:"lenientMtimes" json:"lenientMTimes"`
 	Copiers         int                         `xml:"copiers" json:"copiers" default:"1"`  // This defines how many files are handled concurrently.
--- a/internal/model/model.go
+++ b/internal/model/model.go
@ -1147,6 +1147,7 @@ func (m *Model) ScanFolderSub(folder, sub string) error {
 		TempLifetime:  time.Duration(m.cfg.Options().KeepTemporariesH) * time.Hour,
 		CurrentFiler:  cFiler{m, folder},
 		IgnorePerms:   folderCfg.IgnorePerms,
 		AutoNormalize: folderCfg.AutoNormalize,
 		Hashers:       folderCfg.Hashers,
 	}
--- a/internal/scanner/walk.go
+++ b/internal/scanner/walk.go
@ -13,6 +13,7 @@ import (
 	"runtime"
 	"strings"
 	"time"
 	"unicode/utf8"
 	"github.com/syncthing/protocol"
 	"github.com/syncthing/syncthing/internal/ignore"
@ -55,6 +56,9 @@ type Walker struct {
 	// detected. Scanned files will get zero permission bits and the
 	// NoPermissionBits flag set.
 	IgnorePerms bool
 	// When AutoNormalize is set, file names that are in UTF8 but incorrect
 	// normalization form will be corrected.
 	AutoNormalize bool
 	// Number of routines to use for hashing
 	Hashers int
 }
@ -104,11 +108,18 @@ func (w *Walker) Walk() (chan protocol.FileInfo, error) {
 func (w *Walker) walkAndHashFiles(fchan chan protocol.FileInfo) filepath.WalkFunc {
 	now := time.Now()
 	return func(p string, info os.FileInfo, err error) error {
 		// Return value used when we are returning early and don't want to
 		// process the item. For directories, this means do-not-descend.
 		var skip error // nil
 		if info.IsDir() {
 			skip = filepath.SkipDir
 		}
 		if err != nil {
 			if debug {
 				l.Debugln("error:", p, info, err)
 			}
-			return nil
+			return skip
 		}
 		rn, err := filepath.Rel(w.Dir, p)
@ -116,7 +127,7 @@ func (w *Walker) walkAndHashFiles(fchan chan protocol.FileInfo) filepath.WalkFun
 			if debug {
 				l.Debugln("rel error:", p, err)
 			}
-			return nil
+			return skip
 		}
 		if rn == "." {
@ -143,33 +154,62 @@ func (w *Walker) walkAndHashFiles(fchan chan protocol.FileInfo) filepath.WalkFun
 			if debug {
 				l.Debugln("ignored:", rn)
 			}
-			if info.IsDir() {
+			return skip
 				return filepath.SkipDir
 			}
 			return nil
 		}
-		if (runtime.GOOS == "linux" || runtime.GOOS == "windows") && !norm.NFC.IsNormalString(rn) {
+		if !utf8.ValidString(rn) {
-			l.Warnf("File %q contains non-NFC UTF-8 sequences and cannot be synced. Consider renaming.", rn)
+			l.Warnf("File name %q is not in UTF8 encoding; skipping.", rn)
-			return nil
+			return skip
 		}
 		var normalizedRn string
 		if runtime.GOOS == "darwin" {
 			// Mac OS X file names should always be NFD normalized.
 			normalizedRn = norm.NFD.String(rn)
 		} else {
 			// Every other OS in the known universe uses NFC or just plain
 			// doesn't bother to define an encoding. In our case *we* do care,
 			// so we enforce NFC regardless.
 			normalizedRn = norm.NFC.String(rn)
 		}
 		if rn != normalizedRn {
 			// The file name was not normalized.
 			if !w.AutoNormalize {
 				// We're not authorized to do anything about it, so complain and skip.
 				l.Warnf("File name %q is not in the correct UTF8 normalization form; skipping.", rn)
 				return skip
 			}
 			// We will attempt to normalize it.
 			normalizedPath := filepath.Join(w.Dir, normalizedRn)
 			if _, err := os.Lstat(normalizedPath); os.IsNotExist(err) {
 				// Nothing exists with the normalized filename. Good.
 				if err = os.Rename(p, normalizedPath); err != nil {
 					l.Infof(`Error normalizing UTF8 encoding of file "%s": %v`, rn, err)
 					return skip
 				}
 				l.Infof(`Normalized UTF8 encoding of file name "%s".`, rn)
 			} else {
 				// There is something already in the way at the normalized
 				// file name.
 				l.Infof(`File "%s" has UTF8 encoding conflict with another file; ignoring.`, rn)
 				return skip
 			}
 			rn = normalizedRn
 		}
 		// Index wise symlinks are always files, regardless of what the target
 		// is, because symlinks carry their target path as their content.
 		if info.Mode()&os.ModeSymlink == os.ModeSymlink {
 			var rval error
 			// If the target is a directory, do NOT descend down there. This
 			// will cause files to get tracked, and removing the symlink will
-			// as a result remove files in their real location. But do not
+			// as a result remove files in their real location.
 			// SkipDir if the target is not a directory, as it will stop
 			// scanning the current directory.
 			if info.IsDir() {
 				rval = filepath.SkipDir
 			}
 			// If we don't support symlinks, skip.
 			if !symlinks.Supported {
-				return rval
+				return skip
 			}
 			// We always rehash symlinks as they have no modtime or
@ -183,7 +223,7 @@ func (w *Walker) walkAndHashFiles(fchan chan protocol.FileInfo) filepath.WalkFun
 				if debug {
 					l.Debugln("readlink error:", p, err)
 				}
-				return rval
+				return skip
 			}
 			blocks, err := Blocks(strings.NewReader(target), w.BlockSize, 0)
@ -191,7 +231,7 @@ func (w *Walker) walkAndHashFiles(fchan chan protocol.FileInfo) filepath.WalkFun
 				if debug {
 					l.Debugln("hash link error:", p, err)
 				}
-				return rval
+				return skip
 			}
 			if w.CurrentFiler != nil {
@ -204,7 +244,7 @@ func (w *Walker) walkAndHashFiles(fchan chan protocol.FileInfo) filepath.WalkFun
 				//  - the block list (i.e. hash of target) was the same
 				cf, ok := w.CurrentFiler.CurrentFile(rn)
 				if ok && !cf.IsDeleted() && cf.IsSymlink() && !cf.IsInvalid() && SymlinkTypeEqual(flags, cf.Flags) && BlocksEqual(cf.Blocks, blocks) {
-					return rval
+					return skip
 				}
 			}
@ -222,7 +262,7 @@ func (w *Walker) walkAndHashFiles(fchan chan protocol.FileInfo) filepath.WalkFun
 			fchan <- f
-			return rval
+			return skip
 		}
 		if info.Mode().IsDir() {
--- a/internal/scanner/walk_test.go
+++ b/internal/scanner/walk_test.go
@ -9,14 +9,17 @@ package scanner
 import (
 	"bytes"
 	"fmt"
 	"os"
 	"path/filepath"
 	"reflect"
 	"runtime"
 	rdebug "runtime/debug"
 	"sort"
 	"testing"
 	"github.com/syncthing/protocol"
 	"github.com/syncthing/syncthing/internal/ignore"
 	"golang.org/x/text/unicode/norm"
 )
 type testfile struct {
@ -181,6 +184,102 @@ func TestVerify(t *testing.T) {
 	}
 }
 func TestNormalization(t *testing.T) {
 	if runtime.GOOS == "darwin" {
 		t.Skip("Normalization test not possible on darwin")
 		return
 	}
 	os.RemoveAll("testdata/normalization")
 	defer os.RemoveAll("testdata/normalization")
 	tests := []string{
 		"0-A",            // ASCII A -- accepted
 		"1-\xC3\x84",     // NFC 'Ä' -- conflicts with the entry below, accepted
 		"1-\x41\xCC\x88", // NFD 'Ä' -- conflicts with the entry above, ignored
 		"2-\xC3\x85",     // NFC 'Å' -- accepted
 		"3-\x41\xCC\x83", // NFD 'Ã' -- converted to NFC
 		"4-\xE2\x98\x95", // U+2615 HOT BEVERAGE (☕) -- accepted
 		"5-\xCD\xE2",     // EUC-CN "wài" (外) -- ignored (not UTF8)
 	}
 	numInvalid := 2
 	numValid := len(tests) - numInvalid
 	for _, s1 := range tests {
 		// Create a directory for each of the interesting strings above
 		if err := os.MkdirAll(filepath.Join("testdata/normalization", s1), 0755); err != nil {
 			t.Fatal(err)
 		}
 		for _, s2 := range tests {
 			// Within each dir, create a file with each of the interesting
 			// file names. Ensure that the file doesn't exist when it's
 			// created. This detects and fails if there's file name
 			// normalization stuff at the filesystem level.
 			if fd, err := os.OpenFile(filepath.Join("testdata/normalization", s1, s2), os.O_CREATE|os.O_EXCL, 0644); err != nil {
 				t.Fatal(err)
 			} else {
 				fd.WriteString("test")
 				fd.Close()
 			}
 		}
 	}
 	// We can normalize a directory name, but we can't descend into it in the
 	// same pass due to how filepath.Walk works. So we run the scan twice to
 	// make sure it all gets done. In production, things will be correct
 	// eventually...
 	_, err := walkDir("testdata/normalization")
 	if err != nil {
 		t.Fatal(err)
 	}
 	tmp, err := walkDir("testdata/normalization")
 	if err != nil {
 		t.Fatal(err)
 	}
 	files := fileList(tmp).testfiles()
 	// We should have one file per combination, plus the directories
 	// themselves
 	expectedNum := numValid*numValid + numValid
 	if len(files) != expectedNum {
 		t.Errorf("Expected %d files, got %d", expectedNum, len(files))
 	}
 	// The file names should all be in NFC form.
 	for _, f := range files {
 		t.Logf("%q (% x) %v", f.name, f.name, norm.NFC.IsNormalString(f.name))
 		if !norm.NFC.IsNormalString(f.name) {
 			t.Errorf("File name %q is not NFC normalized", f.name)
 		}
 	}
 }
 func walkDir(dir string) ([]protocol.FileInfo, error) {
 	w := Walker{
 		Dir:           dir,
 		BlockSize:     128 * 1024,
 		AutoNormalize: true,
 	}
 	fchan, err := w.Walk()
 	if err != nil {
 		return nil, err
 	}
 	var tmp []protocol.FileInfo
 	for f := range fchan {
 		tmp = append(tmp, f)
 	}
 	sort.Sort(fileList(tmp))
 	return tmp, nil
 }
 type fileList []protocol.FileInfo
 func (l fileList) Len() int {