From db03562d43f8a2a55e829dff754af47f08d3eb6d Mon Sep 17 00:00:00 2001
From: Antony Male <antony.male@gmail.com>
Date: Fri, 5 Jan 2018 18:11:09 +0000
Subject: [PATCH] lib/scanner: Fix UTF-8 normalization on ZFS (fixes #4649)

It turns out that ZFS doesn't do any normalization when storing files,
but does do normalization "as part of any comparison process".

In practice, this seems to mean that if you LStat a normalized filename,
ZFS will return the FileInfo for the un-normalized version of that
filename.

This meant that our test to see whether a separate file with a
normalized version of the filename already exists was failing, as we
were detecting the same file.

The fix is to use os.SameFile, to see whether we're getting the same
FileInfo from the normalized and un-normalized versions of the same
filename.

One complication is that ZFS also seems to apply its magic to os.Rename,
meaning that we can't use it to rename an un-normalized file to its
normalized filename. Instead we have to move via a temporary object. If
the move to the temporary object fails, that's OK, we can skip it and
move on. If the move from the temporary object fails however, I'm not
sure of the best approach: the current one is to leave the temporary
file name as-is, and get Syncthing to syncronize it, so at least we
don't lose the file. I'm not sure if there are any implications of this
however.

As part of reworking normalizePath, I spotted that it appeared to be
returning the wrong thing: the doc and the surrounding code expecting it
to return the normalized filename, but it was returning the
un-normalized one. I fixed this, but it seems suspicious that, if the
previous behaviour was incorrect, noone ever ran afoul of it. Maybe all
filesystems will do some searching and give you a normalized filename if
you request an unnormalized one.

As part of this, I found that TestNormalization was broken: it was
passing, when in fact one of the files it should have verified was
present was missing. Maybe this was related to the above issue with
normalizePath's return value, I'm not sure. Fixed en route.

Kindly tested by @khinsen on the forum, and it appears to work.

GitHub-Pull-Request: https://github.com/syncthing/syncthing/pull/4646
---
 lib/fs/basicfs.go        | 12 +++++++
 lib/fs/errorfs.go        |  1 +
 lib/fs/filesystem.go     |  1 +
 lib/fs/tempname.go       |  8 +++--
 lib/scanner/walk.go      | 75 +++++++++++++++++++++++++---------------
 lib/scanner/walk_test.go |  4 +--
 6 files changed, 70 insertions(+), 31 deletions(-)

diff --git a/lib/fs/basicfs.go b/lib/fs/basicfs.go
index 8736d621b..133d032d7 100644
--- a/lib/fs/basicfs.go
+++ b/lib/fs/basicfs.go
@@ -299,6 +299,18 @@ func (f *BasicFilesystem) URI() string {
 	return strings.TrimPrefix(f.root, `\\?\`)
 }
 
+func (f *BasicFilesystem) SameFile(fi1, fi2 FileInfo) bool {
+	// Like os.SameFile, we always return false unless fi1 and fi2 were created
+	// by this package's Stat/Lstat method.
+	f1, ok1 := fi1.(fsFileInfo)
+	f2, ok2 := fi2.(fsFileInfo)
+	if !ok1 || !ok2 {
+		return false
+	}
+
+	return os.SameFile(f1.FileInfo, f2.FileInfo)
+}
+
 // fsFile implements the fs.File interface on top of an os.File
 type fsFile struct {
 	*os.File
diff --git a/lib/fs/errorfs.go b/lib/fs/errorfs.go
index 49ec8ad65..035e6a4fa 100644
--- a/lib/fs/errorfs.go
+++ b/lib/fs/errorfs.go
@@ -42,6 +42,7 @@ func (fs *errorFilesystem) Roots() ([]string, error)
 func (fs *errorFilesystem) Usage(name string) (Usage, error)                            { return Usage{}, fs.err }
 func (fs *errorFilesystem) Type() FilesystemType                                        { return fs.fsType }
 func (fs *errorFilesystem) URI() string                                                 { return fs.uri }
+func (fs *errorFilesystem) SameFile(fi1, fi2 FileInfo) bool                             { return false }
 func (fs *errorFilesystem) Watch(path string, ignore Matcher, ctx context.Context, ignorePerms bool) (<-chan Event, error) {
 	return nil, fs.err
 }
diff --git a/lib/fs/filesystem.go b/lib/fs/filesystem.go
index 346dd2548..15ba5add8 100644
--- a/lib/fs/filesystem.go
+++ b/lib/fs/filesystem.go
@@ -43,6 +43,7 @@ type Filesystem interface {
 	Usage(name string) (Usage, error)
 	Type() FilesystemType
 	URI() string
+	SameFile(fi1, fi2 FileInfo) bool
 }
 
 // The File interface abstracts access to a regular file, being a somewhat
diff --git a/lib/fs/tempname.go b/lib/fs/tempname.go
index c0afdeec1..45ea8b8b9 100644
--- a/lib/fs/tempname.go
+++ b/lib/fs/tempname.go
@@ -46,7 +46,7 @@ func IsTemporary(name string) bool {
 	return false
 }
 
-func TempName(name string) string {
+func TempNameWithPrefix(name, prefix string) string {
 	tdir := filepath.Dir(name)
 	tbase := filepath.Base(name)
 	if len(tbase) > maxFilenameLength {
@@ -54,6 +54,10 @@ func TempName(name string) string {
 		hash.Write([]byte(name))
 		tbase = fmt.Sprintf("%x", hash.Sum(nil))
 	}
-	tname := fmt.Sprintf("%s%s.tmp", TempPrefix, tbase)
+	tname := fmt.Sprintf("%s%s.tmp", prefix, tbase)
 	return filepath.Join(tdir, tname)
 }
+
+func TempName(name string) string {
+	return TempNameWithPrefix(name, TempPrefix)
+}
diff --git a/lib/scanner/walk.go b/lib/scanner/walk.go
index c4831db51..a0bf470a4 100644
--- a/lib/scanner/walk.go
+++ b/lib/scanner/walk.go
@@ -257,7 +257,7 @@ func (w *walker) walkAndHashFiles(ctx context.Context, fchan, dchan chan protoco
 			return skip
 		}
 
-		path, shouldSkip := w.normalizePath(path)
+		path, shouldSkip := w.normalizePath(path, info)
 		if shouldSkip {
 			return skip
 		}
@@ -419,7 +419,7 @@ func (w *walker) walkSymlink(ctx context.Context, relPath string, dchan chan pro
 
 // normalizePath returns the normalized relative path (possibly after fixing
 // it on disk), or skip is true.
-func (w *walker) normalizePath(path string) (normPath string, skip bool) {
+func (w *walker) normalizePath(path string, info fs.FileInfo) (normPath string, skip bool) {
 	if runtime.GOOS == "darwin" {
 		// Mac OS X file names should always be NFD normalized.
 		normPath = norm.NFD.String(path)
@@ -430,33 +430,54 @@ func (w *walker) normalizePath(path string) (normPath string, skip bool) {
 		normPath = norm.NFC.String(path)
 	}
 
-	if path != normPath {
-		// The file name was not normalized.
-
-		if !w.AutoNormalize {
-			// We're not authorized to do anything about it, so complain and skip.
-
-			l.Warnf("File name %q is not in the correct UTF8 normalization form; skipping.", path)
-			return "", true
-		}
-
-		// We will attempt to normalize it.
-		if _, err := w.Filesystem.Lstat(normPath); fs.IsNotExist(err) {
-			// Nothing exists with the normalized filename. Good.
-			if err = w.Filesystem.Rename(path, normPath); err != nil {
-				l.Infof(`Error normalizing UTF8 encoding of file "%s": %v`, path, err)
-				return "", true
-			}
-			l.Infof(`Normalized UTF8 encoding of file name "%s".`, path)
-		} else {
-			// There is something already in the way at the normalized
-			// file name.
-			l.Infof(`File "%s" path has UTF8 encoding conflict with another file; ignoring.`, path)
-			return "", true
-		}
+	if path == normPath {
+		// The file name is already normalized: nothing to do
+		return path, false
 	}
 
-	return path, false
+	if !w.AutoNormalize {
+		// We're not authorized to do anything about it, so complain and skip.
+
+		l.Warnf("File name %q is not in the correct UTF8 normalization form; skipping.", path)
+		return "", true
+	}
+
+	// We will attempt to normalize it.
+	normInfo, err := w.Filesystem.Lstat(normPath)
+	if fs.IsNotExist(err) {
+		// Nothing exists with the normalized filename. Good.
+		if err = w.Filesystem.Rename(path, normPath); err != nil {
+			l.Infof(`Error normalizing UTF8 encoding of file "%s": %v`, path, err)
+			return "", true
+		}
+		l.Infof(`Normalized UTF8 encoding of file name "%s".`, path)
+	} else if w.Filesystem.SameFile(info, normInfo) {
+		// With some filesystems (ZFS), if there is an un-normalized path and you ask whether the normalized
+		// version exists, it responds with true. Therefore we need to check fs.SameFile as well.
+		// In this case, a call to Rename won't do anything, so we have to rename via a temp file.
+
+		// We don't want to use the standard syncthing prefix here, as that will result in the file being ignored
+		// and eventually deleted by Syncthing if the rename back fails.
+
+		tempPath := fs.TempNameWithPrefix(normPath, "")
+		if err = w.Filesystem.Rename(path, tempPath); err != nil {
+			l.Infof(`Error during normalizing UTF8 encoding of file "%s" (renamed to "%s"): %v`, path, tempPath, err)
+			return "", true
+		}
+		if err = w.Filesystem.Rename(tempPath, normPath); err != nil {
+			// I don't ever expect this to happen, but if it does, we should probably tell our caller that the normalized
+			// path is the temp path: that way at least the user's data still gets synced.
+			l.Warnf(`Error renaming "%s" to "%s" while normalizating UTF8 encoding: %v. You will want to rename this file back manually`, tempPath, normPath, err)
+			return tempPath, false
+		}
+	} else {
+		// There is something already in the way at the normalized
+		// file name.
+		l.Infof(`File "%s" path has UTF8 encoding conflict with another file; ignoring.`, path)
+		return "", true
+	}
+
+	return normPath, false
 }
 
 func (w *walker) checkDir() error {
diff --git a/lib/scanner/walk_test.go b/lib/scanner/walk_test.go
index af4906ce7..1ca9d1ce0 100644
--- a/lib/scanner/walk_test.go
+++ b/lib/scanner/walk_test.go
@@ -259,9 +259,9 @@ func TestNormalization(t *testing.T) {
 	files := fileList(tmp).testfiles()
 
 	// We should have one file per combination, plus the directories
-	// themselves
+	// themselves, plus the "testdata/normalization" directory
 
-	expectedNum := numValid*numValid + numValid
+	expectedNum := numValid*numValid + numValid + 1
 	if len(files) != expectedNum {
 		t.Errorf("Expected %d files, got %d", expectedNum, len(files))
 	}