lib/fs: Optimize UnicodeLowercase (#6979)

Most notably, it now detects all-lowercase files and returns these as-is. The tests have been expanded with two cases and are now used as a benchmark (admittedly a rather trivial one). name old time/op new time/op delta UnicodeLowercaseMaybeChange-8 4.59µs ± 2% 4.57µs ± 1% ~ (p=0.197 n=10+10) UnicodeLowercaseNoChange-8 3.26µs ± 1% 3.09µs ± 1% -5.27% (p=0.000 n=9+10)
2025-01-22 14:48:30 +00:00 · 2020-09-11 09:16:10 +02:00 · 2020-09-11 09:16:10 +02:00 · 3e24d82513
commit 3e24d82513
parent 08bebbe59b
2 changed files with 90 additions and 38 deletions
--- a/lib/fs/folding.go
+++ b/lib/fs/folding.go
@ -7,13 +7,39 @@
 package fs

 import (
+	"strings"
 	"unicode"
+	"unicode/utf8"
 )

 func UnicodeLowercase(s string) string {
-	rs := []rune(s)
-	for i, r := range rs {
-		rs[i] = unicode.ToLower(unicode.ToUpper(r))
+	i := firstCaseChange(s)
+	if i == -1 {
+		return s
 	}
-	return string(rs)
+
+	var rs strings.Builder
+	// WriteRune always reserves utf8.UTFMax bytes for non-ASCII runes,
+	// even if it doesn't need all that space. Overallocate now to prevent
+	// it from ever triggering a reallocation.
+	rs.Grow(utf8.UTFMax - 1 + len(s))
+	rs.WriteString(s[:i])
+
+	for _, r := range s[i:] {
+		rs.WriteRune(unicode.ToLower(unicode.ToUpper(r)))
+	}
+	return rs.String()
+}
+
+// Byte index of the first rune r s.t. lower(upper(r)) != r.
+func firstCaseChange(s string) int {
+	for i, r := range s {
+		if r <= unicode.MaxASCII && (r < 'A' || r > 'Z') {
+			continue
+		}
+		if unicode.ToLower(unicode.ToUpper(r)) != r {
+			return i
+		}
+	}
+	return -1
 }
--- a/lib/fs/folding_test.go
+++ b/lib/fs/folding_test.go
@ -6,45 +6,71 @@

 package fs

-import "testing"
+import (
+	"testing"
+)
+
+var caseCases = [][2]string{
+	{"", ""},
+	{"hej", "hej"},
+	{"HeJ!@#", "hej!@#"},
+	// Western Europe diacritical stuff is trivial.
+	{"ÜBERRÄKSMÖRGÅS", "überräksmörgås"},
+	// As are ligatures.
+	{"Æglefinus", "æglefinus"},
+	{"Ĳssel", "ĳssel"},
+	// Cyrillic seems regular as well.
+	{"Привет", "привет"},
+	// Greek has multiple lower case characters for things depending on
+	// context; we should always choose the same one.
+	{"Ὀδυσσεύς", "ὀδυσσεύσ"},
+	{"ὈΔΥΣΣΕΎΣ", "ὀδυσσεύσ"},
+	// German ß doesn't really have an upper case variant, and we
+	// shouldn't mess things up when lower casing it either. We don't
+	// attempt to make ß equivalent to "ss".
+	{"Reichwaldstraße", "reichwaldstraße"},
+	// The Turks do their thing with the Is.... Like the Greek example
+	// we pick just the one canonicalized "i" although you can argue
+	// with this... From what I understand most operating systems don't
+	// get this right anyway.
+	{"İI", "ii"},
+	// Arabic doesn't do case folding.
+	{"العَرَبِيَّة", "العَرَبِيَّة"},
+	// Neither does Hebrew.
+	{"עברית", "עברית"},
+	// Nor Chinese, in any variant.
+	{"汉语/漢語 or 中文", "汉语/漢語 or 中文"},
+	// Nor katakana, as far as I can tell.
+	{"チャーハン", "チャーハン"},
+	// Some special Unicode characters, however, are folded by OSes.
+	{"\u212A", "k"},
+}

 func TestUnicodeLowercase(t *testing.T) {
-	cases := [][2]string{
-		{"", ""},
-		{"hej", "hej"},
-		{"HeJ!@#", "hej!@#"},
-		// Western Europe diacritical stuff is trivial
-		{"ÜBERRÄKSMÖRGÅS", "überräksmörgås"},
-		// Cyrillic seems regular as well
-		{"Привет", "привет"},
-		// Greek has multiple lower case characters for things depending on
-		// context; we should always choose the right one.
-		{"Ὀδυσσεύς", "ὀδυσσεύσ"},
-		{"ὈΔΥΣΣΕΎΣ", "ὀδυσσεύσ"},
-		// German ß doesn't really have an upper case variant, and we
-		// shouldn't mess things up when lower casing it either. We don't
-		// attempt to make ß equivalent to "ss".
-		{"Reichwaldstraße", "reichwaldstraße"},
-		// The Turks do their thing with the Is.... Like the Greek example
-		// we pick just the one canonicalized "i" although you can argue
-		// with this... From what I understand most operating systems don't
-		// get this right anyway.
-		{"İI", "ii"},
-		// Arabic doesn't do case folding.
-		{"العَرَبِيَّة", "العَرَبِيَّة"},
-		// Neither does Hebrew.
-		{"עברית", "עברית"},
-		// Nor Chinese, in any variant.
-		{"汉语/漢語 or 中文", "汉语/漢語 or 中文"},
-		// Niether katakana as far as I can tell.
-		{"チャーハン", "チャーハン"},
-		// Some special unicode characters, however, are folded by OSes
-		{"\u212A", "k"},
-	}
-	for _, tc := range cases {
+	for _, tc := range caseCases {
 		res := UnicodeLowercase(tc[0])
 		if res != tc[1] {
 			t.Errorf("UnicodeLowercase(%q) => %q, expected %q", tc[0], res, tc[1])
 		}
 	}
 }
+
+func BenchmarkUnicodeLowercaseMaybeChange(b *testing.B) {
+	b.ReportAllocs()
+
+	for i := 0; i < b.N; i++ {
+		for _, s := range caseCases {
+			UnicodeLowercase(s[0])
+		}
+	}
+}
+
+func BenchmarkUnicodeLowercaseNoChange(b *testing.B) {
+	b.ReportAllocs()
+
+	for i := 0; i < b.N; i++ {
+		for _, s := range caseCases {
+			UnicodeLowercase(s[1])
+		}
+	}
+}