diff --git a/lib/fs/folding.go b/lib/fs/folding.go index e220855af..9f95f3d36 100644 --- a/lib/fs/folding.go +++ b/lib/fs/folding.go @@ -7,13 +7,39 @@ package fs import ( + "strings" "unicode" + "unicode/utf8" ) func UnicodeLowercase(s string) string { - rs := []rune(s) - for i, r := range rs { - rs[i] = unicode.ToLower(unicode.ToUpper(r)) + i := firstCaseChange(s) + if i == -1 { + return s } - return string(rs) + + var rs strings.Builder + // WriteRune always reserves utf8.UTFMax bytes for non-ASCII runes, + // even if it doesn't need all that space. Overallocate now to prevent + // it from ever triggering a reallocation. + rs.Grow(utf8.UTFMax - 1 + len(s)) + rs.WriteString(s[:i]) + + for _, r := range s[i:] { + rs.WriteRune(unicode.ToLower(unicode.ToUpper(r))) + } + return rs.String() +} + +// Byte index of the first rune r s.t. lower(upper(r)) != r. +func firstCaseChange(s string) int { + for i, r := range s { + if r <= unicode.MaxASCII && (r < 'A' || r > 'Z') { + continue + } + if unicode.ToLower(unicode.ToUpper(r)) != r { + return i + } + } + return -1 } diff --git a/lib/fs/folding_test.go b/lib/fs/folding_test.go index f8f838207..d00fef26c 100644 --- a/lib/fs/folding_test.go +++ b/lib/fs/folding_test.go @@ -6,45 +6,71 @@ package fs -import "testing" +import ( + "testing" +) + +var caseCases = [][2]string{ + {"", ""}, + {"hej", "hej"}, + {"HeJ!@#", "hej!@#"}, + // Western Europe diacritical stuff is trivial. + {"ÜBERRÄKSMÖRGÅS", "überräksmörgås"}, + // As are ligatures. + {"Æglefinus", "æglefinus"}, + {"IJssel", "ijssel"}, + // Cyrillic seems regular as well. + {"Привет", "привет"}, + // Greek has multiple lower case characters for things depending on + // context; we should always choose the same one. + {"Ὀδυσσεύς", "ὀδυσσεύσ"}, + {"ὈΔΥΣΣΕΎΣ", "ὀδυσσεύσ"}, + // German ß doesn't really have an upper case variant, and we + // shouldn't mess things up when lower casing it either. We don't + // attempt to make ß equivalent to "ss". + {"Reichwaldstraße", "reichwaldstraße"}, + // The Turks do their thing with the Is.... Like the Greek example + // we pick just the one canonicalized "i" although you can argue + // with this... From what I understand most operating systems don't + // get this right anyway. + {"İI", "ii"}, + // Arabic doesn't do case folding. + {"العَرَبِيَّة", "العَرَبِيَّة"}, + // Neither does Hebrew. + {"עברית", "עברית"}, + // Nor Chinese, in any variant. + {"汉语/漢語 or 中文", "汉语/漢語 or 中文"}, + // Nor katakana, as far as I can tell. + {"チャーハン", "チャーハン"}, + // Some special Unicode characters, however, are folded by OSes. + {"\u212A", "k"}, +} func TestUnicodeLowercase(t *testing.T) { - cases := [][2]string{ - {"", ""}, - {"hej", "hej"}, - {"HeJ!@#", "hej!@#"}, - // Western Europe diacritical stuff is trivial - {"ÜBERRÄKSMÖRGÅS", "überräksmörgås"}, - // Cyrillic seems regular as well - {"Привет", "привет"}, - // Greek has multiple lower case characters for things depending on - // context; we should always choose the right one. - {"Ὀδυσσεύς", "ὀδυσσεύσ"}, - {"ὈΔΥΣΣΕΎΣ", "ὀδυσσεύσ"}, - // German ß doesn't really have an upper case variant, and we - // shouldn't mess things up when lower casing it either. We don't - // attempt to make ß equivalent to "ss". - {"Reichwaldstraße", "reichwaldstraße"}, - // The Turks do their thing with the Is.... Like the Greek example - // we pick just the one canonicalized "i" although you can argue - // with this... From what I understand most operating systems don't - // get this right anyway. - {"İI", "ii"}, - // Arabic doesn't do case folding. - {"العَرَبِيَّة", "العَرَبِيَّة"}, - // Neither does Hebrew. - {"עברית", "עברית"}, - // Nor Chinese, in any variant. - {"汉语/漢語 or 中文", "汉语/漢語 or 中文"}, - // Niether katakana as far as I can tell. - {"チャーハン", "チャーハン"}, - // Some special unicode characters, however, are folded by OSes - {"\u212A", "k"}, - } - for _, tc := range cases { + for _, tc := range caseCases { res := UnicodeLowercase(tc[0]) if res != tc[1] { t.Errorf("UnicodeLowercase(%q) => %q, expected %q", tc[0], res, tc[1]) } } } + +func BenchmarkUnicodeLowercaseMaybeChange(b *testing.B) { + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + for _, s := range caseCases { + UnicodeLowercase(s[0]) + } + } +} + +func BenchmarkUnicodeLowercaseNoChange(b *testing.B) { + b.ReportAllocs() + + for i := 0; i < b.N; i++ { + for _, s := range caseCases { + UnicodeLowercase(s[1]) + } + } +}