lib/fs: Optimize UnicodeLowercase (#6979)

Most notably, it now detects all-lowercase files and returns these
as-is. The tests have been expanded with two cases and are now used
as a benchmark (admittedly a rather trivial one).

name                           old time/op    new time/op    delta
UnicodeLowercaseMaybeChange-8    4.59µs ± 2%    4.57µs ± 1%    ~     (p=0.197 n=10+10)
UnicodeLowercaseNoChange-8       3.26µs ± 1%    3.09µs ± 1%  -5.27%  (p=0.000 n=9+10)
This commit is contained in:
greatroar 2020-09-11 09:16:10 +02:00 committed by GitHub
parent 08bebbe59b
commit 3e24d82513
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 90 additions and 38 deletions

View File

@ -7,13 +7,39 @@
package fs package fs
import ( import (
"strings"
"unicode" "unicode"
"unicode/utf8"
) )
func UnicodeLowercase(s string) string { func UnicodeLowercase(s string) string {
rs := []rune(s) i := firstCaseChange(s)
for i, r := range rs { if i == -1 {
rs[i] = unicode.ToLower(unicode.ToUpper(r)) return s
} }
return string(rs)
var rs strings.Builder
// WriteRune always reserves utf8.UTFMax bytes for non-ASCII runes,
// even if it doesn't need all that space. Overallocate now to prevent
// it from ever triggering a reallocation.
rs.Grow(utf8.UTFMax - 1 + len(s))
rs.WriteString(s[:i])
for _, r := range s[i:] {
rs.WriteRune(unicode.ToLower(unicode.ToUpper(r)))
}
return rs.String()
}
// Byte index of the first rune r s.t. lower(upper(r)) != r.
func firstCaseChange(s string) int {
for i, r := range s {
if r <= unicode.MaxASCII && (r < 'A' || r > 'Z') {
continue
}
if unicode.ToLower(unicode.ToUpper(r)) != r {
return i
}
}
return -1
} }

View File

@ -6,19 +6,23 @@
package fs package fs
import "testing" import (
"testing"
)
func TestUnicodeLowercase(t *testing.T) { var caseCases = [][2]string{
cases := [][2]string{
{"", ""}, {"", ""},
{"hej", "hej"}, {"hej", "hej"},
{"HeJ!@#", "hej!@#"}, {"HeJ!@#", "hej!@#"},
// Western Europe diacritical stuff is trivial // Western Europe diacritical stuff is trivial.
{"ÜBERRÄKSMÖRGÅS", "überräksmörgås"}, {"ÜBERRÄKSMÖRGÅS", "überräksmörgås"},
// Cyrillic seems regular as well // As are ligatures.
{"Æglefinus", "æglefinus"},
{"IJssel", "ijssel"},
// Cyrillic seems regular as well.
{"Привет", "привет"}, {"Привет", "привет"},
// Greek has multiple lower case characters for things depending on // Greek has multiple lower case characters for things depending on
// context; we should always choose the right one. // context; we should always choose the same one.
{"Ὀδυσσεύς", "ὀδυσσεύσ"}, {"Ὀδυσσεύς", "ὀδυσσεύσ"},
{"ὈΔΥΣΣΕΎΣ", "ὀδυσσεύσ"}, {"ὈΔΥΣΣΕΎΣ", "ὀδυσσεύσ"},
// German ß doesn't really have an upper case variant, and we // German ß doesn't really have an upper case variant, and we
@ -36,15 +40,37 @@ func TestUnicodeLowercase(t *testing.T) {
{"עברית", "עברית"}, {"עברית", "עברית"},
// Nor Chinese, in any variant. // Nor Chinese, in any variant.
{"汉语/漢語 or 中文", "汉语/漢語 or 中文"}, {"汉语/漢語 or 中文", "汉语/漢語 or 中文"},
// Niether katakana as far as I can tell. // Nor katakana, as far as I can tell.
{"チャーハン", "チャーハン"}, {"チャーハン", "チャーハン"},
// Some special unicode characters, however, are folded by OSes // Some special Unicode characters, however, are folded by OSes.
{"\u212A", "k"}, {"\u212A", "k"},
} }
for _, tc := range cases {
func TestUnicodeLowercase(t *testing.T) {
for _, tc := range caseCases {
res := UnicodeLowercase(tc[0]) res := UnicodeLowercase(tc[0])
if res != tc[1] { if res != tc[1] {
t.Errorf("UnicodeLowercase(%q) => %q, expected %q", tc[0], res, tc[1]) t.Errorf("UnicodeLowercase(%q) => %q, expected %q", tc[0], res, tc[1])
} }
} }
} }
func BenchmarkUnicodeLowercaseMaybeChange(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for _, s := range caseCases {
UnicodeLowercase(s[0])
}
}
}
func BenchmarkUnicodeLowercaseNoChange(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
for _, s := range caseCases {
UnicodeLowercase(s[1])
}
}
}