mirror of
https://github.com/octoleo/syncthing.git
synced 2025-01-22 14:48:30 +00:00
lib/fs: Optimize UnicodeLowercase (#6979)
Most notably, it now detects all-lowercase files and returns these as-is. The tests have been expanded with two cases and are now used as a benchmark (admittedly a rather trivial one). name old time/op new time/op delta UnicodeLowercaseMaybeChange-8 4.59µs ± 2% 4.57µs ± 1% ~ (p=0.197 n=10+10) UnicodeLowercaseNoChange-8 3.26µs ± 1% 3.09µs ± 1% -5.27% (p=0.000 n=9+10)
This commit is contained in:
parent
08bebbe59b
commit
3e24d82513
@ -7,13 +7,39 @@
|
||||
package fs
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func UnicodeLowercase(s string) string {
|
||||
rs := []rune(s)
|
||||
for i, r := range rs {
|
||||
rs[i] = unicode.ToLower(unicode.ToUpper(r))
|
||||
i := firstCaseChange(s)
|
||||
if i == -1 {
|
||||
return s
|
||||
}
|
||||
return string(rs)
|
||||
|
||||
var rs strings.Builder
|
||||
// WriteRune always reserves utf8.UTFMax bytes for non-ASCII runes,
|
||||
// even if it doesn't need all that space. Overallocate now to prevent
|
||||
// it from ever triggering a reallocation.
|
||||
rs.Grow(utf8.UTFMax - 1 + len(s))
|
||||
rs.WriteString(s[:i])
|
||||
|
||||
for _, r := range s[i:] {
|
||||
rs.WriteRune(unicode.ToLower(unicode.ToUpper(r)))
|
||||
}
|
||||
return rs.String()
|
||||
}
|
||||
|
||||
// Byte index of the first rune r s.t. lower(upper(r)) != r.
|
||||
func firstCaseChange(s string) int {
|
||||
for i, r := range s {
|
||||
if r <= unicode.MaxASCII && (r < 'A' || r > 'Z') {
|
||||
continue
|
||||
}
|
||||
if unicode.ToLower(unicode.ToUpper(r)) != r {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
@ -6,45 +6,71 @@
|
||||
|
||||
package fs
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
var caseCases = [][2]string{
|
||||
{"", ""},
|
||||
{"hej", "hej"},
|
||||
{"HeJ!@#", "hej!@#"},
|
||||
// Western Europe diacritical stuff is trivial.
|
||||
{"ÜBERRÄKSMÖRGÅS", "überräksmörgås"},
|
||||
// As are ligatures.
|
||||
{"Æglefinus", "æglefinus"},
|
||||
{"IJssel", "ijssel"},
|
||||
// Cyrillic seems regular as well.
|
||||
{"Привет", "привет"},
|
||||
// Greek has multiple lower case characters for things depending on
|
||||
// context; we should always choose the same one.
|
||||
{"Ὀδυσσεύς", "ὀδυσσεύσ"},
|
||||
{"ὈΔΥΣΣΕΎΣ", "ὀδυσσεύσ"},
|
||||
// German ß doesn't really have an upper case variant, and we
|
||||
// shouldn't mess things up when lower casing it either. We don't
|
||||
// attempt to make ß equivalent to "ss".
|
||||
{"Reichwaldstraße", "reichwaldstraße"},
|
||||
// The Turks do their thing with the Is.... Like the Greek example
|
||||
// we pick just the one canonicalized "i" although you can argue
|
||||
// with this... From what I understand most operating systems don't
|
||||
// get this right anyway.
|
||||
{"İI", "ii"},
|
||||
// Arabic doesn't do case folding.
|
||||
{"العَرَبِيَّة", "العَرَبِيَّة"},
|
||||
// Neither does Hebrew.
|
||||
{"עברית", "עברית"},
|
||||
// Nor Chinese, in any variant.
|
||||
{"汉语/漢語 or 中文", "汉语/漢語 or 中文"},
|
||||
// Nor katakana, as far as I can tell.
|
||||
{"チャーハン", "チャーハン"},
|
||||
// Some special Unicode characters, however, are folded by OSes.
|
||||
{"\u212A", "k"},
|
||||
}
|
||||
|
||||
func TestUnicodeLowercase(t *testing.T) {
|
||||
cases := [][2]string{
|
||||
{"", ""},
|
||||
{"hej", "hej"},
|
||||
{"HeJ!@#", "hej!@#"},
|
||||
// Western Europe diacritical stuff is trivial
|
||||
{"ÜBERRÄKSMÖRGÅS", "überräksmörgås"},
|
||||
// Cyrillic seems regular as well
|
||||
{"Привет", "привет"},
|
||||
// Greek has multiple lower case characters for things depending on
|
||||
// context; we should always choose the right one.
|
||||
{"Ὀδυσσεύς", "ὀδυσσεύσ"},
|
||||
{"ὈΔΥΣΣΕΎΣ", "ὀδυσσεύσ"},
|
||||
// German ß doesn't really have an upper case variant, and we
|
||||
// shouldn't mess things up when lower casing it either. We don't
|
||||
// attempt to make ß equivalent to "ss".
|
||||
{"Reichwaldstraße", "reichwaldstraße"},
|
||||
// The Turks do their thing with the Is.... Like the Greek example
|
||||
// we pick just the one canonicalized "i" although you can argue
|
||||
// with this... From what I understand most operating systems don't
|
||||
// get this right anyway.
|
||||
{"İI", "ii"},
|
||||
// Arabic doesn't do case folding.
|
||||
{"العَرَبِيَّة", "العَرَبِيَّة"},
|
||||
// Neither does Hebrew.
|
||||
{"עברית", "עברית"},
|
||||
// Nor Chinese, in any variant.
|
||||
{"汉语/漢語 or 中文", "汉语/漢語 or 中文"},
|
||||
// Niether katakana as far as I can tell.
|
||||
{"チャーハン", "チャーハン"},
|
||||
// Some special unicode characters, however, are folded by OSes
|
||||
{"\u212A", "k"},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
for _, tc := range caseCases {
|
||||
res := UnicodeLowercase(tc[0])
|
||||
if res != tc[1] {
|
||||
t.Errorf("UnicodeLowercase(%q) => %q, expected %q", tc[0], res, tc[1])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkUnicodeLowercaseMaybeChange(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, s := range caseCases {
|
||||
UnicodeLowercase(s[0])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkUnicodeLowercaseNoChange(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, s := range caseCases {
|
||||
UnicodeLowercase(s[1])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user