Speed up ANSI code processing (#2368)

This commit speeds up the parsing/processing of ANSI escape codes by
roughly 7.5x. The speedup is mostly accomplished by replacing the regex
with dedicated parsing logic (nextAnsiEscapeSequence()) and reducing the
number of allocations in extractColor().

#### Benchmarks
```
name             old time/op    new time/op     delta
ExtractColor-16    4.89µs ± 5%     0.64µs ± 2%   -86.87%  (p=0.000 n=9+9)

name             old speed      new speed       delta
ExtractColor-16  25.6MB/s ± 5%  194.6MB/s ± 2%  +661.43%  (p=0.000 n=9+9)

name             old alloc/op   new alloc/op    delta
ExtractColor-16    1.37kB ± 0%     0.31kB ± 0%   -77.31%  (p=0.000 n=10+10)

name             old allocs/op  new allocs/op   delta
ExtractColor-16      48.0 ± 0%        4.0 ± 0%   -91.67%  (p=0.000 n=10+10)
```
This commit is contained in:
Charlie Vieth 2021-03-11 05:34:50 -05:00 committed by GitHub
parent f4e1ed25f2
commit 5a874ae241
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 431 additions and 76 deletions

View File

@ -73,6 +73,9 @@ test: $(SOURCES)
github.com/junegunn/fzf/src/tui \
github.com/junegunn/fzf/src/util
bench:
cd src && SHELL=/bin/sh GOOS= $(GO) test -v -tags "$(TAGS)" -run=Bench -bench=. -benchmem
install: bin/fzf
build:
@ -153,4 +156,4 @@ update:
$(GO) get -u
$(GO) mod tidy
.PHONY: all build release test install clean docker docker-test update
.PHONY: all build release test bench install clean docker docker-test update

View File

@ -1,8 +1,6 @@
package fzf
import (
"bytes"
"regexp"
"strconv"
"strings"
"unicode/utf8"
@ -82,73 +80,154 @@ func toAnsiString(color tui.Color, offset int) string {
return ret + ";"
}
var ansiRegex *regexp.Regexp
func init() {
/*
References:
- https://github.com/gnachman/iTerm2
- http://ascii-table.com/ansi-escape-sequences.php
- http://ascii-table.com/ansi-escape-sequences-vt-100.php
- http://tldp.org/HOWTO/Bash-Prompt-HOWTO/x405.html
- https://invisible-island.net/xterm/ctlseqs/ctlseqs.html
*/
// The following regular expression will include not all but most of the
// frequently used ANSI sequences
ansiRegex = regexp.MustCompile("(?:\x1b[\\[()][0-9;]*[a-zA-Z@]|\x1b][0-9];[[:print:]]+(?:\x1b\\\\|\x07)|\x1b.|[\x0e\x0f]|.\x08)")
func isPrint(c uint8) bool {
return '\x20' <= c && c <= '\x7e'
}
func findAnsiStart(str string) int {
idx := 0
for ; idx < len(str); idx++ {
b := str[idx]
if b == 0x1b || b == 0x0e || b == 0x0f {
return idx
func matchOperatingSystemCommand(s string) int {
// `\x1b][0-9];[[:print:]]+(?:\x1b\\\\|\x07)`
// ^ match starting here
//
i := 5 // prefix matched in nextAnsiEscapeSequence()
for ; i < len(s) && isPrint(s[i]); i++ {
}
if i < len(s) {
if s[i] == '\x07' {
return i + 1
}
if b == 0x08 && idx > 0 {
return idx - 1
if s[i] == '\x1b' && i < len(s)-1 && s[i+1] == '\\' {
return i + 2
}
}
return idx
return -1
}
func matchControlSequence(s string) int {
// `\x1b[\\[()][0-9;]*[a-zA-Z@]`
// ^ match starting here
//
i := 2 // prefix matched in nextAnsiEscapeSequence()
for ; i < len(s) && (isNumeric(s[i]) || s[i] == ';'); i++ {
}
if i < len(s) {
c := s[i]
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '@' {
return i + 1
}
}
return -1
}
func isCtrlSeqStart(c uint8) bool {
return c == '\\' || c == '[' || c == '(' || c == ')'
}
// nextAnsiEscapeSequence returns the ANSI escape sequence and is equivalent to
// calling FindStringIndex() on the below regex (which was originally used):
//
// "(?:\x1b[\\[()][0-9;]*[a-zA-Z@]|\x1b][0-9];[[:print:]]+(?:\x1b\\\\|\x07)|\x1b.|[\x0e\x0f]|.\x08)"
//
func nextAnsiEscapeSequence(s string) (int, int) {
// fast check for ANSI escape sequences
i := 0
for ; i < len(s); i++ {
switch s[i] {
case '\x0e', '\x0f', '\x1b', '\x08':
// We ignore the fact that '\x08' cannot be the first char
// in the string and be an escape sequence for the sake of
// speed and simplicity.
goto Loop
}
}
return -1, -1
Loop:
for ; i < len(s); i++ {
switch s[i] {
case '\x08':
// backtrack to match: `.\x08`
if i > 0 && s[i-1] != '\n' {
if s[i-1] < utf8.RuneSelf {
return i - 1, i + 1
}
_, n := utf8.DecodeLastRuneInString(s[:i])
return i - n, i + 1
}
case '\x1b':
// match: `\x1b[\\[()][0-9;]*[a-zA-Z@]`
if i+2 < len(s) && isCtrlSeqStart(s[i+1]) {
if j := matchControlSequence(s[i:]); j != -1 {
return i, i + j
}
}
// match: `\x1b][0-9];[[:print:]]+(?:\x1b\\\\|\x07)`
if i+5 < len(s) && s[i+1] == ']' && isNumeric(s[i+2]) &&
s[i+3] == ';' && isPrint(s[i+4]) {
if j := matchOperatingSystemCommand(s[i:]); j != -1 {
return i, i + j
}
}
// match: `\x1b.`
if i+1 < len(s) && s[i+1] != '\n' {
if s[i+1] < utf8.RuneSelf {
return i, i + 2
}
_, n := utf8.DecodeRuneInString(s[i+1:])
return i, i + n + 1
}
case '\x0e', '\x0f':
// match: `[\x0e\x0f]`
return i, i + 1
}
}
return -1, -1
}
func extractColor(str string, state *ansiState, proc func(string, *ansiState) bool) (string, *[]ansiOffset, *ansiState) {
var offsets []ansiOffset
var output bytes.Buffer
// We append to a stack allocated variable that we'll
// later copy and return, to save on allocations.
offsets := make([]ansiOffset, 0, 32)
if state != nil {
offsets = append(offsets, ansiOffset{[2]int32{0, 0}, *state})
}
prevIdx := 0
runeCount := 0
var (
pstate *ansiState // lazily allocated
output strings.Builder
prevIdx int
runeCount int
)
for idx := 0; idx < len(str); {
idx += findAnsiStart(str[idx:])
if idx == len(str) {
// Make sure that we found an ANSI code
start, end := nextAnsiEscapeSequence(str[idx:])
if start == -1 {
break
}
// Make sure that we found an ANSI code
offset := ansiRegex.FindStringIndex(str[idx:])
if len(offset) < 2 {
idx++
continue
}
offset[0] += idx
offset[1] += idx
idx = offset[1]
start += idx
idx += end
// Check if we should continue
prev := str[prevIdx:offset[0]]
prev := str[prevIdx:start]
if proc != nil && !proc(prev, state) {
return "", nil, nil
}
prevIdx = idx
prevIdx = offset[1]
runeCount += utf8.RuneCountInString(prev)
output.WriteString(prev)
if len(prev) != 0 {
runeCount += utf8.RuneCountInString(prev)
// Grow the buffer size to the maximum possible length (string length
// containing ansi codes) to avoid repetitive allocation
if output.Cap() == 0 {
output.Grow(len(str))
}
output.WriteString(prev)
}
newState := interpretCode(str[offset[0]:offset[1]], state)
newState := interpretCode(str[start:idx], state)
if !newState.equals(state) {
if state != nil {
// Update last offset
@ -157,8 +236,15 @@ func extractColor(str string, state *ansiState, proc func(string, *ansiState) bo
if newState.colored() {
// Append new offset
state = newState
offsets = append(offsets, ansiOffset{[2]int32{int32(runeCount), int32(runeCount)}, *state})
if pstate == nil {
pstate = &ansiState{}
}
*pstate = newState
state = pstate
offsets = append(offsets, ansiOffset{
[2]int32{int32(runeCount), int32(runeCount)},
newState,
})
} else {
// Discard state
state = nil
@ -168,7 +254,6 @@ func extractColor(str string, state *ansiState, proc func(string, *ansiState) bo
var rest string
var trimmed string
if prevIdx == 0 {
// No ANSI code found
rest = str
@ -178,51 +263,75 @@ func extractColor(str string, state *ansiState, proc func(string, *ansiState) bo
output.WriteString(rest)
trimmed = output.String()
}
if len(rest) > 0 && state != nil {
// Update last offset
runeCount += utf8.RuneCountInString(rest)
(&offsets[len(offsets)-1]).offset[1] = int32(runeCount)
}
if proc != nil {
proc(rest, state)
}
if len(offsets) == 0 {
return trimmed, nil, state
if len(offsets) > 0 {
if len(rest) > 0 && state != nil {
// Update last offset
runeCount += utf8.RuneCountInString(rest)
(&offsets[len(offsets)-1]).offset[1] = int32(runeCount)
}
// Return a copy of the offsets slice
a := make([]ansiOffset, len(offsets))
copy(a, offsets)
return trimmed, &a, state
}
return trimmed, &offsets, state
return trimmed, nil, state
}
func interpretCode(ansiCode string, prevState *ansiState) *ansiState {
// State
var state *ansiState
func parseAnsiCode(s string) (int, string) {
var remaining string
if i := strings.IndexByte(s, ';'); i >= 0 {
remaining = s[i+1:]
s = s[:i]
}
if len(s) > 0 {
// Inlined version of strconv.Atoi() that only handles positive
// integers and does not allocate on error.
code := 0
for _, ch := range []byte(s) {
ch -= '0'
if ch > 9 {
return -1, remaining
}
code = code*10 + int(ch)
}
return code, remaining
}
return -1, remaining
}
func interpretCode(ansiCode string, prevState *ansiState) ansiState {
var state ansiState
if prevState == nil {
state = &ansiState{-1, -1, 0, -1}
state = ansiState{-1, -1, 0, -1}
} else {
state = &ansiState{prevState.fg, prevState.bg, prevState.attr, prevState.lbg}
state = ansiState{prevState.fg, prevState.bg, prevState.attr, prevState.lbg}
}
if ansiCode[0] != '\x1b' || ansiCode[1] != '[' || ansiCode[len(ansiCode)-1] != 'm' {
if strings.HasSuffix(ansiCode, "0K") && prevState != nil {
if prevState != nil && strings.HasSuffix(ansiCode, "0K") {
state.lbg = prevState.bg
}
return state
}
ptr := &state.fg
state256 := 0
init := func() {
if len(ansiCode) <= 3 {
state.fg = -1
state.bg = -1
state.attr = 0
state256 = 0
return state
}
ansiCode = ansiCode[2 : len(ansiCode)-1]
if len(ansiCode) == 0 {
init()
}
for _, code := range strings.Split(ansiCode, ";") {
if num, err := strconv.Atoi(code); err == nil {
state256 := 0
ptr := &state.fg
for len(ansiCode) != 0 {
var num int
if num, ansiCode = parseAnsiCode(ansiCode); num != -1 {
switch state256 {
case 0:
switch num {
@ -253,7 +362,10 @@ func interpretCode(ansiCode string, prevState *ansiState) *ansiState {
case 24: // tput rmul
state.attr = state.attr &^ tui.Underline
case 0:
init()
state.fg = -1
state.bg = -1
state.attr = 0
state256 = 0
default:
if num >= 30 && num <= 37 {
state.fg = tui.Color(num - 30)
@ -289,6 +401,7 @@ func interpretCode(ansiCode string, prevState *ansiState) *ansiState {
}
}
}
if state256 > 0 {
*ptr = -1
}

View File

@ -2,12 +2,190 @@ package fzf
import (
"fmt"
"math/rand"
"regexp"
"strings"
"testing"
"unicode/utf8"
"github.com/junegunn/fzf/src/tui"
)
// The following regular expression will include not all but most of the
// frequently used ANSI sequences. This regex is used as a reference for
// testing nextAnsiEscapeSequence().
//
// References:
// - https://github.com/gnachman/iTerm2
// - http://ascii-table.com/ansi-escape-sequences.php
// - http://ascii-table.com/ansi-escape-sequences-vt-100.php
// - http://tldp.org/HOWTO/Bash-Prompt-HOWTO/x405.html
// - https://invisible-island.net/xterm/ctlseqs/ctlseqs.html
var ansiRegexRefence = regexp.MustCompile("(?:\x1b[\\[()][0-9;]*[a-zA-Z@]|\x1b][0-9];[[:print:]]+(?:\x1b\\\\|\x07)|\x1b.|[\x0e\x0f]|.\x08)")
func testParserReference(t testing.TB, str string) {
t.Helper()
toSlice := func(start, end int) []int {
if start == -1 {
return nil
}
return []int{start, end}
}
s := str
for i := 0; ; i++ {
got := toSlice(nextAnsiEscapeSequence(s))
exp := ansiRegexRefence.FindStringIndex(s)
equal := len(got) == len(exp)
if equal {
for i := 0; i < len(got); i++ {
if got[i] != exp[i] {
equal = false
break
}
}
}
if !equal {
var exps, gots []rune
if len(got) == 2 {
gots = []rune(s[got[0]:got[1]])
}
if len(exp) == 2 {
exps = []rune(s[exp[0]:exp[1]])
}
t.Errorf("%d: %q: got: %v (%q) want: %v (%q)", i, s, got, gots, exp, exps)
return
}
if len(exp) == 0 {
return
}
s = s[exp[1]:]
}
}
func TestNextAnsiEscapeSequence(t *testing.T) {
testStrs := []string{
"\x1b[0mhello world",
"\x1b[1mhello world",
"椙\x1b[1m椙",
"椙\x1b[1椙m椙",
"\x1b[1mhello \x1b[mw\x1b7o\x1b8r\x1b(Bl\x1b[2@d",
"\x1b[1mhello \x1b[Kworld",
"hello \x1b[34;45;1mworld",
"hello \x1b[34;45;1mwor\x1b[34;45;1mld",
"hello \x1b[34;45;1mwor\x1b[0mld",
"hello \x1b[34;48;5;233;1mwo\x1b[38;5;161mr\x1b[0ml\x1b[38;5;161md",
"hello \x1b[38;5;38;48;5;48;1mwor\x1b[38;5;48;48;5;38ml\x1b[0md",
"hello \x1b[32;1mworld",
"hello world",
"hello \x1b[0;38;5;200;48;5;100mworld",
"\x1b椙",
"椙\x08",
"\n\x08",
"X\x08",
"",
"\x1b]4;3;rgb:aa/bb/cc\x07 ",
"\x1b]4;3;rgb:aa/bb/cc\x1b\\ ",
ansiBenchmarkString,
}
for _, s := range testStrs {
testParserReference(t, s)
}
}
func TestNextAnsiEscapeSequence_Fuzz_Modified(t *testing.T) {
t.Parallel()
if testing.Short() {
t.Skip("short test")
}
testStrs := []string{
"\x1b[0mhello world",
"\x1b[1mhello world",
"椙\x1b[1m椙",
"椙\x1b[1椙m椙",
"\x1b[1mhello \x1b[mw\x1b7o\x1b8r\x1b(Bl\x1b[2@d",
"\x1b[1mhello \x1b[Kworld",
"hello \x1b[34;45;1mworld",
"hello \x1b[34;45;1mwor\x1b[34;45;1mld",
"hello \x1b[34;45;1mwor\x1b[0mld",
"hello \x1b[34;48;5;233;1mwo\x1b[38;5;161mr\x1b[0ml\x1b[38;5;161md",
"hello \x1b[38;5;38;48;5;48;1mwor\x1b[38;5;48;48;5;38ml\x1b[0md",
"hello \x1b[32;1mworld",
"hello world",
"hello \x1b[0;38;5;200;48;5;100mworld",
ansiBenchmarkString,
}
replacementBytes := [...]rune{'\x0e', '\x0f', '\x1b', '\x08'}
modifyString := func(s string, rr *rand.Rand) string {
n := rr.Intn(len(s))
b := []rune(s)
for ; n >= 0 && len(b) != 0; n-- {
i := rr.Intn(len(b))
switch x := rr.Intn(4); x {
case 0:
b = append(b[:i], b[i+1:]...)
case 1:
j := rr.Intn(len(replacementBytes) - 1)
b[i] = replacementBytes[j]
case 2:
x := rune(rr.Intn(utf8.MaxRune))
for !utf8.ValidRune(x) {
x = rune(rr.Intn(utf8.MaxRune))
}
b[i] = x
case 3:
b[i] = rune(rr.Intn(utf8.MaxRune)) // potentially invalid
default:
t.Fatalf("unsupported value: %d", x)
}
}
return string(b)
}
rr := rand.New(rand.NewSource(1))
for _, s := range testStrs {
for i := 1_000; i >= 0; i-- {
testParserReference(t, modifyString(s, rr))
}
}
}
func TestNextAnsiEscapeSequence_Fuzz_Random(t *testing.T) {
t.Parallel()
if testing.Short() {
t.Skip("short test")
}
randomString := func(rr *rand.Rand) string {
numChars := rand.Intn(50)
codePoints := make([]rune, numChars)
for i := 0; i < len(codePoints); i++ {
var r rune
for n := 0; n < 1000; n++ {
r = rune(rr.Intn(utf8.MaxRune))
// Allow 10% of runes to be invalid
if utf8.ValidRune(r) || rr.Float64() < 0.10 {
break
}
}
codePoints[i] = r
}
return string(codePoints)
}
rr := rand.New(rand.NewSource(1))
for i := 0; i < 100_000; i++ {
testParserReference(t, randomString(rr))
}
}
func TestExtractColor(t *testing.T) {
assert := func(offset ansiOffset, b int32, e int32, fg tui.Color, bg tui.Color, bold bool) {
var attr tui.Attr
@ -185,3 +363,64 @@ func TestAnsiCodeStringConversion(t *testing.T) {
&ansiState{attr: tui.Dim | tui.Italic, fg: 1, bg: 1},
"\x1b[2;3;7;38;2;10;20;30;48;5;100m")
}
func TestParseAnsiCode(t *testing.T) {
tests := []struct {
In, Exp string
N int
}{
{"123", "", 123},
{"1a", "", -1},
{"1a;12", "12", -1},
{"12;a", "a", 12},
{"-2", "", -1},
}
for _, x := range tests {
n, s := parseAnsiCode(x.In)
if n != x.N || s != x.Exp {
t.Fatalf("%q: got: (%d %q) want: (%d %q)", x.In, n, s, x.N, x.Exp)
}
}
}
// kernel/bpf/preload/iterators/README
const ansiBenchmarkString = "\x1b[38;5;81m\x1b[01;31m\x1b[Kkernel/\x1b[0m\x1b[38;5;81mbpf/" +
"\x1b[0m\x1b[38;5;81mpreload/\x1b[0m\x1b[38;5;81miterators/" +
"\x1b[0m\x1b[38;5;149mMakefile\x1b[m\x1b[K\x1b[0m"
func BenchmarkNextAnsiEscapeSequence(b *testing.B) {
b.SetBytes(int64(len(ansiBenchmarkString)))
for i := 0; i < b.N; i++ {
s := ansiBenchmarkString
for {
_, o := nextAnsiEscapeSequence(s)
if o == -1 {
break
}
s = s[o:]
}
}
}
// Baseline test to compare the speed of nextAnsiEscapeSequence() to the
// previously used regex based implementation.
func BenchmarkNextAnsiEscapeSequence_Regex(b *testing.B) {
b.SetBytes(int64(len(ansiBenchmarkString)))
for i := 0; i < b.N; i++ {
s := ansiBenchmarkString
for {
a := ansiRegexRefence.FindStringIndex(s)
if len(a) == 0 {
break
}
s = s[a[1]:]
}
}
}
func BenchmarkExtractColor(b *testing.B) {
b.SetBytes(int64(len(ansiBenchmarkString)))
for i := 0; i < b.N; i++ {
extractColor(ansiBenchmarkString, nil, nil)
}
}