From cddf3875cf90a085e974bf68a59ae36ea91c2437 Mon Sep 17 00:00:00 2001 From: Kevin Wan Date: Tue, 14 Feb 2023 10:18:02 +0800 Subject: [PATCH] refactor: simplify stringx.Replacer, and avoid potential infinite loops (#2877) * simplify replace * backup * refactor: simplify stringx.Replacer * chore: add comments and const * chore: add more tests * chore: rename variable --- core/stringx/node.go | 98 -------------- core/stringx/node_test.go | 234 ---------------------------------- core/stringx/replacer.go | 68 ++++++---- core/stringx/replacer_test.go | 10 ++ 4 files changed, 56 insertions(+), 354 deletions(-) diff --git a/core/stringx/node.go b/core/stringx/node.go index ff739185..c6fdb9a8 100644 --- a/core/stringx/node.go +++ b/core/stringx/node.go @@ -96,101 +96,3 @@ func (n *node) find(chars []rune) []scope { return scopes } - -func (n *node) longestMatch(chars []rune, paths []*node) (uselessLen, matchLen int, nextPaths []*node) { - cur := n - var longestMatched *node - - for i := len(paths); i < len(chars); i++ { - char := chars[i] - child, ok := cur.children[char] - if ok { - cur = child - if cur.end { - longestMatched = cur - } - paths = append(paths, cur) - } else { - if longestMatched != nil { - return 0, longestMatched.depth, nil - } - - if n.end { - return 0, n.depth, nil - } - - // new path match - var jump *node - // old path pre longest preMatch - preMatch, preStart := findMatch(paths) - icur := cur - for icur.fail != nil { - jump, ok = icur.fail.children[char] - if ok { - break - } - - icur = icur.fail - } - - switch { - case preMatch != nil && jump != nil: - if jumpStart := i - jump.depth + 1; preStart < jumpStart { - return preStart, preMatch.depth, nil - } else { - return jumpStart, 0, append(paths[jumpStart:], jump) - } - case preMatch != nil && jump == nil: - return preStart, preMatch.depth, nil - case preMatch == nil && jump != nil: - return i - jump.depth + 1, 0, append(paths[i-jump.depth+1:], jump) - case preMatch == nil && jump == nil: - return i + 1, 0, nil - } - } - } - - // this longest matched node - if longestMatched != nil { - return 0, longestMatched.depth, nil - } - - if n.end { - return 0, n.depth, nil - } - - match, start := findMatch(paths) - if match != nil { - return start, match.depth, nil - } - - return len(chars), 0, nil -} - -func findMatch(path []*node) (*node, int) { - var result *node - var start int - - for i := len(path) - 1; i >= 0; i-- { - icur := path[i] - var cur *node - for icur.fail != nil { - if icur.fail.end { - cur = icur.fail - break - } - icur = icur.fail - } - if cur != nil { - if result == nil { - result = cur - start = i - result.depth + 1 - } else if curStart := i - cur.depth + 1; curStart < start { - result = cur - start = curStart - } - } - } - - return result, start -} diff --git a/core/stringx/node_test.go b/core/stringx/node_test.go index d4895d5f..f9b8d1d0 100644 --- a/core/stringx/node_test.go +++ b/core/stringx/node_test.go @@ -6,15 +6,6 @@ import ( "github.com/stretchr/testify/assert" ) -func TestLongestMatchGuardedCondition(t *testing.T) { - n := new(node) - n.end = true - uselessLen, matchLen, jump := n.longestMatch([]rune(""), nil) - assert.Equal(t, 0, uselessLen) - assert.Nil(t, jump) - assert.Equal(t, 0, matchLen) -} - func TestFuzzNodeCase1(t *testing.T) { keywords := []string{ "cs8Zh", @@ -202,228 +193,3 @@ func BenchmarkNodeFind(b *testing.B) { trie.find([]rune("日本AV演员兼电视、电影演员。无名氏AV女优是xx出道, 日本AV女优们最精彩的表演是AV演员色情表演")) } } - -func TestNode_longestMatchCase0(t *testing.T) { - // match the longest word - keywords := []string{ - "a", - "ab", - "abc", - "abcd", - } - trie := new(node) - for _, keyword := range keywords { - trie.add(keyword) - } - trie.build() - - uselessLen, matchLen, jump := trie.longestMatch([]rune("abcef"), nil) - assert.Equal(t, 0, uselessLen) - assert.Equal(t, 3, matchLen) - assert.Nil(t, jump) -} - -func TestNode_longestMatchCase1(t *testing.T) { - keywords := []string{ - "abcde", - "bcde", - "cde", - "de", - - "b", - "bc", - } - trie := new(node) - for _, keyword := range keywords { - trie.add(keyword) - } - trie.build() - - uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil) - assert.Equal(t, 1, uselessLen) - assert.Equal(t, 2, matchLen) - assert.Nil(t, jump) -} - -func TestNode_longestMatchCase2(t *testing.T) { - keywords := []string{ - "abcde", - "bcde", - "cde", - "de", - - "c", - "cd", - } - trie := new(node) - for _, keyword := range keywords { - trie.add(keyword) - } - trie.build() - - uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil) - assert.Equal(t, 2, uselessLen) - assert.Equal(t, 2, matchLen) - assert.Nil(t, jump) -} - -func TestNode_longestMatchCase3(t *testing.T) { - keywords := []string{ - "abcde", - "bcde", - "cde", - "de", - - "b", - "bc", - "c", - "cd", - } - trie := new(node) - for _, keyword := range keywords { - trie.add(keyword) - } - trie.build() - - uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil) - assert.Equal(t, 1, uselessLen) - assert.Equal(t, 2, matchLen) - assert.Nil(t, jump) -} - -func TestNode_longestMatchCase4(t *testing.T) { - keywords := []string{ - "abcde", - "bcdf", - "bcd", - } - trie := new(node) - for _, keyword := range keywords { - trie.add(keyword) - } - trie.build() - - uselessLen, matchLen, paths := trie.longestMatch([]rune("abcdf"), nil) - assert.Equal(t, 1, uselessLen) - assert.Equal(t, 0, matchLen) - assert.Equal(t, 4, len(paths)) -} - -func TestNode_longestMatchCase5(t *testing.T) { - keywords := []string{ - "abcdef", - "bcde", - } - trie := new(node) - for _, keyword := range keywords { - trie.add(keyword) - } - trie.build() - - uselessLen, matchLen, paths := trie.longestMatch([]rune("abcde"), nil) - assert.Equal(t, 1, uselessLen) - assert.Equal(t, 4, matchLen) - assert.Nil(t, paths) -} - -func TestNode_longestMatchCase6(t *testing.T) { - keywords := []string{ - "abcde", - "bc", - "d", - } - trie := new(node) - for _, keyword := range keywords { - trie.add(keyword) - } - trie.build() - - uselessLen, matchLen, jump := trie.longestMatch([]rune("abcd"), nil) - assert.Equal(t, 1, uselessLen) - assert.Equal(t, 2, matchLen) - assert.Nil(t, jump) -} - -func TestNode_longestMatchCase7(t *testing.T) { - keywords := []string{ - "abcdeg", - "cdef", - "bcde", - } - trie := new(node) - for _, keyword := range keywords { - trie.add(keyword) - } - trie.build() - - word := []rune("abcdef") - uselessLen, matchLen, paths := trie.longestMatch(word, nil) - assert.Equal(t, 1, uselessLen) - assert.Equal(t, 4, matchLen) - assert.Nil(t, paths) - uselessLen, matchLen, paths = trie.longestMatch(word[uselessLen+matchLen:], paths) - assert.Equal(t, 1, uselessLen) - assert.Equal(t, 0, matchLen) - assert.Nil(t, paths) -} - -func TestNode_longestMatchCase8(t *testing.T) { - keywords := []string{ - "abcdeg", - "cdef", - "cde", - } - trie := new(node) - for _, keyword := range keywords { - trie.add(keyword) - } - trie.build() - - word := []rune("abcdef") - uselessLen, matchLen, paths := trie.longestMatch(word, nil) - assert.Equal(t, 2, uselessLen) - assert.Equal(t, 0, matchLen) - assert.NotNil(t, paths) -} - -func TestNode_longestMatchCase9(t *testing.T) { - keywords := []string{ - "abcdeg", - "cdef", - "cde", - "cd", - } - trie := new(node) - for _, keyword := range keywords { - trie.add(keyword) - } - trie.build() - - word := []rune("abcde") - uselessLen, matchLen, paths := trie.longestMatch(word, nil) - assert.Equal(t, 2, uselessLen) - assert.Equal(t, 3, matchLen) - assert.Nil(t, paths) -} - -func TestNode_jump(t *testing.T) { - keywords := []string{ - "de", - "fe", - } - trie := new(node) - for _, keyword := range keywords { - trie.add(keyword) - } - trie.build() - target := []rune("dfe") - - uselessLen, matchLen, paths := trie.longestMatch(target, nil) - assert.Equal(t, 1, uselessLen) - assert.Equal(t, 0, matchLen) - assert.NotNil(t, paths) - uselessLen, matchLen, paths = paths[len(paths)-1].longestMatch(target[uselessLen+matchLen:], paths) - assert.Equal(t, 0, uselessLen) - assert.Equal(t, 2, matchLen) - assert.Nil(t, paths) -} diff --git a/core/stringx/replacer.go b/core/stringx/replacer.go index f691d2ee..efb2c708 100644 --- a/core/stringx/replacer.go +++ b/core/stringx/replacer.go @@ -1,9 +1,14 @@ package stringx import ( + "sort" "strings" ) +// replace more than once to avoid overlapped keywords after replace. +// only try 2 times to avoid too many or infinite loops. +const replaceTimes = 2 + type ( // Replacer interface wraps the Replace method. Replacer interface { @@ -32,29 +37,48 @@ func NewReplacer(mapping map[string]string) Replacer { // Replace replaces text with given substitutes. func (r *replacer) Replace(text string) string { - var buf strings.Builder - var paths []*node - target := []rune(text) - cur := r.node - - for len(target) != 0 { - uselessLen, matchLen, nextPaths := cur.longestMatch(target, paths) - if uselessLen > 0 { - buf.WriteString(string(target[:uselessLen])) - target = target[uselessLen:] - } - if matchLen > 0 { - replaced := r.mapping[string(target[:matchLen])] - target = append([]rune(replaced), target[matchLen:]...) - } - if len(nextPaths) != 0 { - cur = nextPaths[len(nextPaths)-1] - paths = nextPaths - } else { - cur = r.node - paths = nil + for i := 0; i < replaceTimes; i++ { + var replaced bool + if text, replaced = r.doReplace(text); !replaced { + return text } } - return buf.String() + return text +} + +func (r *replacer) doReplace(text string) (string, bool) { + chars := []rune(text) + scopes := r.find(chars) + if len(scopes) == 0 { + return text, false + } + + sort.Slice(scopes, func(i, j int) bool { + if scopes[i].start < scopes[j].start { + return true + } + if scopes[i].start == scopes[j].start { + return scopes[i].stop > scopes[j].stop + } + return false + }) + + var buf strings.Builder + var index int + for i := 0; i < len(scopes); i++ { + scp := &scopes[i] + if scp.start < index { + continue + } + + buf.WriteString(string(chars[index:scp.start])) + buf.WriteString(r.mapping[string(chars[scp.start:scp.stop])]) + index = scp.stop + } + if index < len(chars) { + buf.WriteString(string(chars[index:])) + } + + return buf.String(), true } diff --git a/core/stringx/replacer_test.go b/core/stringx/replacer_test.go index 323874f6..5ccf9182 100644 --- a/core/stringx/replacer_test.go +++ b/core/stringx/replacer_test.go @@ -219,3 +219,13 @@ func TestReplacer_ReplaceLongestMatch(t *testing.T) { }) assert.Equal(t, "东京是东京", replacer.Replace("日本的首都是东京")) } + +func TestReplacer_ReplaceIndefinitely(t *testing.T) { + mapping := map[string]string{ + "日本的首都": "东京", + "东京": "日本的首都", + } + assert.NotPanics(t, func() { + NewReplacer(mapping).Replace("日本的首都是东京") + }) +}