refactor: simplify stringx.Replacer, and avoid potential infinite loops (#2877)

* simplify replace

* backup

* refactor: simplify stringx.Replacer

* chore: add comments and const

* chore: add more tests

* chore: rename variable
This commit is contained in:
Kevin Wan 2023-02-14 10:18:02 +08:00 committed by GitHub
parent 9be17a2d28
commit cddf3875cf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 56 additions and 354 deletions

View File

@ -96,101 +96,3 @@ func (n *node) find(chars []rune) []scope {
return scopes
}
func (n *node) longestMatch(chars []rune, paths []*node) (uselessLen, matchLen int, nextPaths []*node) {
cur := n
var longestMatched *node
for i := len(paths); i < len(chars); i++ {
char := chars[i]
child, ok := cur.children[char]
if ok {
cur = child
if cur.end {
longestMatched = cur
}
paths = append(paths, cur)
} else {
if longestMatched != nil {
return 0, longestMatched.depth, nil
}
if n.end {
return 0, n.depth, nil
}
// new path match
var jump *node
// old path pre longest preMatch
preMatch, preStart := findMatch(paths)
icur := cur
for icur.fail != nil {
jump, ok = icur.fail.children[char]
if ok {
break
}
icur = icur.fail
}
switch {
case preMatch != nil && jump != nil:
if jumpStart := i - jump.depth + 1; preStart < jumpStart {
return preStart, preMatch.depth, nil
} else {
return jumpStart, 0, append(paths[jumpStart:], jump)
}
case preMatch != nil && jump == nil:
return preStart, preMatch.depth, nil
case preMatch == nil && jump != nil:
return i - jump.depth + 1, 0, append(paths[i-jump.depth+1:], jump)
case preMatch == nil && jump == nil:
return i + 1, 0, nil
}
}
}
// this longest matched node
if longestMatched != nil {
return 0, longestMatched.depth, nil
}
if n.end {
return 0, n.depth, nil
}
match, start := findMatch(paths)
if match != nil {
return start, match.depth, nil
}
return len(chars), 0, nil
}
func findMatch(path []*node) (*node, int) {
var result *node
var start int
for i := len(path) - 1; i >= 0; i-- {
icur := path[i]
var cur *node
for icur.fail != nil {
if icur.fail.end {
cur = icur.fail
break
}
icur = icur.fail
}
if cur != nil {
if result == nil {
result = cur
start = i - result.depth + 1
} else if curStart := i - cur.depth + 1; curStart < start {
result = cur
start = curStart
}
}
}
return result, start
}

View File

@ -6,15 +6,6 @@ import (
"github.com/stretchr/testify/assert"
)
func TestLongestMatchGuardedCondition(t *testing.T) {
n := new(node)
n.end = true
uselessLen, matchLen, jump := n.longestMatch([]rune(""), nil)
assert.Equal(t, 0, uselessLen)
assert.Nil(t, jump)
assert.Equal(t, 0, matchLen)
}
func TestFuzzNodeCase1(t *testing.T) {
keywords := []string{
"cs8Zh",
@ -202,228 +193,3 @@ func BenchmarkNodeFind(b *testing.B) {
trie.find([]rune("日本AV演员兼电视、电影演员。无名氏AV女优是xx出道, 日本AV女优们最精彩的表演是AV演员色情表演"))
}
}
func TestNode_longestMatchCase0(t *testing.T) {
// match the longest word
keywords := []string{
"a",
"ab",
"abc",
"abcd",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcef"), nil)
assert.Equal(t, 0, uselessLen)
assert.Equal(t, 3, matchLen)
assert.Nil(t, jump)
}
func TestNode_longestMatchCase1(t *testing.T) {
keywords := []string{
"abcde",
"bcde",
"cde",
"de",
"b",
"bc",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 2, matchLen)
assert.Nil(t, jump)
}
func TestNode_longestMatchCase2(t *testing.T) {
keywords := []string{
"abcde",
"bcde",
"cde",
"de",
"c",
"cd",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
assert.Equal(t, 2, uselessLen)
assert.Equal(t, 2, matchLen)
assert.Nil(t, jump)
}
func TestNode_longestMatchCase3(t *testing.T) {
keywords := []string{
"abcde",
"bcde",
"cde",
"de",
"b",
"bc",
"c",
"cd",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 2, matchLen)
assert.Nil(t, jump)
}
func TestNode_longestMatchCase4(t *testing.T) {
keywords := []string{
"abcde",
"bcdf",
"bcd",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, paths := trie.longestMatch([]rune("abcdf"), nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 0, matchLen)
assert.Equal(t, 4, len(paths))
}
func TestNode_longestMatchCase5(t *testing.T) {
keywords := []string{
"abcdef",
"bcde",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, paths := trie.longestMatch([]rune("abcde"), nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 4, matchLen)
assert.Nil(t, paths)
}
func TestNode_longestMatchCase6(t *testing.T) {
keywords := []string{
"abcde",
"bc",
"d",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcd"), nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 2, matchLen)
assert.Nil(t, jump)
}
func TestNode_longestMatchCase7(t *testing.T) {
keywords := []string{
"abcdeg",
"cdef",
"bcde",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
word := []rune("abcdef")
uselessLen, matchLen, paths := trie.longestMatch(word, nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 4, matchLen)
assert.Nil(t, paths)
uselessLen, matchLen, paths = trie.longestMatch(word[uselessLen+matchLen:], paths)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 0, matchLen)
assert.Nil(t, paths)
}
func TestNode_longestMatchCase8(t *testing.T) {
keywords := []string{
"abcdeg",
"cdef",
"cde",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
word := []rune("abcdef")
uselessLen, matchLen, paths := trie.longestMatch(word, nil)
assert.Equal(t, 2, uselessLen)
assert.Equal(t, 0, matchLen)
assert.NotNil(t, paths)
}
func TestNode_longestMatchCase9(t *testing.T) {
keywords := []string{
"abcdeg",
"cdef",
"cde",
"cd",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
word := []rune("abcde")
uselessLen, matchLen, paths := trie.longestMatch(word, nil)
assert.Equal(t, 2, uselessLen)
assert.Equal(t, 3, matchLen)
assert.Nil(t, paths)
}
func TestNode_jump(t *testing.T) {
keywords := []string{
"de",
"fe",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
target := []rune("dfe")
uselessLen, matchLen, paths := trie.longestMatch(target, nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 0, matchLen)
assert.NotNil(t, paths)
uselessLen, matchLen, paths = paths[len(paths)-1].longestMatch(target[uselessLen+matchLen:], paths)
assert.Equal(t, 0, uselessLen)
assert.Equal(t, 2, matchLen)
assert.Nil(t, paths)
}

View File

@ -1,9 +1,14 @@
package stringx
import (
"sort"
"strings"
)
// replace more than once to avoid overlapped keywords after replace.
// only try 2 times to avoid too many or infinite loops.
const replaceTimes = 2
type (
// Replacer interface wraps the Replace method.
Replacer interface {
@ -32,29 +37,48 @@ func NewReplacer(mapping map[string]string) Replacer {
// Replace replaces text with given substitutes.
func (r *replacer) Replace(text string) string {
var buf strings.Builder
var paths []*node
target := []rune(text)
cur := r.node
for len(target) != 0 {
uselessLen, matchLen, nextPaths := cur.longestMatch(target, paths)
if uselessLen > 0 {
buf.WriteString(string(target[:uselessLen]))
target = target[uselessLen:]
}
if matchLen > 0 {
replaced := r.mapping[string(target[:matchLen])]
target = append([]rune(replaced), target[matchLen:]...)
}
if len(nextPaths) != 0 {
cur = nextPaths[len(nextPaths)-1]
paths = nextPaths
} else {
cur = r.node
paths = nil
for i := 0; i < replaceTimes; i++ {
var replaced bool
if text, replaced = r.doReplace(text); !replaced {
return text
}
}
return buf.String()
return text
}
func (r *replacer) doReplace(text string) (string, bool) {
chars := []rune(text)
scopes := r.find(chars)
if len(scopes) == 0 {
return text, false
}
sort.Slice(scopes, func(i, j int) bool {
if scopes[i].start < scopes[j].start {
return true
}
if scopes[i].start == scopes[j].start {
return scopes[i].stop > scopes[j].stop
}
return false
})
var buf strings.Builder
var index int
for i := 0; i < len(scopes); i++ {
scp := &scopes[i]
if scp.start < index {
continue
}
buf.WriteString(string(chars[index:scp.start]))
buf.WriteString(r.mapping[string(chars[scp.start:scp.stop])])
index = scp.stop
}
if index < len(chars) {
buf.WriteString(string(chars[index:]))
}
return buf.String(), true
}

View File

@ -219,3 +219,13 @@ func TestReplacer_ReplaceLongestMatch(t *testing.T) {
})
assert.Equal(t, "东京是东京", replacer.Replace("日本的首都是东京"))
}
func TestReplacer_ReplaceIndefinitely(t *testing.T) {
mapping := map[string]string{
"日本的首都": "东京",
"东京": "日本的首都",
}
assert.NotPanics(t, func() {
NewReplacer(mapping).Replace("日本的首都是东京")
})
}