mirror of
https://github.com/zeromicro/go-zero.git
synced 2025-02-02 16:28:39 +08:00
fix Replacer suffix match, and add test case (#2867)
* fix: replace shoud replace the longest match * feat: revert bytes.Buffer to strings.Builder * fix: loop reset nextStart * feat: add node longest match test * feat: add replacer suffix match test case * feat: multiple match * fix: partial match ends * fix: replace look back upon error * feat: rm unnecessary branch --------- Co-authored-by: hudahai <hscxrzs@gmail.com> Co-authored-by: hushichang <hushichang@sensetime.com>
This commit is contained in:
parent
3736dacf1e
commit
cacd5dc91a
@ -14,7 +14,6 @@ func (n *node) add(word string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
nd := n
|
nd := n
|
||||||
var depth int
|
|
||||||
for i, char := range chars {
|
for i, char := range chars {
|
||||||
if nd.children == nil {
|
if nd.children == nil {
|
||||||
child := new(node)
|
child := new(node)
|
||||||
@ -23,7 +22,6 @@ func (n *node) add(word string) {
|
|||||||
nd = child
|
nd = child
|
||||||
} else if child, ok := nd.children[char]; ok {
|
} else if child, ok := nd.children[char]; ok {
|
||||||
nd = child
|
nd = child
|
||||||
depth++
|
|
||||||
} else {
|
} else {
|
||||||
child := new(node)
|
child := new(node)
|
||||||
child.depth = i + 1
|
child.depth = i + 1
|
||||||
@ -99,51 +97,91 @@ func (n *node) find(chars []rune) []scope {
|
|||||||
return scopes
|
return scopes
|
||||||
}
|
}
|
||||||
|
|
||||||
func (n *node) longestMatch(chars []rune, start int) (used int, jump *node, matched bool) {
|
func (n *node) longestMatch(chars []rune, paths []*node) (uselessLen, matchLen int, nextPaths []*node) {
|
||||||
cur := n
|
cur := n
|
||||||
var matchedNode *node
|
var longestMatched *node
|
||||||
|
findMatch := func(path []*node) (*node, int) {
|
||||||
|
var (
|
||||||
|
result *node
|
||||||
|
start int
|
||||||
|
)
|
||||||
|
for i := len(path) - 1; i >= 0; i-- {
|
||||||
|
icur := path[i]
|
||||||
|
var cur *node
|
||||||
|
for icur.fail != nil {
|
||||||
|
if icur.fail.end {
|
||||||
|
cur = icur.fail
|
||||||
|
break
|
||||||
|
}
|
||||||
|
icur = icur.fail
|
||||||
|
}
|
||||||
|
if cur != nil {
|
||||||
|
if result == nil {
|
||||||
|
result = cur
|
||||||
|
start = i - result.depth + 1
|
||||||
|
} else if curStart := i - cur.depth + 1; curStart < start {
|
||||||
|
result = cur
|
||||||
|
start = curStart
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result, start
|
||||||
|
}
|
||||||
|
|
||||||
for i := start; i < len(chars); i++ {
|
for i := len(paths); i < len(chars); i++ {
|
||||||
child, ok := cur.children[chars[i]]
|
char := chars[i]
|
||||||
|
child, ok := cur.children[char]
|
||||||
if ok {
|
if ok {
|
||||||
cur = child
|
cur = child
|
||||||
if cur.end {
|
if cur.end {
|
||||||
matchedNode = cur
|
longestMatched = cur
|
||||||
}
|
}
|
||||||
|
paths = append(paths, cur)
|
||||||
} else {
|
} else {
|
||||||
if matchedNode != nil {
|
if longestMatched != nil {
|
||||||
return matchedNode.depth, nil, true
|
return 0, longestMatched.depth, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if n.end {
|
if n.end {
|
||||||
return start, nil, true
|
return 0, n.depth, nil
|
||||||
}
|
}
|
||||||
|
// old path pre longest preMatch
|
||||||
|
preMatch, preStart := findMatch(paths)
|
||||||
|
// new path match
|
||||||
var jump *node
|
var jump *node
|
||||||
for cur.fail != nil {
|
icur := cur
|
||||||
jump, ok = cur.fail.children[chars[i]]
|
for icur.fail != nil {
|
||||||
|
jump, ok = icur.fail.children[char]
|
||||||
if ok {
|
if ok {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
cur = cur.fail
|
icur = icur.fail
|
||||||
}
|
}
|
||||||
if jump != nil {
|
switch {
|
||||||
return i + 1 - jump.depth, jump, false
|
case preMatch != nil && jump != nil:
|
||||||
|
if jumpStart := i - jump.depth + 1; preStart < jumpStart {
|
||||||
|
return preStart, preMatch.depth, nil
|
||||||
|
} else {
|
||||||
|
return jumpStart, 0, append(paths[jumpStart:], jump)
|
||||||
|
}
|
||||||
|
case preMatch != nil && jump == nil:
|
||||||
|
return preStart, preMatch.depth, nil
|
||||||
|
case preMatch == nil && jump != nil:
|
||||||
|
return i - jump.depth + 1, 0, append(paths[i-jump.depth+1:], jump)
|
||||||
|
case preMatch == nil && jump == nil:
|
||||||
|
return i + 1, 0, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return i + 1, nil, false
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// this longest matched node
|
||||||
// longest matched node
|
if longestMatched != nil {
|
||||||
if matchedNode != nil {
|
return 0, longestMatched.depth, nil
|
||||||
return matchedNode.depth, nil, true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// last matched node
|
|
||||||
if n.end {
|
if n.end {
|
||||||
return start, nil, true
|
return 0, n.depth, nil
|
||||||
}
|
}
|
||||||
|
match, start := findMatch(paths)
|
||||||
return len(chars), nil, false
|
if match != nil {
|
||||||
|
return start, match.depth, nil
|
||||||
|
}
|
||||||
|
return len(chars), 0, nil
|
||||||
}
|
}
|
||||||
|
@ -9,10 +9,10 @@ import (
|
|||||||
func TestLongestMatchGuardedCondition(t *testing.T) {
|
func TestLongestMatchGuardedCondition(t *testing.T) {
|
||||||
n := new(node)
|
n := new(node)
|
||||||
n.end = true
|
n.end = true
|
||||||
used, jump, matched := n.longestMatch([]rune(""), 0)
|
uselessLen, matchLen, jump := n.longestMatch([]rune(""), nil)
|
||||||
assert.Equal(t, 0, used)
|
assert.Equal(t, 0, uselessLen)
|
||||||
assert.Nil(t, jump)
|
assert.Nil(t, jump)
|
||||||
assert.True(t, matched)
|
assert.Equal(t, 0, matchLen)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestFuzzNodeCase1(t *testing.T) {
|
func TestFuzzNodeCase1(t *testing.T) {
|
||||||
@ -202,3 +202,228 @@ func BenchmarkNodeFind(b *testing.B) {
|
|||||||
trie.find([]rune("日本AV演员兼电视、电影演员。无名氏AV女优是xx出道, 日本AV女优们最精彩的表演是AV演员色情表演"))
|
trie.find([]rune("日本AV演员兼电视、电影演员。无名氏AV女优是xx出道, 日本AV女优们最精彩的表演是AV演员色情表演"))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNode_longestMatchCase0(t *testing.T) {
|
||||||
|
// match the longest word
|
||||||
|
keywords := []string{
|
||||||
|
"a",
|
||||||
|
"ab",
|
||||||
|
"abc",
|
||||||
|
"abcd",
|
||||||
|
}
|
||||||
|
trie := new(node)
|
||||||
|
for _, keyword := range keywords {
|
||||||
|
trie.add(keyword)
|
||||||
|
}
|
||||||
|
trie.build()
|
||||||
|
|
||||||
|
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcef"), nil)
|
||||||
|
assert.Equal(t, 0, uselessLen)
|
||||||
|
assert.Equal(t, 3, matchLen)
|
||||||
|
assert.Nil(t, jump)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNode_longestMatchCase1(t *testing.T) {
|
||||||
|
keywords := []string{
|
||||||
|
"abcde",
|
||||||
|
"bcde",
|
||||||
|
"cde",
|
||||||
|
"de",
|
||||||
|
|
||||||
|
"b",
|
||||||
|
"bc",
|
||||||
|
}
|
||||||
|
trie := new(node)
|
||||||
|
for _, keyword := range keywords {
|
||||||
|
trie.add(keyword)
|
||||||
|
}
|
||||||
|
trie.build()
|
||||||
|
|
||||||
|
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
|
||||||
|
assert.Equal(t, 1, uselessLen)
|
||||||
|
assert.Equal(t, 2, matchLen)
|
||||||
|
assert.Nil(t, jump)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNode_longestMatchCase2(t *testing.T) {
|
||||||
|
keywords := []string{
|
||||||
|
"abcde",
|
||||||
|
"bcde",
|
||||||
|
"cde",
|
||||||
|
"de",
|
||||||
|
|
||||||
|
"c",
|
||||||
|
"cd",
|
||||||
|
}
|
||||||
|
trie := new(node)
|
||||||
|
for _, keyword := range keywords {
|
||||||
|
trie.add(keyword)
|
||||||
|
}
|
||||||
|
trie.build()
|
||||||
|
|
||||||
|
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
|
||||||
|
assert.Equal(t, 2, uselessLen)
|
||||||
|
assert.Equal(t, 2, matchLen)
|
||||||
|
assert.Nil(t, jump)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNode_longestMatchCase3(t *testing.T) {
|
||||||
|
keywords := []string{
|
||||||
|
"abcde",
|
||||||
|
"bcde",
|
||||||
|
"cde",
|
||||||
|
"de",
|
||||||
|
|
||||||
|
"b",
|
||||||
|
"bc",
|
||||||
|
"c",
|
||||||
|
"cd",
|
||||||
|
}
|
||||||
|
trie := new(node)
|
||||||
|
for _, keyword := range keywords {
|
||||||
|
trie.add(keyword)
|
||||||
|
}
|
||||||
|
trie.build()
|
||||||
|
|
||||||
|
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
|
||||||
|
assert.Equal(t, 1, uselessLen)
|
||||||
|
assert.Equal(t, 2, matchLen)
|
||||||
|
assert.Nil(t, jump)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNode_longestMatchCase4(t *testing.T) {
|
||||||
|
keywords := []string{
|
||||||
|
"abcde",
|
||||||
|
"bcdf",
|
||||||
|
"bcd",
|
||||||
|
}
|
||||||
|
trie := new(node)
|
||||||
|
for _, keyword := range keywords {
|
||||||
|
trie.add(keyword)
|
||||||
|
}
|
||||||
|
trie.build()
|
||||||
|
|
||||||
|
uselessLen, matchLen, paths := trie.longestMatch([]rune("abcdf"), nil)
|
||||||
|
assert.Equal(t, 1, uselessLen)
|
||||||
|
assert.Equal(t, 0, matchLen)
|
||||||
|
assert.Equal(t, 4, len(paths))
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNode_longestMatchCase5(t *testing.T) {
|
||||||
|
keywords := []string{
|
||||||
|
"abcdef",
|
||||||
|
"bcde",
|
||||||
|
}
|
||||||
|
trie := new(node)
|
||||||
|
for _, keyword := range keywords {
|
||||||
|
trie.add(keyword)
|
||||||
|
}
|
||||||
|
trie.build()
|
||||||
|
|
||||||
|
uselessLen, matchLen, paths := trie.longestMatch([]rune("abcde"), nil)
|
||||||
|
assert.Equal(t, 1, uselessLen)
|
||||||
|
assert.Equal(t, 4, matchLen)
|
||||||
|
assert.Nil(t, paths)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNode_longestMatchCase6(t *testing.T) {
|
||||||
|
keywords := []string{
|
||||||
|
"abcde",
|
||||||
|
"bc",
|
||||||
|
"d",
|
||||||
|
}
|
||||||
|
trie := new(node)
|
||||||
|
for _, keyword := range keywords {
|
||||||
|
trie.add(keyword)
|
||||||
|
}
|
||||||
|
trie.build()
|
||||||
|
|
||||||
|
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcd"), nil)
|
||||||
|
assert.Equal(t, 1, uselessLen)
|
||||||
|
assert.Equal(t, 2, matchLen)
|
||||||
|
assert.Nil(t, jump)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNode_longestMatchCase7(t *testing.T) {
|
||||||
|
keywords := []string{
|
||||||
|
"abcdeg",
|
||||||
|
"cdef",
|
||||||
|
"bcde",
|
||||||
|
}
|
||||||
|
trie := new(node)
|
||||||
|
for _, keyword := range keywords {
|
||||||
|
trie.add(keyword)
|
||||||
|
}
|
||||||
|
trie.build()
|
||||||
|
|
||||||
|
word := []rune("abcdef")
|
||||||
|
uselessLen, matchLen, paths := trie.longestMatch(word, nil)
|
||||||
|
assert.Equal(t, 1, uselessLen)
|
||||||
|
assert.Equal(t, 4, matchLen)
|
||||||
|
assert.Nil(t, paths)
|
||||||
|
uselessLen, matchLen, paths = trie.longestMatch(word[uselessLen+matchLen:], paths)
|
||||||
|
assert.Equal(t, 1, uselessLen)
|
||||||
|
assert.Equal(t, 0, matchLen)
|
||||||
|
assert.Nil(t, paths)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNode_longestMatchCase8(t *testing.T) {
|
||||||
|
keywords := []string{
|
||||||
|
"abcdeg",
|
||||||
|
"cdef",
|
||||||
|
"cde",
|
||||||
|
}
|
||||||
|
trie := new(node)
|
||||||
|
for _, keyword := range keywords {
|
||||||
|
trie.add(keyword)
|
||||||
|
}
|
||||||
|
trie.build()
|
||||||
|
|
||||||
|
word := []rune("abcdef")
|
||||||
|
uselessLen, matchLen, paths := trie.longestMatch(word, nil)
|
||||||
|
assert.Equal(t, 2, uselessLen)
|
||||||
|
assert.Equal(t, 0, matchLen)
|
||||||
|
assert.NotNil(t, paths)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNode_longestMatchCase9(t *testing.T) {
|
||||||
|
keywords := []string{
|
||||||
|
"abcdeg",
|
||||||
|
"cdef",
|
||||||
|
"cde",
|
||||||
|
"cd",
|
||||||
|
}
|
||||||
|
trie := new(node)
|
||||||
|
for _, keyword := range keywords {
|
||||||
|
trie.add(keyword)
|
||||||
|
}
|
||||||
|
trie.build()
|
||||||
|
|
||||||
|
word := []rune("abcde")
|
||||||
|
uselessLen, matchLen, paths := trie.longestMatch(word, nil)
|
||||||
|
assert.Equal(t, 2, uselessLen)
|
||||||
|
assert.Equal(t, 3, matchLen)
|
||||||
|
assert.Nil(t, paths)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNode_jump(t *testing.T) {
|
||||||
|
keywords := []string{
|
||||||
|
"de",
|
||||||
|
"fe",
|
||||||
|
}
|
||||||
|
trie := new(node)
|
||||||
|
for _, keyword := range keywords {
|
||||||
|
trie.add(keyword)
|
||||||
|
}
|
||||||
|
trie.build()
|
||||||
|
target := []rune("dfe")
|
||||||
|
|
||||||
|
uselessLen, matchLen, paths := trie.longestMatch(target, nil)
|
||||||
|
assert.Equal(t, 1, uselessLen)
|
||||||
|
assert.Equal(t, 0, matchLen)
|
||||||
|
assert.NotNil(t, paths)
|
||||||
|
uselessLen, matchLen, paths = paths[len(paths)-1].longestMatch(target[uselessLen+matchLen:], paths)
|
||||||
|
assert.Equal(t, 0, uselessLen)
|
||||||
|
assert.Equal(t, 2, matchLen)
|
||||||
|
assert.Nil(t, paths)
|
||||||
|
}
|
||||||
|
@ -33,29 +33,26 @@ func NewReplacer(mapping map[string]string) Replacer {
|
|||||||
// Replace replaces text with given substitutes.
|
// Replace replaces text with given substitutes.
|
||||||
func (r *replacer) Replace(text string) string {
|
func (r *replacer) Replace(text string) string {
|
||||||
var buf strings.Builder
|
var buf strings.Builder
|
||||||
var nextStart int
|
|
||||||
target := []rune(text)
|
target := []rune(text)
|
||||||
cur := r.node
|
cur := r.node
|
||||||
|
var paths []*node
|
||||||
for len(target) != 0 {
|
for len(target) != 0 {
|
||||||
used, jump, matched := cur.longestMatch(target, nextStart)
|
uselessLen, matchLen, nextPaths := cur.longestMatch(target, paths)
|
||||||
if matched {
|
if uselessLen > 0 {
|
||||||
replaced := r.mapping[string(target[:used])]
|
buf.WriteString(string(target[:uselessLen]))
|
||||||
target = append([]rune(replaced), target[used:]...)
|
target = target[uselessLen:]
|
||||||
cur = r.node
|
}
|
||||||
nextStart = 0
|
if matchLen > 0 {
|
||||||
|
replaced := r.mapping[string(target[:matchLen])]
|
||||||
|
target = append([]rune(replaced), target[matchLen:]...)
|
||||||
|
}
|
||||||
|
if len(nextPaths) != 0 {
|
||||||
|
cur = nextPaths[len(nextPaths)-1]
|
||||||
|
paths = nextPaths
|
||||||
} else {
|
} else {
|
||||||
buf.WriteString(string(target[:used]))
|
cur = r.node
|
||||||
target = target[used:]
|
paths = nil
|
||||||
if jump != nil {
|
|
||||||
cur = jump
|
|
||||||
nextStart = jump.depth
|
|
||||||
} else {
|
|
||||||
cur = r.node
|
|
||||||
nextStart = 0
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return buf.String()
|
return buf.String()
|
||||||
}
|
}
|
||||||
|
@ -15,6 +15,15 @@ func TestReplacer_Replace(t *testing.T) {
|
|||||||
assert.Equal(t, "零1234五", NewReplacer(mapping).Replace("零一二三四五"))
|
assert.Equal(t, "零1234五", NewReplacer(mapping).Replace("零一二三四五"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestReplacer_ReplaceJumpMatch(t *testing.T) {
|
||||||
|
mapping := map[string]string{
|
||||||
|
"abcdeg": "ABCDEG",
|
||||||
|
"cdef": "CDEF",
|
||||||
|
"cde": "CDE",
|
||||||
|
}
|
||||||
|
assert.Equal(t, "abCDEF", NewReplacer(mapping).Replace("abcdef"))
|
||||||
|
}
|
||||||
|
|
||||||
func TestReplacer_ReplaceOverlap(t *testing.T) {
|
func TestReplacer_ReplaceOverlap(t *testing.T) {
|
||||||
mapping := map[string]string{
|
mapping := map[string]string{
|
||||||
"3d": "34",
|
"3d": "34",
|
||||||
@ -44,6 +53,14 @@ func TestReplacer_ReplacePartialMatch(t *testing.T) {
|
|||||||
assert.Equal(t, "零一二三四五", NewReplacer(mapping).Replace("零一二三四五"))
|
assert.Equal(t, "零一二三四五", NewReplacer(mapping).Replace("零一二三四五"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestReplacer_ReplacePartialMatchEnds(t *testing.T) {
|
||||||
|
mapping := map[string]string{
|
||||||
|
"二三四七": "2347",
|
||||||
|
"三四": "34",
|
||||||
|
}
|
||||||
|
assert.Equal(t, "零一二34", NewReplacer(mapping).Replace("零一二三四"))
|
||||||
|
}
|
||||||
|
|
||||||
func TestReplacer_ReplaceMultiMatches(t *testing.T) {
|
func TestReplacer_ReplaceMultiMatches(t *testing.T) {
|
||||||
mapping := map[string]string{
|
mapping := map[string]string{
|
||||||
"二三": "23",
|
"二三": "23",
|
||||||
@ -60,6 +77,29 @@ func TestReplacer_ReplaceLongestMatching(t *testing.T) {
|
|||||||
assert.Equal(t, "东京在japan", replacer.Replace("日本的首都在日本"))
|
assert.Equal(t, "东京在japan", replacer.Replace("日本的首都在日本"))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestReplacer_ReplaceSuffixMatch(t *testing.T) {
|
||||||
|
// case1
|
||||||
|
{
|
||||||
|
keywords := map[string]string{
|
||||||
|
"abcde": "ABCDE",
|
||||||
|
"bcde": "BCDE",
|
||||||
|
"bcd": "BCD",
|
||||||
|
}
|
||||||
|
assert.Equal(t, "aBCDf", NewReplacer(keywords).Replace("abcdf"))
|
||||||
|
}
|
||||||
|
// case2
|
||||||
|
{
|
||||||
|
keywords := map[string]string{
|
||||||
|
"abcde": "ABCDE",
|
||||||
|
"bcde": "BCDE",
|
||||||
|
"cde": "CDE",
|
||||||
|
"c": "C",
|
||||||
|
"cd": "CD",
|
||||||
|
}
|
||||||
|
assert.Equal(t, "abCDf", NewReplacer(keywords).Replace("abcdf"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestReplacer_ReplaceLongestOverlap(t *testing.T) {
|
func TestReplacer_ReplaceLongestOverlap(t *testing.T) {
|
||||||
keywords := map[string]string{
|
keywords := map[string]string{
|
||||||
"456": "def",
|
"456": "def",
|
||||||
|
Loading…
Reference in New Issue
Block a user