fix Replacer suffix match, and add test case (#2867)

* fix: replace shoud replace the longest match

* feat: revert bytes.Buffer to strings.Builder

* fix: loop reset nextStart

* feat: add node longest match test

* feat: add replacer suffix match test case

* feat: multiple match

* fix: partial match ends

* fix: replace look back upon error

* feat: rm unnecessary branch

---------

Co-authored-by: hudahai <hscxrzs@gmail.com>
Co-authored-by: hushichang <hushichang@sensetime.com>
This commit is contained in:
dahaihu 2023-02-12 21:04:35 +08:00 committed by GitHub
parent 3736dacf1e
commit cacd5dc91a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 349 additions and 49 deletions

View File

@ -14,7 +14,6 @@ func (n *node) add(word string) {
}
nd := n
var depth int
for i, char := range chars {
if nd.children == nil {
child := new(node)
@ -23,7 +22,6 @@ func (n *node) add(word string) {
nd = child
} else if child, ok := nd.children[char]; ok {
nd = child
depth++
} else {
child := new(node)
child.depth = i + 1
@ -99,51 +97,91 @@ func (n *node) find(chars []rune) []scope {
return scopes
}
func (n *node) longestMatch(chars []rune, start int) (used int, jump *node, matched bool) {
func (n *node) longestMatch(chars []rune, paths []*node) (uselessLen, matchLen int, nextPaths []*node) {
cur := n
var matchedNode *node
var longestMatched *node
findMatch := func(path []*node) (*node, int) {
var (
result *node
start int
)
for i := len(path) - 1; i >= 0; i-- {
icur := path[i]
var cur *node
for icur.fail != nil {
if icur.fail.end {
cur = icur.fail
break
}
icur = icur.fail
}
if cur != nil {
if result == nil {
result = cur
start = i - result.depth + 1
} else if curStart := i - cur.depth + 1; curStart < start {
result = cur
start = curStart
}
}
}
return result, start
}
for i := start; i < len(chars); i++ {
child, ok := cur.children[chars[i]]
for i := len(paths); i < len(chars); i++ {
char := chars[i]
child, ok := cur.children[char]
if ok {
cur = child
if cur.end {
matchedNode = cur
longestMatched = cur
}
paths = append(paths, cur)
} else {
if matchedNode != nil {
return matchedNode.depth, nil, true
if longestMatched != nil {
return 0, longestMatched.depth, nil
}
if n.end {
return start, nil, true
return 0, n.depth, nil
}
// old path pre longest preMatch
preMatch, preStart := findMatch(paths)
// new path match
var jump *node
for cur.fail != nil {
jump, ok = cur.fail.children[chars[i]]
icur := cur
for icur.fail != nil {
jump, ok = icur.fail.children[char]
if ok {
break
}
cur = cur.fail
icur = icur.fail
}
if jump != nil {
return i + 1 - jump.depth, jump, false
switch {
case preMatch != nil && jump != nil:
if jumpStart := i - jump.depth + 1; preStart < jumpStart {
return preStart, preMatch.depth, nil
} else {
return jumpStart, 0, append(paths[jumpStart:], jump)
}
return i + 1, nil, false
case preMatch != nil && jump == nil:
return preStart, preMatch.depth, nil
case preMatch == nil && jump != nil:
return i - jump.depth + 1, 0, append(paths[i-jump.depth+1:], jump)
case preMatch == nil && jump == nil:
return i + 1, 0, nil
}
}
// longest matched node
if matchedNode != nil {
return matchedNode.depth, nil, true
}
// last matched node
// this longest matched node
if longestMatched != nil {
return 0, longestMatched.depth, nil
}
if n.end {
return start, nil, true
return 0, n.depth, nil
}
return len(chars), nil, false
match, start := findMatch(paths)
if match != nil {
return start, match.depth, nil
}
return len(chars), 0, nil
}

View File

@ -9,10 +9,10 @@ import (
func TestLongestMatchGuardedCondition(t *testing.T) {
n := new(node)
n.end = true
used, jump, matched := n.longestMatch([]rune(""), 0)
assert.Equal(t, 0, used)
uselessLen, matchLen, jump := n.longestMatch([]rune(""), nil)
assert.Equal(t, 0, uselessLen)
assert.Nil(t, jump)
assert.True(t, matched)
assert.Equal(t, 0, matchLen)
}
func TestFuzzNodeCase1(t *testing.T) {
@ -202,3 +202,228 @@ func BenchmarkNodeFind(b *testing.B) {
trie.find([]rune("日本AV演员兼电视、电影演员。无名氏AV女优是xx出道, 日本AV女优们最精彩的表演是AV演员色情表演"))
}
}
func TestNode_longestMatchCase0(t *testing.T) {
// match the longest word
keywords := []string{
"a",
"ab",
"abc",
"abcd",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcef"), nil)
assert.Equal(t, 0, uselessLen)
assert.Equal(t, 3, matchLen)
assert.Nil(t, jump)
}
func TestNode_longestMatchCase1(t *testing.T) {
keywords := []string{
"abcde",
"bcde",
"cde",
"de",
"b",
"bc",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 2, matchLen)
assert.Nil(t, jump)
}
func TestNode_longestMatchCase2(t *testing.T) {
keywords := []string{
"abcde",
"bcde",
"cde",
"de",
"c",
"cd",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
assert.Equal(t, 2, uselessLen)
assert.Equal(t, 2, matchLen)
assert.Nil(t, jump)
}
func TestNode_longestMatchCase3(t *testing.T) {
keywords := []string{
"abcde",
"bcde",
"cde",
"de",
"b",
"bc",
"c",
"cd",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcdf"), nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 2, matchLen)
assert.Nil(t, jump)
}
func TestNode_longestMatchCase4(t *testing.T) {
keywords := []string{
"abcde",
"bcdf",
"bcd",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, paths := trie.longestMatch([]rune("abcdf"), nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 0, matchLen)
assert.Equal(t, 4, len(paths))
}
func TestNode_longestMatchCase5(t *testing.T) {
keywords := []string{
"abcdef",
"bcde",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, paths := trie.longestMatch([]rune("abcde"), nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 4, matchLen)
assert.Nil(t, paths)
}
func TestNode_longestMatchCase6(t *testing.T) {
keywords := []string{
"abcde",
"bc",
"d",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
uselessLen, matchLen, jump := trie.longestMatch([]rune("abcd"), nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 2, matchLen)
assert.Nil(t, jump)
}
func TestNode_longestMatchCase7(t *testing.T) {
keywords := []string{
"abcdeg",
"cdef",
"bcde",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
word := []rune("abcdef")
uselessLen, matchLen, paths := trie.longestMatch(word, nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 4, matchLen)
assert.Nil(t, paths)
uselessLen, matchLen, paths = trie.longestMatch(word[uselessLen+matchLen:], paths)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 0, matchLen)
assert.Nil(t, paths)
}
func TestNode_longestMatchCase8(t *testing.T) {
keywords := []string{
"abcdeg",
"cdef",
"cde",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
word := []rune("abcdef")
uselessLen, matchLen, paths := trie.longestMatch(word, nil)
assert.Equal(t, 2, uselessLen)
assert.Equal(t, 0, matchLen)
assert.NotNil(t, paths)
}
func TestNode_longestMatchCase9(t *testing.T) {
keywords := []string{
"abcdeg",
"cdef",
"cde",
"cd",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
word := []rune("abcde")
uselessLen, matchLen, paths := trie.longestMatch(word, nil)
assert.Equal(t, 2, uselessLen)
assert.Equal(t, 3, matchLen)
assert.Nil(t, paths)
}
func TestNode_jump(t *testing.T) {
keywords := []string{
"de",
"fe",
}
trie := new(node)
for _, keyword := range keywords {
trie.add(keyword)
}
trie.build()
target := []rune("dfe")
uselessLen, matchLen, paths := trie.longestMatch(target, nil)
assert.Equal(t, 1, uselessLen)
assert.Equal(t, 0, matchLen)
assert.NotNil(t, paths)
uselessLen, matchLen, paths = paths[len(paths)-1].longestMatch(target[uselessLen+matchLen:], paths)
assert.Equal(t, 0, uselessLen)
assert.Equal(t, 2, matchLen)
assert.Nil(t, paths)
}

View File

@ -33,29 +33,26 @@ func NewReplacer(mapping map[string]string) Replacer {
// Replace replaces text with given substitutes.
func (r *replacer) Replace(text string) string {
var buf strings.Builder
var nextStart int
target := []rune(text)
cur := r.node
var paths []*node
for len(target) != 0 {
used, jump, matched := cur.longestMatch(target, nextStart)
if matched {
replaced := r.mapping[string(target[:used])]
target = append([]rune(replaced), target[used:]...)
cur = r.node
nextStart = 0
} else {
buf.WriteString(string(target[:used]))
target = target[used:]
if jump != nil {
cur = jump
nextStart = jump.depth
uselessLen, matchLen, nextPaths := cur.longestMatch(target, paths)
if uselessLen > 0 {
buf.WriteString(string(target[:uselessLen]))
target = target[uselessLen:]
}
if matchLen > 0 {
replaced := r.mapping[string(target[:matchLen])]
target = append([]rune(replaced), target[matchLen:]...)
}
if len(nextPaths) != 0 {
cur = nextPaths[len(nextPaths)-1]
paths = nextPaths
} else {
cur = r.node
nextStart = 0
paths = nil
}
}
}
return buf.String()
}

View File

@ -15,6 +15,15 @@ func TestReplacer_Replace(t *testing.T) {
assert.Equal(t, "零1234五", NewReplacer(mapping).Replace("零一二三四五"))
}
func TestReplacer_ReplaceJumpMatch(t *testing.T) {
mapping := map[string]string{
"abcdeg": "ABCDEG",
"cdef": "CDEF",
"cde": "CDE",
}
assert.Equal(t, "abCDEF", NewReplacer(mapping).Replace("abcdef"))
}
func TestReplacer_ReplaceOverlap(t *testing.T) {
mapping := map[string]string{
"3d": "34",
@ -44,6 +53,14 @@ func TestReplacer_ReplacePartialMatch(t *testing.T) {
assert.Equal(t, "零一二三四五", NewReplacer(mapping).Replace("零一二三四五"))
}
func TestReplacer_ReplacePartialMatchEnds(t *testing.T) {
mapping := map[string]string{
"二三四七": "2347",
"三四": "34",
}
assert.Equal(t, "零一二34", NewReplacer(mapping).Replace("零一二三四"))
}
func TestReplacer_ReplaceMultiMatches(t *testing.T) {
mapping := map[string]string{
"二三": "23",
@ -60,6 +77,29 @@ func TestReplacer_ReplaceLongestMatching(t *testing.T) {
assert.Equal(t, "东京在japan", replacer.Replace("日本的首都在日本"))
}
func TestReplacer_ReplaceSuffixMatch(t *testing.T) {
// case1
{
keywords := map[string]string{
"abcde": "ABCDE",
"bcde": "BCDE",
"bcd": "BCD",
}
assert.Equal(t, "aBCDf", NewReplacer(keywords).Replace("abcdf"))
}
// case2
{
keywords := map[string]string{
"abcde": "ABCDE",
"bcde": "BCDE",
"cde": "CDE",
"c": "C",
"cd": "CD",
}
assert.Equal(t, "abCDf", NewReplacer(keywords).Replace("abcdf"))
}
}
func TestReplacer_ReplaceLongestOverlap(t *testing.T) {
keywords := map[string]string{
"456": "def",