2020-07-26 17:09:05 +08:00
|
|
|
package stringx
|
|
|
|
|
2020-08-08 16:40:10 +08:00
|
|
|
import "github.com/tal-tech/go-zero/core/lang"
|
2020-07-26 17:09:05 +08:00
|
|
|
|
2020-08-19 14:54:59 +08:00
|
|
|
const defaultMask = '*'
|
|
|
|
|
2020-07-26 17:09:05 +08:00
|
|
|
type (
|
2021-02-24 16:09:07 +08:00
|
|
|
// TrieOption defines the method to customize a Trie.
|
2020-08-19 14:54:59 +08:00
|
|
|
TrieOption func(trie *trieNode)
|
|
|
|
|
2021-02-24 16:09:07 +08:00
|
|
|
// A Trie is a tree implementation that used to find elements rapidly.
|
2020-07-26 17:09:05 +08:00
|
|
|
Trie interface {
|
|
|
|
Filter(text string) (string, []string, bool)
|
|
|
|
FindKeywords(text string) []string
|
|
|
|
}
|
|
|
|
|
|
|
|
trieNode struct {
|
|
|
|
node
|
2020-08-19 14:54:59 +08:00
|
|
|
mask rune
|
2020-07-26 17:09:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
scope struct {
|
|
|
|
start int
|
|
|
|
stop int
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
2021-02-24 16:09:07 +08:00
|
|
|
// NewTrie returns a Trie.
|
2020-08-19 14:54:59 +08:00
|
|
|
func NewTrie(words []string, opts ...TrieOption) Trie {
|
2020-07-26 17:09:05 +08:00
|
|
|
n := new(trieNode)
|
2020-08-19 14:54:59 +08:00
|
|
|
|
|
|
|
for _, opt := range opts {
|
|
|
|
opt(n)
|
|
|
|
}
|
|
|
|
if n.mask == 0 {
|
|
|
|
n.mask = defaultMask
|
|
|
|
}
|
2020-07-26 17:09:05 +08:00
|
|
|
for _, word := range words {
|
|
|
|
n.add(word)
|
|
|
|
}
|
|
|
|
|
|
|
|
return n
|
|
|
|
}
|
|
|
|
|
|
|
|
func (n *trieNode) Filter(text string) (sentence string, keywords []string, found bool) {
|
|
|
|
chars := []rune(text)
|
|
|
|
if len(chars) == 0 {
|
|
|
|
return text, nil, false
|
|
|
|
}
|
|
|
|
|
|
|
|
scopes := n.findKeywordScopes(chars)
|
|
|
|
keywords = n.collectKeywords(chars, scopes)
|
|
|
|
|
|
|
|
for _, match := range scopes {
|
|
|
|
// we don't care about overlaps, not bringing a performance improvement
|
|
|
|
n.replaceWithAsterisk(chars, match.start, match.stop)
|
|
|
|
}
|
|
|
|
|
|
|
|
return string(chars), keywords, len(keywords) > 0
|
|
|
|
}
|
|
|
|
|
|
|
|
func (n *trieNode) FindKeywords(text string) []string {
|
|
|
|
chars := []rune(text)
|
|
|
|
if len(chars) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
scopes := n.findKeywordScopes(chars)
|
|
|
|
return n.collectKeywords(chars, scopes)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (n *trieNode) collectKeywords(chars []rune, scopes []scope) []string {
|
|
|
|
set := make(map[string]lang.PlaceholderType)
|
|
|
|
for _, v := range scopes {
|
|
|
|
set[string(chars[v.start:v.stop])] = lang.Placeholder
|
|
|
|
}
|
|
|
|
|
|
|
|
var i int
|
|
|
|
keywords := make([]string, len(set))
|
|
|
|
for k := range set {
|
|
|
|
keywords[i] = k
|
|
|
|
i++
|
|
|
|
}
|
|
|
|
|
|
|
|
return keywords
|
|
|
|
}
|
|
|
|
|
|
|
|
func (n *trieNode) findKeywordScopes(chars []rune) []scope {
|
|
|
|
var scopes []scope
|
|
|
|
size := len(chars)
|
|
|
|
start := -1
|
|
|
|
|
|
|
|
for i := 0; i < size; i++ {
|
|
|
|
child, ok := n.children[chars[i]]
|
|
|
|
if !ok {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if start < 0 {
|
|
|
|
start = i
|
|
|
|
}
|
|
|
|
if child.end {
|
|
|
|
scopes = append(scopes, scope{
|
|
|
|
start: start,
|
|
|
|
stop: i + 1,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
for j := i + 1; j < size; j++ {
|
|
|
|
grandchild, ok := child.children[chars[j]]
|
|
|
|
if !ok {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
child = grandchild
|
|
|
|
if child.end {
|
|
|
|
scopes = append(scopes, scope{
|
|
|
|
start: start,
|
|
|
|
stop: j + 1,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
start = -1
|
|
|
|
}
|
|
|
|
|
|
|
|
return scopes
|
|
|
|
}
|
|
|
|
|
|
|
|
func (n *trieNode) replaceWithAsterisk(chars []rune, start, stop int) {
|
|
|
|
for i := start; i < stop; i++ {
|
2020-08-19 14:54:59 +08:00
|
|
|
chars[i] = n.mask
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-24 16:09:07 +08:00
|
|
|
// WithMask customizes a Trie with keywords masked as given mask char.
|
2020-08-19 14:54:59 +08:00
|
|
|
func WithMask(mask rune) TrieOption {
|
|
|
|
return func(n *trieNode) {
|
|
|
|
n.mask = mask
|
2020-07-26 17:09:05 +08:00
|
|
|
}
|
|
|
|
}
|