0
0
Fork 0
bleve/analysis/token/compound/dict.go

142 lines
3.9 KiB
Go

// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package compound
import (
"bytes"
"fmt"
"unicode/utf8"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const Name = "dict_compound"
const defaultMinWordSize = 5
const defaultMinSubWordSize = 2
const defaultMaxSubWordSize = 15
const defaultOnlyLongestMatch = false
type DictionaryCompoundFilter struct {
dict analysis.TokenMap
minWordSize int
minSubWordSize int
maxSubWordSize int
onlyLongestMatch bool
}
func NewDictionaryCompoundFilter(dict analysis.TokenMap, minWordSize, minSubWordSize, maxSubWordSize int, onlyLongestMatch bool) *DictionaryCompoundFilter {
return &DictionaryCompoundFilter{
dict: dict,
minWordSize: minWordSize,
minSubWordSize: minSubWordSize,
maxSubWordSize: maxSubWordSize,
onlyLongestMatch: onlyLongestMatch,
}
}
func (f *DictionaryCompoundFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input {
rv = append(rv, token)
tokenLen := utf8.RuneCount(token.Term)
if tokenLen >= f.minWordSize {
newtokens := f.decompose(token)
for _, newtoken := range newtokens {
rv = append(rv, newtoken)
}
}
}
return rv
}
func (f *DictionaryCompoundFilter) decompose(token *analysis.Token) []*analysis.Token {
runes := bytes.Runes(token.Term)
rv := make([]*analysis.Token, 0)
rlen := len(runes)
for i := 0; i <= (rlen - f.minSubWordSize); i++ {
var longestMatchToken *analysis.Token
for j := f.minSubWordSize; j <= f.maxSubWordSize; j++ {
if i+j > rlen {
break
}
_, inDict := f.dict[string(runes[i:i+j])]
if inDict {
newtoken := analysis.Token{
Term: []byte(string(runes[i : i+j])),
Position: token.Position,
Start: token.Start + i,
End: token.Start + i + j,
Type: token.Type,
KeyWord: token.KeyWord,
}
if f.onlyLongestMatch {
if longestMatchToken == nil || utf8.RuneCount(longestMatchToken.Term) < j {
longestMatchToken = &newtoken
}
} else {
rv = append(rv, &newtoken)
}
}
}
if f.onlyLongestMatch && longestMatchToken != nil {
rv = append(rv, longestMatchToken)
}
}
return rv
}
func DictionaryCompoundFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
minWordSize := defaultMinWordSize
minSubWordSize := defaultMinSubWordSize
maxSubWordSize := defaultMaxSubWordSize
onlyLongestMatch := defaultOnlyLongestMatch
minVal, ok := config["min_word_size"].(float64)
if ok {
minWordSize = int(minVal)
}
minSubVal, ok := config["min_subword_size"].(float64)
if ok {
minSubWordSize = int(minSubVal)
}
maxSubVal, ok := config["max_subword_size"].(float64)
if ok {
maxSubWordSize = int(maxSubVal)
}
onlyVal, ok := config["only_longest_match"].(bool)
if ok {
onlyLongestMatch = onlyVal
}
dictTokenMapName, ok := config["dict_token_map"].(string)
if !ok {
return nil, fmt.Errorf("must specify dict_token_map")
}
dictTokenMap, err := cache.TokenMapNamed(dictTokenMapName)
if err != nil {
return nil, fmt.Errorf("error building dict compound words filter: %v", err)
}
return NewDictionaryCompoundFilter(dictTokenMap, minWordSize, minSubWordSize, maxSubWordSize, onlyLongestMatch), nil
}
func init() {
registry.RegisterTokenFilter(Name, DictionaryCompoundFilterConstructor)
}