142 lines
3.9 KiB
Go
142 lines
3.9 KiB
Go
// Copyright (c) 2014 Couchbase, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package compound
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"unicode/utf8"
|
|
|
|
"github.com/blevesearch/bleve/analysis"
|
|
"github.com/blevesearch/bleve/registry"
|
|
)
|
|
|
|
const Name = "dict_compound"
|
|
|
|
const defaultMinWordSize = 5
|
|
const defaultMinSubWordSize = 2
|
|
const defaultMaxSubWordSize = 15
|
|
const defaultOnlyLongestMatch = false
|
|
|
|
type DictionaryCompoundFilter struct {
|
|
dict analysis.TokenMap
|
|
minWordSize int
|
|
minSubWordSize int
|
|
maxSubWordSize int
|
|
onlyLongestMatch bool
|
|
}
|
|
|
|
func NewDictionaryCompoundFilter(dict analysis.TokenMap, minWordSize, minSubWordSize, maxSubWordSize int, onlyLongestMatch bool) *DictionaryCompoundFilter {
|
|
return &DictionaryCompoundFilter{
|
|
dict: dict,
|
|
minWordSize: minWordSize,
|
|
minSubWordSize: minSubWordSize,
|
|
maxSubWordSize: maxSubWordSize,
|
|
onlyLongestMatch: onlyLongestMatch,
|
|
}
|
|
}
|
|
|
|
func (f *DictionaryCompoundFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
|
rv := make(analysis.TokenStream, 0, len(input))
|
|
|
|
for _, token := range input {
|
|
rv = append(rv, token)
|
|
tokenLen := utf8.RuneCount(token.Term)
|
|
if tokenLen >= f.minWordSize {
|
|
newtokens := f.decompose(token)
|
|
for _, newtoken := range newtokens {
|
|
rv = append(rv, newtoken)
|
|
}
|
|
}
|
|
}
|
|
|
|
return rv
|
|
}
|
|
|
|
func (f *DictionaryCompoundFilter) decompose(token *analysis.Token) []*analysis.Token {
|
|
runes := bytes.Runes(token.Term)
|
|
rv := make([]*analysis.Token, 0)
|
|
rlen := len(runes)
|
|
for i := 0; i <= (rlen - f.minSubWordSize); i++ {
|
|
var longestMatchToken *analysis.Token
|
|
for j := f.minSubWordSize; j <= f.maxSubWordSize; j++ {
|
|
if i+j > rlen {
|
|
break
|
|
}
|
|
_, inDict := f.dict[string(runes[i:i+j])]
|
|
if inDict {
|
|
newtoken := analysis.Token{
|
|
Term: []byte(string(runes[i : i+j])),
|
|
Position: token.Position,
|
|
Start: token.Start + i,
|
|
End: token.Start + i + j,
|
|
Type: token.Type,
|
|
KeyWord: token.KeyWord,
|
|
}
|
|
if f.onlyLongestMatch {
|
|
if longestMatchToken == nil || utf8.RuneCount(longestMatchToken.Term) < j {
|
|
longestMatchToken = &newtoken
|
|
}
|
|
} else {
|
|
rv = append(rv, &newtoken)
|
|
}
|
|
}
|
|
}
|
|
if f.onlyLongestMatch && longestMatchToken != nil {
|
|
rv = append(rv, longestMatchToken)
|
|
}
|
|
}
|
|
return rv
|
|
}
|
|
|
|
func DictionaryCompoundFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
|
|
|
minWordSize := defaultMinWordSize
|
|
minSubWordSize := defaultMinSubWordSize
|
|
maxSubWordSize := defaultMaxSubWordSize
|
|
onlyLongestMatch := defaultOnlyLongestMatch
|
|
|
|
minVal, ok := config["min_word_size"].(float64)
|
|
if ok {
|
|
minWordSize = int(minVal)
|
|
}
|
|
minSubVal, ok := config["min_subword_size"].(float64)
|
|
if ok {
|
|
minSubWordSize = int(minSubVal)
|
|
}
|
|
maxSubVal, ok := config["max_subword_size"].(float64)
|
|
if ok {
|
|
maxSubWordSize = int(maxSubVal)
|
|
}
|
|
onlyVal, ok := config["only_longest_match"].(bool)
|
|
if ok {
|
|
onlyLongestMatch = onlyVal
|
|
}
|
|
|
|
dictTokenMapName, ok := config["dict_token_map"].(string)
|
|
if !ok {
|
|
return nil, fmt.Errorf("must specify dict_token_map")
|
|
}
|
|
dictTokenMap, err := cache.TokenMapNamed(dictTokenMapName)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error building dict compound words filter: %v", err)
|
|
}
|
|
return NewDictionaryCompoundFilter(dictTokenMap, minWordSize, minSubWordSize, maxSubWordSize, onlyLongestMatch), nil
|
|
}
|
|
|
|
func init() {
|
|
registry.RegisterTokenFilter(Name, DictionaryCompoundFilterConstructor)
|
|
}
|