bleve/analysis/token/compound/dict.go

//  Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// 		http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package compound

import (
	"bytes"
	"fmt"
	"unicode/utf8"

	"github.com/blevesearch/bleve/analysis"
	"github.com/blevesearch/bleve/registry"
)

const Name = "dict_compound"

const defaultMinWordSize = 5
const defaultMinSubWordSize = 2
const defaultMaxSubWordSize = 15
const defaultOnlyLongestMatch = false

type DictionaryCompoundFilter struct {
	dict             analysis.TokenMap
	minWordSize      int
	minSubWordSize   int
	maxSubWordSize   int
	onlyLongestMatch bool
}

func NewDictionaryCompoundFilter(dict analysis.TokenMap, minWordSize, minSubWordSize, maxSubWordSize int, onlyLongestMatch bool) *DictionaryCompoundFilter {
	return &DictionaryCompoundFilter{
		dict:             dict,
		minWordSize:      minWordSize,
		minSubWordSize:   minSubWordSize,
		maxSubWordSize:   maxSubWordSize,
		onlyLongestMatch: onlyLongestMatch,
	}
}

func (f *DictionaryCompoundFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
	rv := make(analysis.TokenStream, 0, len(input))

	for _, token := range input {
		rv = append(rv, token)
		tokenLen := utf8.RuneCount(token.Term)
		if tokenLen >= f.minWordSize {
			newtokens := f.decompose(token)
			for _, newtoken := range newtokens {
				rv = append(rv, newtoken)
			}
		}
	}

	return rv
}

func (f *DictionaryCompoundFilter) decompose(token *analysis.Token) []*analysis.Token {
	runes := bytes.Runes(token.Term)
	rv := make([]*analysis.Token, 0)
	rlen := len(runes)
	for i := 0; i <= (rlen - f.minSubWordSize); i++ {
		var longestMatchToken *analysis.Token
		for j := f.minSubWordSize; j <= f.maxSubWordSize; j++ {
			if i+j > rlen {
				break
			}
			_, inDict := f.dict[string(runes[i:i+j])]
			if inDict {
				newtoken := analysis.Token{
					Term:     []byte(string(runes[i : i+j])),
					Position: token.Position,
					Start:    token.Start + i,
					End:      token.Start + i + j,
					Type:     token.Type,
					KeyWord:  token.KeyWord,
				}
				if f.onlyLongestMatch {
					if longestMatchToken == nil || utf8.RuneCount(longestMatchToken.Term) < j {
						longestMatchToken = &newtoken
					}
				} else {
					rv = append(rv, &newtoken)
				}
			}
		}
		if f.onlyLongestMatch && longestMatchToken != nil {
			rv = append(rv, longestMatchToken)
		}
	}
	return rv
}

func DictionaryCompoundFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {

	minWordSize := defaultMinWordSize
	minSubWordSize := defaultMinSubWordSize
	maxSubWordSize := defaultMaxSubWordSize
	onlyLongestMatch := defaultOnlyLongestMatch

	minVal, ok := config["min_word_size"].(float64)
	if ok {
		minWordSize = int(minVal)
	}
	minSubVal, ok := config["min_subword_size"].(float64)
	if ok {
		minSubWordSize = int(minSubVal)
	}
	maxSubVal, ok := config["max_subword_size"].(float64)
	if ok {
		maxSubWordSize = int(maxSubVal)
	}
	onlyVal, ok := config["only_longest_match"].(bool)
	if ok {
		onlyLongestMatch = onlyVal
	}

	dictTokenMapName, ok := config["dict_token_map"].(string)
	if !ok {
		return nil, fmt.Errorf("must specify dict_token_map")
	}
	dictTokenMap, err := cache.TokenMapNamed(dictTokenMapName)
	if err != nil {
		return nil, fmt.Errorf("error building dict compound words filter: %v", err)
	}
	return NewDictionaryCompoundFilter(dictTokenMap, minWordSize, minSubWordSize, maxSubWordSize, onlyLongestMatch), nil
}

func init() {
	registry.RegisterTokenFilter(Name, DictionaryCompoundFilterConstructor)
}