add support for dictionary based compound word filter

partially addresses #115
2014-11-18 15:18:42 -05:00 · 2014-11-18 15:18:42 -05:00 · d452b2a10e
parent 47bc7caec3
commit d452b2a10e
3 changed files with 319 additions and 0 deletions
--- a/analysis/token_filters/compound/dict.go
+++ b/analysis/token_filters/compound/dict.go
@ -0,0 +1,136 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+
+package compound
+
+import (
+	"bytes"
+	"fmt"
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/registry"
+)
+
+const Name = "dict_compound"
+
+const defaultMinWordSize = 5
+const defaultMinSubWordSize = 2
+const defaultMaxSubWordSize = 15
+const defaultOnlyLongestMatch = false
+
+type DictionaryCompoundFilter struct {
+	dict             analysis.TokenMap
+	minWordSize      int
+	minSubWordSize   int
+	maxSubWordSize   int
+	onlyLongestMatch bool
+}
+
+func NewDictionaryCompoundFilter(dict analysis.TokenMap, minWordSize, minSubWordSize, maxSubWordSize int, onlyLongestMatch bool) *DictionaryCompoundFilter {
+	return &DictionaryCompoundFilter{
+		dict:             dict,
+		minWordSize:      minWordSize,
+		minSubWordSize:   minSubWordSize,
+		maxSubWordSize:   maxSubWordSize,
+		onlyLongestMatch: onlyLongestMatch,
+	}
+}
+
+func (f *DictionaryCompoundFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	rv := make(analysis.TokenStream, 0, len(input))
+
+	for _, token := range input {
+		rv = append(rv, token)
+		tokenLen := utf8.RuneCount(token.Term)
+		if tokenLen >= f.minWordSize {
+			newtokens := f.decompose(token)
+			for _, newtoken := range newtokens {
+				rv = append(rv, newtoken)
+			}
+		}
+	}
+
+	return rv
+}
+
+func (f *DictionaryCompoundFilter) decompose(token *analysis.Token) []*analysis.Token {
+	runes := bytes.Runes(token.Term)
+	rv := make([]*analysis.Token, 0)
+	rlen := len(runes)
+	for i := 0; i <= (rlen - f.minSubWordSize); i++ {
+		var longestMatchToken *analysis.Token
+		for j := f.minSubWordSize; j <= f.maxSubWordSize; j++ {
+			if i+j > rlen {
+				break
+			}
+			_, inDict := f.dict[string(runes[i:i+j])]
+			if inDict {
+				newtoken := analysis.Token{
+					Term:     []byte(string(runes[i : i+j])),
+					Position: token.Position,
+					Start:    token.Start + i,
+					End:      token.Start + i + j,
+					Type:     token.Type,
+					KeyWord:  token.KeyWord,
+				}
+				if f.onlyLongestMatch {
+					if longestMatchToken == nil || utf8.RuneCount(longestMatchToken.Term) < j {
+						longestMatchToken = &newtoken
+					}
+				} else {
+					rv = append(rv, &newtoken)
+				}
+			}
+		}
+		if f.onlyLongestMatch && longestMatchToken != nil {
+			rv = append(rv, longestMatchToken)
+		}
+	}
+	return rv
+}
+
+func DictionaryCompoundFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+
+	minWordSize := defaultMinWordSize
+	minSubWordSize := defaultMinSubWordSize
+	maxSubWordSize := defaultMaxSubWordSize
+	onlyLongestMatch := defaultOnlyLongestMatch
+
+	minVal, ok := config["min_word_size"].(float64)
+	if ok {
+		minWordSize = int(minVal)
+	}
+	minSubVal, ok := config["min_subword_size"].(float64)
+	if ok {
+		minSubWordSize = int(minSubVal)
+	}
+	maxSubVal, ok := config["max_subword_size"].(float64)
+	if ok {
+		maxSubWordSize = int(maxSubVal)
+	}
+	onlyVal, ok := config["only_longest_match"].(bool)
+	if ok {
+		onlyLongestMatch = onlyVal
+	}
+
+	dictTokenMapName, ok := config["dict_token_map"].(string)
+	if !ok {
+		return nil, fmt.Errorf("must specify dict_token_map")
+	}
+	dictTokenMap, err := cache.TokenMapNamed(dictTokenMapName)
+	if err != nil {
+		return nil, fmt.Errorf("error building dict compound words filter: %v", err)
+	}
+	return NewDictionaryCompoundFilter(dictTokenMap, minWordSize, minSubWordSize, maxSubWordSize, onlyLongestMatch), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(Name, DictionaryCompoundFilterConstructor)
+}
--- a/analysis/token_filters/compound/dict_test.go
+++ b/analysis/token_filters/compound/dict_test.go
@ -0,0 +1,182 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+
+package compound
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/analysis/token_map"
+	"github.com/blevesearch/bleve/registry"
+)
+
+func TestStopWordsFilter(t *testing.T) {
+
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term:     []byte("i"),
+			Start:    0,
+			End:      1,
+			Position: 1,
+		},
+		&analysis.Token{
+			Term:     []byte("like"),
+			Start:    2,
+			End:      6,
+			Position: 2,
+		},
+		&analysis.Token{
+			Term:     []byte("to"),
+			Start:    7,
+			End:      9,
+			Position: 3,
+		},
+		&analysis.Token{
+			Term:     []byte("play"),
+			Start:    10,
+			End:      14,
+			Position: 4,
+		},
+		&analysis.Token{
+			Term:     []byte("softball"),
+			Start:    15,
+			End:      23,
+			Position: 5,
+		},
+	}
+
+	expectedTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term:     []byte("i"),
+			Start:    0,
+			End:      1,
+			Position: 1,
+		},
+		&analysis.Token{
+			Term:     []byte("like"),
+			Start:    2,
+			End:      6,
+			Position: 2,
+		},
+		&analysis.Token{
+			Term:     []byte("to"),
+			Start:    7,
+			End:      9,
+			Position: 3,
+		},
+		&analysis.Token{
+			Term:     []byte("play"),
+			Start:    10,
+			End:      14,
+			Position: 4,
+		},
+		&analysis.Token{
+			Term:     []byte("softball"),
+			Start:    15,
+			End:      23,
+			Position: 5,
+		},
+		&analysis.Token{
+			Term:     []byte("soft"),
+			Start:    15,
+			End:      19,
+			Position: 5,
+		},
+		&analysis.Token{
+			Term:     []byte("ball"),
+			Start:    19,
+			End:      23,
+			Position: 5,
+		},
+	}
+
+	cache := registry.NewCache()
+	dictListConfig := map[string]interface{}{
+		"type":   token_map.Name,
+		"tokens": []interface{}{"factor", "soft", "ball", "team"},
+	}
+	_, err := cache.DefineTokenMap("dict_test", dictListConfig)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	dictConfig := map[string]interface{}{
+		"type":           "dict_compound",
+		"dict_token_map": "dict_test",
+	}
+	dictFilter, err := cache.DefineTokenFilter("dict_test", dictConfig)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	ouputTokenStream := dictFilter.Filter(inputTokenStream)
+	if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
+		t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
+	}
+}
+
+func TestStopWordsFilterLongestMatch(t *testing.T) {
+
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term:     []byte("softestball"),
+			Start:    0,
+			End:      11,
+			Position: 1,
+		},
+	}
+
+	expectedTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term:     []byte("softestball"),
+			Start:    0,
+			End:      11,
+			Position: 1,
+		},
+		&analysis.Token{
+			Term:     []byte("softest"),
+			Start:    0,
+			End:      7,
+			Position: 1,
+		},
+		&analysis.Token{
+			Term:     []byte("ball"),
+			Start:    7,
+			End:      11,
+			Position: 1,
+		},
+	}
+
+	cache := registry.NewCache()
+	dictListConfig := map[string]interface{}{
+		"type":   token_map.Name,
+		"tokens": []interface{}{"soft", "softest", "ball"},
+	}
+	_, err := cache.DefineTokenMap("dict_test", dictListConfig)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	dictConfig := map[string]interface{}{
+		"type":               "dict_compound",
+		"dict_token_map":     "dict_test",
+		"only_longest_match": true,
+	}
+	dictFilter, err := cache.DefineTokenFilter("dict_test", dictConfig)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	ouputTokenStream := dictFilter.Filter(inputTokenStream)
+	if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
+		t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
+	}
+}
--- a/config.go
+++ b/config.go
@ -42,6 +42,7 @@ import (

 	// token filters
 	_ "github.com/blevesearch/bleve/analysis/token_filters/apostrophe_filter"
+	_ "github.com/blevesearch/bleve/analysis/token_filters/compound"
 	_ "github.com/blevesearch/bleve/analysis/token_filters/edge_ngram_filter"
 	_ "github.com/blevesearch/bleve/analysis/token_filters/elision_filter"
 	_ "github.com/blevesearch/bleve/analysis/token_filters/keyword_marker_filter"