0
0
Fork 0

add support for dictionary based compound word filter

partially addresses #115
This commit is contained in:
Marty Schoch 2014-11-18 15:18:42 -05:00
parent 47bc7caec3
commit d452b2a10e
3 changed files with 319 additions and 0 deletions

View File

@ -0,0 +1,136 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package compound
import (
"bytes"
"fmt"
"unicode/utf8"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
const Name = "dict_compound"
const defaultMinWordSize = 5
const defaultMinSubWordSize = 2
const defaultMaxSubWordSize = 15
const defaultOnlyLongestMatch = false
type DictionaryCompoundFilter struct {
dict analysis.TokenMap
minWordSize int
minSubWordSize int
maxSubWordSize int
onlyLongestMatch bool
}
func NewDictionaryCompoundFilter(dict analysis.TokenMap, minWordSize, minSubWordSize, maxSubWordSize int, onlyLongestMatch bool) *DictionaryCompoundFilter {
return &DictionaryCompoundFilter{
dict: dict,
minWordSize: minWordSize,
minSubWordSize: minSubWordSize,
maxSubWordSize: maxSubWordSize,
onlyLongestMatch: onlyLongestMatch,
}
}
func (f *DictionaryCompoundFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input {
rv = append(rv, token)
tokenLen := utf8.RuneCount(token.Term)
if tokenLen >= f.minWordSize {
newtokens := f.decompose(token)
for _, newtoken := range newtokens {
rv = append(rv, newtoken)
}
}
}
return rv
}
func (f *DictionaryCompoundFilter) decompose(token *analysis.Token) []*analysis.Token {
runes := bytes.Runes(token.Term)
rv := make([]*analysis.Token, 0)
rlen := len(runes)
for i := 0; i <= (rlen - f.minSubWordSize); i++ {
var longestMatchToken *analysis.Token
for j := f.minSubWordSize; j <= f.maxSubWordSize; j++ {
if i+j > rlen {
break
}
_, inDict := f.dict[string(runes[i:i+j])]
if inDict {
newtoken := analysis.Token{
Term: []byte(string(runes[i : i+j])),
Position: token.Position,
Start: token.Start + i,
End: token.Start + i + j,
Type: token.Type,
KeyWord: token.KeyWord,
}
if f.onlyLongestMatch {
if longestMatchToken == nil || utf8.RuneCount(longestMatchToken.Term) < j {
longestMatchToken = &newtoken
}
} else {
rv = append(rv, &newtoken)
}
}
}
if f.onlyLongestMatch && longestMatchToken != nil {
rv = append(rv, longestMatchToken)
}
}
return rv
}
func DictionaryCompoundFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
minWordSize := defaultMinWordSize
minSubWordSize := defaultMinSubWordSize
maxSubWordSize := defaultMaxSubWordSize
onlyLongestMatch := defaultOnlyLongestMatch
minVal, ok := config["min_word_size"].(float64)
if ok {
minWordSize = int(minVal)
}
minSubVal, ok := config["min_subword_size"].(float64)
if ok {
minSubWordSize = int(minSubVal)
}
maxSubVal, ok := config["max_subword_size"].(float64)
if ok {
maxSubWordSize = int(maxSubVal)
}
onlyVal, ok := config["only_longest_match"].(bool)
if ok {
onlyLongestMatch = onlyVal
}
dictTokenMapName, ok := config["dict_token_map"].(string)
if !ok {
return nil, fmt.Errorf("must specify dict_token_map")
}
dictTokenMap, err := cache.TokenMapNamed(dictTokenMapName)
if err != nil {
return nil, fmt.Errorf("error building dict compound words filter: %v", err)
}
return NewDictionaryCompoundFilter(dictTokenMap, minWordSize, minSubWordSize, maxSubWordSize, onlyLongestMatch), nil
}
func init() {
registry.RegisterTokenFilter(Name, DictionaryCompoundFilterConstructor)
}

View File

@ -0,0 +1,182 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package compound
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token_map"
"github.com/blevesearch/bleve/registry"
)
func TestStopWordsFilter(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("i"),
Start: 0,
End: 1,
Position: 1,
},
&analysis.Token{
Term: []byte("like"),
Start: 2,
End: 6,
Position: 2,
},
&analysis.Token{
Term: []byte("to"),
Start: 7,
End: 9,
Position: 3,
},
&analysis.Token{
Term: []byte("play"),
Start: 10,
End: 14,
Position: 4,
},
&analysis.Token{
Term: []byte("softball"),
Start: 15,
End: 23,
Position: 5,
},
}
expectedTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("i"),
Start: 0,
End: 1,
Position: 1,
},
&analysis.Token{
Term: []byte("like"),
Start: 2,
End: 6,
Position: 2,
},
&analysis.Token{
Term: []byte("to"),
Start: 7,
End: 9,
Position: 3,
},
&analysis.Token{
Term: []byte("play"),
Start: 10,
End: 14,
Position: 4,
},
&analysis.Token{
Term: []byte("softball"),
Start: 15,
End: 23,
Position: 5,
},
&analysis.Token{
Term: []byte("soft"),
Start: 15,
End: 19,
Position: 5,
},
&analysis.Token{
Term: []byte("ball"),
Start: 19,
End: 23,
Position: 5,
},
}
cache := registry.NewCache()
dictListConfig := map[string]interface{}{
"type": token_map.Name,
"tokens": []interface{}{"factor", "soft", "ball", "team"},
}
_, err := cache.DefineTokenMap("dict_test", dictListConfig)
if err != nil {
t.Fatal(err)
}
dictConfig := map[string]interface{}{
"type": "dict_compound",
"dict_token_map": "dict_test",
}
dictFilter, err := cache.DefineTokenFilter("dict_test", dictConfig)
if err != nil {
t.Fatal(err)
}
ouputTokenStream := dictFilter.Filter(inputTokenStream)
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
}
}
func TestStopWordsFilterLongestMatch(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("softestball"),
Start: 0,
End: 11,
Position: 1,
},
}
expectedTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("softestball"),
Start: 0,
End: 11,
Position: 1,
},
&analysis.Token{
Term: []byte("softest"),
Start: 0,
End: 7,
Position: 1,
},
&analysis.Token{
Term: []byte("ball"),
Start: 7,
End: 11,
Position: 1,
},
}
cache := registry.NewCache()
dictListConfig := map[string]interface{}{
"type": token_map.Name,
"tokens": []interface{}{"soft", "softest", "ball"},
}
_, err := cache.DefineTokenMap("dict_test", dictListConfig)
if err != nil {
t.Fatal(err)
}
dictConfig := map[string]interface{}{
"type": "dict_compound",
"dict_token_map": "dict_test",
"only_longest_match": true,
}
dictFilter, err := cache.DefineTokenFilter("dict_test", dictConfig)
if err != nil {
t.Fatal(err)
}
ouputTokenStream := dictFilter.Filter(inputTokenStream)
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
}
}

View File

@ -42,6 +42,7 @@ import (
// token filters
_ "github.com/blevesearch/bleve/analysis/token_filters/apostrophe_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/compound"
_ "github.com/blevesearch/bleve/analysis/token_filters/edge_ngram_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/elision_filter"
_ "github.com/blevesearch/bleve/analysis/token_filters/keyword_marker_filter"