diff --git a/analysis/token_filters/keyword_marker_filter/keyword_marker_filter.go b/analysis/token_filters/keyword_marker_filter/keyword_marker_filter.go new file mode 100644 index 00000000..2667fdfc --- /dev/null +++ b/analysis/token_filters/keyword_marker_filter/keyword_marker_filter.go @@ -0,0 +1,36 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package keyword_filter + +import ( + "github.com/couchbaselabs/bleve/analysis" +) + +type KeyWordMarkerFilter struct { + keyWords analysis.WordMap +} + +func NewKeyWordMarkerFilter(keyWords analysis.WordMap) *KeyWordMarkerFilter { + return &KeyWordMarkerFilter{ + keyWords: keyWords, + } +} + +func (f *KeyWordMarkerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + + for _, token := range input { + word := string(token.Term) + _, isKeyWord := f.keyWords[word] + if isKeyWord { + token.KeyWord = true + } + } + + return input +} diff --git a/analysis/token_filters/keyword_marker_filter/keyword_marker_filter_test.go b/analysis/token_filters/keyword_marker_filter/keyword_marker_filter_test.go new file mode 100644 index 00000000..8ac79d54 --- /dev/null +++ b/analysis/token_filters/keyword_marker_filter/keyword_marker_filter_test.go @@ -0,0 +1,67 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package keyword_filter + +import ( + "reflect" + "testing" + + "github.com/couchbaselabs/bleve/analysis" +) + +func TestKeyWordMarkerFilter(t *testing.T) { + + inputTokenStream := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("a"), + }, + &analysis.Token{ + Term: []byte("walk"), + }, + &analysis.Token{ + Term: []byte("in"), + }, + &analysis.Token{ + Term: []byte("the"), + }, + &analysis.Token{ + Term: []byte("park"), + }, + } + + expectedTokenStream := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("a"), + }, + &analysis.Token{ + Term: []byte("walk"), + KeyWord: true, + }, + &analysis.Token{ + Term: []byte("in"), + }, + &analysis.Token{ + Term: []byte("the"), + }, + &analysis.Token{ + Term: []byte("park"), + KeyWord: true, + }, + } + + keyWordsMap := analysis.NewWordMap() + keyWordsMap.AddWord("walk") + keyWordsMap.AddWord("park") + + filter := NewKeyWordMarkerFilter(keyWordsMap) + ouputTokenStream := filter.Filter(inputTokenStream) + if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) { + t.Errorf("expected %#v got %#v", expectedTokenStream[0].KeyWord, ouputTokenStream[0].KeyWord) + } +} diff --git a/analysis/token_filters/stemmer_filter/stemmer_filter.go b/analysis/token_filters/stemmer_filter/stemmer_filter.go index fd825c88..c89bc291 100644 --- a/analysis/token_filters/stemmer_filter/stemmer_filter.go +++ b/analysis/token_filters/stemmer_filter/stemmer_filter.go @@ -45,8 +45,11 @@ func (s *StemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream rv := make(analysis.TokenStream, 0) for _, token := range input { - stemmed := s.stemmer.Stem(string(token.Term)) - token.Term = []byte(stemmed) + // if not protected keyword, stem it + if !token.KeyWord { + stemmed := s.stemmer.Stem(string(token.Term)) + token.Term = []byte(stemmed) + } rv = append(rv, token) } diff --git a/analysis/token_filters/stemmer_filter/stemmer_filter_test.go b/analysis/token_filters/stemmer_filter/stemmer_filter_test.go index 645efa31..639e7d16 100644 --- a/analysis/token_filters/stemmer_filter/stemmer_filter_test.go +++ b/analysis/token_filters/stemmer_filter/stemmer_filter_test.go @@ -27,6 +27,10 @@ func TestStemmerFilter(t *testing.T) { &analysis.Token{ Term: []byte("business"), }, + &analysis.Token{ + Term: []byte("protected"), + KeyWord: true, + }, } expectedTokenStream := analysis.TokenStream{ @@ -39,6 +43,10 @@ func TestStemmerFilter(t *testing.T) { &analysis.Token{ Term: []byte("busi"), }, + &analysis.Token{ + Term: []byte("protected"), + KeyWord: true, + }, } filter, err := NewStemmerFilter("english") @@ -47,6 +55,6 @@ func TestStemmerFilter(t *testing.T) { } ouputTokenStream := filter.Filter(inputTokenStream) if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) { - t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream) + t.Errorf("expected %#v got %#v", expectedTokenStream[3], ouputTokenStream[3]) } } diff --git a/analysis/type.go b/analysis/type.go index 54a1171d..7dc0a19f 100644 --- a/analysis/type.go +++ b/analysis/type.go @@ -31,6 +31,7 @@ type Token struct { Term []byte Position int Type TokenType + KeyWord bool } func (t *Token) String() string {