From 1dc466a8002f917fe345650b36acbb4dac16cddb Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 23 Sep 2014 18:41:32 -0400 Subject: [PATCH] modified token filters to avoid creating new token stream often the result stream was the same length, so can reuse the existing token stream also, in cases where a new stream was required, set capacity to the length of the input stream. most output stream are at least as long as the input, so this may avoid some subsequent resizing --- analysis/language/ar/arabic_normalize.go | 6 +--- analysis/language/cjk/cjk_bigram.go | 2 +- analysis/language/ckb/sorani_normalize.go | 6 +--- .../language/ckb/sorani_stemmer_filter.go | 6 +--- analysis/language/de/german_normalize.go | 6 +--- analysis/language/en/possessive_filter_en.go | 2 -- analysis/language/fa/persian_normalize.go | 6 +--- analysis/language/hi/hindi_normalize.go | 6 +--- analysis/language/hi/hindi_stemmer_filter.go | 6 +--- analysis/language/ja/ja_morph_kagome.go | 2 +- .../apostrophe_filter/apostrophe_filter.go | 5 +--- analysis/token_filters/cld2/cld2_filter.go | 2 +- .../edge_ngram_filter/edge_ngram_filter.go | 2 +- .../elision_filter/elision_filter.go | 6 +--- .../keyword_marker_filter.go | 2 -- .../length_filter/length_filter.go | 2 +- .../lower_case_filter/lower_case_filter.go | 6 +--- .../ngram_filter/ngram_filter.go | 2 +- analysis/token_filters/shingle/shingle.go | 2 +- .../token_filters/shingle/shingle_test.go | 2 +- .../stemmer_filter/stemmer_filter.go | 28 ++++++++++--------- .../stop_tokens_filter/stop_tokens_filter.go | 2 +- .../truncate_token_filter.go | 6 +--- .../unicode_normalize/unicode_normalize.go | 6 +--- 24 files changed, 36 insertions(+), 85 deletions(-) diff --git a/analysis/language/ar/arabic_normalize.go b/analysis/language/ar/arabic_normalize.go index 70705258..abf86018 100644 --- a/analysis/language/ar/arabic_normalize.go +++ b/analysis/language/ar/arabic_normalize.go @@ -46,15 +46,11 @@ func NewArabicNormalizeFilter() *ArabicNormalizeFilter { } func (s *ArabicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) - for _, token := range input { term := normalize(token.Term) token.Term = term - rv = append(rv, token) } - - return rv + return input } func normalize(input []byte) []byte { diff --git a/analysis/language/cjk/cjk_bigram.go b/analysis/language/cjk/cjk_bigram.go index 36cfc88d..0596447e 100644 --- a/analysis/language/cjk/cjk_bigram.go +++ b/analysis/language/cjk/cjk_bigram.go @@ -32,7 +32,7 @@ func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStrea r := ring.New(2) itemsInRing := 0 - rv := make(analysis.TokenStream, 0) + rv := make(analysis.TokenStream, 0, len(input)) for _, token := range input { if token.Type == analysis.Ideographic { diff --git a/analysis/language/ckb/sorani_normalize.go b/analysis/language/ckb/sorani_normalize.go index 9b414192..b264bb40 100644 --- a/analysis/language/ckb/sorani_normalize.go +++ b/analysis/language/ckb/sorani_normalize.go @@ -56,15 +56,11 @@ func NewSoraniNormalizeFilter() *SoraniNormalizeFilter { } func (s *SoraniNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) - for _, token := range input { term := normalize(token.Term) token.Term = term - rv = append(rv, token) } - - return rv + return input } func normalize(input []byte) []byte { diff --git a/analysis/language/ckb/sorani_stemmer_filter.go b/analysis/language/ckb/sorani_stemmer_filter.go index aabce1c6..68367a55 100644 --- a/analysis/language/ckb/sorani_stemmer_filter.go +++ b/analysis/language/ckb/sorani_stemmer_filter.go @@ -27,18 +27,14 @@ func NewSoraniStemmerFilter() *SoraniStemmerFilter { } func (s *SoraniStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) - for _, token := range input { // if not protected keyword, stem it if !token.KeyWord { stemmed := stem(token.Term) token.Term = stemmed } - rv = append(rv, token) } - - return rv + return input } func stem(input []byte) []byte { diff --git a/analysis/language/de/german_normalize.go b/analysis/language/de/german_normalize.go index cc73ea65..47cc97b4 100644 --- a/analysis/language/de/german_normalize.go +++ b/analysis/language/de/german_normalize.go @@ -32,15 +32,11 @@ func NewGermanNormalizeFilter() *GermanNormalizeFilter { } func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) - for _, token := range input { term := normalize(token.Term) token.Term = term - rv = append(rv, token) } - - return rv + return input } func normalize(input []byte) []byte { diff --git a/analysis/language/en/possessive_filter_en.go b/analysis/language/en/possessive_filter_en.go index 128fbac9..84a62c2f 100644 --- a/analysis/language/en/possessive_filter_en.go +++ b/analysis/language/en/possessive_filter_en.go @@ -32,7 +32,6 @@ func NewPossessiveFilter() *PossessiveFilter { } func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - for _, token := range input { runes := bytes.Runes(token.Term) if len(runes) >= 2 { @@ -46,7 +45,6 @@ func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStre } } } - return input } diff --git a/analysis/language/fa/persian_normalize.go b/analysis/language/fa/persian_normalize.go index 494b5597..ab67af3a 100644 --- a/analysis/language/fa/persian_normalize.go +++ b/analysis/language/fa/persian_normalize.go @@ -38,15 +38,11 @@ func NewPersianNormalizeFilter() *PersianNormalizeFilter { } func (s *PersianNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) - for _, token := range input { term := normalize(token.Term) token.Term = term - rv = append(rv, token) } - - return rv + return input } func normalize(input []byte) []byte { diff --git a/analysis/language/hi/hindi_normalize.go b/analysis/language/hi/hindi_normalize.go index 1117c131..f9b607c6 100644 --- a/analysis/language/hi/hindi_normalize.go +++ b/analysis/language/hi/hindi_normalize.go @@ -26,15 +26,11 @@ func NewHindiNormalizeFilter() *HindiNormalizeFilter { } func (s *HindiNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) - for _, token := range input { term := normalize(token.Term) token.Term = term - rv = append(rv, token) } - - return rv + return input } func normalize(input []byte) []byte { diff --git a/analysis/language/hi/hindi_stemmer_filter.go b/analysis/language/hi/hindi_stemmer_filter.go index 0ebd3bde..b4e6ddd6 100644 --- a/analysis/language/hi/hindi_stemmer_filter.go +++ b/analysis/language/hi/hindi_stemmer_filter.go @@ -27,18 +27,14 @@ func NewHindiStemmerFilter() *HindiStemmerFilter { } func (s *HindiStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) - for _, token := range input { // if not protected keyword, stem it if !token.KeyWord { stemmed := stem(token.Term) token.Term = stemmed } - rv = append(rv, token) } - - return rv + return input } func stem(input []byte) []byte { diff --git a/analysis/language/ja/ja_morph_kagome.go b/analysis/language/ja/ja_morph_kagome.go index 9989526e..415183d2 100644 --- a/analysis/language/ja/ja_morph_kagome.go +++ b/analysis/language/ja/ja_morph_kagome.go @@ -46,7 +46,7 @@ func (t *KagomeMorphTokenizer) Tokenize(input []byte) analysis.TokenStream { prevstart int ) - rv := make(analysis.TokenStream, 0) + rv := make(analysis.TokenStream, 0, len(input)) if len(input) < 1 { return rv } diff --git a/analysis/token_filters/apostrophe_filter/apostrophe_filter.go b/analysis/token_filters/apostrophe_filter/apostrophe_filter.go index 7f889ec5..853db823 100644 --- a/analysis/token_filters/apostrophe_filter/apostrophe_filter.go +++ b/analysis/token_filters/apostrophe_filter/apostrophe_filter.go @@ -29,18 +29,15 @@ func NewApostropheFilter() *ApostropheFilter { } func (s *ApostropheFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) - for _, token := range input { firstApostrophe := bytes.IndexAny(token.Term, Apostrophes) if firstApostrophe >= 0 { // found an apostrophe token.Term = token.Term[0:firstApostrophe] } - rv = append(rv, token) } - return rv + return input } func ApostropheFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { diff --git a/analysis/token_filters/cld2/cld2_filter.go b/analysis/token_filters/cld2/cld2_filter.go index 2458564d..4349ef8a 100644 --- a/analysis/token_filters/cld2/cld2_filter.go +++ b/analysis/token_filters/cld2/cld2_filter.go @@ -33,7 +33,7 @@ func NewCld2Filter() *Cld2Filter { } func (f *Cld2Filter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) + rv := make(analysis.TokenStream, 0, len(input)) offset := 0 for _, token := range input { diff --git a/analysis/token_filters/edge_ngram_filter/edge_ngram_filter.go b/analysis/token_filters/edge_ngram_filter/edge_ngram_filter.go index 7372e4e8..f370aa9d 100644 --- a/analysis/token_filters/edge_ngram_filter/edge_ngram_filter.go +++ b/analysis/token_filters/edge_ngram_filter/edge_ngram_filter.go @@ -40,7 +40,7 @@ func NewEdgeNgramFilter(side Side, minLength, maxLength int) *EdgeNgramFilter { } func (s *EdgeNgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) + rv := make(analysis.TokenStream, 0, len(input)) for _, token := range input { runeCount := utf8.RuneCount(token.Term) diff --git a/analysis/token_filters/elision_filter/elision_filter.go b/analysis/token_filters/elision_filter/elision_filter.go index e1baced8..61aa6137 100644 --- a/analysis/token_filters/elision_filter/elision_filter.go +++ b/analysis/token_filters/elision_filter/elision_filter.go @@ -35,8 +35,6 @@ func NewElisionFilter(articles analysis.TokenMap) *ElisionFilter { } func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) - for _, token := range input { firstApostrophe := bytes.IndexAny(token.Term, Apostrophes) if firstApostrophe >= 0 { @@ -48,10 +46,8 @@ func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream token.Term = token.Term[firstApostrophe+1:] } } - rv = append(rv, token) } - - return rv + return input } func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { diff --git a/analysis/token_filters/keyword_marker_filter/keyword_marker_filter.go b/analysis/token_filters/keyword_marker_filter/keyword_marker_filter.go index e950331f..062d2172 100644 --- a/analysis/token_filters/keyword_marker_filter/keyword_marker_filter.go +++ b/analysis/token_filters/keyword_marker_filter/keyword_marker_filter.go @@ -29,7 +29,6 @@ func NewKeyWordMarkerFilter(keyWords analysis.TokenMap) *KeyWordMarkerFilter { } func (f *KeyWordMarkerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - for _, token := range input { word := string(token.Term) _, isKeyWord := f.keyWords[word] @@ -37,7 +36,6 @@ func (f *KeyWordMarkerFilter) Filter(input analysis.TokenStream) analysis.TokenS token.KeyWord = true } } - return input } diff --git a/analysis/token_filters/length_filter/length_filter.go b/analysis/token_filters/length_filter/length_filter.go index f42c8c7d..118c583c 100644 --- a/analysis/token_filters/length_filter/length_filter.go +++ b/analysis/token_filters/length_filter/length_filter.go @@ -32,7 +32,7 @@ func NewLengthFilter(min, max int) *LengthFilter { } func (f *LengthFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) + rv := make(analysis.TokenStream, 0, len(input)) for _, token := range input { wordLen := utf8.RuneCount(token.Term) diff --git a/analysis/token_filters/lower_case_filter/lower_case_filter.go b/analysis/token_filters/lower_case_filter/lower_case_filter.go index 7668bf4b..44884924 100644 --- a/analysis/token_filters/lower_case_filter/lower_case_filter.go +++ b/analysis/token_filters/lower_case_filter/lower_case_filter.go @@ -26,16 +26,12 @@ func NewLowerCaseFilter() *LowerCaseFilter { } func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) - for _, token := range input { word := string(token.Term) wordLowerCase := strings.ToLower(word) token.Term = []byte(wordLowerCase) - rv = append(rv, token) } - - return rv + return input } func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { diff --git a/analysis/token_filters/ngram_filter/ngram_filter.go b/analysis/token_filters/ngram_filter/ngram_filter.go index 903a81df..d1433e26 100644 --- a/analysis/token_filters/ngram_filter/ngram_filter.go +++ b/analysis/token_filters/ngram_filter/ngram_filter.go @@ -33,7 +33,7 @@ func NewNgramFilter(minLength, maxLength int) *NgramFilter { } func (s *NgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) + rv := make(analysis.TokenStream, 0, len(input)) for _, token := range input { runeCount := utf8.RuneCount(token.Term) diff --git a/analysis/token_filters/shingle/shingle.go b/analysis/token_filters/shingle/shingle.go index 8b7f122f..90965092 100644 --- a/analysis/token_filters/shingle/shingle.go +++ b/analysis/token_filters/shingle/shingle.go @@ -32,7 +32,7 @@ func NewShingleFilter(min, max int, outputOriginal bool, sep, fill string) *Shin } func (s *ShingleFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) + rv := make(analysis.TokenStream, 0, len(input)) currentPosition := 0 for _, token := range input { diff --git a/analysis/token_filters/shingle/shingle_test.go b/analysis/token_filters/shingle/shingle_test.go index 42f114a4..d81ac57e 100644 --- a/analysis/token_filters/shingle/shingle_test.go +++ b/analysis/token_filters/shingle/shingle_test.go @@ -16,7 +16,7 @@ import ( "github.com/blevesearch/bleve/analysis" ) -func TestNgramFilter(t *testing.T) { +func TestShingleFilter(t *testing.T) { tests := []struct { min int diff --git a/analysis/token_filters/stemmer_filter/stemmer_filter.go b/analysis/token_filters/stemmer_filter/stemmer_filter.go index 6c476840..3640681b 100644 --- a/analysis/token_filters/stemmer_filter/stemmer_filter.go +++ b/analysis/token_filters/stemmer_filter/stemmer_filter.go @@ -22,18 +22,22 @@ import ( const Name = "stem" type StemmerFilter struct { - lang string - stemmer *snowball.Stemmer + lang string + stemmerPool chan *snowball.Stemmer } func NewStemmerFilter(lang string) (*StemmerFilter, error) { - stemmer, err := snowball.New(lang) - if err != nil { - return nil, err + stemmerPool := make(chan *snowball.Stemmer, 4) + for i := 0; i < 4; i++ { + stemmer, err := snowball.New(lang) + if err != nil { + return nil, err + } + stemmerPool <- stemmer } return &StemmerFilter{ - lang: lang, - stemmer: stemmer, + lang: lang, + stemmerPool: stemmerPool, }, nil } @@ -50,18 +54,16 @@ func (s *StemmerFilter) List() []string { } func (s *StemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) - for _, token := range input { // if not protected keyword, stem it if !token.KeyWord { - stemmed := s.stemmer.Stem(string(token.Term)) + stemmer := <-s.stemmerPool + stemmed := stemmer.Stem(string(token.Term)) + s.stemmerPool <- stemmer token.Term = []byte(stemmed) } - rv = append(rv, token) } - - return rv + return input } func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { diff --git a/analysis/token_filters/stop_tokens_filter/stop_tokens_filter.go b/analysis/token_filters/stop_tokens_filter/stop_tokens_filter.go index 8f429ff3..712302b8 100644 --- a/analysis/token_filters/stop_tokens_filter/stop_tokens_filter.go +++ b/analysis/token_filters/stop_tokens_filter/stop_tokens_filter.go @@ -29,7 +29,7 @@ func NewStopTokensFilter(stopTokens analysis.TokenMap) *StopTokensFilter { } func (f *StopTokensFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) + rv := make(analysis.TokenStream, 0, len(input)) for _, token := range input { tokenTerm := string(token.Term) diff --git a/analysis/token_filters/truncate_token_filter/truncate_token_filter.go b/analysis/token_filters/truncate_token_filter/truncate_token_filter.go index af648181..2b16cb08 100644 --- a/analysis/token_filters/truncate_token_filter/truncate_token_filter.go +++ b/analysis/token_filters/truncate_token_filter/truncate_token_filter.go @@ -31,8 +31,6 @@ func NewTruncateTokenFilter(length int) *TruncateTokenFilter { } func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) - for _, token := range input { wordLen := utf8.RuneCount(token.Term) if wordLen > s.length { @@ -45,10 +43,8 @@ func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenS } token.Term = newterm } - rv = append(rv, token) } - - return rv + return input } func TruncateTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { diff --git a/analysis/token_filters/unicode_normalize/unicode_normalize.go b/analysis/token_filters/unicode_normalize/unicode_normalize.go index 798f3da3..12956bc0 100644 --- a/analysis/token_filters/unicode_normalize/unicode_normalize.go +++ b/analysis/token_filters/unicode_normalize/unicode_normalize.go @@ -54,14 +54,10 @@ func MustNewUnicodeNormalizeFilter(formName string) *UnicodeNormalizeFilter { } func (s *UnicodeNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) - for _, token := range input { token.Term = s.form.Bytes(token.Term) - rv = append(rv, token) } - - return rv + return input } func UnicodeNormalizeFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {