From 9a777aaa80cbc6890f8bbba7df4c950a2c72b63e Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 6 Aug 2014 20:39:42 -0400 Subject: [PATCH] added token truncate filter closes #49 --- .../truncate_token_filter.go | 47 ++++++++++++ .../truncate_token_filter_test.go | 73 +++++++++++++++++++ config.go | 2 + 3 files changed, 122 insertions(+) create mode 100644 analysis/token_filters/truncate_token_filter/truncate_token_filter.go create mode 100644 analysis/token_filters/truncate_token_filter/truncate_token_filter_test.go diff --git a/analysis/token_filters/truncate_token_filter/truncate_token_filter.go b/analysis/token_filters/truncate_token_filter/truncate_token_filter.go new file mode 100644 index 00000000..4e539550 --- /dev/null +++ b/analysis/token_filters/truncate_token_filter/truncate_token_filter.go @@ -0,0 +1,47 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package truncate_token_filter + +import ( + "bytes" + "unicode/utf8" + + "github.com/couchbaselabs/bleve/analysis" +) + +type TruncateTokenFilter struct { + length int +} + +func NewTruncateTokenFilter(length int) *TruncateTokenFilter { + return &TruncateTokenFilter{ + length: length, + } +} + +func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + rv := make(analysis.TokenStream, 0) + + for _, token := range input { + wordLen := utf8.RuneCount(token.Term) + if wordLen > s.length { + runes := bytes.Runes(token.Term)[0:s.length] + newterm := make([]byte, 0, s.length*4) + for _, r := range runes { + runeBytes := make([]byte, utf8.RuneLen(r)) + utf8.EncodeRune(runeBytes, r) + newterm = append(newterm, runeBytes...) + } + token.Term = newterm + } + rv = append(rv, token) + } + + return rv +} diff --git a/analysis/token_filters/truncate_token_filter/truncate_token_filter_test.go b/analysis/token_filters/truncate_token_filter/truncate_token_filter_test.go new file mode 100644 index 00000000..bf3ec0e4 --- /dev/null +++ b/analysis/token_filters/truncate_token_filter/truncate_token_filter_test.go @@ -0,0 +1,73 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package truncate_token_filter + +import ( + "reflect" + "testing" + + "github.com/couchbaselabs/bleve/analysis" +) + +func TestTruncateTokenFilter(t *testing.T) { + + tests := []struct { + length int + input analysis.TokenStream + output analysis.TokenStream + }{ + { + length: 5, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("abcdefgh"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("abcde"), + }, + }, + }, + { + length: 3, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("こんにちは世界"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("こんに"), + }, + }, + }, + { + length: 10, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("แยกคำภาษาไ"), + }, + }, + }, + } + + for _, test := range tests { + truncateTokenFilter := NewTruncateTokenFilter(test.length) + actual := truncateTokenFilter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) + } + } +} diff --git a/config.go b/config.go index 63805923..09614b3e 100644 --- a/config.go +++ b/config.go @@ -30,6 +30,7 @@ import ( "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" "github.com/couchbaselabs/bleve/analysis/token_filters/stop_words_filter" + "github.com/couchbaselabs/bleve/analysis/token_filters/truncate_token_filter" "github.com/couchbaselabs/bleve/analysis/token_filters/unicode_normalize" "github.com/couchbaselabs/bleve/search" @@ -181,6 +182,7 @@ func init() { Config.Analysis.TokenFilters["long"] = length_filter.NewLengthFilter(-1, 255) Config.Analysis.TokenFilters["to_lower"] = lower_case_filter.NewLowerCaseFilter() Config.Analysis.TokenFilters["apostrophe"] = apostrophe_filter.NewApostropheFilter() + Config.Analysis.TokenFilters["truncate_token"] = truncate_token_filter.NewTruncateTokenFilter(25) // register stemmer filters Config.Analysis.TokenFilters["stemmer_da"] = stemmer_filter.MustNewStemmerFilter("danish")