From 890b1abfe69625cf82ca9108ccffe7cf0e69ea18 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 14 Jan 2015 11:34:30 -0500 Subject: [PATCH] new version of lower case filter which tries to avoid copying bytes --- .../lower_case_filter/lower_case_filter.go | 41 ++++++++++++++++++- .../lower_case_filter_test.go | 25 +++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/analysis/token_filters/lower_case_filter/lower_case_filter.go b/analysis/token_filters/lower_case_filter/lower_case_filter.go index 137fe601..6d63d6d3 100644 --- a/analysis/token_filters/lower_case_filter/lower_case_filter.go +++ b/analysis/token_filters/lower_case_filter/lower_case_filter.go @@ -11,6 +11,8 @@ package lower_case_filter import ( "bytes" + "unicode" + "unicode/utf8" "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/registry" @@ -27,7 +29,7 @@ func NewLowerCaseFilter() *LowerCaseFilter { func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream { for _, token := range input { - token.Term = bytes.ToLower(token.Term) + token.Term = toLowerDeferredCopy(token.Term) } return input } @@ -39,3 +41,40 @@ func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.C func init() { registry.RegisterTokenFilter(Name, LowerCaseFilterConstructor) } + +// toLowerDeferredCopy will function exactly like +// bytes.ToLower() only it will reuse (overwrite) +// the original byte array when possible +// NOTE: because its possible that the lower-case +// form of a rune has a different utf-8 encoded +// length, in these cases a new byte array is allocated +func toLowerDeferredCopy(s []byte) []byte { + j := 0 + for i := 0; i < len(s); { + wid := 1 + r := rune(s[i]) + if r >= utf8.RuneSelf { + r, wid = utf8.DecodeRune(s[i:]) + } + l := unicode.ToLower(r) + lwid := utf8.RuneLen(l) + if lwid > wid { + // utf-8 encoded replacement is wider + // for now, punt and defer + // to bytes.ToLower() for the remainder + // only known to happen with chars + // Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3 + // Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3 + rest := bytes.ToLower(s[i:]) + rv := make([]byte, j+len(rest)) + copy(rv[:j], s[:j]) + copy(rv[j:], rest) + return rv + } else { + utf8.EncodeRune(s[j:], l) + } + i += wid + j += lwid + } + return s[:j] +} diff --git a/analysis/token_filters/lower_case_filter/lower_case_filter_test.go b/analysis/token_filters/lower_case_filter/lower_case_filter_test.go index d7bf7f55..ccb00ead 100644 --- a/analysis/token_filters/lower_case_filter/lower_case_filter_test.go +++ b/analysis/token_filters/lower_case_filter/lower_case_filter_test.go @@ -31,6 +31,18 @@ func TestLowerCaseFilter(t *testing.T) { &analysis.Token{ Term: []byte("steven's"), }, + // these characters are chosen in particular + // because the utf-8 encoding of the lower-case + // version has a different length + // Rune İ(304) width 2 - Lower i(105) width 1 + // Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3 + // Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3 + &analysis.Token{ + Term: []byte("İȺȾCAT"), + }, + &analysis.Token{ + Term: []byte("ȺȾCAT"), + }, } expectedTokenStream := analysis.TokenStream{ @@ -46,12 +58,19 @@ func TestLowerCaseFilter(t *testing.T) { &analysis.Token{ Term: []byte("steven's"), }, + &analysis.Token{ + Term: []byte("iⱥⱦcat"), + }, + &analysis.Token{ + Term: []byte("ⱥⱦcat"), + }, } filter := NewLowerCaseFilter() ouputTokenStream := filter.Filter(inputTokenStream) if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) { t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream) + t.Errorf("expected %s got %s", expectedTokenStream[0].Term, ouputTokenStream[0].Term) } } @@ -120,6 +139,12 @@ func BenchmarkLowerCaseFilter(b *testing.B) { &analysis.Token{ Term: []byte("point"), }, + &analysis.Token{ + Term: []byte("İȺȾCAT"), + }, + &analysis.Token{ + Term: []byte("ȺȾCAT"), + }, } filter := NewLowerCaseFilter()