0
0

new version of lower case filter which tries to avoid copying bytes

This commit is contained in:
Marty Schoch 2015-01-14 11:34:30 -05:00
parent 7cc544adf2
commit 890b1abfe6
2 changed files with 65 additions and 1 deletions

View File

@ -11,6 +11,8 @@ package lower_case_filter
import (
"bytes"
"unicode"
"unicode/utf8"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
@ -27,7 +29,7 @@ func NewLowerCaseFilter() *LowerCaseFilter {
func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
token.Term = bytes.ToLower(token.Term)
token.Term = toLowerDeferredCopy(token.Term)
}
return input
}
@ -39,3 +41,40 @@ func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.C
func init() {
registry.RegisterTokenFilter(Name, LowerCaseFilterConstructor)
}
// toLowerDeferredCopy will function exactly like
// bytes.ToLower() only it will reuse (overwrite)
// the original byte array when possible
// NOTE: because its possible that the lower-case
// form of a rune has a different utf-8 encoded
// length, in these cases a new byte array is allocated
func toLowerDeferredCopy(s []byte) []byte {
j := 0
for i := 0; i < len(s); {
wid := 1
r := rune(s[i])
if r >= utf8.RuneSelf {
r, wid = utf8.DecodeRune(s[i:])
}
l := unicode.ToLower(r)
lwid := utf8.RuneLen(l)
if lwid > wid {
// utf-8 encoded replacement is wider
// for now, punt and defer
// to bytes.ToLower() for the remainder
// only known to happen with chars
// Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3
// Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3
rest := bytes.ToLower(s[i:])
rv := make([]byte, j+len(rest))
copy(rv[:j], s[:j])
copy(rv[j:], rest)
return rv
} else {
utf8.EncodeRune(s[j:], l)
}
i += wid
j += lwid
}
return s[:j]
}

View File

@ -31,6 +31,18 @@ func TestLowerCaseFilter(t *testing.T) {
&analysis.Token{
Term: []byte("steven's"),
},
// these characters are chosen in particular
// because the utf-8 encoding of the lower-case
// version has a different length
// Rune İ(304) width 2 - Lower i(105) width 1
// Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3
// Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3
&analysis.Token{
Term: []byte("İȺȾCAT"),
},
&analysis.Token{
Term: []byte("ȺȾCAT"),
},
}
expectedTokenStream := analysis.TokenStream{
@ -46,12 +58,19 @@ func TestLowerCaseFilter(t *testing.T) {
&analysis.Token{
Term: []byte("steven's"),
},
&analysis.Token{
Term: []byte("iⱥⱦcat"),
},
&analysis.Token{
Term: []byte("ⱥⱦcat"),
},
}
filter := NewLowerCaseFilter()
ouputTokenStream := filter.Filter(inputTokenStream)
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
t.Errorf("expected %s got %s", expectedTokenStream[0].Term, ouputTokenStream[0].Term)
}
}
@ -120,6 +139,12 @@ func BenchmarkLowerCaseFilter(b *testing.B) {
&analysis.Token{
Term: []byte("point"),
},
&analysis.Token{
Term: []byte("İȺȾCAT"),
},
&analysis.Token{
Term: []byte("ȺȾCAT"),
},
}
filter := NewLowerCaseFilter()