new version of lower case filter which tries to avoid copying bytes
This commit is contained in:
parent
7cc544adf2
commit
890b1abfe6
|
@ -11,6 +11,8 @@ package lower_case_filter
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"unicode"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/analysis"
|
"github.com/blevesearch/bleve/analysis"
|
||||||
"github.com/blevesearch/bleve/registry"
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
@ -27,7 +29,7 @@ func NewLowerCaseFilter() *LowerCaseFilter {
|
||||||
|
|
||||||
func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
for _, token := range input {
|
for _, token := range input {
|
||||||
token.Term = bytes.ToLower(token.Term)
|
token.Term = toLowerDeferredCopy(token.Term)
|
||||||
}
|
}
|
||||||
return input
|
return input
|
||||||
}
|
}
|
||||||
|
@ -39,3 +41,40 @@ func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.C
|
||||||
func init() {
|
func init() {
|
||||||
registry.RegisterTokenFilter(Name, LowerCaseFilterConstructor)
|
registry.RegisterTokenFilter(Name, LowerCaseFilterConstructor)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// toLowerDeferredCopy will function exactly like
|
||||||
|
// bytes.ToLower() only it will reuse (overwrite)
|
||||||
|
// the original byte array when possible
|
||||||
|
// NOTE: because its possible that the lower-case
|
||||||
|
// form of a rune has a different utf-8 encoded
|
||||||
|
// length, in these cases a new byte array is allocated
|
||||||
|
func toLowerDeferredCopy(s []byte) []byte {
|
||||||
|
j := 0
|
||||||
|
for i := 0; i < len(s); {
|
||||||
|
wid := 1
|
||||||
|
r := rune(s[i])
|
||||||
|
if r >= utf8.RuneSelf {
|
||||||
|
r, wid = utf8.DecodeRune(s[i:])
|
||||||
|
}
|
||||||
|
l := unicode.ToLower(r)
|
||||||
|
lwid := utf8.RuneLen(l)
|
||||||
|
if lwid > wid {
|
||||||
|
// utf-8 encoded replacement is wider
|
||||||
|
// for now, punt and defer
|
||||||
|
// to bytes.ToLower() for the remainder
|
||||||
|
// only known to happen with chars
|
||||||
|
// Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3
|
||||||
|
// Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3
|
||||||
|
rest := bytes.ToLower(s[i:])
|
||||||
|
rv := make([]byte, j+len(rest))
|
||||||
|
copy(rv[:j], s[:j])
|
||||||
|
copy(rv[j:], rest)
|
||||||
|
return rv
|
||||||
|
} else {
|
||||||
|
utf8.EncodeRune(s[j:], l)
|
||||||
|
}
|
||||||
|
i += wid
|
||||||
|
j += lwid
|
||||||
|
}
|
||||||
|
return s[:j]
|
||||||
|
}
|
||||||
|
|
|
@ -31,6 +31,18 @@ func TestLowerCaseFilter(t *testing.T) {
|
||||||
&analysis.Token{
|
&analysis.Token{
|
||||||
Term: []byte("steven's"),
|
Term: []byte("steven's"),
|
||||||
},
|
},
|
||||||
|
// these characters are chosen in particular
|
||||||
|
// because the utf-8 encoding of the lower-case
|
||||||
|
// version has a different length
|
||||||
|
// Rune İ(304) width 2 - Lower i(105) width 1
|
||||||
|
// Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3
|
||||||
|
// Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("İȺȾCAT"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ȺȾCAT"),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
expectedTokenStream := analysis.TokenStream{
|
expectedTokenStream := analysis.TokenStream{
|
||||||
|
@ -46,12 +58,19 @@ func TestLowerCaseFilter(t *testing.T) {
|
||||||
&analysis.Token{
|
&analysis.Token{
|
||||||
Term: []byte("steven's"),
|
Term: []byte("steven's"),
|
||||||
},
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("iⱥⱦcat"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ⱥⱦcat"),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
filter := NewLowerCaseFilter()
|
filter := NewLowerCaseFilter()
|
||||||
ouputTokenStream := filter.Filter(inputTokenStream)
|
ouputTokenStream := filter.Filter(inputTokenStream)
|
||||||
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
||||||
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
|
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
|
||||||
|
t.Errorf("expected %s got %s", expectedTokenStream[0].Term, ouputTokenStream[0].Term)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -120,6 +139,12 @@ func BenchmarkLowerCaseFilter(b *testing.B) {
|
||||||
&analysis.Token{
|
&analysis.Token{
|
||||||
Term: []byte("point"),
|
Term: []byte("point"),
|
||||||
},
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("İȺȾCAT"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ȺȾCAT"),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
filter := NewLowerCaseFilter()
|
filter := NewLowerCaseFilter()
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user