0
0

Merge pull request #424 from mschoch/possessivefaster

speed up english possessive filter
This commit is contained in:
Marty Schoch 2016-09-11 13:26:50 -04:00 committed by GitHub
commit 5ed9f67b0b
2 changed files with 59 additions and 10 deletions

View File

@ -10,7 +10,7 @@
package en package en
import ( import (
"bytes" "unicode/utf8"
"github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/registry"
@ -40,15 +40,13 @@ func NewPossessiveFilter() *PossessiveFilter {
func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input { for _, token := range input {
runes := bytes.Runes(token.Term) lastRune, lastRuneSize := utf8.DecodeLastRune(token.Term)
if len(runes) >= 2 { if lastRune == 's' || lastRune == 'S' {
secondToLastRune := runes[len(runes)-2] nextLastRune, nextLastRuneSize := utf8.DecodeLastRune(token.Term[:len(token.Term)-lastRuneSize])
lastRune := runes[len(runes)-1] if nextLastRune == rightSingleQuotationMark ||
if (secondToLastRune == rightSingleQuotationMark || nextLastRune == apostrophe ||
secondToLastRune == apostrophe || nextLastRune == fullWidthApostrophe {
secondToLastRune == fullWidthApostrophe) && token.Term = token.Term[:len(token.Term)-lastRuneSize-nextLastRuneSize]
(lastRune == 's' || lastRune == 'S') {
token.Term = analysis.TruncateRunes(token.Term, 2)
} }
} }
} }

View File

@ -45,6 +45,12 @@ func TestEnglishPossessiveFilter(t *testing.T) {
&analysis.Token{ &analysis.Token{
Term: []byte("m"), Term: []byte("m"),
}, },
&analysis.Token{
Term: []byte("s"),
},
&analysis.Token{
Term: []byte("'s"),
},
}, },
output: analysis.TokenStream{ output: analysis.TokenStream{
&analysis.Token{ &analysis.Token{
@ -68,6 +74,12 @@ func TestEnglishPossessiveFilter(t *testing.T) {
&analysis.Token{ &analysis.Token{
Term: []byte("m"), Term: []byte("m"),
}, },
&analysis.Token{
Term: []byte("s"),
},
&analysis.Token{
Term: []byte(""),
},
}, },
}, },
} }
@ -84,3 +96,42 @@ func TestEnglishPossessiveFilter(t *testing.T) {
} }
} }
} }
func BenchmarkEnglishPossessiveFilter(b *testing.B) {
input := analysis.TokenStream{
&analysis.Token{
Term: []byte("marty's"),
},
&analysis.Token{
Term: []byte("MARTY'S"),
},
&analysis.Token{
Term: []byte("martys"),
},
&analysis.Token{
Term: []byte("MARTYS"),
},
&analysis.Token{
Term: []byte("martys"),
},
&analysis.Token{
Term: []byte("MARTYS"),
},
&analysis.Token{
Term: []byte("m"),
},
}
cache := registry.NewCache()
stemmerFilter, err := cache.TokenFilterNamed(PossessiveName)
if err != nil {
b.Fatal(err)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
stemmerFilter.Filter(input)
}
}