0
0

Merge pull request #424 from mschoch/possessivefaster

speed up english possessive filter
This commit is contained in:
Marty Schoch 2016-09-11 13:26:50 -04:00 committed by GitHub
commit 5ed9f67b0b
2 changed files with 59 additions and 10 deletions

View File

@ -10,7 +10,7 @@
package en
import (
"bytes"
"unicode/utf8"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
@ -40,15 +40,13 @@ func NewPossessiveFilter() *PossessiveFilter {
func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
runes := bytes.Runes(token.Term)
if len(runes) >= 2 {
secondToLastRune := runes[len(runes)-2]
lastRune := runes[len(runes)-1]
if (secondToLastRune == rightSingleQuotationMark ||
secondToLastRune == apostrophe ||
secondToLastRune == fullWidthApostrophe) &&
(lastRune == 's' || lastRune == 'S') {
token.Term = analysis.TruncateRunes(token.Term, 2)
lastRune, lastRuneSize := utf8.DecodeLastRune(token.Term)
if lastRune == 's' || lastRune == 'S' {
nextLastRune, nextLastRuneSize := utf8.DecodeLastRune(token.Term[:len(token.Term)-lastRuneSize])
if nextLastRune == rightSingleQuotationMark ||
nextLastRune == apostrophe ||
nextLastRune == fullWidthApostrophe {
token.Term = token.Term[:len(token.Term)-lastRuneSize-nextLastRuneSize]
}
}
}

View File

@ -45,6 +45,12 @@ func TestEnglishPossessiveFilter(t *testing.T) {
&analysis.Token{
Term: []byte("m"),
},
&analysis.Token{
Term: []byte("s"),
},
&analysis.Token{
Term: []byte("'s"),
},
},
output: analysis.TokenStream{
&analysis.Token{
@ -68,6 +74,12 @@ func TestEnglishPossessiveFilter(t *testing.T) {
&analysis.Token{
Term: []byte("m"),
},
&analysis.Token{
Term: []byte("s"),
},
&analysis.Token{
Term: []byte(""),
},
},
},
}
@ -84,3 +96,42 @@ func TestEnglishPossessiveFilter(t *testing.T) {
}
}
}
func BenchmarkEnglishPossessiveFilter(b *testing.B) {
input := analysis.TokenStream{
&analysis.Token{
Term: []byte("marty's"),
},
&analysis.Token{
Term: []byte("MARTY'S"),
},
&analysis.Token{
Term: []byte("martys"),
},
&analysis.Token{
Term: []byte("MARTYS"),
},
&analysis.Token{
Term: []byte("martys"),
},
&analysis.Token{
Term: []byte("MARTYS"),
},
&analysis.Token{
Term: []byte("m"),
},
}
cache := registry.NewCache()
stemmerFilter, err := cache.TokenFilterNamed(PossessiveName)
if err != nil {
b.Fatal(err)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
stemmerFilter.Filter(input)
}
}