Merge pull request #424 from mschoch/possessivefaster
speed up english possessive filter
This commit is contained in:
commit
5ed9f67b0b
|
@ -10,7 +10,7 @@
|
|||
package en
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
|
@ -40,15 +40,13 @@ func NewPossessiveFilter() *PossessiveFilter {
|
|||
|
||||
func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
runes := bytes.Runes(token.Term)
|
||||
if len(runes) >= 2 {
|
||||
secondToLastRune := runes[len(runes)-2]
|
||||
lastRune := runes[len(runes)-1]
|
||||
if (secondToLastRune == rightSingleQuotationMark ||
|
||||
secondToLastRune == apostrophe ||
|
||||
secondToLastRune == fullWidthApostrophe) &&
|
||||
(lastRune == 's' || lastRune == 'S') {
|
||||
token.Term = analysis.TruncateRunes(token.Term, 2)
|
||||
lastRune, lastRuneSize := utf8.DecodeLastRune(token.Term)
|
||||
if lastRune == 's' || lastRune == 'S' {
|
||||
nextLastRune, nextLastRuneSize := utf8.DecodeLastRune(token.Term[:len(token.Term)-lastRuneSize])
|
||||
if nextLastRune == rightSingleQuotationMark ||
|
||||
nextLastRune == apostrophe ||
|
||||
nextLastRune == fullWidthApostrophe {
|
||||
token.Term = token.Term[:len(token.Term)-lastRuneSize-nextLastRuneSize]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -45,6 +45,12 @@ func TestEnglishPossessiveFilter(t *testing.T) {
|
|||
&analysis.Token{
|
||||
Term: []byte("m"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("s"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("'s"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
|
@ -68,6 +74,12 @@ func TestEnglishPossessiveFilter(t *testing.T) {
|
|||
&analysis.Token{
|
||||
Term: []byte("m"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("s"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
@ -84,3 +96,42 @@ func TestEnglishPossessiveFilter(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkEnglishPossessiveFilter(b *testing.B) {
|
||||
|
||||
input := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("marty's"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY'S"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marty’s"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY’S"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marty's"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY'S"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("m"),
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
stemmerFilter, err := cache.TokenFilterNamed(PossessiveName)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
stemmerFilter.Filter(input)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user