Merge pull request #424 from mschoch/possessivefaster
speed up english possessive filter
This commit is contained in:
commit
5ed9f67b0b
@ -10,7 +10,7 @@
|
|||||||
package en
|
package en
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"unicode/utf8"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/analysis"
|
"github.com/blevesearch/bleve/analysis"
|
||||||
"github.com/blevesearch/bleve/registry"
|
"github.com/blevesearch/bleve/registry"
|
||||||
@ -40,15 +40,13 @@ func NewPossessiveFilter() *PossessiveFilter {
|
|||||||
|
|
||||||
func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
for _, token := range input {
|
for _, token := range input {
|
||||||
runes := bytes.Runes(token.Term)
|
lastRune, lastRuneSize := utf8.DecodeLastRune(token.Term)
|
||||||
if len(runes) >= 2 {
|
if lastRune == 's' || lastRune == 'S' {
|
||||||
secondToLastRune := runes[len(runes)-2]
|
nextLastRune, nextLastRuneSize := utf8.DecodeLastRune(token.Term[:len(token.Term)-lastRuneSize])
|
||||||
lastRune := runes[len(runes)-1]
|
if nextLastRune == rightSingleQuotationMark ||
|
||||||
if (secondToLastRune == rightSingleQuotationMark ||
|
nextLastRune == apostrophe ||
|
||||||
secondToLastRune == apostrophe ||
|
nextLastRune == fullWidthApostrophe {
|
||||||
secondToLastRune == fullWidthApostrophe) &&
|
token.Term = token.Term[:len(token.Term)-lastRuneSize-nextLastRuneSize]
|
||||||
(lastRune == 's' || lastRune == 'S') {
|
|
||||||
token.Term = analysis.TruncateRunes(token.Term, 2)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -45,6 +45,12 @@ func TestEnglishPossessiveFilter(t *testing.T) {
|
|||||||
&analysis.Token{
|
&analysis.Token{
|
||||||
Term: []byte("m"),
|
Term: []byte("m"),
|
||||||
},
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("s"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("'s"),
|
||||||
|
},
|
||||||
},
|
},
|
||||||
output: analysis.TokenStream{
|
output: analysis.TokenStream{
|
||||||
&analysis.Token{
|
&analysis.Token{
|
||||||
@ -68,6 +74,12 @@ func TestEnglishPossessiveFilter(t *testing.T) {
|
|||||||
&analysis.Token{
|
&analysis.Token{
|
||||||
Term: []byte("m"),
|
Term: []byte("m"),
|
||||||
},
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("s"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte(""),
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@ -84,3 +96,42 @@ func TestEnglishPossessiveFilter(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func BenchmarkEnglishPossessiveFilter(b *testing.B) {
|
||||||
|
|
||||||
|
input := analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("marty's"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("MARTY'S"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("marty’s"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("MARTY’S"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("marty's"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("MARTY'S"),
|
||||||
|
},
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("m"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
cache := registry.NewCache()
|
||||||
|
stemmerFilter, err := cache.TokenFilterNamed(PossessiveName)
|
||||||
|
if err != nil {
|
||||||
|
b.Fatal(err)
|
||||||
|
}
|
||||||
|
b.ResetTimer()
|
||||||
|
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
stemmerFilter.Filter(input)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user