0
0
Fork 0

improve perf of porter stemmer

1.  porter stemmer offers method to NOT do lowercasing, however
to use this we must convert to runes first ourself, so we did this

2.  now we can invoke the version that skips lowercasing, we
already do this ourselves before stemming through separate filter

due to the fact that the stemmer modifies the runes in place
we have no way to know if there were changes, thus we must
always encode back into the term byte slice

added unit test which catches the problem found

NOTE this uses analysis.BuildTermFromRunes so perf gain is
only visible with other PR also merged

future gains are possible if we udpate the stemmer to let us
know if changes were made, thus skipping re-encoding to
[]byte when no changes were actually made
This commit is contained in:
Marty Schoch 2016-09-11 13:33:46 -04:00
parent b961d742c1
commit 44ff6ced8a
2 changed files with 57 additions and 2 deletions

View File

@ -10,6 +10,8 @@
package porter
import (
"bytes"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
@ -29,8 +31,9 @@ func (s *PorterStemmer) Filter(input analysis.TokenStream) analysis.TokenStream
for _, token := range input {
// if it is not a protected keyword, stem it
if !token.KeyWord {
stemmed := porterstemmer.StemString(string(token.Term))
token.Term = []byte(stemmed)
termRunes := bytes.Runes(token.Term)
stemmedRunes := porterstemmer.StemWithoutLowerCasing(termRunes)
token.Term = analysis.BuildTermFromRunes(stemmedRunes)
}
}
return input

View File

@ -32,6 +32,16 @@ func TestPorterStemmer(t *testing.T) {
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
// a term which does stem, but does not change length
&analysis.Token{
Term: []byte("marty"),
},
}
expectedTokenStream := analysis.TokenStream{
@ -48,6 +58,15 @@ func TestPorterStemmer(t *testing.T) {
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
&analysis.Token{
Term: []byte("marti"),
},
}
filter := NewPorterStemmer()
@ -56,3 +75,36 @@ func TestPorterStemmer(t *testing.T) {
t.Errorf("expected %#v got %#v", expectedTokenStream[3], ouputTokenStream[3])
}
}
func BenchmarkPorterStemmer(b *testing.B) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walking"),
},
&analysis.Token{
Term: []byte("talked"),
},
&analysis.Token{
Term: []byte("business"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
}
filter := NewPorterStemmer()
b.ResetTimer()
for i := 0; i < b.N; i++ {
filter.Filter(inputTokenStream)
}
}