0
0
Fork 0

Merge pull request #425 from mschoch/porterfaster

improve perf of porter stemmer
This commit is contained in:
Marty Schoch 2016-09-11 20:22:23 -04:00 committed by GitHub
commit ee61b2e866
2 changed files with 57 additions and 2 deletions

View File

@ -10,6 +10,8 @@
package porter
import (
"bytes"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
@ -29,8 +31,9 @@ func (s *PorterStemmer) Filter(input analysis.TokenStream) analysis.TokenStream
for _, token := range input {
// if it is not a protected keyword, stem it
if !token.KeyWord {
stemmed := porterstemmer.StemString(string(token.Term))
token.Term = []byte(stemmed)
termRunes := bytes.Runes(token.Term)
stemmedRunes := porterstemmer.StemWithoutLowerCasing(termRunes)
token.Term = analysis.BuildTermFromRunes(stemmedRunes)
}
}
return input

View File

@ -32,6 +32,16 @@ func TestPorterStemmer(t *testing.T) {
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
// a term which does stem, but does not change length
&analysis.Token{
Term: []byte("marty"),
},
}
expectedTokenStream := analysis.TokenStream{
@ -48,6 +58,15 @@ func TestPorterStemmer(t *testing.T) {
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
&analysis.Token{
Term: []byte("marti"),
},
}
filter := NewPorterStemmer()
@ -56,3 +75,36 @@ func TestPorterStemmer(t *testing.T) {
t.Errorf("expected %#v got %#v", expectedTokenStream[3], ouputTokenStream[3])
}
}
func BenchmarkPorterStemmer(b *testing.B) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walking"),
},
&analysis.Token{
Term: []byte("talked"),
},
&analysis.Token{
Term: []byte("business"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
}
filter := NewPorterStemmer()
b.ResetTimer()
for i := 0; i < b.N; i++ {
filter.Filter(inputTokenStream)
}
}