0
0
Fork 0

avoid allocation in stop token filter

the token stream resulting from the removal of stop words must
be shorter or the same length as the original, so we just
reuse it and truncate it at the end.
This commit is contained in:
Marty Schoch 2016-09-11 12:29:33 -04:00
parent b961d742c1
commit faa07ac3a6
2 changed files with 50 additions and 4 deletions

View File

@ -36,16 +36,16 @@ func NewStopTokensFilter(stopTokens analysis.TokenMap) *StopTokensFilter {
}
func (f *StopTokensFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, len(input))
j := 0
for _, token := range input {
_, isStopToken := f.stopTokens[string(token.Term)]
if !isStopToken {
rv = append(rv, token)
input[j] = token
j++
}
}
return rv
return input[:j]
}
func StopTokensFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {

View File

@ -71,3 +71,49 @@ func TestStopWordsFilter(t *testing.T) {
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
}
}
func BenchmarkStopWordsFilter(b *testing.B) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("walk"),
},
&analysis.Token{
Term: []byte("in"),
},
&analysis.Token{
Term: []byte("the"),
},
&analysis.Token{
Term: []byte("park"),
},
}
cache := registry.NewCache()
stopListConfig := map[string]interface{}{
"type": token_map.Name,
"tokens": []interface{}{"a", "in", "the"},
}
_, err := cache.DefineTokenMap("stop_test", stopListConfig)
if err != nil {
b.Fatal(err)
}
stopConfig := map[string]interface{}{
"type": "stop_tokens",
"stop_token_map": "stop_test",
}
stopFilter, err := cache.DefineTokenFilter("stop_test", stopConfig)
if err != nil {
b.Fatal(err)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
stopFilter.Filter(inputTokenStream)
}
}