modified token filters to avoid creating new token stream
often the result stream was the same length, so can reuse the existing token stream also, in cases where a new stream was required, set capacity to the length of the input stream. most output stream are at least as long as the input, so this may avoid some subsequent resizing
This commit is contained in:
parent
95e6e37e67
commit
1dc466a800
|
@ -46,15 +46,11 @@ func NewArabicNormalizeFilter() *ArabicNormalizeFilter {
|
|||
}
|
||||
|
||||
func (s *ArabicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
|
|
|
@ -32,7 +32,7 @@ func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
|
|||
r := ring.New(2)
|
||||
itemsInRing := 0
|
||||
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
|
||||
for _, token := range input {
|
||||
if token.Type == analysis.Ideographic {
|
||||
|
|
|
@ -56,15 +56,11 @@ func NewSoraniNormalizeFilter() *SoraniNormalizeFilter {
|
|||
}
|
||||
|
||||
func (s *SoraniNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
|
|
|
@ -27,18 +27,14 @@ func NewSoraniStemmerFilter() *SoraniStemmerFilter {
|
|||
}
|
||||
|
||||
func (s *SoraniStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
// if not protected keyword, stem it
|
||||
if !token.KeyWord {
|
||||
stemmed := stem(token.Term)
|
||||
token.Term = stemmed
|
||||
}
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
return input
|
||||
}
|
||||
|
||||
func stem(input []byte) []byte {
|
||||
|
|
|
@ -32,15 +32,11 @@ func NewGermanNormalizeFilter() *GermanNormalizeFilter {
|
|||
}
|
||||
|
||||
func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
|
|
|
@ -32,7 +32,6 @@ func NewPossessiveFilter() *PossessiveFilter {
|
|||
}
|
||||
|
||||
func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
|
||||
for _, token := range input {
|
||||
runes := bytes.Runes(token.Term)
|
||||
if len(runes) >= 2 {
|
||||
|
@ -46,7 +45,6 @@ func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStre
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
return input
|
||||
}
|
||||
|
||||
|
|
|
@ -38,15 +38,11 @@ func NewPersianNormalizeFilter() *PersianNormalizeFilter {
|
|||
}
|
||||
|
||||
func (s *PersianNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
|
|
|
@ -26,15 +26,11 @@ func NewHindiNormalizeFilter() *HindiNormalizeFilter {
|
|||
}
|
||||
|
||||
func (s *HindiNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
|
|
|
@ -27,18 +27,14 @@ func NewHindiStemmerFilter() *HindiStemmerFilter {
|
|||
}
|
||||
|
||||
func (s *HindiStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
// if not protected keyword, stem it
|
||||
if !token.KeyWord {
|
||||
stemmed := stem(token.Term)
|
||||
token.Term = stemmed
|
||||
}
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
return input
|
||||
}
|
||||
|
||||
func stem(input []byte) []byte {
|
||||
|
|
|
@ -46,7 +46,7 @@ func (t *KagomeMorphTokenizer) Tokenize(input []byte) analysis.TokenStream {
|
|||
prevstart int
|
||||
)
|
||||
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
if len(input) < 1 {
|
||||
return rv
|
||||
}
|
||||
|
|
|
@ -29,18 +29,15 @@ func NewApostropheFilter() *ApostropheFilter {
|
|||
}
|
||||
|
||||
func (s *ApostropheFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
firstApostrophe := bytes.IndexAny(token.Term, Apostrophes)
|
||||
if firstApostrophe >= 0 {
|
||||
// found an apostrophe
|
||||
token.Term = token.Term[0:firstApostrophe]
|
||||
}
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
return input
|
||||
}
|
||||
|
||||
func ApostropheFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
|
|
|
@ -33,7 +33,7 @@ func NewCld2Filter() *Cld2Filter {
|
|||
}
|
||||
|
||||
func (f *Cld2Filter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
|
||||
offset := 0
|
||||
for _, token := range input {
|
||||
|
|
|
@ -40,7 +40,7 @@ func NewEdgeNgramFilter(side Side, minLength, maxLength int) *EdgeNgramFilter {
|
|||
}
|
||||
|
||||
func (s *EdgeNgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
|
||||
for _, token := range input {
|
||||
runeCount := utf8.RuneCount(token.Term)
|
||||
|
|
|
@ -35,8 +35,6 @@ func NewElisionFilter(articles analysis.TokenMap) *ElisionFilter {
|
|||
}
|
||||
|
||||
func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
firstApostrophe := bytes.IndexAny(token.Term, Apostrophes)
|
||||
if firstApostrophe >= 0 {
|
||||
|
@ -48,10 +46,8 @@ func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream
|
|||
token.Term = token.Term[firstApostrophe+1:]
|
||||
}
|
||||
}
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
return input
|
||||
}
|
||||
|
||||
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
|
|
|
@ -29,7 +29,6 @@ func NewKeyWordMarkerFilter(keyWords analysis.TokenMap) *KeyWordMarkerFilter {
|
|||
}
|
||||
|
||||
func (f *KeyWordMarkerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
|
||||
for _, token := range input {
|
||||
word := string(token.Term)
|
||||
_, isKeyWord := f.keyWords[word]
|
||||
|
@ -37,7 +36,6 @@ func (f *KeyWordMarkerFilter) Filter(input analysis.TokenStream) analysis.TokenS
|
|||
token.KeyWord = true
|
||||
}
|
||||
}
|
||||
|
||||
return input
|
||||
}
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ func NewLengthFilter(min, max int) *LengthFilter {
|
|||
}
|
||||
|
||||
func (f *LengthFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
|
||||
for _, token := range input {
|
||||
wordLen := utf8.RuneCount(token.Term)
|
||||
|
|
|
@ -26,16 +26,12 @@ func NewLowerCaseFilter() *LowerCaseFilter {
|
|||
}
|
||||
|
||||
func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
word := string(token.Term)
|
||||
wordLowerCase := strings.ToLower(word)
|
||||
token.Term = []byte(wordLowerCase)
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
return input
|
||||
}
|
||||
|
||||
func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
|
|
|
@ -33,7 +33,7 @@ func NewNgramFilter(minLength, maxLength int) *NgramFilter {
|
|||
}
|
||||
|
||||
func (s *NgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
|
||||
for _, token := range input {
|
||||
runeCount := utf8.RuneCount(token.Term)
|
||||
|
|
|
@ -32,7 +32,7 @@ func NewShingleFilter(min, max int, outputOriginal bool, sep, fill string) *Shin
|
|||
}
|
||||
|
||||
func (s *ShingleFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
|
||||
currentPosition := 0
|
||||
for _, token := range input {
|
||||
|
|
|
@ -16,7 +16,7 @@ import (
|
|||
"github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestNgramFilter(t *testing.T) {
|
||||
func TestShingleFilter(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
min int
|
||||
|
|
|
@ -22,18 +22,22 @@ import (
|
|||
const Name = "stem"
|
||||
|
||||
type StemmerFilter struct {
|
||||
lang string
|
||||
stemmer *snowball.Stemmer
|
||||
lang string
|
||||
stemmerPool chan *snowball.Stemmer
|
||||
}
|
||||
|
||||
func NewStemmerFilter(lang string) (*StemmerFilter, error) {
|
||||
stemmer, err := snowball.New(lang)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
stemmerPool := make(chan *snowball.Stemmer, 4)
|
||||
for i := 0; i < 4; i++ {
|
||||
stemmer, err := snowball.New(lang)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerPool <- stemmer
|
||||
}
|
||||
return &StemmerFilter{
|
||||
lang: lang,
|
||||
stemmer: stemmer,
|
||||
lang: lang,
|
||||
stemmerPool: stemmerPool,
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
@ -50,18 +54,16 @@ func (s *StemmerFilter) List() []string {
|
|||
}
|
||||
|
||||
func (s *StemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
// if not protected keyword, stem it
|
||||
if !token.KeyWord {
|
||||
stemmed := s.stemmer.Stem(string(token.Term))
|
||||
stemmer := <-s.stemmerPool
|
||||
stemmed := stemmer.Stem(string(token.Term))
|
||||
s.stemmerPool <- stemmer
|
||||
token.Term = []byte(stemmed)
|
||||
}
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
return input
|
||||
}
|
||||
|
||||
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
|
|
|
@ -29,7 +29,7 @@ func NewStopTokensFilter(stopTokens analysis.TokenMap) *StopTokensFilter {
|
|||
}
|
||||
|
||||
func (f *StopTokensFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
|
||||
for _, token := range input {
|
||||
tokenTerm := string(token.Term)
|
||||
|
|
|
@ -31,8 +31,6 @@ func NewTruncateTokenFilter(length int) *TruncateTokenFilter {
|
|||
}
|
||||
|
||||
func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
wordLen := utf8.RuneCount(token.Term)
|
||||
if wordLen > s.length {
|
||||
|
@ -45,10 +43,8 @@ func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenS
|
|||
}
|
||||
token.Term = newterm
|
||||
}
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
return input
|
||||
}
|
||||
|
||||
func TruncateTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
|
|
|
@ -54,14 +54,10 @@ func MustNewUnicodeNormalizeFilter(formName string) *UnicodeNormalizeFilter {
|
|||
}
|
||||
|
||||
func (s *UnicodeNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
token.Term = s.form.Bytes(token.Term)
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
return input
|
||||
}
|
||||
|
||||
func UnicodeNormalizeFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user