0
0

modified token filters to avoid creating new token stream

often the result stream was the same length, so can reuse the
existing token stream
also, in cases where a new stream was required, set capacity to
the length of the input stream.  most output stream are at least
as long as the input, so this may avoid some subsequent resizing
This commit is contained in:
Marty Schoch 2014-09-23 18:41:32 -04:00
parent 95e6e37e67
commit 1dc466a800
24 changed files with 36 additions and 85 deletions

View File

@ -46,15 +46,11 @@ func NewArabicNormalizeFilter() *ArabicNormalizeFilter {
} }
func (s *ArabicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *ArabicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input { for _, token := range input {
term := normalize(token.Term) term := normalize(token.Term)
token.Term = term token.Term = term
rv = append(rv, token)
} }
return input
return rv
} }
func normalize(input []byte) []byte { func normalize(input []byte) []byte {

View File

@ -32,7 +32,7 @@ func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
r := ring.New(2) r := ring.New(2)
itemsInRing := 0 itemsInRing := 0
rv := make(analysis.TokenStream, 0) rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input { for _, token := range input {
if token.Type == analysis.Ideographic { if token.Type == analysis.Ideographic {

View File

@ -56,15 +56,11 @@ func NewSoraniNormalizeFilter() *SoraniNormalizeFilter {
} }
func (s *SoraniNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *SoraniNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input { for _, token := range input {
term := normalize(token.Term) term := normalize(token.Term)
token.Term = term token.Term = term
rv = append(rv, token)
} }
return input
return rv
} }
func normalize(input []byte) []byte { func normalize(input []byte) []byte {

View File

@ -27,18 +27,14 @@ func NewSoraniStemmerFilter() *SoraniStemmerFilter {
} }
func (s *SoraniStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *SoraniStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input { for _, token := range input {
// if not protected keyword, stem it // if not protected keyword, stem it
if !token.KeyWord { if !token.KeyWord {
stemmed := stem(token.Term) stemmed := stem(token.Term)
token.Term = stemmed token.Term = stemmed
} }
rv = append(rv, token)
} }
return input
return rv
} }
func stem(input []byte) []byte { func stem(input []byte) []byte {

View File

@ -32,15 +32,11 @@ func NewGermanNormalizeFilter() *GermanNormalizeFilter {
} }
func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input { for _, token := range input {
term := normalize(token.Term) term := normalize(token.Term)
token.Term = term token.Term = term
rv = append(rv, token)
} }
return input
return rv
} }
func normalize(input []byte) []byte { func normalize(input []byte) []byte {

View File

@ -32,7 +32,6 @@ func NewPossessiveFilter() *PossessiveFilter {
} }
func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input { for _, token := range input {
runes := bytes.Runes(token.Term) runes := bytes.Runes(token.Term)
if len(runes) >= 2 { if len(runes) >= 2 {
@ -46,7 +45,6 @@ func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStre
} }
} }
} }
return input return input
} }

View File

@ -38,15 +38,11 @@ func NewPersianNormalizeFilter() *PersianNormalizeFilter {
} }
func (s *PersianNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *PersianNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input { for _, token := range input {
term := normalize(token.Term) term := normalize(token.Term)
token.Term = term token.Term = term
rv = append(rv, token)
} }
return input
return rv
} }
func normalize(input []byte) []byte { func normalize(input []byte) []byte {

View File

@ -26,15 +26,11 @@ func NewHindiNormalizeFilter() *HindiNormalizeFilter {
} }
func (s *HindiNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *HindiNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input { for _, token := range input {
term := normalize(token.Term) term := normalize(token.Term)
token.Term = term token.Term = term
rv = append(rv, token)
} }
return input
return rv
} }
func normalize(input []byte) []byte { func normalize(input []byte) []byte {

View File

@ -27,18 +27,14 @@ func NewHindiStemmerFilter() *HindiStemmerFilter {
} }
func (s *HindiStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *HindiStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input { for _, token := range input {
// if not protected keyword, stem it // if not protected keyword, stem it
if !token.KeyWord { if !token.KeyWord {
stemmed := stem(token.Term) stemmed := stem(token.Term)
token.Term = stemmed token.Term = stemmed
} }
rv = append(rv, token)
} }
return input
return rv
} }
func stem(input []byte) []byte { func stem(input []byte) []byte {

View File

@ -46,7 +46,7 @@ func (t *KagomeMorphTokenizer) Tokenize(input []byte) analysis.TokenStream {
prevstart int prevstart int
) )
rv := make(analysis.TokenStream, 0) rv := make(analysis.TokenStream, 0, len(input))
if len(input) < 1 { if len(input) < 1 {
return rv return rv
} }

View File

@ -29,18 +29,15 @@ func NewApostropheFilter() *ApostropheFilter {
} }
func (s *ApostropheFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *ApostropheFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input { for _, token := range input {
firstApostrophe := bytes.IndexAny(token.Term, Apostrophes) firstApostrophe := bytes.IndexAny(token.Term, Apostrophes)
if firstApostrophe >= 0 { if firstApostrophe >= 0 {
// found an apostrophe // found an apostrophe
token.Term = token.Term[0:firstApostrophe] token.Term = token.Term[0:firstApostrophe]
} }
rv = append(rv, token)
} }
return rv return input
} }
func ApostropheFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { func ApostropheFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {

View File

@ -33,7 +33,7 @@ func NewCld2Filter() *Cld2Filter {
} }
func (f *Cld2Filter) Filter(input analysis.TokenStream) analysis.TokenStream { func (f *Cld2Filter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0) rv := make(analysis.TokenStream, 0, len(input))
offset := 0 offset := 0
for _, token := range input { for _, token := range input {

View File

@ -40,7 +40,7 @@ func NewEdgeNgramFilter(side Side, minLength, maxLength int) *EdgeNgramFilter {
} }
func (s *EdgeNgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *EdgeNgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0) rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input { for _, token := range input {
runeCount := utf8.RuneCount(token.Term) runeCount := utf8.RuneCount(token.Term)

View File

@ -35,8 +35,6 @@ func NewElisionFilter(articles analysis.TokenMap) *ElisionFilter {
} }
func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input { for _, token := range input {
firstApostrophe := bytes.IndexAny(token.Term, Apostrophes) firstApostrophe := bytes.IndexAny(token.Term, Apostrophes)
if firstApostrophe >= 0 { if firstApostrophe >= 0 {
@ -48,10 +46,8 @@ func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream
token.Term = token.Term[firstApostrophe+1:] token.Term = token.Term[firstApostrophe+1:]
} }
} }
rv = append(rv, token)
} }
return input
return rv
} }
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {

View File

@ -29,7 +29,6 @@ func NewKeyWordMarkerFilter(keyWords analysis.TokenMap) *KeyWordMarkerFilter {
} }
func (f *KeyWordMarkerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (f *KeyWordMarkerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input { for _, token := range input {
word := string(token.Term) word := string(token.Term)
_, isKeyWord := f.keyWords[word] _, isKeyWord := f.keyWords[word]
@ -37,7 +36,6 @@ func (f *KeyWordMarkerFilter) Filter(input analysis.TokenStream) analysis.TokenS
token.KeyWord = true token.KeyWord = true
} }
} }
return input return input
} }

View File

@ -32,7 +32,7 @@ func NewLengthFilter(min, max int) *LengthFilter {
} }
func (f *LengthFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (f *LengthFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0) rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input { for _, token := range input {
wordLen := utf8.RuneCount(token.Term) wordLen := utf8.RuneCount(token.Term)

View File

@ -26,16 +26,12 @@ func NewLowerCaseFilter() *LowerCaseFilter {
} }
func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input { for _, token := range input {
word := string(token.Term) word := string(token.Term)
wordLowerCase := strings.ToLower(word) wordLowerCase := strings.ToLower(word)
token.Term = []byte(wordLowerCase) token.Term = []byte(wordLowerCase)
rv = append(rv, token)
} }
return input
return rv
} }
func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {

View File

@ -33,7 +33,7 @@ func NewNgramFilter(minLength, maxLength int) *NgramFilter {
} }
func (s *NgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *NgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0) rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input { for _, token := range input {
runeCount := utf8.RuneCount(token.Term) runeCount := utf8.RuneCount(token.Term)

View File

@ -32,7 +32,7 @@ func NewShingleFilter(min, max int, outputOriginal bool, sep, fill string) *Shin
} }
func (s *ShingleFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *ShingleFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0) rv := make(analysis.TokenStream, 0, len(input))
currentPosition := 0 currentPosition := 0
for _, token := range input { for _, token := range input {

View File

@ -16,7 +16,7 @@ import (
"github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis"
) )
func TestNgramFilter(t *testing.T) { func TestShingleFilter(t *testing.T) {
tests := []struct { tests := []struct {
min int min int

View File

@ -22,18 +22,22 @@ import (
const Name = "stem" const Name = "stem"
type StemmerFilter struct { type StemmerFilter struct {
lang string lang string
stemmer *snowball.Stemmer stemmerPool chan *snowball.Stemmer
} }
func NewStemmerFilter(lang string) (*StemmerFilter, error) { func NewStemmerFilter(lang string) (*StemmerFilter, error) {
stemmer, err := snowball.New(lang) stemmerPool := make(chan *snowball.Stemmer, 4)
if err != nil { for i := 0; i < 4; i++ {
return nil, err stemmer, err := snowball.New(lang)
if err != nil {
return nil, err
}
stemmerPool <- stemmer
} }
return &StemmerFilter{ return &StemmerFilter{
lang: lang, lang: lang,
stemmer: stemmer, stemmerPool: stemmerPool,
}, nil }, nil
} }
@ -50,18 +54,16 @@ func (s *StemmerFilter) List() []string {
} }
func (s *StemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *StemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input { for _, token := range input {
// if not protected keyword, stem it // if not protected keyword, stem it
if !token.KeyWord { if !token.KeyWord {
stemmed := s.stemmer.Stem(string(token.Term)) stemmer := <-s.stemmerPool
stemmed := stemmer.Stem(string(token.Term))
s.stemmerPool <- stemmer
token.Term = []byte(stemmed) token.Term = []byte(stemmed)
} }
rv = append(rv, token)
} }
return input
return rv
} }
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {

View File

@ -29,7 +29,7 @@ func NewStopTokensFilter(stopTokens analysis.TokenMap) *StopTokensFilter {
} }
func (f *StopTokensFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (f *StopTokensFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0) rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input { for _, token := range input {
tokenTerm := string(token.Term) tokenTerm := string(token.Term)

View File

@ -31,8 +31,6 @@ func NewTruncateTokenFilter(length int) *TruncateTokenFilter {
} }
func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input { for _, token := range input {
wordLen := utf8.RuneCount(token.Term) wordLen := utf8.RuneCount(token.Term)
if wordLen > s.length { if wordLen > s.length {
@ -45,10 +43,8 @@ func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenS
} }
token.Term = newterm token.Term = newterm
} }
rv = append(rv, token)
} }
return input
return rv
} }
func TruncateTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { func TruncateTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {

View File

@ -54,14 +54,10 @@ func MustNewUnicodeNormalizeFilter(formName string) *UnicodeNormalizeFilter {
} }
func (s *UnicodeNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *UnicodeNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input { for _, token := range input {
token.Term = s.form.Bytes(token.Term) token.Term = s.form.Bytes(token.Term)
rv = append(rv, token)
} }
return input
return rv
} }
func UnicodeNormalizeFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { func UnicodeNormalizeFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {