0
0

modified token filters to avoid creating new token stream

often the result stream was the same length, so can reuse the
existing token stream
also, in cases where a new stream was required, set capacity to
the length of the input stream.  most output stream are at least
as long as the input, so this may avoid some subsequent resizing
This commit is contained in:
Marty Schoch 2014-09-23 18:41:32 -04:00
parent 95e6e37e67
commit 1dc466a800
24 changed files with 36 additions and 85 deletions

View File

@ -46,15 +46,11 @@ func NewArabicNormalizeFilter() *ArabicNormalizeFilter {
}
func (s *ArabicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input {
term := normalize(token.Term)
token.Term = term
rv = append(rv, token)
}
return rv
return input
}
func normalize(input []byte) []byte {

View File

@ -32,7 +32,7 @@ func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
r := ring.New(2)
itemsInRing := 0
rv := make(analysis.TokenStream, 0)
rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input {
if token.Type == analysis.Ideographic {

View File

@ -56,15 +56,11 @@ func NewSoraniNormalizeFilter() *SoraniNormalizeFilter {
}
func (s *SoraniNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input {
term := normalize(token.Term)
token.Term = term
rv = append(rv, token)
}
return rv
return input
}
func normalize(input []byte) []byte {

View File

@ -27,18 +27,14 @@ func NewSoraniStemmerFilter() *SoraniStemmerFilter {
}
func (s *SoraniStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input {
// if not protected keyword, stem it
if !token.KeyWord {
stemmed := stem(token.Term)
token.Term = stemmed
}
rv = append(rv, token)
}
return rv
return input
}
func stem(input []byte) []byte {

View File

@ -32,15 +32,11 @@ func NewGermanNormalizeFilter() *GermanNormalizeFilter {
}
func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input {
term := normalize(token.Term)
token.Term = term
rv = append(rv, token)
}
return rv
return input
}
func normalize(input []byte) []byte {

View File

@ -32,7 +32,6 @@ func NewPossessiveFilter() *PossessiveFilter {
}
func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
runes := bytes.Runes(token.Term)
if len(runes) >= 2 {
@ -46,7 +45,6 @@ func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStre
}
}
}
return input
}

View File

@ -38,15 +38,11 @@ func NewPersianNormalizeFilter() *PersianNormalizeFilter {
}
func (s *PersianNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input {
term := normalize(token.Term)
token.Term = term
rv = append(rv, token)
}
return rv
return input
}
func normalize(input []byte) []byte {

View File

@ -26,15 +26,11 @@ func NewHindiNormalizeFilter() *HindiNormalizeFilter {
}
func (s *HindiNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input {
term := normalize(token.Term)
token.Term = term
rv = append(rv, token)
}
return rv
return input
}
func normalize(input []byte) []byte {

View File

@ -27,18 +27,14 @@ func NewHindiStemmerFilter() *HindiStemmerFilter {
}
func (s *HindiStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input {
// if not protected keyword, stem it
if !token.KeyWord {
stemmed := stem(token.Term)
token.Term = stemmed
}
rv = append(rv, token)
}
return rv
return input
}
func stem(input []byte) []byte {

View File

@ -46,7 +46,7 @@ func (t *KagomeMorphTokenizer) Tokenize(input []byte) analysis.TokenStream {
prevstart int
)
rv := make(analysis.TokenStream, 0)
rv := make(analysis.TokenStream, 0, len(input))
if len(input) < 1 {
return rv
}

View File

@ -29,18 +29,15 @@ func NewApostropheFilter() *ApostropheFilter {
}
func (s *ApostropheFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input {
firstApostrophe := bytes.IndexAny(token.Term, Apostrophes)
if firstApostrophe >= 0 {
// found an apostrophe
token.Term = token.Term[0:firstApostrophe]
}
rv = append(rv, token)
}
return rv
return input
}
func ApostropheFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {

View File

@ -33,7 +33,7 @@ func NewCld2Filter() *Cld2Filter {
}
func (f *Cld2Filter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
rv := make(analysis.TokenStream, 0, len(input))
offset := 0
for _, token := range input {

View File

@ -40,7 +40,7 @@ func NewEdgeNgramFilter(side Side, minLength, maxLength int) *EdgeNgramFilter {
}
func (s *EdgeNgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input {
runeCount := utf8.RuneCount(token.Term)

View File

@ -35,8 +35,6 @@ func NewElisionFilter(articles analysis.TokenMap) *ElisionFilter {
}
func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input {
firstApostrophe := bytes.IndexAny(token.Term, Apostrophes)
if firstApostrophe >= 0 {
@ -48,10 +46,8 @@ func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream
token.Term = token.Term[firstApostrophe+1:]
}
}
rv = append(rv, token)
}
return rv
return input
}
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {

View File

@ -29,7 +29,6 @@ func NewKeyWordMarkerFilter(keyWords analysis.TokenMap) *KeyWordMarkerFilter {
}
func (f *KeyWordMarkerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
word := string(token.Term)
_, isKeyWord := f.keyWords[word]
@ -37,7 +36,6 @@ func (f *KeyWordMarkerFilter) Filter(input analysis.TokenStream) analysis.TokenS
token.KeyWord = true
}
}
return input
}

View File

@ -32,7 +32,7 @@ func NewLengthFilter(min, max int) *LengthFilter {
}
func (f *LengthFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input {
wordLen := utf8.RuneCount(token.Term)

View File

@ -26,16 +26,12 @@ func NewLowerCaseFilter() *LowerCaseFilter {
}
func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input {
word := string(token.Term)
wordLowerCase := strings.ToLower(word)
token.Term = []byte(wordLowerCase)
rv = append(rv, token)
}
return rv
return input
}
func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {

View File

@ -33,7 +33,7 @@ func NewNgramFilter(minLength, maxLength int) *NgramFilter {
}
func (s *NgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input {
runeCount := utf8.RuneCount(token.Term)

View File

@ -32,7 +32,7 @@ func NewShingleFilter(min, max int, outputOriginal bool, sep, fill string) *Shin
}
func (s *ShingleFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
rv := make(analysis.TokenStream, 0, len(input))
currentPosition := 0
for _, token := range input {

View File

@ -16,7 +16,7 @@ import (
"github.com/blevesearch/bleve/analysis"
)
func TestNgramFilter(t *testing.T) {
func TestShingleFilter(t *testing.T) {
tests := []struct {
min int

View File

@ -22,18 +22,22 @@ import (
const Name = "stem"
type StemmerFilter struct {
lang string
stemmer *snowball.Stemmer
lang string
stemmerPool chan *snowball.Stemmer
}
func NewStemmerFilter(lang string) (*StemmerFilter, error) {
stemmer, err := snowball.New(lang)
if err != nil {
return nil, err
stemmerPool := make(chan *snowball.Stemmer, 4)
for i := 0; i < 4; i++ {
stemmer, err := snowball.New(lang)
if err != nil {
return nil, err
}
stemmerPool <- stemmer
}
return &StemmerFilter{
lang: lang,
stemmer: stemmer,
lang: lang,
stemmerPool: stemmerPool,
}, nil
}
@ -50,18 +54,16 @@ func (s *StemmerFilter) List() []string {
}
func (s *StemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input {
// if not protected keyword, stem it
if !token.KeyWord {
stemmed := s.stemmer.Stem(string(token.Term))
stemmer := <-s.stemmerPool
stemmed := stemmer.Stem(string(token.Term))
s.stemmerPool <- stemmer
token.Term = []byte(stemmed)
}
rv = append(rv, token)
}
return rv
return input
}
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {

View File

@ -29,7 +29,7 @@ func NewStopTokensFilter(stopTokens analysis.TokenMap) *StopTokensFilter {
}
func (f *StopTokensFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input {
tokenTerm := string(token.Term)

View File

@ -31,8 +31,6 @@ func NewTruncateTokenFilter(length int) *TruncateTokenFilter {
}
func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input {
wordLen := utf8.RuneCount(token.Term)
if wordLen > s.length {
@ -45,10 +43,8 @@ func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenS
}
token.Term = newterm
}
rv = append(rv, token)
}
return rv
return input
}
func TruncateTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {

View File

@ -54,14 +54,10 @@ func MustNewUnicodeNormalizeFilter(formName string) *UnicodeNormalizeFilter {
}
func (s *UnicodeNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for _, token := range input {
token.Term = s.form.Bytes(token.Term)
rv = append(rv, token)
}
return rv
return input
}
func UnicodeNormalizeFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {