changed many components to not have defaults

many of these defaults were arbitrary, and not having defaults lets us more easily flag them for configuration added a shingle filter introduce new toke type for shingles
2014-09-09 18:15:14 -04:00 · 2014-09-09 18:15:14 -04:00 · 8debf26cb7
parent 8dd8fb8910
commit 8debf26cb7
13 changed files with 554 additions and 39 deletions
--- a/analysis/analyzers/custom_analyzer/custom_analyzer.go
+++ b/analysis/analyzers/custom_analyzer/custom_analyzer.go
@ -21,15 +21,20 @@ const Name = "custom"
 func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {

 	var charFilters []analysis.CharFilter
-	charFilterNames, ok := config["char_filters"].([]string)
+	charFilterNames, ok := config["char_filters"].([]interface{})
 	if ok {
 		charFilters = make([]analysis.CharFilter, len(charFilterNames))
 		for i, charFilterName := range charFilterNames {
-			charFilter, err := cache.CharFilterNamed(charFilterName)
-			if err != nil {
-				return nil, err
+			charFilterNameString, ok := charFilterName.(string)
+			if ok {
+				charFilter, err := cache.CharFilterNamed(charFilterNameString)
+				if err != nil {
+					return nil, err
+				}
+				charFilters[i] = charFilter
+			} else {
+				return nil, fmt.Errorf("char filter name must be a string")
 			}
-			charFilters[i] = charFilter
 		}
 	}

@ -44,15 +49,20 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
 	}

 	var tokenFilters []analysis.TokenFilter
-	tokenFilterNames, ok := config["token_filters"].([]string)
+	tokenFilterNames, ok := config["token_filters"].([]interface{})
 	if ok {
 		tokenFilters = make([]analysis.TokenFilter, len(tokenFilterNames))
 		for i, tokenFilterName := range tokenFilterNames {
-			tokenFilter, err := cache.TokenFilterNamed(tokenFilterName)
-			if err != nil {
-				return nil, err
+			tokenFilterNameString, ok := tokenFilterName.(string)
+			if ok {
+				tokenFilter, err := cache.TokenFilterNamed(tokenFilterNameString)
+				if err != nil {
+					return nil, err
+				}
+				tokenFilters[i] = tokenFilter
+			} else {
+				return nil, fmt.Errorf("token filter name must be a string")
 			}
-			tokenFilters[i] = tokenFilter
 		}
 	}

--- a/analysis/token_filters/edge_ngram_filter/edge_ngram_filter.go
+++ b/analysis/token_filters/edge_ngram_filter/edge_ngram_filter.go
@ -11,6 +11,7 @@ package edge_ngram_filter

 import (
 	"bytes"
+	"fmt"
 	"unicode/utf8"

 	"github.com/blevesearch/bleve/analysis"
@ -100,16 +101,16 @@ func EdgeNgramFilterConstructor(config map[string]interface{}, cache *registry.C
 	if ok && back {
 		side = BACK
 	}
-	min := 1
 	minVal, ok := config["min"].(float64)
-	if ok {
-		min = int(minVal)
+	if !ok {
+		return nil, fmt.Errorf("must specify min")
 	}
-	max := 2
+	min := int(minVal)
 	maxVal, ok := config["max"].(float64)
-	if ok {
-		max = int(maxVal)
+	if !ok {
+		return nil, fmt.Errorf("must specify max")
 	}
+	max := int(maxVal)

 	return NewEdgeNgramFilter(side, min, max), nil
 }
--- a/analysis/token_filters/length_filter/length_filter.go
+++ b/analysis/token_filters/length_filter/length_filter.go
@ -10,6 +10,7 @@
 package length_filter

 import (
+	"fmt"
 	"unicode/utf8"

 	"github.com/blevesearch/bleve/analysis"
@ -59,6 +60,9 @@ func LengthFilterConstructor(config map[string]interface{}, cache *registry.Cach
 	if ok {
 		max = int(maxVal)
 	}
+	if min == max && max == 0 {
+		return nil, fmt.Errorf("either min or max must be non-zero")
+	}

 	return NewLengthFilter(min, max), nil
 }
--- a/analysis/token_filters/ngram_filter/ngram_filter.go
+++ b/analysis/token_filters/ngram_filter/ngram_filter.go
@ -11,6 +11,7 @@ package ngram_filter

 import (
 	"bytes"
+	"fmt"
 	"unicode/utf8"

 	"github.com/blevesearch/bleve/analysis"
@ -70,16 +71,16 @@ func buildTermFromRunes(runes []rune) []byte {
 }

 func NgramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	min := 1
 	minVal, ok := config["min"].(float64)
-	if ok {
-		min = int(minVal)
+	if !ok {
+		return nil, fmt.Errorf("must specify min")
 	}
-	max := 2
+	min := int(minVal)
 	maxVal, ok := config["max"].(float64)
-	if ok {
-		max = int(maxVal)
+	if !ok {
+		return nil, fmt.Errorf("must specify max")
 	}
+	max := int(maxVal)

 	return NewNgramFilter(min, max), nil
 }
--- a/analysis/token_filters/shingle/shingle.go
+++ b/analysis/token_filters/shingle/shingle.go
@ -0,0 +1,157 @@
+package shingle
+
+import (
+	"container/ring"
+	"fmt"
+
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/registry"
+)
+
+const Name = "shingle"
+
+type ShingleFilter struct {
+	min            int
+	max            int
+	outputOriginal bool
+	tokenSeparator string
+	fill           string
+	ring           *ring.Ring
+	itemsInRing    int
+}
+
+func NewShingleFilter(min, max int, outputOriginal bool, sep, fill string) *ShingleFilter {
+	return &ShingleFilter{
+		min:            min,
+		max:            max,
+		outputOriginal: outputOriginal,
+		tokenSeparator: sep,
+		fill:           fill,
+		ring:           ring.New(max),
+	}
+}
+
+func (s *ShingleFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	rv := make(analysis.TokenStream, 0)
+
+	currentPosition := 0
+	for _, token := range input {
+		if s.outputOriginal {
+			rv = append(rv, token)
+		}
+
+		// if there are gaps, insert filler tokens
+		offset := token.Position - currentPosition
+		for offset > 1 {
+			fillerToken := analysis.Token{
+				Position: 0,
+				Start:    -1,
+				End:      -1,
+				Type:     analysis.AlphaNumeric,
+				Term:     []byte(s.fill),
+			}
+			s.ring.Value = &fillerToken
+			if s.itemsInRing < s.max {
+				s.itemsInRing++
+			}
+			rv = append(rv, s.shingleCurrentRingState()...)
+			s.ring = s.ring.Next()
+			offset--
+		}
+		currentPosition = token.Position
+
+		s.ring.Value = token
+		if s.itemsInRing < s.max {
+			s.itemsInRing++
+		}
+		rv = append(rv, s.shingleCurrentRingState()...)
+		s.ring = s.ring.Next()
+
+	}
+
+	return rv
+}
+
+func (s *ShingleFilter) shingleCurrentRingState() analysis.TokenStream {
+	rv := make(analysis.TokenStream, 0)
+	for shingleN := s.min; shingleN <= s.max; shingleN++ {
+		// if there are enough items in the ring
+		// to produce a shingle of this size
+		if s.itemsInRing >= shingleN {
+			thisShingleRing := s.ring.Move(-(shingleN - 1))
+			shingledBytes := make([]byte, 0)
+			pos := 0
+			start := -1
+			end := 0
+			for i := 0; i < shingleN; i++ {
+				if i != 0 {
+					shingledBytes = append(shingledBytes, []byte(s.tokenSeparator)...)
+				}
+				curr := thisShingleRing.Value.(*analysis.Token)
+				if pos == 0 && curr.Position != 0 {
+					pos = curr.Position
+				}
+				if start == -1 && curr.Start != -1 {
+					start = curr.Start
+				}
+				if curr.End != -1 {
+					end = curr.End
+				}
+				shingledBytes = append(shingledBytes, curr.Term...)
+				thisShingleRing = thisShingleRing.Next()
+			}
+			token := analysis.Token{
+				Type: analysis.Shingle,
+				Term: shingledBytes,
+			}
+			if pos != 0 {
+				token.Position = pos
+			}
+			if start != -1 {
+				token.Start = start
+			}
+			if end != -1 {
+				token.End = end
+			}
+			rv = append(rv, &token)
+		}
+	}
+	return rv
+}
+
+func ShingleFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	minVal, ok := config["min"].(float64)
+	if !ok {
+		return nil, fmt.Errorf("must specify min")
+	}
+	min := int(minVal)
+	maxVal, ok := config["max"].(float64)
+	if !ok {
+		return nil, fmt.Errorf("must specify max")
+	}
+	max := int(maxVal)
+
+	outputOriginal := false
+	outVal, ok := config["output_original"].(bool)
+	if ok {
+		outputOriginal = outVal
+	}
+
+	sep := " "
+	sepVal, ok := config["separator"].(string)
+	if ok {
+		sep = sepVal
+	}
+
+	fill := "_"
+	fillVal, ok := config["filler"].(string)
+	if ok {
+		fill = fillVal
+	}
+
+	return NewShingleFilter(min, max, outputOriginal, sep, fill), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(Name, ShingleFilterConstructor)
+}
--- a/analysis/token_filters/shingle/shingle_test.go
+++ b/analysis/token_filters/shingle/shingle_test.go
@ -0,0 +1,330 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+
+package shingle
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/analysis"
+)
+
+func TestNgramFilter(t *testing.T) {
+
+	tests := []struct {
+		min            int
+		max            int
+		outputOriginal bool
+		separator      string
+		filler         string
+		input          analysis.TokenStream
+		output         analysis.TokenStream
+	}{
+		{
+			min:            2,
+			max:            2,
+			outputOriginal: false,
+			separator:      " ",
+			filler:         "_",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the"),
+				},
+				&analysis.Token{
+					Term: []byte("quick"),
+				},
+				&analysis.Token{
+					Term: []byte("brown"),
+				},
+				&analysis.Token{
+					Term: []byte("fox"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the quick"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("quick brown"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("brown fox"),
+					Type: analysis.Shingle,
+				},
+			},
+		},
+		{
+			min:            3,
+			max:            3,
+			outputOriginal: false,
+			separator:      " ",
+			filler:         "_",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the"),
+				},
+				&analysis.Token{
+					Term: []byte("quick"),
+				},
+				&analysis.Token{
+					Term: []byte("brown"),
+				},
+				&analysis.Token{
+					Term: []byte("fox"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the quick brown"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("quick brown fox"),
+					Type: analysis.Shingle,
+				},
+			},
+		},
+		{
+			min:            2,
+			max:            3,
+			outputOriginal: false,
+			separator:      " ",
+			filler:         "_",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the"),
+				},
+				&analysis.Token{
+					Term: []byte("quick"),
+				},
+				&analysis.Token{
+					Term: []byte("brown"),
+				},
+				&analysis.Token{
+					Term: []byte("fox"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the quick"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("quick brown"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("the quick brown"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("brown fox"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("quick brown fox"),
+					Type: analysis.Shingle,
+				},
+			},
+		},
+		{
+			min:            3,
+			max:            3,
+			outputOriginal: false,
+			separator:      " ",
+			filler:         "_",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("ugly"),
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("quick"),
+					Position: 3,
+				},
+				&analysis.Token{
+					Term:     []byte("brown"),
+					Position: 4,
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("ugly _ quick"),
+					Type:     analysis.Shingle,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("_ quick brown"),
+					Type:     analysis.Shingle,
+					Position: 3,
+				},
+			},
+		},
+		{
+			min:            1,
+			max:            5,
+			outputOriginal: false,
+			separator:      " ",
+			filler:         "_",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("test"),
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("text"),
+					Position: 2,
+				},
+				// token 3 removed by stop filter
+				&analysis.Token{
+					Term:     []byte("see"),
+					Position: 4,
+				},
+				&analysis.Token{
+					Term:     []byte("shingles"),
+					Position: 5,
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("test"),
+					Type:     analysis.Shingle,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("text"),
+					Type:     analysis.Shingle,
+					Position: 2,
+				},
+				&analysis.Token{
+					Term:     []byte("test text"),
+					Type:     analysis.Shingle,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term: []byte("_"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term:     []byte("text _"),
+					Type:     analysis.Shingle,
+					Position: 2,
+				},
+				&analysis.Token{
+					Term:     []byte("test text _"),
+					Type:     analysis.Shingle,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("see"),
+					Type:     analysis.Shingle,
+					Position: 4,
+				},
+				&analysis.Token{
+					Term:     []byte("_ see"),
+					Type:     analysis.Shingle,
+					Position: 4,
+				},
+				&analysis.Token{
+					Term:     []byte("text _ see"),
+					Type:     analysis.Shingle,
+					Position: 2,
+				},
+				&analysis.Token{
+					Term:     []byte("test text _ see"),
+					Type:     analysis.Shingle,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("shingles"),
+					Type:     analysis.Shingle,
+					Position: 5,
+				},
+				&analysis.Token{
+					Term:     []byte("see shingles"),
+					Type:     analysis.Shingle,
+					Position: 4,
+				},
+				&analysis.Token{
+					Term:     []byte("_ see shingles"),
+					Type:     analysis.Shingle,
+					Position: 4,
+				},
+				&analysis.Token{
+					Term:     []byte("text _ see shingles"),
+					Type:     analysis.Shingle,
+					Position: 2,
+				},
+				&analysis.Token{
+					Term:     []byte("test text _ see shingles"),
+					Type:     analysis.Shingle,
+					Position: 1,
+				},
+			},
+		},
+		{
+			min:            2,
+			max:            2,
+			outputOriginal: true,
+			separator:      " ",
+			filler:         "_",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the"),
+				},
+				&analysis.Token{
+					Term: []byte("quick"),
+				},
+				&analysis.Token{
+					Term: []byte("brown"),
+				},
+				&analysis.Token{
+					Term: []byte("fox"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the"),
+				},
+				&analysis.Token{
+					Term: []byte("quick"),
+				},
+				&analysis.Token{
+					Term: []byte("the quick"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("brown"),
+				},
+				&analysis.Token{
+					Term: []byte("quick brown"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("fox"),
+				},
+				&analysis.Token{
+					Term: []byte("brown fox"),
+					Type: analysis.Shingle,
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		shingleFilter := NewShingleFilter(test.min, test.max, test.outputOriginal, test.separator, test.filler)
+		actual := shingleFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %s, got %s", test.output, actual)
+		}
+	}
+}
--- a/analysis/token_filters/stemmer_filter/stemmer_filter.go
+++ b/analysis/token_filters/stemmer_filter/stemmer_filter.go
@ -12,6 +12,8 @@
 package stemmer_filter

 import (
+	"fmt"
+
 	"bitbucket.org/tebeka/snowball"
 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/registry"
@ -63,11 +65,11 @@ func (s *StemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream
 }

 func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	lang := "en"
 	langVal, ok := config["lang"].(string)
-	if ok {
-		lang = langVal
+	if !ok {
+		return nil, fmt.Errorf("must specify stemmer language")
 	}
+	lang := langVal
 	return NewStemmerFilter(lang)
 }

--- a/analysis/token_filters/truncate_token_filter/truncate_token_filter.go
+++ b/analysis/token_filters/truncate_token_filter/truncate_token_filter.go
@ -11,6 +11,7 @@ package truncate_token_filter

 import (
 	"bytes"
+	"fmt"
 	"unicode/utf8"

 	"github.com/blevesearch/bleve/analysis"
@ -51,12 +52,11 @@ func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenS
 }

 func TruncateTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	length := 25
-
 	lenVal, ok := config["length"].(float64)
-	if ok {
-		length = int(lenVal)
+	if !ok {
+		return nil, fmt.Errorf("must specify length")
 	}
+	length := int(lenVal)

 	return NewTruncateTokenFilter(length), nil
 }
--- a/analysis/token_filters/unicode_normalize/unicode_normalize.go
+++ b/analysis/token_filters/unicode_normalize/unicode_normalize.go
@ -65,11 +65,11 @@ func (s *UnicodeNormalizeFilter) Filter(input analysis.TokenStream) analysis.Tok
 }

 func UnicodeNormalizeFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	form := NFKC
 	formVal, ok := config["form"].(string)
-	if ok {
-		form = formVal
+	if !ok {
+		return nil, fmt.Errorf("must specify form")
 	}
+	form := formVal
 	return NewUnicodeNormalizeFilter(form)
 }

--- a/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go
+++ b/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go
@ -19,7 +19,7 @@ import (

 const Name = "whitespace"

-var whitespaceTokenizerRegexp = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|\w+`)
+var whitespaceTokenizerRegexp = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|[^\p{Z}\p{P}\p{C}]+`)

 func TokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
 	return regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp), nil
--- a/analysis/type.go
+++ b/analysis/type.go
@ -25,15 +25,16 @@ const (
 	Ideographic
 	Numeric
 	DateTime
+	Shingle
 )

 type Token struct {
-	Start    int
-	End      int
-	Term     []byte
-	Position int
-	Type     TokenType
-	KeyWord  bool
+	Start    int       `json:"start"`
+	End      int       `json:"end"`
+	Term     []byte    `json:"term"`
+	Position int       `json:"position"`
+	Type     TokenType `json:"type"`
+	KeyWord  bool      `json:"keyword"`
 }

 func (t *Token) String() string {
--- a/config.go
+++ b/config.go
@ -46,6 +46,7 @@ import (
 	_ "github.com/blevesearch/bleve/analysis/token_filters/length_filter"
 	_ "github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
 	_ "github.com/blevesearch/bleve/analysis/token_filters/ngram_filter"
+	_ "github.com/blevesearch/bleve/analysis/token_filters/shingle"
 	_ "github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
 	_ "github.com/blevesearch/bleve/analysis/token_filters/truncate_token_filter"
 	_ "github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
--- a/mapping_index.go
+++ b/mapping_index.go
@ -443,3 +443,11 @@ func (im *IndexMapping) datetimeParserNameForPath(path string) string {

 	return im.DefaultDateTimeParser
 }
+
+func (im *IndexMapping) AnalyzeText(analyzerName string, text []byte) (analysis.TokenStream, error) {
+	analyzer, err := im.cache.AnalyzerNamed(analyzerName)
+	if err != nil {
+		return nil, err
+	}
+	return analyzer.Analyze(text), nil
+}