introduced token type

2014-07-31 13:54:12 -04:00 · 2014-07-31 13:54:12 -04:00 · 25540c736a
commit 25540c736a
parent c8918fe41a
11 changed files with 110 additions and 115 deletions
--- a/analysis/token_filters/cld2/cld2_filter.go
+++ b/analysis/token_filters/cld2/cld2_filter.go
@ -38,6 +38,7 @@ func (f *Cld2Filter) Filter(input analysis.TokenStream) analysis.TokenStream {
 		}
 		token.Start = offset
 		token.End = token.Start + len(token.Term)
 		token.Type = analysis.AlphaNumeric
 		rv = append(rv, token)
 		offset = token.End + 1
 	}
--- a/analysis/token_filters/cld2/cld2_filter_test.go
+++ b/analysis/token_filters/cld2/cld2_filter_test.go
@ -27,6 +27,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      19,
 					Position: 1,
 					Type:     analysis.AlphaNumeric,
 				},
 			},
 			output: analysis.TokenStream{
@ -35,6 +36,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      2,
 					Position: 1,
 					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@ -45,6 +47,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      21,
 					Position: 1,
 					Type:     analysis.AlphaNumeric,
 				},
 			},
 			output: analysis.TokenStream{
@ -53,6 +56,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      2,
 					Position: 1,
 					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@ -63,6 +67,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      72,
 					Position: 1,
 					Type:     analysis.AlphaNumeric,
 				},
 			},
 			output: analysis.TokenStream{
@ -71,6 +76,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      2,
 					Position: 1,
 					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@ -81,6 +87,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      26,
 					Position: 1,
 					Type:     analysis.AlphaNumeric,
 				},
 			},
 			output: analysis.TokenStream{
@ -89,6 +96,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      2,
 					Position: 1,
 					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
--- a/analysis/token_filters/stop_words_filter/stop_words_filter.go
+++ b/analysis/token_filters/stop_words_filter/stop_words_filter.go
@ -12,14 +12,6 @@ import (
 	"github.com/couchbaselabs/bleve/analysis"
 )
 // var DEFAULT_STOP_WORDS []string = []string{
 // 	"a", "an", "and", "are", "as", "at", "be", "but", "by",
 // 	"for", "if", "in", "into", "is", "it",
 // 	"no", "not", "of", "on", "or", "such",
 // 	"that", "the", "their", "then", "there", "these",
 // 	"they", "this", "to", "was", "will", "with",
 // }
 type StopWordsFilter struct {
 	stopWords StopWordsMap
 }
@ -43,11 +35,3 @@ func (f *StopWordsFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
 	return rv
 }
 // func buildStopWordMap(words []string) map[string]bool {
 // 	rv := make(map[string]bool, len(words))
 // 	for _, word := range words {
 // 		rv[word] = true
 // 	}
 // 	return rv
 // }
--- a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go
+++ b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go
@ -33,6 +33,7 @@ func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream {
 			Start:    match[0],
 			End:      match[1],
 			Position: i + 1,
 			Type:     analysis.AlphaNumeric,
 		}
 		rv[i] = &token
 	}
--- a/analysis/tokenizers/simple_word_boundary/simple_word_boundary_test.go
+++ b/analysis/tokenizers/simple_word_boundary/simple_word_boundary_test.go
@ -6,10 +6,11 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package simple_word_boundary
+package regexp_tokenizer
 import (
 	"reflect"
 	"regexp"
 	"testing"
 	"github.com/couchbaselabs/bleve/analysis"
@ -17,6 +18,8 @@ import (
 func TestBoundary(t *testing.T) {
 	wordRegex := regexp.MustCompile(`\w+`)
 	tests := []struct {
 		input  []byte
 		output analysis.TokenStream
@ -25,23 +28,25 @@ func TestBoundary(t *testing.T) {
 			[]byte("Hello World."),
 			analysis.TokenStream{
 				{
-					0,
+					Start:    0,
-					5,
+					End:      5,
-					[]byte("Hello"),
+					Term:     []byte("Hello"),
-					1,
+					Position: 1,
 					Type:     analysis.AlphaNumeric,
 				},
 				{
-					6,
+					Start:    6,
-					11,
+					End:      11,
-					[]byte("World"),
+					Term:     []byte("World"),
-					2,
+					Position: 2,
 					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
 	}
 	for _, test := range tests {
-		tokenizer := NewSimpleWordBoundaryTokenizer()
+		tokenizer := NewRegexpTokenizer(wordRegex)
 		actual := tokenizer.Tokenize(test.input)
 		if !reflect.DeepEqual(actual, test.output) {
--- a/analysis/tokenizers/simple_word_boundary/simple_word_boundary.go
+++ b/analysis/tokenizers/simple_word_boundary/simple_word_boundary.go
@ -1,29 +0,0 @@
 //  Copyright (c) 2014 Couchbase, Inc.
 //  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
 //  except in compliance with the License. You may obtain a copy of the License at
 //    http://www.apache.org/licenses/LICENSE-2.0
 //  Unless required by applicable law or agreed to in writing, software distributed under the
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
 package simple_word_boundary
 import (
 	"regexp"
 	"github.com/couchbaselabs/bleve/analysis/tokenizers/regexp_tokenizer"
 )
 const wordPattern = `\w+`
 var wordRegex = regexp.MustCompile(wordPattern)
 type SimpleWordBoundaryTokenizer struct {
 	*regexp_tokenizer.RegexpTokenizer
 }
 func NewSimpleWordBoundaryTokenizer() *SimpleWordBoundaryTokenizer {
 	return &SimpleWordBoundaryTokenizer{
 		regexp_tokenizer.NewRegexpTokenizer(wordRegex),
 	}
 }
--- a/analysis/tokenizers/single_token/single_token.go
+++ b/analysis/tokenizers/single_token/single_token.go
@ -26,6 +26,7 @@ func (t *SingleTokenTokenizer) Tokenize(input []byte) analysis.TokenStream {
 			Position: 1,
 			Start:    0,
 			End:      len(input),
 			Type:     analysis.AlphaNumeric,
 		},
 	}
 }
--- a/analysis/tokenizers/single_token/single_token_test.go
+++ b/analysis/tokenizers/single_token/single_token_test.go
@ -25,10 +25,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
 			[]byte("Hello World"),
 			analysis.TokenStream{
 				{
-					0,
+					Start:    0,
-					11,
+					End:      11,
-					[]byte("Hello World"),
+					Term:     []byte("Hello World"),
-					1,
+					Position: 1,
 					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@ -36,10 +37,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
 			[]byte("こんにちは世界"),
 			analysis.TokenStream{
 				{
-					0,
+					Start:    0,
-					21,
+					End:      21,
-					[]byte("こんにちは世界"),
+					Term:     []byte("こんにちは世界"),
-					1,
+					Position: 1,
 					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@ -47,10 +49,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
 			[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
 			analysis.TokenStream{
 				{
-					0,
+					Start:    0,
-					72,
+					End:      72,
-					[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
+					Term:     []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
-					1,
+					Position: 1,
 					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
--- a/analysis/tokenizers/unicode_word_boundary/boundary.go
+++ b/analysis/tokenizers/unicode_word_boundary/boundary.go
@ -103,6 +103,7 @@ func (t *UnicodeWordBoundaryTokenizer) Tokenize(input []byte) analysis.TokenStre
 				End:      int(indexB),
 				Term:     input[indexA:indexB],
 				Position: position,
 				Type:     analysis.AlphaNumeric,
 			}
 			rv = append(rv, &token)
 		}
--- a/analysis/tokenizers/unicode_word_boundary/boundary_test.go
+++ b/analysis/tokenizers/unicode_word_boundary/boundary_test.go
@ -27,16 +27,18 @@ func TestBoundary(t *testing.T) {
 			"en_US",
 			analysis.TokenStream{
 				{
-					0,
+					Start:    0,
-					5,
+					End:      5,
-					[]byte("Hello"),
+					Term:     []byte("Hello"),
-					1,
+					Position: 1,
 					Type:     analysis.AlphaNumeric,
 				},
 				{
-					6,
+					Start:    6,
-					11,
+					End:      11,
-					[]byte("World"),
+					Term:     []byte("World"),
-					2,
+					Position: 2,
 					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@ -45,16 +47,18 @@ func TestBoundary(t *testing.T) {
 			"en_US",
 			analysis.TokenStream{
 				{
-					0,
+					Start:    0,
-					15,
+					End:      15,
-					[]byte("こんにちは"),
+					Term:     []byte("こんにちは"),
-					1,
+					Position: 1,
 					Type:     analysis.AlphaNumeric,
 				},
 				{
-					15,
+					Start:    15,
-					21,
+					End:      21,
-					[]byte("世界"),
+					Term:     []byte("世界"),
-					2,
+					Position: 2,
 					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@ -63,52 +67,60 @@ func TestBoundary(t *testing.T) {
 			"th_TH",
 			analysis.TokenStream{
 				{
-					0,
+					Start:    0,
-					9,
+					End:      9,
-					[]byte("แยก"),
+					Term:     []byte("แยก"),
-					1,
+					Position: 1,
 					Type:     analysis.AlphaNumeric,
 				},
 				{
-					9,
+					Start:    9,
-					15,
+					End:      15,
-					[]byte("คำ"),
+					Term:     []byte("คำ"),
-					2,
+					Position: 2,
 					Type:     analysis.AlphaNumeric,
 				},
 				{
-					15,
+					Start:    15,
-					27,
+					End:      27,
-					[]byte("ภาษา"),
+					Term:     []byte("ภาษา"),
-					3,
+					Position: 3,
 					Type:     analysis.AlphaNumeric,
 				},
 				{
-					27,
+					Start:    27,
-					36,
+					End:      36,
-					[]byte("ไทย"),
+					Term:     []byte("ไทย"),
-					4,
+					Position: 4,
 					Type:     analysis.AlphaNumeric,
 				},
 				{
-					36,
+					Start:    36,
-					42,
+					End:      42,
-					[]byte("ก็"),
+					Term:     []byte("ก็"),
-					5,
+					Position: 5,
 					Type:     analysis.AlphaNumeric,
 				},
 				{
-					42,
+					Start:    42,
-					57,
+					End:      57,
-					[]byte("ทำได้"),
+					Term:     []byte("ทำได้"),
-					6,
+					Position: 6,
 					Type:     analysis.AlphaNumeric,
 				},
 				{
-					57,
+					Start:    57,
-					63,
+					End:      63,
-					[]byte("นะ"),
+					Term:     []byte("นะ"),
-					7,
+					Position: 7,
 					Type:     analysis.AlphaNumeric,
 				},
 				{
-					63,
+					Start:    63,
-					72,
+					End:      72,
-					[]byte("จ้ะ"),
+					Term:     []byte("จ้ะ"),
-					8,
+					Position: 8,
 					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
--- a/analysis/type.go
+++ b/analysis/type.go
@ -16,11 +16,19 @@ type CharFilter interface {
 	Filter([]byte) []byte
 }
 type TokenType int
 const (
 	AlphaNumeric TokenType = iota
 	Numeric
 )
 type Token struct {
 	Start    int
 	End      int
 	Term     []byte
 	Position int
 	Type     TokenType
 }
 func (t *Token) String() string {