introduced token type

2014-07-31 13:54:12 -04:00 · 2014-07-31 13:54:12 -04:00 · 25540c736a
commit 25540c736a
parent c8918fe41a
11 changed files with 110 additions and 115 deletions
--- a/analysis/token_filters/cld2/cld2_filter.go
+++ b/analysis/token_filters/cld2/cld2_filter.go
@ -38,6 +38,7 @@ func (f *Cld2Filter) Filter(input analysis.TokenStream) analysis.TokenStream {
 		}
 		token.Start = offset
 		token.End = token.Start + len(token.Term)
+		token.Type = analysis.AlphaNumeric
 		rv = append(rv, token)
 		offset = token.End + 1
 	}
--- a/analysis/token_filters/cld2/cld2_filter_test.go
+++ b/analysis/token_filters/cld2/cld2_filter_test.go
@ -27,6 +27,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      19,
 					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 			output: analysis.TokenStream{
@ -35,6 +36,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      2,
 					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@ -45,6 +47,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      21,
 					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 			output: analysis.TokenStream{
@ -53,6 +56,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      2,
 					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@ -63,6 +67,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      72,
 					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 			output: analysis.TokenStream{
@ -71,6 +76,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      2,
 					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@ -81,6 +87,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      26,
 					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 			output: analysis.TokenStream{
@ -89,6 +96,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      2,
 					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
--- a/analysis/token_filters/stop_words_filter/stop_words_filter.go
+++ b/analysis/token_filters/stop_words_filter/stop_words_filter.go
@ -12,14 +12,6 @@ import (
 	"github.com/couchbaselabs/bleve/analysis"
 )

-// var DEFAULT_STOP_WORDS []string = []string{
-// 	"a", "an", "and", "are", "as", "at", "be", "but", "by",
-// 	"for", "if", "in", "into", "is", "it",
-// 	"no", "not", "of", "on", "or", "such",
-// 	"that", "the", "their", "then", "there", "these",
-// 	"they", "this", "to", "was", "will", "with",
-// }
-
 type StopWordsFilter struct {
 	stopWords StopWordsMap
 }
@ -43,11 +35,3 @@ func (f *StopWordsFilter) Filter(input analysis.TokenStream) analysis.TokenStrea

 	return rv
 }
-
-// func buildStopWordMap(words []string) map[string]bool {
-// 	rv := make(map[string]bool, len(words))
-// 	for _, word := range words {
-// 		rv[word] = true
-// 	}
-// 	return rv
-// }
--- a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go
+++ b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go
@ -33,6 +33,7 @@ func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream {
 			Start:    match[0],
 			End:      match[1],
 			Position: i + 1,
+			Type:     analysis.AlphaNumeric,
 		}
 		rv[i] = &token
 	}
--- a/analysis/tokenizers/simple_word_boundary/simple_word_boundary_test.go
+++ b/analysis/tokenizers/simple_word_boundary/simple_word_boundary_test.go
@ -6,10 +6,11 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package simple_word_boundary
+package regexp_tokenizer

 import (
 	"reflect"
+	"regexp"
 	"testing"

 	"github.com/couchbaselabs/bleve/analysis"
@ -17,6 +18,8 @@ import (

 func TestBoundary(t *testing.T) {

+	wordRegex := regexp.MustCompile(`\w+`)
+
 	tests := []struct {
 		input  []byte
 		output analysis.TokenStream
@ -25,23 +28,25 @@ func TestBoundary(t *testing.T) {
 			[]byte("Hello World."),
 			analysis.TokenStream{
 				{
-					0,
-					5,
-					[]byte("Hello"),
-					1,
+					Start:    0,
+					End:      5,
+					Term:     []byte("Hello"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					6,
-					11,
-					[]byte("World"),
-					2,
+					Start:    6,
+					End:      11,
+					Term:     []byte("World"),
+					Position: 2,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
 	}

 	for _, test := range tests {
-		tokenizer := NewSimpleWordBoundaryTokenizer()
+		tokenizer := NewRegexpTokenizer(wordRegex)
 		actual := tokenizer.Tokenize(test.input)

 		if !reflect.DeepEqual(actual, test.output) {
--- a/analysis/tokenizers/simple_word_boundary/simple_word_boundary.go
+++ b/analysis/tokenizers/simple_word_boundary/simple_word_boundary.go
@ -1,29 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-package simple_word_boundary
-
-import (
-	"regexp"
-
-	"github.com/couchbaselabs/bleve/analysis/tokenizers/regexp_tokenizer"
-)
-
-const wordPattern = `\w+`
-
-var wordRegex = regexp.MustCompile(wordPattern)
-
-type SimpleWordBoundaryTokenizer struct {
-	*regexp_tokenizer.RegexpTokenizer
-}
-
-func NewSimpleWordBoundaryTokenizer() *SimpleWordBoundaryTokenizer {
-	return &SimpleWordBoundaryTokenizer{
-		regexp_tokenizer.NewRegexpTokenizer(wordRegex),
-	}
-}
--- a/analysis/tokenizers/single_token/single_token.go
+++ b/analysis/tokenizers/single_token/single_token.go
@ -26,6 +26,7 @@ func (t *SingleTokenTokenizer) Tokenize(input []byte) analysis.TokenStream {
 			Position: 1,
 			Start:    0,
 			End:      len(input),
+			Type:     analysis.AlphaNumeric,
 		},
 	}
 }
--- a/analysis/tokenizers/single_token/single_token_test.go
+++ b/analysis/tokenizers/single_token/single_token_test.go
@ -25,10 +25,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
 			[]byte("Hello World"),
 			analysis.TokenStream{
 				{
-					0,
-					11,
-					[]byte("Hello World"),
-					1,
+					Start:    0,
+					End:      11,
+					Term:     []byte("Hello World"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@ -36,10 +37,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
 			[]byte("こんにちは世界"),
 			analysis.TokenStream{
 				{
-					0,
-					21,
-					[]byte("こんにちは世界"),
-					1,
+					Start:    0,
+					End:      21,
+					Term:     []byte("こんにちは世界"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@ -47,10 +49,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
 			[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
 			analysis.TokenStream{
 				{
-					0,
-					72,
-					[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
-					1,
+					Start:    0,
+					End:      72,
+					Term:     []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
--- a/analysis/tokenizers/unicode_word_boundary/boundary.go
+++ b/analysis/tokenizers/unicode_word_boundary/boundary.go
@ -103,6 +103,7 @@ func (t *UnicodeWordBoundaryTokenizer) Tokenize(input []byte) analysis.TokenStre
 				End:      int(indexB),
 				Term:     input[indexA:indexB],
 				Position: position,
+				Type:     analysis.AlphaNumeric,
 			}
 			rv = append(rv, &token)
 		}
--- a/analysis/tokenizers/unicode_word_boundary/boundary_test.go
+++ b/analysis/tokenizers/unicode_word_boundary/boundary_test.go
@ -27,16 +27,18 @@ func TestBoundary(t *testing.T) {
 			"en_US",
 			analysis.TokenStream{
 				{
-					0,
-					5,
-					[]byte("Hello"),
-					1,
+					Start:    0,
+					End:      5,
+					Term:     []byte("Hello"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					6,
-					11,
-					[]byte("World"),
-					2,
+					Start:    6,
+					End:      11,
+					Term:     []byte("World"),
+					Position: 2,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@ -45,16 +47,18 @@ func TestBoundary(t *testing.T) {
 			"en_US",
 			analysis.TokenStream{
 				{
-					0,
-					15,
-					[]byte("こんにちは"),
-					1,
+					Start:    0,
+					End:      15,
+					Term:     []byte("こんにちは"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					15,
-					21,
-					[]byte("世界"),
-					2,
+					Start:    15,
+					End:      21,
+					Term:     []byte("世界"),
+					Position: 2,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@ -63,52 +67,60 @@ func TestBoundary(t *testing.T) {
 			"th_TH",
 			analysis.TokenStream{
 				{
-					0,
-					9,
-					[]byte("แยก"),
-					1,
+					Start:    0,
+					End:      9,
+					Term:     []byte("แยก"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					9,
-					15,
-					[]byte("คำ"),
-					2,
+					Start:    9,
+					End:      15,
+					Term:     []byte("คำ"),
+					Position: 2,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					15,
-					27,
-					[]byte("ภาษา"),
-					3,
+					Start:    15,
+					End:      27,
+					Term:     []byte("ภาษา"),
+					Position: 3,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					27,
-					36,
-					[]byte("ไทย"),
-					4,
+					Start:    27,
+					End:      36,
+					Term:     []byte("ไทย"),
+					Position: 4,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					36,
-					42,
-					[]byte("ก็"),
-					5,
+					Start:    36,
+					End:      42,
+					Term:     []byte("ก็"),
+					Position: 5,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					42,
-					57,
-					[]byte("ทำได้"),
-					6,
+					Start:    42,
+					End:      57,
+					Term:     []byte("ทำได้"),
+					Position: 6,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					57,
-					63,
-					[]byte("นะ"),
-					7,
+					Start:    57,
+					End:      63,
+					Term:     []byte("นะ"),
+					Position: 7,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					63,
-					72,
-					[]byte("จ้ะ"),
-					8,
+					Start:    63,
+					End:      72,
+					Term:     []byte("จ้ะ"),
+					Position: 8,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
--- a/analysis/type.go
+++ b/analysis/type.go
@ -16,11 +16,19 @@ type CharFilter interface {
 	Filter([]byte) []byte
 }

+type TokenType int
+
+const (
+	AlphaNumeric TokenType = iota
+	Numeric
+)
+
 type Token struct {
 	Start    int
 	End      int
 	Term     []byte
 	Position int
+	Type     TokenType
 }

 func (t *Token) String() string {