From 25540c736a05b035cea637c6bf21c09593551cd7 Mon Sep 17 00:00:00 2001
From: Marty Schoch <marty.schoch@gmail.com>
Date: Thu, 31 Jul 2014 13:54:12 -0400
Subject: [PATCH] introduced token type

---
 analysis/token_filters/cld2/cld2_filter.go    |   1 +
 .../token_filters/cld2/cld2_filter_test.go    |   8 ++
 .../stop_words_filter/stop_words_filter.go    |  16 ---
 .../regexp_tokenizer/regexp_tokenizer.go      |   1 +
 .../regexp_tokenizer_test.go}                 |  25 ++--
 .../simple_word_boundary.go                   |  29 -----
 .../tokenizers/single_token/single_token.go   |   1 +
 .../single_token/single_token_test.go         |  27 +++--
 .../unicode_word_boundary/boundary.go         |   1 +
 .../unicode_word_boundary/boundary_test.go    | 108 ++++++++++--------
 analysis/type.go                              |   8 ++
 11 files changed, 110 insertions(+), 115 deletions(-)
 rename analysis/tokenizers/{simple_word_boundary/simple_word_boundary_test.go => regexp_tokenizer/regexp_tokenizer_test.go} (74%)
 delete mode 100644 analysis/tokenizers/simple_word_boundary/simple_word_boundary.go

diff --git a/analysis/token_filters/cld2/cld2_filter.go b/analysis/token_filters/cld2/cld2_filter.go
index b037592f..6aa957f7 100644
--- a/analysis/token_filters/cld2/cld2_filter.go
+++ b/analysis/token_filters/cld2/cld2_filter.go
@@ -38,6 +38,7 @@ func (f *Cld2Filter) Filter(input analysis.TokenStream) analysis.TokenStream {
 		}
 		token.Start = offset
 		token.End = token.Start + len(token.Term)
+		token.Type = analysis.AlphaNumeric
 		rv = append(rv, token)
 		offset = token.End + 1
 	}
diff --git a/analysis/token_filters/cld2/cld2_filter_test.go b/analysis/token_filters/cld2/cld2_filter_test.go
index a0c0c749..193ce985 100644
--- a/analysis/token_filters/cld2/cld2_filter_test.go
+++ b/analysis/token_filters/cld2/cld2_filter_test.go
@@ -27,6 +27,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      19,
 					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 			output: analysis.TokenStream{
@@ -35,6 +36,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      2,
 					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@@ -45,6 +47,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      21,
 					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 			output: analysis.TokenStream{
@@ -53,6 +56,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      2,
 					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@@ -63,6 +67,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      72,
 					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 			output: analysis.TokenStream{
@@ -71,6 +76,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      2,
 					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@@ -81,6 +87,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      26,
 					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 			output: analysis.TokenStream{
@@ -89,6 +96,7 @@ func TestCld2Filter(t *testing.T) {
 					Start:    0,
 					End:      2,
 					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
diff --git a/analysis/token_filters/stop_words_filter/stop_words_filter.go b/analysis/token_filters/stop_words_filter/stop_words_filter.go
index 149d52e6..7c4c6c8e 100644
--- a/analysis/token_filters/stop_words_filter/stop_words_filter.go
+++ b/analysis/token_filters/stop_words_filter/stop_words_filter.go
@@ -12,14 +12,6 @@ import (
 	"github.com/couchbaselabs/bleve/analysis"
 )
 
-// var DEFAULT_STOP_WORDS []string = []string{
-// 	"a", "an", "and", "are", "as", "at", "be", "but", "by",
-// 	"for", "if", "in", "into", "is", "it",
-// 	"no", "not", "of", "on", "or", "such",
-// 	"that", "the", "their", "then", "there", "these",
-// 	"they", "this", "to", "was", "will", "with",
-// }
-
 type StopWordsFilter struct {
 	stopWords StopWordsMap
 }
@@ -43,11 +35,3 @@ func (f *StopWordsFilter) Filter(input analysis.TokenStream) analysis.TokenStrea
 
 	return rv
 }
-
-// func buildStopWordMap(words []string) map[string]bool {
-// 	rv := make(map[string]bool, len(words))
-// 	for _, word := range words {
-// 		rv[word] = true
-// 	}
-// 	return rv
-// }
diff --git a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go
index b9286cf3..8e720024 100644
--- a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go
+++ b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go
@@ -33,6 +33,7 @@ func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream {
 			Start:    match[0],
 			End:      match[1],
 			Position: i + 1,
+			Type:     analysis.AlphaNumeric,
 		}
 		rv[i] = &token
 	}
diff --git a/analysis/tokenizers/simple_word_boundary/simple_word_boundary_test.go b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go
similarity index 74%
rename from analysis/tokenizers/simple_word_boundary/simple_word_boundary_test.go
rename to analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go
index 73f2af11..eef0d77f 100644
--- a/analysis/tokenizers/simple_word_boundary/simple_word_boundary_test.go
+++ b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go
@@ -6,10 +6,11 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package simple_word_boundary
+package regexp_tokenizer
 
 import (
 	"reflect"
+	"regexp"
 	"testing"
 
 	"github.com/couchbaselabs/bleve/analysis"
@@ -17,6 +18,8 @@ import (
 
 func TestBoundary(t *testing.T) {
 
+	wordRegex := regexp.MustCompile(`\w+`)
+
 	tests := []struct {
 		input  []byte
 		output analysis.TokenStream
@@ -25,23 +28,25 @@ func TestBoundary(t *testing.T) {
 			[]byte("Hello World."),
 			analysis.TokenStream{
 				{
-					0,
-					5,
-					[]byte("Hello"),
-					1,
+					Start:    0,
+					End:      5,
+					Term:     []byte("Hello"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					6,
-					11,
-					[]byte("World"),
-					2,
+					Start:    6,
+					End:      11,
+					Term:     []byte("World"),
+					Position: 2,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
 	}
 
 	for _, test := range tests {
-		tokenizer := NewSimpleWordBoundaryTokenizer()
+		tokenizer := NewRegexpTokenizer(wordRegex)
 		actual := tokenizer.Tokenize(test.input)
 
 		if !reflect.DeepEqual(actual, test.output) {
diff --git a/analysis/tokenizers/simple_word_boundary/simple_word_boundary.go b/analysis/tokenizers/simple_word_boundary/simple_word_boundary.go
deleted file mode 100644
index dea53856..00000000
--- a/analysis/tokenizers/simple_word_boundary/simple_word_boundary.go
+++ /dev/null
@@ -1,29 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-package simple_word_boundary
-
-import (
-	"regexp"
-
-	"github.com/couchbaselabs/bleve/analysis/tokenizers/regexp_tokenizer"
-)
-
-const wordPattern = `\w+`
-
-var wordRegex = regexp.MustCompile(wordPattern)
-
-type SimpleWordBoundaryTokenizer struct {
-	*regexp_tokenizer.RegexpTokenizer
-}
-
-func NewSimpleWordBoundaryTokenizer() *SimpleWordBoundaryTokenizer {
-	return &SimpleWordBoundaryTokenizer{
-		regexp_tokenizer.NewRegexpTokenizer(wordRegex),
-	}
-}
diff --git a/analysis/tokenizers/single_token/single_token.go b/analysis/tokenizers/single_token/single_token.go
index 0f73bcce..4b3dbf5f 100644
--- a/analysis/tokenizers/single_token/single_token.go
+++ b/analysis/tokenizers/single_token/single_token.go
@@ -26,6 +26,7 @@ func (t *SingleTokenTokenizer) Tokenize(input []byte) analysis.TokenStream {
 			Position: 1,
 			Start:    0,
 			End:      len(input),
+			Type:     analysis.AlphaNumeric,
 		},
 	}
 }
diff --git a/analysis/tokenizers/single_token/single_token_test.go b/analysis/tokenizers/single_token/single_token_test.go
index 8f29dc85..8a55087c 100644
--- a/analysis/tokenizers/single_token/single_token_test.go
+++ b/analysis/tokenizers/single_token/single_token_test.go
@@ -25,10 +25,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
 			[]byte("Hello World"),
 			analysis.TokenStream{
 				{
-					0,
-					11,
-					[]byte("Hello World"),
-					1,
+					Start:    0,
+					End:      11,
+					Term:     []byte("Hello World"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@@ -36,10 +37,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
 			[]byte("こんにちは世界"),
 			analysis.TokenStream{
 				{
-					0,
-					21,
-					[]byte("こんにちは世界"),
-					1,
+					Start:    0,
+					End:      21,
+					Term:     []byte("こんにちは世界"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@@ -47,10 +49,11 @@ func TestSingleTokenTokenizer(t *testing.T) {
 			[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
 			analysis.TokenStream{
 				{
-					0,
-					72,
-					[]byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
-					1,
+					Start:    0,
+					End:      72,
+					Term:     []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
diff --git a/analysis/tokenizers/unicode_word_boundary/boundary.go b/analysis/tokenizers/unicode_word_boundary/boundary.go
index b853ea13..f05e9f53 100644
--- a/analysis/tokenizers/unicode_word_boundary/boundary.go
+++ b/analysis/tokenizers/unicode_word_boundary/boundary.go
@@ -103,6 +103,7 @@ func (t *UnicodeWordBoundaryTokenizer) Tokenize(input []byte) analysis.TokenStre
 				End:      int(indexB),
 				Term:     input[indexA:indexB],
 				Position: position,
+				Type:     analysis.AlphaNumeric,
 			}
 			rv = append(rv, &token)
 		}
diff --git a/analysis/tokenizers/unicode_word_boundary/boundary_test.go b/analysis/tokenizers/unicode_word_boundary/boundary_test.go
index d8ffcecd..0eeb6ce4 100644
--- a/analysis/tokenizers/unicode_word_boundary/boundary_test.go
+++ b/analysis/tokenizers/unicode_word_boundary/boundary_test.go
@@ -27,16 +27,18 @@ func TestBoundary(t *testing.T) {
 			"en_US",
 			analysis.TokenStream{
 				{
-					0,
-					5,
-					[]byte("Hello"),
-					1,
+					Start:    0,
+					End:      5,
+					Term:     []byte("Hello"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					6,
-					11,
-					[]byte("World"),
-					2,
+					Start:    6,
+					End:      11,
+					Term:     []byte("World"),
+					Position: 2,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@@ -45,16 +47,18 @@ func TestBoundary(t *testing.T) {
 			"en_US",
 			analysis.TokenStream{
 				{
-					0,
-					15,
-					[]byte("こんにちは"),
-					1,
+					Start:    0,
+					End:      15,
+					Term:     []byte("こんにちは"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					15,
-					21,
-					[]byte("世界"),
-					2,
+					Start:    15,
+					End:      21,
+					Term:     []byte("世界"),
+					Position: 2,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@@ -63,52 +67,60 @@ func TestBoundary(t *testing.T) {
 			"th_TH",
 			analysis.TokenStream{
 				{
-					0,
-					9,
-					[]byte("แยก"),
-					1,
+					Start:    0,
+					End:      9,
+					Term:     []byte("แยก"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					9,
-					15,
-					[]byte("คำ"),
-					2,
+					Start:    9,
+					End:      15,
+					Term:     []byte("คำ"),
+					Position: 2,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					15,
-					27,
-					[]byte("ภาษา"),
-					3,
+					Start:    15,
+					End:      27,
+					Term:     []byte("ภาษา"),
+					Position: 3,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					27,
-					36,
-					[]byte("ไทย"),
-					4,
+					Start:    27,
+					End:      36,
+					Term:     []byte("ไทย"),
+					Position: 4,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					36,
-					42,
-					[]byte("ก็"),
-					5,
+					Start:    36,
+					End:      42,
+					Term:     []byte("ก็"),
+					Position: 5,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					42,
-					57,
-					[]byte("ทำได้"),
-					6,
+					Start:    42,
+					End:      57,
+					Term:     []byte("ทำได้"),
+					Position: 6,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					57,
-					63,
-					[]byte("นะ"),
-					7,
+					Start:    57,
+					End:      63,
+					Term:     []byte("นะ"),
+					Position: 7,
+					Type:     analysis.AlphaNumeric,
 				},
 				{
-					63,
-					72,
-					[]byte("จ้ะ"),
-					8,
+					Start:    63,
+					End:      72,
+					Term:     []byte("จ้ะ"),
+					Position: 8,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
diff --git a/analysis/type.go b/analysis/type.go
index daa48cee..caf3e70c 100644
--- a/analysis/type.go
+++ b/analysis/type.go
@@ -16,11 +16,19 @@ type CharFilter interface {
 	Filter([]byte) []byte
 }
 
+type TokenType int
+
+const (
+	AlphaNumeric TokenType = iota
+	Numeric
+)
+
 type Token struct {
 	Start    int
 	End      int
 	Term     []byte
 	Position int
+	Type     TokenType
 }
 
 func (t *Token) String() string {