From 25540c736a05b035cea637c6bf21c09593551cd7 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 31 Jul 2014 13:54:12 -0400 Subject: [PATCH] introduced token type --- analysis/token_filters/cld2/cld2_filter.go | 1 + .../token_filters/cld2/cld2_filter_test.go | 8 ++ .../stop_words_filter/stop_words_filter.go | 16 --- .../regexp_tokenizer/regexp_tokenizer.go | 1 + .../regexp_tokenizer_test.go} | 25 ++-- .../simple_word_boundary.go | 29 ----- .../tokenizers/single_token/single_token.go | 1 + .../single_token/single_token_test.go | 27 +++-- .../unicode_word_boundary/boundary.go | 1 + .../unicode_word_boundary/boundary_test.go | 108 ++++++++++-------- analysis/type.go | 8 ++ 11 files changed, 110 insertions(+), 115 deletions(-) rename analysis/tokenizers/{simple_word_boundary/simple_word_boundary_test.go => regexp_tokenizer/regexp_tokenizer_test.go} (74%) delete mode 100644 analysis/tokenizers/simple_word_boundary/simple_word_boundary.go diff --git a/analysis/token_filters/cld2/cld2_filter.go b/analysis/token_filters/cld2/cld2_filter.go index b037592f..6aa957f7 100644 --- a/analysis/token_filters/cld2/cld2_filter.go +++ b/analysis/token_filters/cld2/cld2_filter.go @@ -38,6 +38,7 @@ func (f *Cld2Filter) Filter(input analysis.TokenStream) analysis.TokenStream { } token.Start = offset token.End = token.Start + len(token.Term) + token.Type = analysis.AlphaNumeric rv = append(rv, token) offset = token.End + 1 } diff --git a/analysis/token_filters/cld2/cld2_filter_test.go b/analysis/token_filters/cld2/cld2_filter_test.go index a0c0c749..193ce985 100644 --- a/analysis/token_filters/cld2/cld2_filter_test.go +++ b/analysis/token_filters/cld2/cld2_filter_test.go @@ -27,6 +27,7 @@ func TestCld2Filter(t *testing.T) { Start: 0, End: 19, Position: 1, + Type: analysis.AlphaNumeric, }, }, output: analysis.TokenStream{ @@ -35,6 +36,7 @@ func TestCld2Filter(t *testing.T) { Start: 0, End: 2, Position: 1, + Type: analysis.AlphaNumeric, }, }, }, @@ -45,6 +47,7 @@ func TestCld2Filter(t *testing.T) { Start: 0, End: 21, Position: 1, + Type: analysis.AlphaNumeric, }, }, output: analysis.TokenStream{ @@ -53,6 +56,7 @@ func TestCld2Filter(t *testing.T) { Start: 0, End: 2, Position: 1, + Type: analysis.AlphaNumeric, }, }, }, @@ -63,6 +67,7 @@ func TestCld2Filter(t *testing.T) { Start: 0, End: 72, Position: 1, + Type: analysis.AlphaNumeric, }, }, output: analysis.TokenStream{ @@ -71,6 +76,7 @@ func TestCld2Filter(t *testing.T) { Start: 0, End: 2, Position: 1, + Type: analysis.AlphaNumeric, }, }, }, @@ -81,6 +87,7 @@ func TestCld2Filter(t *testing.T) { Start: 0, End: 26, Position: 1, + Type: analysis.AlphaNumeric, }, }, output: analysis.TokenStream{ @@ -89,6 +96,7 @@ func TestCld2Filter(t *testing.T) { Start: 0, End: 2, Position: 1, + Type: analysis.AlphaNumeric, }, }, }, diff --git a/analysis/token_filters/stop_words_filter/stop_words_filter.go b/analysis/token_filters/stop_words_filter/stop_words_filter.go index 149d52e6..7c4c6c8e 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_filter.go +++ b/analysis/token_filters/stop_words_filter/stop_words_filter.go @@ -12,14 +12,6 @@ import ( "github.com/couchbaselabs/bleve/analysis" ) -// var DEFAULT_STOP_WORDS []string = []string{ -// "a", "an", "and", "are", "as", "at", "be", "but", "by", -// "for", "if", "in", "into", "is", "it", -// "no", "not", "of", "on", "or", "such", -// "that", "the", "their", "then", "there", "these", -// "they", "this", "to", "was", "will", "with", -// } - type StopWordsFilter struct { stopWords StopWordsMap } @@ -43,11 +35,3 @@ func (f *StopWordsFilter) Filter(input analysis.TokenStream) analysis.TokenStrea return rv } - -// func buildStopWordMap(words []string) map[string]bool { -// rv := make(map[string]bool, len(words)) -// for _, word := range words { -// rv[word] = true -// } -// return rv -// } diff --git a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go index b9286cf3..8e720024 100644 --- a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go +++ b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go @@ -33,6 +33,7 @@ func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream { Start: match[0], End: match[1], Position: i + 1, + Type: analysis.AlphaNumeric, } rv[i] = &token } diff --git a/analysis/tokenizers/simple_word_boundary/simple_word_boundary_test.go b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go similarity index 74% rename from analysis/tokenizers/simple_word_boundary/simple_word_boundary_test.go rename to analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go index 73f2af11..eef0d77f 100644 --- a/analysis/tokenizers/simple_word_boundary/simple_word_boundary_test.go +++ b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go @@ -6,10 +6,11 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package simple_word_boundary +package regexp_tokenizer import ( "reflect" + "regexp" "testing" "github.com/couchbaselabs/bleve/analysis" @@ -17,6 +18,8 @@ import ( func TestBoundary(t *testing.T) { + wordRegex := regexp.MustCompile(`\w+`) + tests := []struct { input []byte output analysis.TokenStream @@ -25,23 +28,25 @@ func TestBoundary(t *testing.T) { []byte("Hello World."), analysis.TokenStream{ { - 0, - 5, - []byte("Hello"), - 1, + Start: 0, + End: 5, + Term: []byte("Hello"), + Position: 1, + Type: analysis.AlphaNumeric, }, { - 6, - 11, - []byte("World"), - 2, + Start: 6, + End: 11, + Term: []byte("World"), + Position: 2, + Type: analysis.AlphaNumeric, }, }, }, } for _, test := range tests { - tokenizer := NewSimpleWordBoundaryTokenizer() + tokenizer := NewRegexpTokenizer(wordRegex) actual := tokenizer.Tokenize(test.input) if !reflect.DeepEqual(actual, test.output) { diff --git a/analysis/tokenizers/simple_word_boundary/simple_word_boundary.go b/analysis/tokenizers/simple_word_boundary/simple_word_boundary.go deleted file mode 100644 index dea53856..00000000 --- a/analysis/tokenizers/simple_word_boundary/simple_word_boundary.go +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) 2014 Couchbase, Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file -// except in compliance with the License. You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software distributed under the -// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, -// either express or implied. See the License for the specific language governing permissions -// and limitations under the License. -package simple_word_boundary - -import ( - "regexp" - - "github.com/couchbaselabs/bleve/analysis/tokenizers/regexp_tokenizer" -) - -const wordPattern = `\w+` - -var wordRegex = regexp.MustCompile(wordPattern) - -type SimpleWordBoundaryTokenizer struct { - *regexp_tokenizer.RegexpTokenizer -} - -func NewSimpleWordBoundaryTokenizer() *SimpleWordBoundaryTokenizer { - return &SimpleWordBoundaryTokenizer{ - regexp_tokenizer.NewRegexpTokenizer(wordRegex), - } -} diff --git a/analysis/tokenizers/single_token/single_token.go b/analysis/tokenizers/single_token/single_token.go index 0f73bcce..4b3dbf5f 100644 --- a/analysis/tokenizers/single_token/single_token.go +++ b/analysis/tokenizers/single_token/single_token.go @@ -26,6 +26,7 @@ func (t *SingleTokenTokenizer) Tokenize(input []byte) analysis.TokenStream { Position: 1, Start: 0, End: len(input), + Type: analysis.AlphaNumeric, }, } } diff --git a/analysis/tokenizers/single_token/single_token_test.go b/analysis/tokenizers/single_token/single_token_test.go index 8f29dc85..8a55087c 100644 --- a/analysis/tokenizers/single_token/single_token_test.go +++ b/analysis/tokenizers/single_token/single_token_test.go @@ -25,10 +25,11 @@ func TestSingleTokenTokenizer(t *testing.T) { []byte("Hello World"), analysis.TokenStream{ { - 0, - 11, - []byte("Hello World"), - 1, + Start: 0, + End: 11, + Term: []byte("Hello World"), + Position: 1, + Type: analysis.AlphaNumeric, }, }, }, @@ -36,10 +37,11 @@ func TestSingleTokenTokenizer(t *testing.T) { []byte("こんにちは世界"), analysis.TokenStream{ { - 0, - 21, - []byte("こんにちは世界"), - 1, + Start: 0, + End: 21, + Term: []byte("こんにちは世界"), + Position: 1, + Type: analysis.AlphaNumeric, }, }, }, @@ -47,10 +49,11 @@ func TestSingleTokenTokenizer(t *testing.T) { []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"), analysis.TokenStream{ { - 0, - 72, - []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"), - 1, + Start: 0, + End: 72, + Term: []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"), + Position: 1, + Type: analysis.AlphaNumeric, }, }, }, diff --git a/analysis/tokenizers/unicode_word_boundary/boundary.go b/analysis/tokenizers/unicode_word_boundary/boundary.go index b853ea13..f05e9f53 100644 --- a/analysis/tokenizers/unicode_word_boundary/boundary.go +++ b/analysis/tokenizers/unicode_word_boundary/boundary.go @@ -103,6 +103,7 @@ func (t *UnicodeWordBoundaryTokenizer) Tokenize(input []byte) analysis.TokenStre End: int(indexB), Term: input[indexA:indexB], Position: position, + Type: analysis.AlphaNumeric, } rv = append(rv, &token) } diff --git a/analysis/tokenizers/unicode_word_boundary/boundary_test.go b/analysis/tokenizers/unicode_word_boundary/boundary_test.go index d8ffcecd..0eeb6ce4 100644 --- a/analysis/tokenizers/unicode_word_boundary/boundary_test.go +++ b/analysis/tokenizers/unicode_word_boundary/boundary_test.go @@ -27,16 +27,18 @@ func TestBoundary(t *testing.T) { "en_US", analysis.TokenStream{ { - 0, - 5, - []byte("Hello"), - 1, + Start: 0, + End: 5, + Term: []byte("Hello"), + Position: 1, + Type: analysis.AlphaNumeric, }, { - 6, - 11, - []byte("World"), - 2, + Start: 6, + End: 11, + Term: []byte("World"), + Position: 2, + Type: analysis.AlphaNumeric, }, }, }, @@ -45,16 +47,18 @@ func TestBoundary(t *testing.T) { "en_US", analysis.TokenStream{ { - 0, - 15, - []byte("こんにちは"), - 1, + Start: 0, + End: 15, + Term: []byte("こんにちは"), + Position: 1, + Type: analysis.AlphaNumeric, }, { - 15, - 21, - []byte("世界"), - 2, + Start: 15, + End: 21, + Term: []byte("世界"), + Position: 2, + Type: analysis.AlphaNumeric, }, }, }, @@ -63,52 +67,60 @@ func TestBoundary(t *testing.T) { "th_TH", analysis.TokenStream{ { - 0, - 9, - []byte("แยก"), - 1, + Start: 0, + End: 9, + Term: []byte("แยก"), + Position: 1, + Type: analysis.AlphaNumeric, }, { - 9, - 15, - []byte("คำ"), - 2, + Start: 9, + End: 15, + Term: []byte("คำ"), + Position: 2, + Type: analysis.AlphaNumeric, }, { - 15, - 27, - []byte("ภาษา"), - 3, + Start: 15, + End: 27, + Term: []byte("ภาษา"), + Position: 3, + Type: analysis.AlphaNumeric, }, { - 27, - 36, - []byte("ไทย"), - 4, + Start: 27, + End: 36, + Term: []byte("ไทย"), + Position: 4, + Type: analysis.AlphaNumeric, }, { - 36, - 42, - []byte("ก็"), - 5, + Start: 36, + End: 42, + Term: []byte("ก็"), + Position: 5, + Type: analysis.AlphaNumeric, }, { - 42, - 57, - []byte("ทำได้"), - 6, + Start: 42, + End: 57, + Term: []byte("ทำได้"), + Position: 6, + Type: analysis.AlphaNumeric, }, { - 57, - 63, - []byte("นะ"), - 7, + Start: 57, + End: 63, + Term: []byte("นะ"), + Position: 7, + Type: analysis.AlphaNumeric, }, { - 63, - 72, - []byte("จ้ะ"), - 8, + Start: 63, + End: 72, + Term: []byte("จ้ะ"), + Position: 8, + Type: analysis.AlphaNumeric, }, }, }, diff --git a/analysis/type.go b/analysis/type.go index daa48cee..caf3e70c 100644 --- a/analysis/type.go +++ b/analysis/type.go @@ -16,11 +16,19 @@ type CharFilter interface { Filter([]byte) []byte } +type TokenType int + +const ( + AlphaNumeric TokenType = iota + Numeric +) + type Token struct { Start int End int Term []byte Position int + Type TokenType } func (t *Token) String() string {