updated whtitepsace to behave more like lucene/es

2016-06-10 15:30:43 -04:00 · 2016-06-10 15:30:43 -04:00 · fedb46269e
parent 9c9dbcc90a
commit fedb46269e
2 changed files with 17 additions and 64 deletions
--- a/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go
+++ b/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go
@ -10,19 +10,21 @@
 package whitespace_tokenizer

 import (
-	"regexp"
+	"unicode"

 	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/tokenizers/regexp_tokenizer"
+	"github.com/blevesearch/bleve/analysis/tokenizers/character"
 	"github.com/blevesearch/bleve/registry"
 )

 const Name = "whitespace"

-var whitespaceTokenizerRegexp = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|[^\p{Z}\p{P}\p{C}\p{Han}\p{Hangul}\p{Hiragana}\p{Katakana}]+`)
-
 func TokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
-	return regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp), nil
+	return character.NewCharacterTokenizer(notSpace), nil
+}
+
+func notSpace(r rune) bool {
+	return !unicode.IsSpace(r)
 }

 func init() {
--- a/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer_test.go
+++ b/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer_test.go
@ -14,7 +14,7 @@ import (
 	"testing"

 	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/tokenizers/regexp_tokenizer"
+	"github.com/blevesearch/bleve/analysis/tokenizers/character"
 )

 func TestBoundary(t *testing.T) {
@ -35,8 +35,8 @@ func TestBoundary(t *testing.T) {
 				},
 				{
 					Start:    6,
-					End:      11,
-					Term:     []byte("World"),
+					End:      12,
+					Term:     []byte("World."),
 					Position: 2,
 					Type:     analysis.AlphaNumeric,
 				},
@ -47,52 +47,10 @@ func TestBoundary(t *testing.T) {
 			analysis.TokenStream{
 				{
 					Start:    0,
-					End:      3,
-					Term:     []byte("こ"),
-					Position: 1,
-					Type:     analysis.Ideographic,
-				},
-				{
-					Start:    3,
-					End:      6,
-					Term:     []byte("ん"),
-					Position: 2,
-					Type:     analysis.Ideographic,
-				},
-				{
-					Start:    6,
-					End:      9,
-					Term:     []byte("に"),
-					Position: 3,
-					Type:     analysis.Ideographic,
-				},
-				{
-					Start:    9,
-					End:      12,
-					Term:     []byte("ち"),
-					Position: 4,
-					Type:     analysis.Ideographic,
-				},
-				{
-					Start:    12,
-					End:      15,
-					Term:     []byte("は"),
-					Position: 5,
-					Type:     analysis.Ideographic,
-				},
-				{
-					Start:    15,
-					End:      18,
-					Term:     []byte("世"),
-					Position: 6,
-					Type:     analysis.Ideographic,
-				},
-				{
-					Start:    18,
 					End:      21,
-					Term:     []byte("界"),
-					Position: 7,
-					Type:     analysis.Ideographic,
+					Term:     []byte("こんにちは世界"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
 				},
 			},
 		},
@ -105,24 +63,17 @@ func TestBoundary(t *testing.T) {
 			analysis.TokenStream{
 				{
 					Start:    0,
-					End:      3,
-					Term:     []byte("abc"),
+					End:      6,
+					Term:     []byte("abc界"),
 					Position: 1,
 					Type:     analysis.AlphaNumeric,
 				},
-				{
-					Start:    3,
-					End:      6,
-					Term:     []byte("界"),
-					Position: 2,
-					Type:     analysis.Ideographic,
-				},
 			},
 		},
 	}

 	for _, test := range tests {
-		tokenizer := regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp)
+		tokenizer := character.NewCharacterTokenizer(notSpace)
 		actual := tokenizer.Tokenize(test.input)

 		if !reflect.DeepEqual(actual, test.output) {
@ -140,7 +91,7 @@ If the pressurized vessel, containing liquid at high temperature (which may be r

 func BenchmarkTokenizeEnglishText(b *testing.B) {

-	tokenizer := regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp)
+	tokenizer := character.NewCharacterTokenizer(notSpace)
 	b.ResetTimer()

 	for i := 0; i < b.N; i++ {