changed whitespace tokenizer to work better on cjk input

now it will return each cjk character as a separate token this will pair well with a cjk bigram filter for indexing
2014-09-07 14:11:01 -04:00 · 2014-09-07 14:11:01 -04:00 · 6b4c86b35a
commit 6b4c86b35a
parent 933d99c576
4 changed files with 74 additions and 5 deletions
--- a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go
+++ b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go
@ -12,6 +12,7 @@ package regexp_tokenizer
 import (
 	"fmt"
 	"regexp"
 	"strconv"
 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/registry"
@ -19,6 +20,8 @@ import (
 const Name = "regexp"
 var IdeographRegexp = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}`)
 type RegexpTokenizer struct {
 	r *regexp.Regexp
 }
@ -33,12 +36,13 @@ func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream {
 	matches := rt.r.FindAllIndex(input, -1)
 	rv := make(analysis.TokenStream, len(matches))
 	for i, match := range matches {
 		matchBytes := input[match[0]:match[1]]
 		token := analysis.Token{
-			Term:     input[match[0]:match[1]],
+			Term:     matchBytes,
 			Start:    match[0],
 			End:      match[1],
 			Position: i + 1,
-			Type:     analysis.AlphaNumeric,
+			Type:     detectTokenType(matchBytes),
 		}
 		rv[i] = &token
 	}
@ -60,3 +64,14 @@ func RegexpTokenizerConstructor(config map[string]interface{}, cache *registry.C
 func init() {
 	registry.RegisterTokenizer(Name, RegexpTokenizerConstructor)
 }
 func detectTokenType(termBytes []byte) analysis.TokenType {
 	if IdeographRegexp.Match(termBytes) {
 		return analysis.Ideographic
 	}
 	_, err := strconv.ParseFloat(string(termBytes), 64)
 	if err == nil {
 		return analysis.Numeric
 	}
 	return analysis.AlphaNumeric
 }
--- a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go
+++ b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go
@ -19,7 +19,7 @@ import (
 func TestBoundary(t *testing.T) {
-	wordRegex := regexp.MustCompile(`\w+`)
+	wordRegex := regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|\w+`)
 	tests := []struct {
 		input  []byte
@ -44,6 +44,60 @@ func TestBoundary(t *testing.T) {
 				},
 			},
 		},
 		{
 			[]byte("こんにちは世界"),
 			analysis.TokenStream{
 				{
 					Start:    0,
 					End:      3,
 					Term:     []byte("こ"),
 					Position: 1,
 					Type:     analysis.Ideographic,
 				},
 				{
 					Start:    3,
 					End:      6,
 					Term:     []byte("ん"),
 					Position: 2,
 					Type:     analysis.Ideographic,
 				},
 				{
 					Start:    6,
 					End:      9,
 					Term:     []byte("に"),
 					Position: 3,
 					Type:     analysis.Ideographic,
 				},
 				{
 					Start:    9,
 					End:      12,
 					Term:     []byte("ち"),
 					Position: 4,
 					Type:     analysis.Ideographic,
 				},
 				{
 					Start:    12,
 					End:      15,
 					Term:     []byte("は"),
 					Position: 5,
 					Type:     analysis.Ideographic,
 				},
 				{
 					Start:    15,
 					End:      18,
 					Term:     []byte("世"),
 					Position: 6,
 					Type:     analysis.Ideographic,
 				},
 				{
 					Start:    18,
 					End:      21,
 					Term:     []byte("界"),
 					Position: 7,
 					Type:     analysis.Ideographic,
 				},
 			},
 		},
 		{
 			[]byte(""),
 			analysis.TokenStream{},
--- a/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go
+++ b/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go
@ -19,7 +19,7 @@ import (
 const Name = "whitespace"
-var whitespaceTokenizerRegexp = regexp.MustCompile(`\w+`)
+var whitespaceTokenizerRegexp = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|\w+`)
 func TokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
 	return regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp), nil
--- a/analysis/type.go
+++ b/analysis/type.go
@ -37,7 +37,7 @@ type Token struct {
 }
 func (t *Token) String() string {
-	return fmt.Sprintf("Start: %d  End: %d  Position: %d  Token: %s", t.Start, t.End, t.Position, string(t.Term))
+	return fmt.Sprintf("Start: %d  End: %d  Position: %d  Token: %s  Type: %d", t.Start, t.End, t.Position, string(t.Term), t.Type)
 }
 type TokenStream []*Token