changed whitespace tokenizer to work better on cjk input

now it will return each cjk character as a separate token this will pair well with a cjk bigram filter for indexing
2014-09-07 14:11:01 -04:00 · 2014-09-07 14:11:01 -04:00 · 6b4c86b35a
commit 6b4c86b35a
parent 933d99c576
4 changed files with 74 additions and 5 deletions
--- a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go
+++ b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go
@ -12,6 +12,7 @@ package regexp_tokenizer
 import (
 	"fmt"
 	"regexp"
+	"strconv"

 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/registry"
@ -19,6 +20,8 @@ import (

 const Name = "regexp"

+var IdeographRegexp = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}`)
+
 type RegexpTokenizer struct {
 	r *regexp.Regexp
 }
@ -33,12 +36,13 @@ func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream {
 	matches := rt.r.FindAllIndex(input, -1)
 	rv := make(analysis.TokenStream, len(matches))
 	for i, match := range matches {
+		matchBytes := input[match[0]:match[1]]
 		token := analysis.Token{
-			Term:     input[match[0]:match[1]],
+			Term:     matchBytes,
 			Start:    match[0],
 			End:      match[1],
 			Position: i + 1,
-			Type:     analysis.AlphaNumeric,
+			Type:     detectTokenType(matchBytes),
 		}
 		rv[i] = &token
 	}
@ -60,3 +64,14 @@ func RegexpTokenizerConstructor(config map[string]interface{}, cache *registry.C
 func init() {
 	registry.RegisterTokenizer(Name, RegexpTokenizerConstructor)
 }
+
+func detectTokenType(termBytes []byte) analysis.TokenType {
+	if IdeographRegexp.Match(termBytes) {
+		return analysis.Ideographic
+	}
+	_, err := strconv.ParseFloat(string(termBytes), 64)
+	if err == nil {
+		return analysis.Numeric
+	}
+	return analysis.AlphaNumeric
+}
--- a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go
+++ b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go
@ -19,7 +19,7 @@ import (

 func TestBoundary(t *testing.T) {

-	wordRegex := regexp.MustCompile(`\w+`)
+	wordRegex := regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|\w+`)

 	tests := []struct {
 		input  []byte
@ -44,6 +44,60 @@ func TestBoundary(t *testing.T) {
 				},
 			},
 		},
+		{
+			[]byte("こんにちは世界"),
+			analysis.TokenStream{
+				{
+					Start:    0,
+					End:      3,
+					Term:     []byte("こ"),
+					Position: 1,
+					Type:     analysis.Ideographic,
+				},
+				{
+					Start:    3,
+					End:      6,
+					Term:     []byte("ん"),
+					Position: 2,
+					Type:     analysis.Ideographic,
+				},
+				{
+					Start:    6,
+					End:      9,
+					Term:     []byte("に"),
+					Position: 3,
+					Type:     analysis.Ideographic,
+				},
+				{
+					Start:    9,
+					End:      12,
+					Term:     []byte("ち"),
+					Position: 4,
+					Type:     analysis.Ideographic,
+				},
+				{
+					Start:    12,
+					End:      15,
+					Term:     []byte("は"),
+					Position: 5,
+					Type:     analysis.Ideographic,
+				},
+				{
+					Start:    15,
+					End:      18,
+					Term:     []byte("世"),
+					Position: 6,
+					Type:     analysis.Ideographic,
+				},
+				{
+					Start:    18,
+					End:      21,
+					Term:     []byte("界"),
+					Position: 7,
+					Type:     analysis.Ideographic,
+				},
+			},
+		},
 		{
 			[]byte(""),
 			analysis.TokenStream{},
--- a/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go
+++ b/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go
@ -19,7 +19,7 @@ import (

 const Name = "whitespace"

-var whitespaceTokenizerRegexp = regexp.MustCompile(`\w+`)
+var whitespaceTokenizerRegexp = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|\w+`)

 func TokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
 	return regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp), nil
--- a/analysis/type.go
+++ b/analysis/type.go
@ -37,7 +37,7 @@ type Token struct {
 }

 func (t *Token) String() string {
-	return fmt.Sprintf("Start: %d  End: %d  Position: %d  Token: %s", t.Start, t.End, t.Position, string(t.Term))
+	return fmt.Sprintf("Start: %d  End: %d  Position: %d  Token: %s  Type: %d", t.Start, t.End, t.Position, string(t.Term), t.Type)
 }

 type TokenStream []*Token