From 6b4c86b35a58039134bc7895d1f17b37667dce3c Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sun, 7 Sep 2014 14:11:01 -0400 Subject: [PATCH] changed whitespace tokenizer to work better on cjk input now it will return each cjk character as a separate token this will pair well with a cjk bigram filter for indexing --- .../regexp_tokenizer/regexp_tokenizer.go | 19 ++++++- .../regexp_tokenizer/regexp_tokenizer_test.go | 56 ++++++++++++++++++- .../whitespace_tokenizer.go | 2 +- analysis/type.go | 2 +- 4 files changed, 74 insertions(+), 5 deletions(-) diff --git a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go index c5b543f7..20e04cd5 100644 --- a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go +++ b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go @@ -12,6 +12,7 @@ package regexp_tokenizer import ( "fmt" "regexp" + "strconv" "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/registry" @@ -19,6 +20,8 @@ import ( const Name = "regexp" +var IdeographRegexp = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}`) + type RegexpTokenizer struct { r *regexp.Regexp } @@ -33,12 +36,13 @@ func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream { matches := rt.r.FindAllIndex(input, -1) rv := make(analysis.TokenStream, len(matches)) for i, match := range matches { + matchBytes := input[match[0]:match[1]] token := analysis.Token{ - Term: input[match[0]:match[1]], + Term: matchBytes, Start: match[0], End: match[1], Position: i + 1, - Type: analysis.AlphaNumeric, + Type: detectTokenType(matchBytes), } rv[i] = &token } @@ -60,3 +64,14 @@ func RegexpTokenizerConstructor(config map[string]interface{}, cache *registry.C func init() { registry.RegisterTokenizer(Name, RegexpTokenizerConstructor) } + +func detectTokenType(termBytes []byte) analysis.TokenType { + if IdeographRegexp.Match(termBytes) { + return analysis.Ideographic + } + _, err := strconv.ParseFloat(string(termBytes), 64) + if err == nil { + return analysis.Numeric + } + return analysis.AlphaNumeric +} diff --git a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go index 6b27855c..d1d119f6 100644 --- a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go +++ b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go @@ -19,7 +19,7 @@ import ( func TestBoundary(t *testing.T) { - wordRegex := regexp.MustCompile(`\w+`) + wordRegex := regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|\w+`) tests := []struct { input []byte @@ -44,6 +44,60 @@ func TestBoundary(t *testing.T) { }, }, }, + { + []byte("こんにちは世界"), + analysis.TokenStream{ + { + Start: 0, + End: 3, + Term: []byte("こ"), + Position: 1, + Type: analysis.Ideographic, + }, + { + Start: 3, + End: 6, + Term: []byte("ん"), + Position: 2, + Type: analysis.Ideographic, + }, + { + Start: 6, + End: 9, + Term: []byte("に"), + Position: 3, + Type: analysis.Ideographic, + }, + { + Start: 9, + End: 12, + Term: []byte("ち"), + Position: 4, + Type: analysis.Ideographic, + }, + { + Start: 12, + End: 15, + Term: []byte("は"), + Position: 5, + Type: analysis.Ideographic, + }, + { + Start: 15, + End: 18, + Term: []byte("世"), + Position: 6, + Type: analysis.Ideographic, + }, + { + Start: 18, + End: 21, + Term: []byte("界"), + Position: 7, + Type: analysis.Ideographic, + }, + }, + }, { []byte(""), analysis.TokenStream{}, diff --git a/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go b/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go index fb983d57..9dd3cdfb 100644 --- a/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go +++ b/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go @@ -19,7 +19,7 @@ import ( const Name = "whitespace" -var whitespaceTokenizerRegexp = regexp.MustCompile(`\w+`) +var whitespaceTokenizerRegexp = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|\w+`) func TokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) { return regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp), nil diff --git a/analysis/type.go b/analysis/type.go index 9bdb088a..b30ad8a0 100644 --- a/analysis/type.go +++ b/analysis/type.go @@ -37,7 +37,7 @@ type Token struct { } func (t *Token) String() string { - return fmt.Sprintf("Start: %d End: %d Position: %d Token: %s", t.Start, t.End, t.Position, string(t.Term)) + return fmt.Sprintf("Start: %d End: %d Position: %d Token: %s Type: %d", t.Start, t.End, t.Position, string(t.Term), t.Type) } type TokenStream []*Token