From ebf100c097dfcb7cd96de176b3d601514cd60abe Mon Sep 17 00:00:00 2001 From: Silvan Jegen Date: Sat, 13 Sep 2014 16:51:56 +0200 Subject: [PATCH] Add the Kagome tokenizer for Japanese --- .../ja_morph_kagome/ja_morph_kagome.go | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 analysis/tokenizers/ja_morph_kagome/ja_morph_kagome.go diff --git a/analysis/tokenizers/ja_morph_kagome/ja_morph_kagome.go b/analysis/tokenizers/ja_morph_kagome/ja_morph_kagome.go new file mode 100644 index 00000000..e7599e97 --- /dev/null +++ b/analysis/tokenizers/ja_morph_kagome/ja_morph_kagome.go @@ -0,0 +1,80 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +// +build kagome full + +package ja_morph_kagome + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/ikawaha/kagome/dic" + "github.com/ikawaha/kagome/tokenizer" +) + +const Name = "kagome" + +type KagomeMorphTokenizer struct { + tok *tokenizer.Tokenizer +} + +func NewKagomeMorphTokenizer() *KagomeMorphTokenizer { + return &KagomeMorphTokenizer{ + tok: tokenizer.NewTokenizer(), + } +} + +func NewKagomeMorphTokenizerWithUserDic(userdic *dic.UserDic) *KagomeMorphTokenizer { + kagome := tokenizer.NewTokenizer() + kagome.SetUserDic(userdic) + return &KagomeMorphTokenizer{ + tok: kagome, + } +} + +func (t *KagomeMorphTokenizer) Tokenize(input []byte) analysis.TokenStream { + var morphs []tokenizer.Morph + var err error + + rv := make(analysis.TokenStream, 0) + if len(input) < 1 { + return rv + } + + morphs, err = t.tok.Tokenize(string(input)) + if err != nil { + return rv + } + + for i, m := range morphs { + if m.Surface == "EOS" { + continue + } + + token := &analysis.Token{ + Term: []byte(m.Surface), + Position: i + 1, + Start: m.Start, + End: m.End, + Type: analysis.Ideographic, + } + rv = append(rv, token) + } + + return rv +} + +func KagomeMorphTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) { + return NewKagomeMorphTokenizer(), nil +} + +func init() { + registry.RegisterTokenizer(Name, KagomeMorphTokenizerConstructor) +}