diff --git a/analysis/language/cjk/analyzer_cjk.go b/analysis/language/cjk/analyzer_cjk.go new file mode 100644 index 00000000..f1841a6f --- /dev/null +++ b/analysis/language/cjk/analyzer_cjk.go @@ -0,0 +1,50 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package cjk + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter" + "github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize" + "github.com/blevesearch/bleve/analysis/tokenizers/whitespace_tokenizer" +) + +const AnalyzerName = "cjk" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + whitespaceTokenizer, err := cache.TokenizerNamed(whitespace_tokenizer.Name) + if err != nil { + return nil, err + } + normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKD) + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + bigramFilter, err := cache.TokenFilterNamed(BigramName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: whitespaceTokenizer, + TokenFilters: []analysis.TokenFilter{ + normalizeFilter, + toLowerFilter, + bigramFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/language/cjk/analyzer_cjk_test.go b/analysis/language/cjk/analyzer_cjk_test.go new file mode 100644 index 00000000..2c676928 --- /dev/null +++ b/analysis/language/cjk/analyzer_cjk_test.go @@ -0,0 +1,620 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package cjk + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestCJKAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + { + input: []byte("こんにちは世界"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("こん"), + Type: analysis.Double, + Position: 1, + Start: 0, + End: 6, + }, + &analysis.Token{ + Term: []byte("んに"), + Type: analysis.Double, + Position: 2, + Start: 3, + End: 9, + }, + &analysis.Token{ + Term: []byte("にち"), + Type: analysis.Double, + Position: 3, + Start: 6, + End: 12, + }, + &analysis.Token{ + Term: []byte("ちは"), + Type: analysis.Double, + Position: 4, + Start: 9, + End: 15, + }, + &analysis.Token{ + Term: []byte("は世"), + Type: analysis.Double, + Position: 5, + Start: 12, + End: 18, + }, + &analysis.Token{ + Term: []byte("世界"), + Type: analysis.Double, + Position: 6, + Start: 15, + End: 21, + }, + }, + }, + { + input: []byte("一二三四五六七八九十"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("一二"), + Type: analysis.Double, + Position: 1, + Start: 0, + End: 6, + }, + &analysis.Token{ + Term: []byte("二三"), + Type: analysis.Double, + Position: 2, + Start: 3, + End: 9, + }, + &analysis.Token{ + Term: []byte("三四"), + Type: analysis.Double, + Position: 3, + Start: 6, + End: 12, + }, + &analysis.Token{ + Term: []byte("四五"), + Type: analysis.Double, + Position: 4, + Start: 9, + End: 15, + }, + &analysis.Token{ + Term: []byte("五六"), + Type: analysis.Double, + Position: 5, + Start: 12, + End: 18, + }, + &analysis.Token{ + Term: []byte("六七"), + Type: analysis.Double, + Position: 6, + Start: 15, + End: 21, + }, + &analysis.Token{ + Term: []byte("七八"), + Type: analysis.Double, + Position: 7, + Start: 18, + End: 24, + }, + &analysis.Token{ + Term: []byte("八九"), + Type: analysis.Double, + Position: 8, + Start: 21, + End: 27, + }, + &analysis.Token{ + Term: []byte("九十"), + Type: analysis.Double, + Position: 9, + Start: 24, + End: 30, + }, + }, + }, + { + input: []byte("一 二三四 五六七八九 十"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("一"), + Type: analysis.Single, + Position: 1, + Start: 0, + End: 3, + }, + &analysis.Token{ + Term: []byte("二三"), + Type: analysis.Double, + Position: 2, + Start: 4, + End: 10, + }, + &analysis.Token{ + Term: []byte("三四"), + Type: analysis.Double, + Position: 3, + Start: 7, + End: 13, + }, + &analysis.Token{ + Term: []byte("五六"), + Type: analysis.Double, + Position: 5, + Start: 14, + End: 20, + }, + &analysis.Token{ + Term: []byte("六七"), + Type: analysis.Double, + Position: 6, + Start: 17, + End: 23, + }, + &analysis.Token{ + Term: []byte("七八"), + Type: analysis.Double, + Position: 7, + Start: 20, + End: 26, + }, + &analysis.Token{ + Term: []byte("八九"), + Type: analysis.Double, + Position: 8, + Start: 23, + End: 29, + }, + &analysis.Token{ + Term: []byte("十"), + Type: analysis.Single, + Position: 10, + Start: 30, + End: 33, + }, + }, + }, + { + input: []byte("abc defgh ijklmn opqrstu vwxy z"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("abc"), + Type: analysis.AlphaNumeric, + Position: 1, + Start: 0, + End: 3, + }, + &analysis.Token{ + Term: []byte("defgh"), + Type: analysis.AlphaNumeric, + Position: 2, + Start: 4, + End: 9, + }, + &analysis.Token{ + Term: []byte("ijklmn"), + Type: analysis.AlphaNumeric, + Position: 3, + Start: 10, + End: 16, + }, + &analysis.Token{ + Term: []byte("opqrstu"), + Type: analysis.AlphaNumeric, + Position: 4, + Start: 17, + End: 24, + }, + &analysis.Token{ + Term: []byte("vwxy"), + Type: analysis.AlphaNumeric, + Position: 5, + Start: 25, + End: 29, + }, + &analysis.Token{ + Term: []byte("z"), + Type: analysis.AlphaNumeric, + Position: 6, + Start: 30, + End: 31, + }, + }, + }, + { + input: []byte("あい"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("あい"), + Type: analysis.Double, + Position: 1, + Start: 0, + End: 6, + }, + }, + }, + { + input: []byte("あい "), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("あい"), + Type: analysis.Double, + Position: 1, + Start: 0, + End: 6, + }, + }, + }, + { + input: []byte("test"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("test"), + Type: analysis.AlphaNumeric, + Position: 1, + Start: 0, + End: 4, + }, + }, + }, + { + input: []byte("test "), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("test"), + Type: analysis.AlphaNumeric, + Position: 1, + Start: 0, + End: 4, + }, + }, + }, + { + input: []byte("あいtest"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("あい"), + Type: analysis.Double, + Position: 1, + Start: 0, + End: 6, + }, + &analysis.Token{ + Term: []byte("test"), + Type: analysis.AlphaNumeric, + Position: 3, + Start: 6, + End: 10, + }, + }, + }, + { + input: []byte("testあい "), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("test"), + Type: analysis.AlphaNumeric, + Position: 1, + Start: 0, + End: 4, + }, + &analysis.Token{ + Term: []byte("あい"), + Type: analysis.Double, + Position: 2, + Start: 4, + End: 10, + }, + }, + }, + { + input: []byte("あいうえおabcかきくけこ"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("あい"), + Type: analysis.Double, + Position: 1, + Start: 0, + End: 6, + }, + &analysis.Token{ + Term: []byte("いう"), + Type: analysis.Double, + Position: 2, + Start: 3, + End: 9, + }, + &analysis.Token{ + Term: []byte("うえ"), + Type: analysis.Double, + Position: 3, + Start: 6, + End: 12, + }, + &analysis.Token{ + Term: []byte("えお"), + Type: analysis.Double, + Position: 4, + Start: 9, + End: 15, + }, + &analysis.Token{ + Term: []byte("abc"), + Type: analysis.AlphaNumeric, + Position: 6, + Start: 15, + End: 18, + }, + &analysis.Token{ + Term: []byte("かき"), + Type: analysis.Double, + Position: 7, + Start: 18, + End: 24, + }, + &analysis.Token{ + Term: []byte("きく"), + Type: analysis.Double, + Position: 8, + Start: 21, + End: 27, + }, + &analysis.Token{ + Term: []byte("くけ"), + Type: analysis.Double, + Position: 9, + Start: 24, + End: 30, + }, + &analysis.Token{ + Term: []byte("けこ"), + Type: analysis.Double, + Position: 10, + Start: 27, + End: 33, + }, + }, + }, + { + input: []byte("あいうえおabんcかきくけ こ"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("あい"), + Type: analysis.Double, + Position: 1, + Start: 0, + End: 6, + }, + &analysis.Token{ + Term: []byte("いう"), + Type: analysis.Double, + Position: 2, + Start: 3, + End: 9, + }, + &analysis.Token{ + Term: []byte("うえ"), + Type: analysis.Double, + Position: 3, + Start: 6, + End: 12, + }, + &analysis.Token{ + Term: []byte("えお"), + Type: analysis.Double, + Position: 4, + Start: 9, + End: 15, + }, + &analysis.Token{ + Term: []byte("ab"), + Type: analysis.AlphaNumeric, + Position: 6, + Start: 15, + End: 17, + }, + &analysis.Token{ + Term: []byte("ん"), + Type: analysis.Single, + Position: 7, + Start: 17, + End: 20, + }, + &analysis.Token{ + Term: []byte("c"), + Type: analysis.AlphaNumeric, + Position: 8, + Start: 20, + End: 21, + }, + &analysis.Token{ + Term: []byte("かき"), + Type: analysis.Double, + Position: 9, + Start: 21, + End: 27, + }, + &analysis.Token{ + Term: []byte("きく"), + Type: analysis.Double, + Position: 10, + Start: 24, + End: 30, + }, + &analysis.Token{ + Term: []byte("くけ"), + Type: analysis.Double, + Position: 11, + Start: 27, + End: 33, + }, + &analysis.Token{ + Term: []byte("こ"), + Type: analysis.Single, + Position: 13, + Start: 34, + End: 37, + }, + }, + }, + { + input: []byte("一 روبرت موير"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("一"), + Type: analysis.Single, + Position: 1, + Start: 0, + End: 3, + }, + &analysis.Token{ + Term: []byte("روبرت"), + Type: analysis.AlphaNumeric, + Position: 2, + Start: 4, + End: 14, + }, + &analysis.Token{ + Term: []byte("موير"), + Type: analysis.AlphaNumeric, + Position: 3, + Start: 15, + End: 23, + }, + }, + }, + { + input: []byte("一 رُوبرت موير"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("一"), + Type: analysis.Single, + Position: 1, + Start: 0, + End: 3, + }, + &analysis.Token{ + Term: []byte("رُوبرت"), + Type: analysis.AlphaNumeric, + Position: 2, + Start: 4, + End: 16, + }, + &analysis.Token{ + Term: []byte("موير"), + Type: analysis.AlphaNumeric, + Position: 3, + Start: 17, + End: 25, + }, + }, + }, + { + input: []byte("𩬅艱鍟䇹愯瀛"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("𩬅艱"), + Type: analysis.Double, + Position: 1, + Start: 0, + End: 7, + }, + &analysis.Token{ + Term: []byte("艱鍟"), + Type: analysis.Double, + Position: 2, + Start: 4, + End: 10, + }, + &analysis.Token{ + Term: []byte("鍟䇹"), + Type: analysis.Double, + Position: 3, + Start: 7, + End: 13, + }, + &analysis.Token{ + Term: []byte("䇹愯"), + Type: analysis.Double, + Position: 4, + Start: 10, + End: 16, + }, + &analysis.Token{ + Term: []byte("愯瀛"), + Type: analysis.Double, + Position: 5, + Start: 13, + End: 19, + }, + }, + }, + { + input: []byte("一"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("一"), + Type: analysis.Single, + Position: 1, + Start: 0, + End: 3, + }, + }, + }, + { + input: []byte("一丁丂"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("一丁"), + Type: analysis.Double, + Position: 1, + Start: 0, + End: 6, + }, + &analysis.Token{ + Term: []byte("丁丂"), + Type: analysis.Double, + Position: 2, + Start: 3, + End: 9, + }, + }, + }, + } + + cache := registry.NewCache() + for _, test := range tests { + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + actual := analyzer.Analyze(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %v, got %v", test.output, actual) + } + } +} diff --git a/analysis/language/cjk/cjk_bigram.go b/analysis/language/cjk/cjk_bigram.go new file mode 100644 index 00000000..36cfc88d --- /dev/null +++ b/analysis/language/cjk/cjk_bigram.go @@ -0,0 +1,166 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package cjk + +import ( + "container/ring" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const BigramName = "cjk_bigram" + +type CJKBigramFilter struct { + outputUnigram bool +} + +func NewCJKBigramFilter(outputUnigram bool) *CJKBigramFilter { + return &CJKBigramFilter{ + outputUnigram: outputUnigram, + } +} + +func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + r := ring.New(2) + itemsInRing := 0 + + rv := make(analysis.TokenStream, 0) + + for _, token := range input { + if token.Type == analysis.Ideographic { + if itemsInRing > 0 { + // if items already buffered + // check to see if this is aligned + curr := r.Value.(*analysis.Token) + if token.Start-curr.End != 0 { + // not aligned flush + flushToken := s.flush(r, &itemsInRing) + if flushToken != nil { + rv = append(rv, flushToken) + } + } + } + // now we can add this token to the buffer + r = r.Next() + r.Value = token + if itemsInRing < 2 { + itemsInRing++ + } + if itemsInRing > 1 && s.outputUnigram { + unigram := s.buildUnigram(r, &itemsInRing) + if unigram != nil { + rv = append(rv, unigram) + } + } + bigramToken := s.outputBigram(r, &itemsInRing) + if bigramToken != nil { + rv = append(rv, bigramToken) + } + } else { + // flush anything already buffered + flushToken := s.flush(r, &itemsInRing) + if flushToken != nil { + rv = append(rv, flushToken) + } + // output this token as is + rv = append(rv, token) + } + } + + // deal with possible trailing unigram + if itemsInRing == 1 || s.outputUnigram { + if itemsInRing == 2 { + r = r.Next() + } + unigram := s.buildUnigram(r, &itemsInRing) + if unigram != nil { + rv = append(rv, unigram) + } + } + return rv +} + +func (s *CJKBigramFilter) flush(r *ring.Ring, itemsInRing *int) *analysis.Token { + var rv *analysis.Token + if *itemsInRing == 1 { + rv = s.buildUnigram(r, itemsInRing) + } + r.Value = nil + *itemsInRing = 0 + return rv +} + +func (s *CJKBigramFilter) outputBigram(r *ring.Ring, itemsInRing *int) *analysis.Token { + if *itemsInRing == 2 { + thisShingleRing := r.Move(-1) + shingledBytes := make([]byte, 0) + + // do first token + prev := thisShingleRing.Value.(*analysis.Token) + shingledBytes = append(shingledBytes, prev.Term...) + + // do second token + thisShingleRing = thisShingleRing.Next() + curr := thisShingleRing.Value.(*analysis.Token) + shingledBytes = append(shingledBytes, curr.Term...) + + token := analysis.Token{ + Type: analysis.Double, + Term: shingledBytes, + Position: prev.Position, + Start: prev.Start, + End: curr.End, + } + return &token + } + return nil +} + +func (s *CJKBigramFilter) buildUnigram(r *ring.Ring, itemsInRing *int) *analysis.Token { + if *itemsInRing == 2 { + thisShingleRing := r.Move(-1) + // do first token + prev := thisShingleRing.Value.(*analysis.Token) + token := analysis.Token{ + Type: analysis.Single, + Term: prev.Term, + Position: prev.Position, + Start: prev.Start, + End: prev.End, + } + return &token + } else if *itemsInRing == 1 { + // do first token + prev := r.Value.(*analysis.Token) + token := analysis.Token{ + Type: analysis.Single, + Term: prev.Term, + Position: prev.Position, + Start: prev.Start, + End: prev.End, + } + return &token + } + return nil +} + +func CJKBigramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + outputUnigram := false + outVal, ok := config["output_unigram"].(bool) + if ok { + outputUnigram = outVal + } + return NewCJKBigramFilter(outputUnigram), nil +} + +func init() { + registry.RegisterTokenFilter(BigramName, CJKBigramFilterConstructor) +} diff --git a/analysis/language/cjk/cjk_bigram_test.go b/analysis/language/cjk/cjk_bigram_test.go new file mode 100644 index 00000000..e30cacfe --- /dev/null +++ b/analysis/language/cjk/cjk_bigram_test.go @@ -0,0 +1,420 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package cjk + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" +) + +func TestCJKBigramFilter(t *testing.T) { + + tests := []struct { + outputUnigram bool + input analysis.TokenStream + output analysis.TokenStream + }{ + { + outputUnigram: false, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("こ"), + Type: analysis.Ideographic, + Position: 1, + Start: 0, + End: 3, + }, + &analysis.Token{ + Term: []byte("ん"), + Type: analysis.Ideographic, + Position: 2, + Start: 5, + End: 7, + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("こ"), + Type: analysis.Single, + Position: 1, + Start: 0, + End: 3, + }, + &analysis.Token{ + Term: []byte("ん"), + Type: analysis.Single, + Position: 2, + Start: 5, + End: 7, + }, + }, + }, + { + outputUnigram: false, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("こ"), + Type: analysis.Ideographic, + Position: 1, + Start: 0, + End: 3, + }, + &analysis.Token{ + Term: []byte("ん"), + Type: analysis.Ideographic, + Position: 2, + Start: 3, + End: 6, + }, + &analysis.Token{ + Term: []byte("に"), + Type: analysis.Ideographic, + Position: 3, + Start: 6, + End: 9, + }, + &analysis.Token{ + Term: []byte("ち"), + Type: analysis.Ideographic, + Position: 4, + Start: 9, + End: 12, + }, + &analysis.Token{ + Term: []byte("は"), + Type: analysis.Ideographic, + Position: 5, + Start: 12, + End: 15, + }, + &analysis.Token{ + Term: []byte("世"), + Type: analysis.Ideographic, + Position: 6, + Start: 15, + End: 18, + }, + &analysis.Token{ + Term: []byte("界"), + Type: analysis.Ideographic, + Position: 7, + Start: 18, + End: 21, + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("こん"), + Type: analysis.Double, + Position: 1, + Start: 0, + End: 6, + }, + &analysis.Token{ + Term: []byte("んに"), + Type: analysis.Double, + Position: 2, + Start: 3, + End: 9, + }, + &analysis.Token{ + Term: []byte("にち"), + Type: analysis.Double, + Position: 3, + Start: 6, + End: 12, + }, + &analysis.Token{ + Term: []byte("ちは"), + Type: analysis.Double, + Position: 4, + Start: 9, + End: 15, + }, + &analysis.Token{ + Term: []byte("は世"), + Type: analysis.Double, + Position: 5, + Start: 12, + End: 18, + }, + &analysis.Token{ + Term: []byte("世界"), + Type: analysis.Double, + Position: 6, + Start: 15, + End: 21, + }, + }, + }, + { + outputUnigram: true, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("こ"), + Type: analysis.Ideographic, + Position: 1, + Start: 0, + End: 3, + }, + &analysis.Token{ + Term: []byte("ん"), + Type: analysis.Ideographic, + Position: 2, + Start: 3, + End: 6, + }, + &analysis.Token{ + Term: []byte("に"), + Type: analysis.Ideographic, + Position: 3, + Start: 6, + End: 9, + }, + &analysis.Token{ + Term: []byte("ち"), + Type: analysis.Ideographic, + Position: 4, + Start: 9, + End: 12, + }, + &analysis.Token{ + Term: []byte("は"), + Type: analysis.Ideographic, + Position: 5, + Start: 12, + End: 15, + }, + &analysis.Token{ + Term: []byte("世"), + Type: analysis.Ideographic, + Position: 6, + Start: 15, + End: 18, + }, + &analysis.Token{ + Term: []byte("界"), + Type: analysis.Ideographic, + Position: 7, + Start: 18, + End: 21, + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("こ"), + Type: analysis.Single, + Position: 1, + Start: 0, + End: 3, + }, + &analysis.Token{ + Term: []byte("こん"), + Type: analysis.Double, + Position: 1, + Start: 0, + End: 6, + }, + &analysis.Token{ + Term: []byte("ん"), + Type: analysis.Single, + Position: 2, + Start: 3, + End: 6, + }, + &analysis.Token{ + Term: []byte("んに"), + Type: analysis.Double, + Position: 2, + Start: 3, + End: 9, + }, + &analysis.Token{ + Term: []byte("に"), + Type: analysis.Single, + Position: 3, + Start: 6, + End: 9, + }, + &analysis.Token{ + Term: []byte("にち"), + Type: analysis.Double, + Position: 3, + Start: 6, + End: 12, + }, + &analysis.Token{ + Term: []byte("ち"), + Type: analysis.Single, + Position: 4, + Start: 9, + End: 12, + }, + &analysis.Token{ + Term: []byte("ちは"), + Type: analysis.Double, + Position: 4, + Start: 9, + End: 15, + }, + &analysis.Token{ + Term: []byte("は"), + Type: analysis.Single, + Position: 5, + Start: 12, + End: 15, + }, + &analysis.Token{ + Term: []byte("は世"), + Type: analysis.Double, + Position: 5, + Start: 12, + End: 18, + }, + &analysis.Token{ + Term: []byte("世"), + Type: analysis.Single, + Position: 6, + Start: 15, + End: 18, + }, + &analysis.Token{ + Term: []byte("世界"), + Type: analysis.Double, + Position: 6, + Start: 15, + End: 21, + }, + &analysis.Token{ + Term: []byte("界"), + Type: analysis.Single, + Position: 7, + Start: 18, + End: 21, + }, + }, + }, + { + outputUnigram: false, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("こ"), + Type: analysis.Ideographic, + Position: 1, + Start: 0, + End: 3, + }, + &analysis.Token{ + Term: []byte("ん"), + Type: analysis.Ideographic, + Position: 2, + Start: 3, + End: 6, + }, + &analysis.Token{ + Term: []byte("に"), + Type: analysis.Ideographic, + Position: 3, + Start: 6, + End: 9, + }, + &analysis.Token{ + Term: []byte("ち"), + Type: analysis.Ideographic, + Position: 4, + Start: 9, + End: 12, + }, + &analysis.Token{ + Term: []byte("は"), + Type: analysis.Ideographic, + Position: 5, + Start: 12, + End: 15, + }, + &analysis.Token{ + Term: []byte("cat"), + Type: analysis.AlphaNumeric, + Position: 6, + Start: 12, + End: 15, + }, + &analysis.Token{ + Term: []byte("世"), + Type: analysis.Ideographic, + Position: 7, + Start: 18, + End: 21, + }, + &analysis.Token{ + Term: []byte("界"), + Type: analysis.Ideographic, + Position: 8, + Start: 21, + End: 24, + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("こん"), + Type: analysis.Double, + Position: 1, + Start: 0, + End: 6, + }, + &analysis.Token{ + Term: []byte("んに"), + Type: analysis.Double, + Position: 2, + Start: 3, + End: 9, + }, + &analysis.Token{ + Term: []byte("にち"), + Type: analysis.Double, + Position: 3, + Start: 6, + End: 12, + }, + &analysis.Token{ + Term: []byte("ちは"), + Type: analysis.Double, + Position: 4, + Start: 9, + End: 15, + }, + &analysis.Token{ + Term: []byte("cat"), + Type: analysis.AlphaNumeric, + Position: 6, + Start: 12, + End: 15, + }, + &analysis.Token{ + Term: []byte("世界"), + Type: analysis.Double, + Position: 7, + Start: 18, + End: 24, + }, + }, + }, + } + + for _, test := range tests { + cjkBigramFilter := NewCJKBigramFilter(test.outputUnigram) + actual := cjkBigramFilter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output, actual) + } + } +} diff --git a/analysis/type.go b/analysis/type.go index 38c513e9..0f32ece1 100644 --- a/analysis/type.go +++ b/analysis/type.go @@ -26,6 +26,8 @@ const ( Numeric DateTime Shingle + Single + Double ) type Token struct {