introducing cjk_bigram filter and cjk analyzer

closes #34
2014-09-11 10:39:05 -04:00 · 2014-09-11 10:39:05 -04:00 · 1a1cf32a86
commit 1a1cf32a86
parent cb5ccd2b1d
5 changed files with 1258 additions and 0 deletions
--- a/analysis/language/cjk/analyzer_cjk.go
+++ b/analysis/language/cjk/analyzer_cjk.go
@ -0,0 +1,50 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+
+package cjk
+
+import (
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/registry"
+
+	"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
+	"github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
+	"github.com/blevesearch/bleve/analysis/tokenizers/whitespace_tokenizer"
+)
+
+const AnalyzerName = "cjk"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
+	whitespaceTokenizer, err := cache.TokenizerNamed(whitespace_tokenizer.Name)
+	if err != nil {
+		return nil, err
+	}
+	normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKD)
+	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
+	if err != nil {
+		return nil, err
+	}
+	bigramFilter, err := cache.TokenFilterNamed(BigramName)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.Analyzer{
+		Tokenizer: whitespaceTokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			normalizeFilter,
+			toLowerFilter,
+			bigramFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
+}
--- a/analysis/language/cjk/analyzer_cjk_test.go
+++ b/analysis/language/cjk/analyzer_cjk_test.go
@ -0,0 +1,620 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+
+package cjk
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/registry"
+)
+
+func TestCJKAnalyzer(t *testing.T) {
+	tests := []struct {
+		input  []byte
+		output analysis.TokenStream
+	}{
+		{
+			input: []byte("こんにちは世界"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("こん"),
+					Type:     analysis.Double,
+					Position: 1,
+					Start:    0,
+					End:      6,
+				},
+				&analysis.Token{
+					Term:     []byte("んに"),
+					Type:     analysis.Double,
+					Position: 2,
+					Start:    3,
+					End:      9,
+				},
+				&analysis.Token{
+					Term:     []byte("にち"),
+					Type:     analysis.Double,
+					Position: 3,
+					Start:    6,
+					End:      12,
+				},
+				&analysis.Token{
+					Term:     []byte("ちは"),
+					Type:     analysis.Double,
+					Position: 4,
+					Start:    9,
+					End:      15,
+				},
+				&analysis.Token{
+					Term:     []byte("は世"),
+					Type:     analysis.Double,
+					Position: 5,
+					Start:    12,
+					End:      18,
+				},
+				&analysis.Token{
+					Term:     []byte("世界"),
+					Type:     analysis.Double,
+					Position: 6,
+					Start:    15,
+					End:      21,
+				},
+			},
+		},
+		{
+			input: []byte("一二三四五六七八九十"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("一二"),
+					Type:     analysis.Double,
+					Position: 1,
+					Start:    0,
+					End:      6,
+				},
+				&analysis.Token{
+					Term:     []byte("二三"),
+					Type:     analysis.Double,
+					Position: 2,
+					Start:    3,
+					End:      9,
+				},
+				&analysis.Token{
+					Term:     []byte("三四"),
+					Type:     analysis.Double,
+					Position: 3,
+					Start:    6,
+					End:      12,
+				},
+				&analysis.Token{
+					Term:     []byte("四五"),
+					Type:     analysis.Double,
+					Position: 4,
+					Start:    9,
+					End:      15,
+				},
+				&analysis.Token{
+					Term:     []byte("五六"),
+					Type:     analysis.Double,
+					Position: 5,
+					Start:    12,
+					End:      18,
+				},
+				&analysis.Token{
+					Term:     []byte("六七"),
+					Type:     analysis.Double,
+					Position: 6,
+					Start:    15,
+					End:      21,
+				},
+				&analysis.Token{
+					Term:     []byte("七八"),
+					Type:     analysis.Double,
+					Position: 7,
+					Start:    18,
+					End:      24,
+				},
+				&analysis.Token{
+					Term:     []byte("八九"),
+					Type:     analysis.Double,
+					Position: 8,
+					Start:    21,
+					End:      27,
+				},
+				&analysis.Token{
+					Term:     []byte("九十"),
+					Type:     analysis.Double,
+					Position: 9,
+					Start:    24,
+					End:      30,
+				},
+			},
+		},
+		{
+			input: []byte("一 二三四 五六七八九 十"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("一"),
+					Type:     analysis.Single,
+					Position: 1,
+					Start:    0,
+					End:      3,
+				},
+				&analysis.Token{
+					Term:     []byte("二三"),
+					Type:     analysis.Double,
+					Position: 2,
+					Start:    4,
+					End:      10,
+				},
+				&analysis.Token{
+					Term:     []byte("三四"),
+					Type:     analysis.Double,
+					Position: 3,
+					Start:    7,
+					End:      13,
+				},
+				&analysis.Token{
+					Term:     []byte("五六"),
+					Type:     analysis.Double,
+					Position: 5,
+					Start:    14,
+					End:      20,
+				},
+				&analysis.Token{
+					Term:     []byte("六七"),
+					Type:     analysis.Double,
+					Position: 6,
+					Start:    17,
+					End:      23,
+				},
+				&analysis.Token{
+					Term:     []byte("七八"),
+					Type:     analysis.Double,
+					Position: 7,
+					Start:    20,
+					End:      26,
+				},
+				&analysis.Token{
+					Term:     []byte("八九"),
+					Type:     analysis.Double,
+					Position: 8,
+					Start:    23,
+					End:      29,
+				},
+				&analysis.Token{
+					Term:     []byte("十"),
+					Type:     analysis.Single,
+					Position: 10,
+					Start:    30,
+					End:      33,
+				},
+			},
+		},
+		{
+			input: []byte("abc defgh ijklmn opqrstu vwxy z"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("abc"),
+					Type:     analysis.AlphaNumeric,
+					Position: 1,
+					Start:    0,
+					End:      3,
+				},
+				&analysis.Token{
+					Term:     []byte("defgh"),
+					Type:     analysis.AlphaNumeric,
+					Position: 2,
+					Start:    4,
+					End:      9,
+				},
+				&analysis.Token{
+					Term:     []byte("ijklmn"),
+					Type:     analysis.AlphaNumeric,
+					Position: 3,
+					Start:    10,
+					End:      16,
+				},
+				&analysis.Token{
+					Term:     []byte("opqrstu"),
+					Type:     analysis.AlphaNumeric,
+					Position: 4,
+					Start:    17,
+					End:      24,
+				},
+				&analysis.Token{
+					Term:     []byte("vwxy"),
+					Type:     analysis.AlphaNumeric,
+					Position: 5,
+					Start:    25,
+					End:      29,
+				},
+				&analysis.Token{
+					Term:     []byte("z"),
+					Type:     analysis.AlphaNumeric,
+					Position: 6,
+					Start:    30,
+					End:      31,
+				},
+			},
+		},
+		{
+			input: []byte("あい"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("あい"),
+					Type:     analysis.Double,
+					Position: 1,
+					Start:    0,
+					End:      6,
+				},
+			},
+		},
+		{
+			input: []byte("あい   "),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("あい"),
+					Type:     analysis.Double,
+					Position: 1,
+					Start:    0,
+					End:      6,
+				},
+			},
+		},
+		{
+			input: []byte("test"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("test"),
+					Type:     analysis.AlphaNumeric,
+					Position: 1,
+					Start:    0,
+					End:      4,
+				},
+			},
+		},
+		{
+			input: []byte("test   "),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("test"),
+					Type:     analysis.AlphaNumeric,
+					Position: 1,
+					Start:    0,
+					End:      4,
+				},
+			},
+		},
+		{
+			input: []byte("あいtest"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("あい"),
+					Type:     analysis.Double,
+					Position: 1,
+					Start:    0,
+					End:      6,
+				},
+				&analysis.Token{
+					Term:     []byte("test"),
+					Type:     analysis.AlphaNumeric,
+					Position: 3,
+					Start:    6,
+					End:      10,
+				},
+			},
+		},
+		{
+			input: []byte("testあい    "),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("test"),
+					Type:     analysis.AlphaNumeric,
+					Position: 1,
+					Start:    0,
+					End:      4,
+				},
+				&analysis.Token{
+					Term:     []byte("あい"),
+					Type:     analysis.Double,
+					Position: 2,
+					Start:    4,
+					End:      10,
+				},
+			},
+		},
+		{
+			input: []byte("あいうえおabcかきくけこ"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("あい"),
+					Type:     analysis.Double,
+					Position: 1,
+					Start:    0,
+					End:      6,
+				},
+				&analysis.Token{
+					Term:     []byte("いう"),
+					Type:     analysis.Double,
+					Position: 2,
+					Start:    3,
+					End:      9,
+				},
+				&analysis.Token{
+					Term:     []byte("うえ"),
+					Type:     analysis.Double,
+					Position: 3,
+					Start:    6,
+					End:      12,
+				},
+				&analysis.Token{
+					Term:     []byte("えお"),
+					Type:     analysis.Double,
+					Position: 4,
+					Start:    9,
+					End:      15,
+				},
+				&analysis.Token{
+					Term:     []byte("abc"),
+					Type:     analysis.AlphaNumeric,
+					Position: 6,
+					Start:    15,
+					End:      18,
+				},
+				&analysis.Token{
+					Term:     []byte("かき"),
+					Type:     analysis.Double,
+					Position: 7,
+					Start:    18,
+					End:      24,
+				},
+				&analysis.Token{
+					Term:     []byte("きく"),
+					Type:     analysis.Double,
+					Position: 8,
+					Start:    21,
+					End:      27,
+				},
+				&analysis.Token{
+					Term:     []byte("くけ"),
+					Type:     analysis.Double,
+					Position: 9,
+					Start:    24,
+					End:      30,
+				},
+				&analysis.Token{
+					Term:     []byte("けこ"),
+					Type:     analysis.Double,
+					Position: 10,
+					Start:    27,
+					End:      33,
+				},
+			},
+		},
+		{
+			input: []byte("あいうえおabんcかきくけ こ"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("あい"),
+					Type:     analysis.Double,
+					Position: 1,
+					Start:    0,
+					End:      6,
+				},
+				&analysis.Token{
+					Term:     []byte("いう"),
+					Type:     analysis.Double,
+					Position: 2,
+					Start:    3,
+					End:      9,
+				},
+				&analysis.Token{
+					Term:     []byte("うえ"),
+					Type:     analysis.Double,
+					Position: 3,
+					Start:    6,
+					End:      12,
+				},
+				&analysis.Token{
+					Term:     []byte("えお"),
+					Type:     analysis.Double,
+					Position: 4,
+					Start:    9,
+					End:      15,
+				},
+				&analysis.Token{
+					Term:     []byte("ab"),
+					Type:     analysis.AlphaNumeric,
+					Position: 6,
+					Start:    15,
+					End:      17,
+				},
+				&analysis.Token{
+					Term:     []byte("ん"),
+					Type:     analysis.Single,
+					Position: 7,
+					Start:    17,
+					End:      20,
+				},
+				&analysis.Token{
+					Term:     []byte("c"),
+					Type:     analysis.AlphaNumeric,
+					Position: 8,
+					Start:    20,
+					End:      21,
+				},
+				&analysis.Token{
+					Term:     []byte("かき"),
+					Type:     analysis.Double,
+					Position: 9,
+					Start:    21,
+					End:      27,
+				},
+				&analysis.Token{
+					Term:     []byte("きく"),
+					Type:     analysis.Double,
+					Position: 10,
+					Start:    24,
+					End:      30,
+				},
+				&analysis.Token{
+					Term:     []byte("くけ"),
+					Type:     analysis.Double,
+					Position: 11,
+					Start:    27,
+					End:      33,
+				},
+				&analysis.Token{
+					Term:     []byte("こ"),
+					Type:     analysis.Single,
+					Position: 13,
+					Start:    34,
+					End:      37,
+				},
+			},
+		},
+		{
+			input: []byte("一 روبرت موير"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("一"),
+					Type:     analysis.Single,
+					Position: 1,
+					Start:    0,
+					End:      3,
+				},
+				&analysis.Token{
+					Term:     []byte("روبرت"),
+					Type:     analysis.AlphaNumeric,
+					Position: 2,
+					Start:    4,
+					End:      14,
+				},
+				&analysis.Token{
+					Term:     []byte("موير"),
+					Type:     analysis.AlphaNumeric,
+					Position: 3,
+					Start:    15,
+					End:      23,
+				},
+			},
+		},
+		{
+			input: []byte("一 رُوبرت موير"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("一"),
+					Type:     analysis.Single,
+					Position: 1,
+					Start:    0,
+					End:      3,
+				},
+				&analysis.Token{
+					Term:     []byte("رُوبرت"),
+					Type:     analysis.AlphaNumeric,
+					Position: 2,
+					Start:    4,
+					End:      16,
+				},
+				&analysis.Token{
+					Term:     []byte("موير"),
+					Type:     analysis.AlphaNumeric,
+					Position: 3,
+					Start:    17,
+					End:      25,
+				},
+			},
+		},
+		{
+			input: []byte("𩬅艱鍟䇹愯瀛"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("𩬅艱"),
+					Type:     analysis.Double,
+					Position: 1,
+					Start:    0,
+					End:      7,
+				},
+				&analysis.Token{
+					Term:     []byte("艱鍟"),
+					Type:     analysis.Double,
+					Position: 2,
+					Start:    4,
+					End:      10,
+				},
+				&analysis.Token{
+					Term:     []byte("鍟䇹"),
+					Type:     analysis.Double,
+					Position: 3,
+					Start:    7,
+					End:      13,
+				},
+				&analysis.Token{
+					Term:     []byte("䇹愯"),
+					Type:     analysis.Double,
+					Position: 4,
+					Start:    10,
+					End:      16,
+				},
+				&analysis.Token{
+					Term:     []byte("愯瀛"),
+					Type:     analysis.Double,
+					Position: 5,
+					Start:    13,
+					End:      19,
+				},
+			},
+		},
+		{
+			input: []byte("一"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("一"),
+					Type:     analysis.Single,
+					Position: 1,
+					Start:    0,
+					End:      3,
+				},
+			},
+		},
+		{
+			input: []byte("一丁丂"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("一丁"),
+					Type:     analysis.Double,
+					Position: 1,
+					Start:    0,
+					End:      6,
+				},
+				&analysis.Token{
+					Term:     []byte("丁丂"),
+					Type:     analysis.Double,
+					Position: 2,
+					Start:    3,
+					End:      9,
+				},
+			},
+		},
+	}
+
+	cache := registry.NewCache()
+	for _, test := range tests {
+		analyzer, err := cache.AnalyzerNamed(AnalyzerName)
+		if err != nil {
+			t.Fatal(err)
+		}
+		actual := analyzer.Analyze(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %v, got %v", test.output, actual)
+		}
+	}
+}
--- a/analysis/language/cjk/cjk_bigram.go
+++ b/analysis/language/cjk/cjk_bigram.go
@ -0,0 +1,166 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+
+package cjk
+
+import (
+	"container/ring"
+
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/registry"
+)
+
+const BigramName = "cjk_bigram"
+
+type CJKBigramFilter struct {
+	outputUnigram bool
+}
+
+func NewCJKBigramFilter(outputUnigram bool) *CJKBigramFilter {
+	return &CJKBigramFilter{
+		outputUnigram: outputUnigram,
+	}
+}
+
+func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	r := ring.New(2)
+	itemsInRing := 0
+
+	rv := make(analysis.TokenStream, 0)
+
+	for _, token := range input {
+		if token.Type == analysis.Ideographic {
+			if itemsInRing > 0 {
+				// if items already buffered
+				// check to see if this is aligned
+				curr := r.Value.(*analysis.Token)
+				if token.Start-curr.End != 0 {
+					// not aligned flush
+					flushToken := s.flush(r, &itemsInRing)
+					if flushToken != nil {
+						rv = append(rv, flushToken)
+					}
+				}
+			}
+			// now we can add this token to the buffer
+			r = r.Next()
+			r.Value = token
+			if itemsInRing < 2 {
+				itemsInRing++
+			}
+			if itemsInRing > 1 && s.outputUnigram {
+				unigram := s.buildUnigram(r, &itemsInRing)
+				if unigram != nil {
+					rv = append(rv, unigram)
+				}
+			}
+			bigramToken := s.outputBigram(r, &itemsInRing)
+			if bigramToken != nil {
+				rv = append(rv, bigramToken)
+			}
+		} else {
+			// flush anything already buffered
+			flushToken := s.flush(r, &itemsInRing)
+			if flushToken != nil {
+				rv = append(rv, flushToken)
+			}
+			// output this token as is
+			rv = append(rv, token)
+		}
+	}
+
+	// deal with possible trailing unigram
+	if itemsInRing == 1 || s.outputUnigram {
+		if itemsInRing == 2 {
+			r = r.Next()
+		}
+		unigram := s.buildUnigram(r, &itemsInRing)
+		if unigram != nil {
+			rv = append(rv, unigram)
+		}
+	}
+	return rv
+}
+
+func (s *CJKBigramFilter) flush(r *ring.Ring, itemsInRing *int) *analysis.Token {
+	var rv *analysis.Token
+	if *itemsInRing == 1 {
+		rv = s.buildUnigram(r, itemsInRing)
+	}
+	r.Value = nil
+	*itemsInRing = 0
+	return rv
+}
+
+func (s *CJKBigramFilter) outputBigram(r *ring.Ring, itemsInRing *int) *analysis.Token {
+	if *itemsInRing == 2 {
+		thisShingleRing := r.Move(-1)
+		shingledBytes := make([]byte, 0)
+
+		// do first token
+		prev := thisShingleRing.Value.(*analysis.Token)
+		shingledBytes = append(shingledBytes, prev.Term...)
+
+		// do second token
+		thisShingleRing = thisShingleRing.Next()
+		curr := thisShingleRing.Value.(*analysis.Token)
+		shingledBytes = append(shingledBytes, curr.Term...)
+
+		token := analysis.Token{
+			Type:     analysis.Double,
+			Term:     shingledBytes,
+			Position: prev.Position,
+			Start:    prev.Start,
+			End:      curr.End,
+		}
+		return &token
+	}
+	return nil
+}
+
+func (s *CJKBigramFilter) buildUnigram(r *ring.Ring, itemsInRing *int) *analysis.Token {
+	if *itemsInRing == 2 {
+		thisShingleRing := r.Move(-1)
+		// do first token
+		prev := thisShingleRing.Value.(*analysis.Token)
+		token := analysis.Token{
+			Type:     analysis.Single,
+			Term:     prev.Term,
+			Position: prev.Position,
+			Start:    prev.Start,
+			End:      prev.End,
+		}
+		return &token
+	} else if *itemsInRing == 1 {
+		// do first token
+		prev := r.Value.(*analysis.Token)
+		token := analysis.Token{
+			Type:     analysis.Single,
+			Term:     prev.Term,
+			Position: prev.Position,
+			Start:    prev.Start,
+			End:      prev.End,
+		}
+		return &token
+	}
+	return nil
+}
+
+func CJKBigramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	outputUnigram := false
+	outVal, ok := config["output_unigram"].(bool)
+	if ok {
+		outputUnigram = outVal
+	}
+	return NewCJKBigramFilter(outputUnigram), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(BigramName, CJKBigramFilterConstructor)
+}
--- a/analysis/language/cjk/cjk_bigram_test.go
+++ b/analysis/language/cjk/cjk_bigram_test.go
@ -0,0 +1,420 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+
+package cjk
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/analysis"
+)
+
+func TestCJKBigramFilter(t *testing.T) {
+
+	tests := []struct {
+		outputUnigram bool
+		input         analysis.TokenStream
+		output        analysis.TokenStream
+	}{
+		{
+			outputUnigram: false,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("こ"),
+					Type:     analysis.Ideographic,
+					Position: 1,
+					Start:    0,
+					End:      3,
+				},
+				&analysis.Token{
+					Term:     []byte("ん"),
+					Type:     analysis.Ideographic,
+					Position: 2,
+					Start:    5,
+					End:      7,
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("こ"),
+					Type:     analysis.Single,
+					Position: 1,
+					Start:    0,
+					End:      3,
+				},
+				&analysis.Token{
+					Term:     []byte("ん"),
+					Type:     analysis.Single,
+					Position: 2,
+					Start:    5,
+					End:      7,
+				},
+			},
+		},
+		{
+			outputUnigram: false,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("こ"),
+					Type:     analysis.Ideographic,
+					Position: 1,
+					Start:    0,
+					End:      3,
+				},
+				&analysis.Token{
+					Term:     []byte("ん"),
+					Type:     analysis.Ideographic,
+					Position: 2,
+					Start:    3,
+					End:      6,
+				},
+				&analysis.Token{
+					Term:     []byte("に"),
+					Type:     analysis.Ideographic,
+					Position: 3,
+					Start:    6,
+					End:      9,
+				},
+				&analysis.Token{
+					Term:     []byte("ち"),
+					Type:     analysis.Ideographic,
+					Position: 4,
+					Start:    9,
+					End:      12,
+				},
+				&analysis.Token{
+					Term:     []byte("は"),
+					Type:     analysis.Ideographic,
+					Position: 5,
+					Start:    12,
+					End:      15,
+				},
+				&analysis.Token{
+					Term:     []byte("世"),
+					Type:     analysis.Ideographic,
+					Position: 6,
+					Start:    15,
+					End:      18,
+				},
+				&analysis.Token{
+					Term:     []byte("界"),
+					Type:     analysis.Ideographic,
+					Position: 7,
+					Start:    18,
+					End:      21,
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("こん"),
+					Type:     analysis.Double,
+					Position: 1,
+					Start:    0,
+					End:      6,
+				},
+				&analysis.Token{
+					Term:     []byte("んに"),
+					Type:     analysis.Double,
+					Position: 2,
+					Start:    3,
+					End:      9,
+				},
+				&analysis.Token{
+					Term:     []byte("にち"),
+					Type:     analysis.Double,
+					Position: 3,
+					Start:    6,
+					End:      12,
+				},
+				&analysis.Token{
+					Term:     []byte("ちは"),
+					Type:     analysis.Double,
+					Position: 4,
+					Start:    9,
+					End:      15,
+				},
+				&analysis.Token{
+					Term:     []byte("は世"),
+					Type:     analysis.Double,
+					Position: 5,
+					Start:    12,
+					End:      18,
+				},
+				&analysis.Token{
+					Term:     []byte("世界"),
+					Type:     analysis.Double,
+					Position: 6,
+					Start:    15,
+					End:      21,
+				},
+			},
+		},
+		{
+			outputUnigram: true,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("こ"),
+					Type:     analysis.Ideographic,
+					Position: 1,
+					Start:    0,
+					End:      3,
+				},
+				&analysis.Token{
+					Term:     []byte("ん"),
+					Type:     analysis.Ideographic,
+					Position: 2,
+					Start:    3,
+					End:      6,
+				},
+				&analysis.Token{
+					Term:     []byte("に"),
+					Type:     analysis.Ideographic,
+					Position: 3,
+					Start:    6,
+					End:      9,
+				},
+				&analysis.Token{
+					Term:     []byte("ち"),
+					Type:     analysis.Ideographic,
+					Position: 4,
+					Start:    9,
+					End:      12,
+				},
+				&analysis.Token{
+					Term:     []byte("は"),
+					Type:     analysis.Ideographic,
+					Position: 5,
+					Start:    12,
+					End:      15,
+				},
+				&analysis.Token{
+					Term:     []byte("世"),
+					Type:     analysis.Ideographic,
+					Position: 6,
+					Start:    15,
+					End:      18,
+				},
+				&analysis.Token{
+					Term:     []byte("界"),
+					Type:     analysis.Ideographic,
+					Position: 7,
+					Start:    18,
+					End:      21,
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("こ"),
+					Type:     analysis.Single,
+					Position: 1,
+					Start:    0,
+					End:      3,
+				},
+				&analysis.Token{
+					Term:     []byte("こん"),
+					Type:     analysis.Double,
+					Position: 1,
+					Start:    0,
+					End:      6,
+				},
+				&analysis.Token{
+					Term:     []byte("ん"),
+					Type:     analysis.Single,
+					Position: 2,
+					Start:    3,
+					End:      6,
+				},
+				&analysis.Token{
+					Term:     []byte("んに"),
+					Type:     analysis.Double,
+					Position: 2,
+					Start:    3,
+					End:      9,
+				},
+				&analysis.Token{
+					Term:     []byte("に"),
+					Type:     analysis.Single,
+					Position: 3,
+					Start:    6,
+					End:      9,
+				},
+				&analysis.Token{
+					Term:     []byte("にち"),
+					Type:     analysis.Double,
+					Position: 3,
+					Start:    6,
+					End:      12,
+				},
+				&analysis.Token{
+					Term:     []byte("ち"),
+					Type:     analysis.Single,
+					Position: 4,
+					Start:    9,
+					End:      12,
+				},
+				&analysis.Token{
+					Term:     []byte("ちは"),
+					Type:     analysis.Double,
+					Position: 4,
+					Start:    9,
+					End:      15,
+				},
+				&analysis.Token{
+					Term:     []byte("は"),
+					Type:     analysis.Single,
+					Position: 5,
+					Start:    12,
+					End:      15,
+				},
+				&analysis.Token{
+					Term:     []byte("は世"),
+					Type:     analysis.Double,
+					Position: 5,
+					Start:    12,
+					End:      18,
+				},
+				&analysis.Token{
+					Term:     []byte("世"),
+					Type:     analysis.Single,
+					Position: 6,
+					Start:    15,
+					End:      18,
+				},
+				&analysis.Token{
+					Term:     []byte("世界"),
+					Type:     analysis.Double,
+					Position: 6,
+					Start:    15,
+					End:      21,
+				},
+				&analysis.Token{
+					Term:     []byte("界"),
+					Type:     analysis.Single,
+					Position: 7,
+					Start:    18,
+					End:      21,
+				},
+			},
+		},
+		{
+			outputUnigram: false,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("こ"),
+					Type:     analysis.Ideographic,
+					Position: 1,
+					Start:    0,
+					End:      3,
+				},
+				&analysis.Token{
+					Term:     []byte("ん"),
+					Type:     analysis.Ideographic,
+					Position: 2,
+					Start:    3,
+					End:      6,
+				},
+				&analysis.Token{
+					Term:     []byte("に"),
+					Type:     analysis.Ideographic,
+					Position: 3,
+					Start:    6,
+					End:      9,
+				},
+				&analysis.Token{
+					Term:     []byte("ち"),
+					Type:     analysis.Ideographic,
+					Position: 4,
+					Start:    9,
+					End:      12,
+				},
+				&analysis.Token{
+					Term:     []byte("は"),
+					Type:     analysis.Ideographic,
+					Position: 5,
+					Start:    12,
+					End:      15,
+				},
+				&analysis.Token{
+					Term:     []byte("cat"),
+					Type:     analysis.AlphaNumeric,
+					Position: 6,
+					Start:    12,
+					End:      15,
+				},
+				&analysis.Token{
+					Term:     []byte("世"),
+					Type:     analysis.Ideographic,
+					Position: 7,
+					Start:    18,
+					End:      21,
+				},
+				&analysis.Token{
+					Term:     []byte("界"),
+					Type:     analysis.Ideographic,
+					Position: 8,
+					Start:    21,
+					End:      24,
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("こん"),
+					Type:     analysis.Double,
+					Position: 1,
+					Start:    0,
+					End:      6,
+				},
+				&analysis.Token{
+					Term:     []byte("んに"),
+					Type:     analysis.Double,
+					Position: 2,
+					Start:    3,
+					End:      9,
+				},
+				&analysis.Token{
+					Term:     []byte("にち"),
+					Type:     analysis.Double,
+					Position: 3,
+					Start:    6,
+					End:      12,
+				},
+				&analysis.Token{
+					Term:     []byte("ちは"),
+					Type:     analysis.Double,
+					Position: 4,
+					Start:    9,
+					End:      15,
+				},
+				&analysis.Token{
+					Term:     []byte("cat"),
+					Type:     analysis.AlphaNumeric,
+					Position: 6,
+					Start:    12,
+					End:      15,
+				},
+				&analysis.Token{
+					Term:     []byte("世界"),
+					Type:     analysis.Double,
+					Position: 7,
+					Start:    18,
+					End:      24,
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		cjkBigramFilter := NewCJKBigramFilter(test.outputUnigram)
+		actual := cjkBigramFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %s, got %s", test.output, actual)
+		}
+	}
+}
--- a/analysis/type.go
+++ b/analysis/type.go
@ -26,6 +26,8 @@ const (
 	Numeric
 	DateTime
 	Shingle
+	Single
+	Double
 )

 type Token struct {