parent
cb5ccd2b1d
commit
1a1cf32a86
50
analysis/language/cjk/analyzer_cjk.go
Normal file
50
analysis/language/cjk/analyzer_cjk.go
Normal file
|
@ -0,0 +1,50 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package cjk
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
|
||||
"github.com/blevesearch/bleve/analysis/tokenizers/whitespace_tokenizer"
|
||||
)
|
||||
|
||||
const AnalyzerName = "cjk"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
|
||||
whitespaceTokenizer, err := cache.TokenizerNamed(whitespace_tokenizer.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKD)
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
bigramFilter, err := cache.TokenFilterNamed(BigramName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.Analyzer{
|
||||
Tokenizer: whitespaceTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
normalizeFilter,
|
||||
toLowerFilter,
|
||||
bigramFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
}
|
620
analysis/language/cjk/analyzer_cjk_test.go
Normal file
620
analysis/language/cjk/analyzer_cjk_test.go
Normal file
|
@ -0,0 +1,620 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package cjk
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
func TestCJKAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: []byte("こんにちは世界"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こん"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("んに"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("にち"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ちは"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は世"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世界"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 21,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一二三四五六七八九十"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一二"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("二三"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("三四"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("四五"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("五六"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("六七"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 21,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("七八"),
|
||||
Type: analysis.Double,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 24,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("八九"),
|
||||
Type: analysis.Double,
|
||||
Position: 8,
|
||||
Start: 21,
|
||||
End: 27,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("九十"),
|
||||
Type: analysis.Double,
|
||||
Position: 9,
|
||||
Start: 24,
|
||||
End: 30,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一 二三四 五六七八九 十"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("二三"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 10,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("三四"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 7,
|
||||
End: 13,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("五六"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 14,
|
||||
End: 20,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("六七"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 17,
|
||||
End: 23,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("七八"),
|
||||
Type: analysis.Double,
|
||||
Position: 7,
|
||||
Start: 20,
|
||||
End: 26,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("八九"),
|
||||
Type: analysis.Double,
|
||||
Position: 8,
|
||||
Start: 23,
|
||||
End: 29,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("十"),
|
||||
Type: analysis.Single,
|
||||
Position: 10,
|
||||
Start: 30,
|
||||
End: 33,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("abc defgh ijklmn opqrstu vwxy z"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abc"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("defgh"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ijklmn"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 3,
|
||||
Start: 10,
|
||||
End: 16,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("opqrstu"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 4,
|
||||
Start: 17,
|
||||
End: 24,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("vwxy"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 5,
|
||||
Start: 25,
|
||||
End: 29,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("z"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 6,
|
||||
Start: 30,
|
||||
End: 31,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("あい"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("あい "),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("test"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 4,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("test "),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 4,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("あいtest"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("testあい "),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("あいうえおabcかきくけこ"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("いう"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("うえ"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("えお"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("abc"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("かき"),
|
||||
Type: analysis.Double,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 24,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("きく"),
|
||||
Type: analysis.Double,
|
||||
Position: 8,
|
||||
Start: 21,
|
||||
End: 27,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("くけ"),
|
||||
Type: analysis.Double,
|
||||
Position: 9,
|
||||
Start: 24,
|
||||
End: 30,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("けこ"),
|
||||
Type: analysis.Double,
|
||||
Position: 10,
|
||||
Start: 27,
|
||||
End: 33,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("あいうえおabんcかきくけ こ"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("いう"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("うえ"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("えお"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ab"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 17,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Single,
|
||||
Position: 7,
|
||||
Start: 17,
|
||||
End: 20,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("c"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 8,
|
||||
Start: 20,
|
||||
End: 21,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("かき"),
|
||||
Type: analysis.Double,
|
||||
Position: 9,
|
||||
Start: 21,
|
||||
End: 27,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("きく"),
|
||||
Type: analysis.Double,
|
||||
Position: 10,
|
||||
Start: 24,
|
||||
End: 30,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("くけ"),
|
||||
Type: analysis.Double,
|
||||
Position: 11,
|
||||
Start: 27,
|
||||
End: 33,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Single,
|
||||
Position: 13,
|
||||
Start: 34,
|
||||
End: 37,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一 روبرت موير"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("روبرت"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 14,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("موير"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 3,
|
||||
Start: 15,
|
||||
End: 23,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一 رُوبرت موير"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("رُوبرت"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 16,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("موير"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 3,
|
||||
Start: 17,
|
||||
End: 25,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("𩬅艱鍟䇹愯瀛"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("𩬅艱"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 7,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("艱鍟"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 10,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("鍟䇹"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 7,
|
||||
End: 13,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("䇹愯"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 10,
|
||||
End: 16,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("愯瀛"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 13,
|
||||
End: 19,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一丁丂"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一丁"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("丁丂"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
for _, test := range tests {
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
166
analysis/language/cjk/cjk_bigram.go
Normal file
166
analysis/language/cjk/cjk_bigram.go
Normal file
|
@ -0,0 +1,166 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package cjk
|
||||
|
||||
import (
|
||||
"container/ring"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
)
|
||||
|
||||
const BigramName = "cjk_bigram"
|
||||
|
||||
type CJKBigramFilter struct {
|
||||
outputUnigram bool
|
||||
}
|
||||
|
||||
func NewCJKBigramFilter(outputUnigram bool) *CJKBigramFilter {
|
||||
return &CJKBigramFilter{
|
||||
outputUnigram: outputUnigram,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
r := ring.New(2)
|
||||
itemsInRing := 0
|
||||
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
|
||||
for _, token := range input {
|
||||
if token.Type == analysis.Ideographic {
|
||||
if itemsInRing > 0 {
|
||||
// if items already buffered
|
||||
// check to see if this is aligned
|
||||
curr := r.Value.(*analysis.Token)
|
||||
if token.Start-curr.End != 0 {
|
||||
// not aligned flush
|
||||
flushToken := s.flush(r, &itemsInRing)
|
||||
if flushToken != nil {
|
||||
rv = append(rv, flushToken)
|
||||
}
|
||||
}
|
||||
}
|
||||
// now we can add this token to the buffer
|
||||
r = r.Next()
|
||||
r.Value = token
|
||||
if itemsInRing < 2 {
|
||||
itemsInRing++
|
||||
}
|
||||
if itemsInRing > 1 && s.outputUnigram {
|
||||
unigram := s.buildUnigram(r, &itemsInRing)
|
||||
if unigram != nil {
|
||||
rv = append(rv, unigram)
|
||||
}
|
||||
}
|
||||
bigramToken := s.outputBigram(r, &itemsInRing)
|
||||
if bigramToken != nil {
|
||||
rv = append(rv, bigramToken)
|
||||
}
|
||||
} else {
|
||||
// flush anything already buffered
|
||||
flushToken := s.flush(r, &itemsInRing)
|
||||
if flushToken != nil {
|
||||
rv = append(rv, flushToken)
|
||||
}
|
||||
// output this token as is
|
||||
rv = append(rv, token)
|
||||
}
|
||||
}
|
||||
|
||||
// deal with possible trailing unigram
|
||||
if itemsInRing == 1 || s.outputUnigram {
|
||||
if itemsInRing == 2 {
|
||||
r = r.Next()
|
||||
}
|
||||
unigram := s.buildUnigram(r, &itemsInRing)
|
||||
if unigram != nil {
|
||||
rv = append(rv, unigram)
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *CJKBigramFilter) flush(r *ring.Ring, itemsInRing *int) *analysis.Token {
|
||||
var rv *analysis.Token
|
||||
if *itemsInRing == 1 {
|
||||
rv = s.buildUnigram(r, itemsInRing)
|
||||
}
|
||||
r.Value = nil
|
||||
*itemsInRing = 0
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *CJKBigramFilter) outputBigram(r *ring.Ring, itemsInRing *int) *analysis.Token {
|
||||
if *itemsInRing == 2 {
|
||||
thisShingleRing := r.Move(-1)
|
||||
shingledBytes := make([]byte, 0)
|
||||
|
||||
// do first token
|
||||
prev := thisShingleRing.Value.(*analysis.Token)
|
||||
shingledBytes = append(shingledBytes, prev.Term...)
|
||||
|
||||
// do second token
|
||||
thisShingleRing = thisShingleRing.Next()
|
||||
curr := thisShingleRing.Value.(*analysis.Token)
|
||||
shingledBytes = append(shingledBytes, curr.Term...)
|
||||
|
||||
token := analysis.Token{
|
||||
Type: analysis.Double,
|
||||
Term: shingledBytes,
|
||||
Position: prev.Position,
|
||||
Start: prev.Start,
|
||||
End: curr.End,
|
||||
}
|
||||
return &token
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *CJKBigramFilter) buildUnigram(r *ring.Ring, itemsInRing *int) *analysis.Token {
|
||||
if *itemsInRing == 2 {
|
||||
thisShingleRing := r.Move(-1)
|
||||
// do first token
|
||||
prev := thisShingleRing.Value.(*analysis.Token)
|
||||
token := analysis.Token{
|
||||
Type: analysis.Single,
|
||||
Term: prev.Term,
|
||||
Position: prev.Position,
|
||||
Start: prev.Start,
|
||||
End: prev.End,
|
||||
}
|
||||
return &token
|
||||
} else if *itemsInRing == 1 {
|
||||
// do first token
|
||||
prev := r.Value.(*analysis.Token)
|
||||
token := analysis.Token{
|
||||
Type: analysis.Single,
|
||||
Term: prev.Term,
|
||||
Position: prev.Position,
|
||||
Start: prev.Start,
|
||||
End: prev.End,
|
||||
}
|
||||
return &token
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func CJKBigramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
outputUnigram := false
|
||||
outVal, ok := config["output_unigram"].(bool)
|
||||
if ok {
|
||||
outputUnigram = outVal
|
||||
}
|
||||
return NewCJKBigramFilter(outputUnigram), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenFilter(BigramName, CJKBigramFilterConstructor)
|
||||
}
|
420
analysis/language/cjk/cjk_bigram_test.go
Normal file
420
analysis/language/cjk/cjk_bigram_test.go
Normal file
|
@ -0,0 +1,420 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
// except in compliance with the License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
// either express or implied. See the License for the specific language governing permissions
|
||||
// and limitations under the License.
|
||||
|
||||
package cjk
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestCJKBigramFilter(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
outputUnigram bool
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
outputUnigram: false,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 2,
|
||||
Start: 5,
|
||||
End: 7,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Single,
|
||||
Position: 2,
|
||||
Start: 5,
|
||||
End: 7,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
outputUnigram: false,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("に"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ち"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("界"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 21,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こん"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("んに"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("にち"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ちは"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は世"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世界"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 21,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
outputUnigram: true,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("に"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ち"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("界"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 21,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("こん"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Single,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("んに"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("に"),
|
||||
Type: analysis.Single,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("にち"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ち"),
|
||||
Type: analysis.Single,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ちは"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は"),
|
||||
Type: analysis.Single,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は世"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世"),
|
||||
Type: analysis.Single,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世界"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 21,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("界"),
|
||||
Type: analysis.Single,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 21,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
outputUnigram: false,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("に"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ち"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 6,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 21,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("界"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 8,
|
||||
Start: 21,
|
||||
End: 24,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こん"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("んに"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("にち"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ちは"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 6,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世界"),
|
||||
Type: analysis.Double,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 24,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
cjkBigramFilter := NewCJKBigramFilter(test.outputUnigram)
|
||||
actual := cjkBigramFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -26,6 +26,8 @@ const (
|
|||
Numeric
|
||||
DateTime
|
||||
Shingle
|
||||
Single
|
||||
Double
|
||||
)
|
||||
|
||||
type Token struct {
|
||||
|
|
Loading…
Reference in New Issue
Block a user