0
0
bleve/analysis/tokenizers/cld2/cld2_tokenizer_test.go

77 lines
1.4 KiB
Go

package cld2
import (
"reflect"
"testing"
"github.com/couchbaselabs/bleve/analysis"
)
func TestCld2Tokenizer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
input: []byte("the quick brown fox"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("en"),
Start: 0,
End: 2,
Position: 1,
},
},
},
{
input: []byte("こんにちは世界"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ja"),
Start: 0,
End: 2,
Position: 1,
},
},
},
{
input: []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("th"),
Start: 0,
End: 2,
Position: 1,
},
},
},
{
input: []byte("مرحبا، العالم!"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ar"),
Start: 0,
End: 2,
Position: 1,
},
},
},
}
tokenizer := NewCld2Tokenizer()
for _, test := range tests {
res := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(res, test.output) {
t.Errorf("expected:")
for _, token := range test.output {
t.Errorf("%#v - %s", token, token.Term)
}
t.Errorf("got:")
for _, token := range res {
t.Errorf("%#v - %s", token, token.Term)
}
}
}
}