bleve/analysis/tokenizers/exception/exception_test.go

package exception

import (
	"reflect"
	"testing"

	"github.com/blevesearch/bleve/analysis"
	_ "github.com/blevesearch/bleve/analysis/tokenizers/unicode"
	"github.com/blevesearch/bleve/registry"
)

func TestExceptionsTokenizer(t *testing.T) {
	tests := []struct {
		config   map[string]interface{}
		input    []byte
		patterns []string
		result   analysis.TokenStream
	}{
		{
			input: []byte("test http://blevesearch.com/ words"),
			config: map[string]interface{}{
				"type":      "exception",
				"tokenizer": "unicode",
				"exceptions": []interface{}{
					`[hH][tT][tT][pP][sS]?://(\S)*`,
					`[fF][iI][lL][eE]://(\S)*`,
					`[fF][tT][pP]://(\S)*`,
				},
			},
			result: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("test"),
					Position: 1,
					Start:    0,
					End:      4,
				},
				&analysis.Token{
					Term:     []byte("http://blevesearch.com/"),
					Position: 2,
					Start:    5,
					End:      28,
				},
				&analysis.Token{
					Term:     []byte("words"),
					Position: 3,
					Start:    29,
					End:      34,
				},
			},
		},
		{
			input: []byte("what ftp://blevesearch.com/ songs"),
			config: map[string]interface{}{
				"type":      "exception",
				"tokenizer": "unicode",
				"exceptions": []interface{}{
					`[hH][tT][tT][pP][sS]?://(\S)*`,
					`[fF][iI][lL][eE]://(\S)*`,
					`[fF][tT][pP]://(\S)*`,
				},
			},
			result: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("what"),
					Position: 1,
					Start:    0,
					End:      4,
				},
				&analysis.Token{
					Term:     []byte("ftp://blevesearch.com/"),
					Position: 2,
					Start:    5,
					End:      27,
				},
				&analysis.Token{
					Term:     []byte("songs"),
					Position: 3,
					Start:    28,
					End:      33,
				},
			},
		},
		{
			input: []byte("please email marty@couchbase.com the URL https://blevesearch.com/"),
			config: map[string]interface{}{
				"type":      "exception",
				"tokenizer": "unicode",
				"exceptions": []interface{}{
					`[hH][tT][tT][pP][sS]?://(\S)*`,
					`[fF][iI][lL][eE]://(\S)*`,
					`[fF][tT][pP]://(\S)*`,
					`\S+@\S+`,
				},
			},
			result: analysis.TokenStream{
				&analysis.Token{
					Term:     []byte("please"),
					Position: 1,
					Start:    0,
					End:      6,
				},
				&analysis.Token{
					Term:     []byte("email"),
					Position: 2,
					Start:    7,
					End:      12,
				},
				&analysis.Token{
					Term:     []byte("marty@couchbase.com"),
					Position: 3,
					Start:    13,
					End:      32,
				},
				&analysis.Token{
					Term:     []byte("the"),
					Position: 4,
					Start:    33,
					End:      36,
				},
				&analysis.Token{
					Term:     []byte("URL"),
					Position: 5,
					Start:    37,
					End:      40,
				},
				&analysis.Token{
					Term:     []byte("https://blevesearch.com/"),
					Position: 6,
					Start:    41,
					End:      65,
				},
			},
		},
	}

	// remaining := unicode.NewUnicodeTokenizer()
	for _, test := range tests {

		// build the requested exception tokenizer
		cache := registry.NewCache()
		tokenizer, err := cache.DefineTokenizer("custom", test.config)
		if err != nil {
			t.Fatal(err)
		}

		// pattern := strings.Join(test.patterns, "|")
		// r, err := regexp.Compile(pattern)
		// if err != nil {
		// 	t.Fatal(err)
		// }
		// tokenizer := NewExceptionsTokenizer(r, remaining)
		actual := tokenizer.Tokenize(test.input)
		if !reflect.DeepEqual(actual, test.result) {
			t.Errorf("expected %v, got %v", test.result, actual)
		}
	}
}