bleve/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go

//  Copyright (c) 2014 Couchbase, Inc.
//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
//  except in compliance with the License. You may obtain a copy of the License at
//    http://www.apache.org/licenses/LICENSE-2.0
//  Unless required by applicable law or agreed to in writing, software distributed under the
//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
//  either express or implied. See the License for the specific language governing permissions
//  and limitations under the License.

package regexp_tokenizer

import (
	"reflect"
	"regexp"
	"testing"

	"github.com/blevesearch/bleve/analysis"
)

func TestBoundary(t *testing.T) {

	wordRegex := regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|\w+`)

	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		{
			[]byte("Hello World."),
			analysis.TokenStream{
				{
					Start:    0,
					End:      5,
					Term:     []byte("Hello"),
					Position: 1,
					Type:     analysis.AlphaNumeric,
				},
				{
					Start:    6,
					End:      11,
					Term:     []byte("World"),
					Position: 2,
					Type:     analysis.AlphaNumeric,
				},
			},
		},
		{
			[]byte("こんにちは世界"),
			analysis.TokenStream{
				{
					Start:    0,
					End:      3,
					Term:     []byte("こ"),
					Position: 1,
					Type:     analysis.Ideographic,
				},
				{
					Start:    3,
					End:      6,
					Term:     []byte("ん"),
					Position: 2,
					Type:     analysis.Ideographic,
				},
				{
					Start:    6,
					End:      9,
					Term:     []byte("に"),
					Position: 3,
					Type:     analysis.Ideographic,
				},
				{
					Start:    9,
					End:      12,
					Term:     []byte("ち"),
					Position: 4,
					Type:     analysis.Ideographic,
				},
				{
					Start:    12,
					End:      15,
					Term:     []byte("は"),
					Position: 5,
					Type:     analysis.Ideographic,
				},
				{
					Start:    15,
					End:      18,
					Term:     []byte("世"),
					Position: 6,
					Type:     analysis.Ideographic,
				},
				{
					Start:    18,
					End:      21,
					Term:     []byte("界"),
					Position: 7,
					Type:     analysis.Ideographic,
				},
			},
		},
		{
			[]byte(""),
			analysis.TokenStream{},
		},
	}

	for _, test := range tests {
		tokenizer := NewRegexpTokenizer(wordRegex)
		actual := tokenizer.Tokenize(test.input)

		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
		}
	}
}

func TestBugProducingEmptyTokens(t *testing.T) {

	wordRegex := regexp.MustCompile(`[0-9a-zA-Z_]*`)

	tests := []struct {
		input  []byte
		output analysis.TokenStream
	}{
		{
			[]byte("Chatha Edwards Sr."),
			analysis.TokenStream{
				{
					Start:    0,
					End:      6,
					Term:     []byte("Chatha"),
					Position: 1,
					Type:     analysis.AlphaNumeric,
				},
				{
					Start:    7,
					End:      14,
					Term:     []byte("Edwards"),
					Position: 2,
					Type:     analysis.AlphaNumeric,
				},
				{
					Start:    15,
					End:      17,
					Term:     []byte("Sr"),
					Position: 3,
					Type:     analysis.AlphaNumeric,
				},
			},
		},
	}

	for _, test := range tests {
		tokenizer := NewRegexpTokenizer(wordRegex)
		actual := tokenizer.Tokenize(test.input)

		if !reflect.DeepEqual(actual, test.output) {
			t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
		}
	}
}
initial commit 2014-04-17 22:55:53 +02:00			`// Copyright (c) 2014 Couchbase, Inc.`
			`// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file`
			`// except in compliance with the License. You may obtain a copy of the License at`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`// Unless required by applicable law or agreed to in writing, software distributed under the`
			`// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,`
			`// either express or implied. See the License for the specific language governing permissions`
			`// and limitations under the License.`
add newline between license and package this avoids cluttering godocs with the license 2014-09-02 16:54:50 +02:00
introduced token type 2014-07-31 19:54:12 +02:00			`package regexp_tokenizer`
initial commit 2014-04-17 22:55:53 +02:00
			`import (`
			`"reflect"`
introduced token type 2014-07-31 19:54:12 +02:00			`"regexp"`
initial commit 2014-04-17 22:55:53 +02:00			`"testing"`

rename imports from couchbaselabs to blevesearch 2014-08-28 21:38:57 +02:00			`"github.com/blevesearch/bleve/analysis"`
initial commit 2014-04-17 22:55:53 +02:00			`)`

			`func TestBoundary(t *testing.T) {`

changed whitespace tokenizer to work better on cjk input now it will return each cjk character as a separate token this will pair well with a cjk bigram filter for indexing 2014-09-07 20:11:01 +02:00			wordRegex := regexp.MustCompile(`\p{Han}\|\p{Hangul}\|\p{Hiragana}\|\p{Katakana}\|\w+`)
introduced token type 2014-07-31 19:54:12 +02:00
initial commit 2014-04-17 22:55:53 +02:00			`tests := []struct {`
			`input []byte`
			`output analysis.TokenStream`
			`}{`
			`{`
			`[]byte("Hello World."),`
			`analysis.TokenStream{`
			`{`
introduced token type 2014-07-31 19:54:12 +02:00			`Start: 0,`
			`End: 5,`
			`Term: []byte("Hello"),`
			`Position: 1,`
			`Type: analysis.AlphaNumeric,`
initial commit 2014-04-17 22:55:53 +02:00			`},`
			`{`
introduced token type 2014-07-31 19:54:12 +02:00			`Start: 6,`
			`End: 11,`
			`Term: []byte("World"),`
			`Position: 2,`
			`Type: analysis.AlphaNumeric,`
initial commit 2014-04-17 22:55:53 +02:00			`},`
			`},`
			`},`
changed whitespace tokenizer to work better on cjk input now it will return each cjk character as a separate token this will pair well with a cjk bigram filter for indexing 2014-09-07 20:11:01 +02:00			`{`
			`[]byte("こんにちは世界"),`
			`analysis.TokenStream{`
			`{`
			`Start: 0,`
			`End: 3,`
			`Term: []byte("こ"),`
			`Position: 1,`
			`Type: analysis.Ideographic,`
			`},`
			`{`
			`Start: 3,`
			`End: 6,`
			`Term: []byte("ん"),`
			`Position: 2,`
			`Type: analysis.Ideographic,`
			`},`
			`{`
			`Start: 6,`
			`End: 9,`
			`Term: []byte("に"),`
			`Position: 3,`
			`Type: analysis.Ideographic,`
			`},`
			`{`
			`Start: 9,`
			`End: 12,`
			`Term: []byte("ち"),`
			`Position: 4,`
			`Type: analysis.Ideographic,`
			`},`
			`{`
			`Start: 12,`
			`End: 15,`
			`Term: []byte("は"),`
			`Position: 5,`
			`Type: analysis.Ideographic,`
			`},`
			`{`
			`Start: 15,`
			`End: 18,`
			`Term: []byte("世"),`
			`Position: 6,`
			`Type: analysis.Ideographic,`
			`},`
			`{`
			`Start: 18,`
			`End: 21,`
			`Term: []byte("界"),`
			`Position: 7,`
			`Type: analysis.Ideographic,`
			`},`
			`},`
			`},`
added test case clarifying whitespace tokenizer on empty input 2014-08-19 16:43:52 +02:00			`{`
			`[]byte(""),`
			`analysis.TokenStream{},`
			`},`
initial commit 2014-04-17 22:55:53 +02:00			`}`

			`for _, test := range tests {`
introduced token type 2014-07-31 19:54:12 +02:00			`tokenizer := NewRegexpTokenizer(wordRegex)`
initial commit 2014-04-17 22:55:53 +02:00			`actual := tokenizer.Tokenize(test.input)`

			`if !reflect.DeepEqual(actual, test.output) {`
			`t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))`
			`}`
			`}`
			`}`
fixed regexp tokenizers to not produce empty tokens 2016-09-14 22:22:20 +02:00
			`func TestBugProducingEmptyTokens(t *testing.T) {`

			wordRegex := regexp.MustCompile(`[0-9a-zA-Z_]*`)

			`tests := []struct {`
			`input []byte`
			`output analysis.TokenStream`
			`}{`
			`{`
			`[]byte("Chatha Edwards Sr."),`
			`analysis.TokenStream{`
			`{`
			`Start: 0,`
			`End: 6,`
			`Term: []byte("Chatha"),`
			`Position: 1,`
			`Type: analysis.AlphaNumeric,`
			`},`
			`{`
			`Start: 7,`
			`End: 14,`
			`Term: []byte("Edwards"),`
			`Position: 2,`
			`Type: analysis.AlphaNumeric,`
			`},`
			`{`
			`Start: 15,`
			`End: 17,`
			`Term: []byte("Sr"),`
			`Position: 3,`
			`Type: analysis.AlphaNumeric,`
			`},`
			`},`
			`},`
			`}`

			`for _, test := range tests {`
			`tokenizer := NewRegexpTokenizer(wordRegex)`
			`actual := tokenizer.Tokenize(test.input)`

			`if !reflect.DeepEqual(actual, test.output) {`
			`t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))`
			`}`
			`}`
			`}`