fixed regexp tokenizers to not produce empty tokens

2016-09-14 16:22:20 -04:00 · 2016-09-14 16:22:20 -04:00 · ffee3c3764
parent d01ff4ad8a
commit ffee3c3764
2 changed files with 56 additions and 8 deletions
--- a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go
+++ b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go
@ -34,17 +34,19 @@ func NewRegexpTokenizer(r *regexp.Regexp) *RegexpTokenizer {

 func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream {
 	matches := rt.r.FindAllIndex(input, -1)
-	rv := make(analysis.TokenStream, len(matches))
+	rv := make(analysis.TokenStream, 0, len(matches))
 	for i, match := range matches {
 		matchBytes := input[match[0]:match[1]]
-		token := analysis.Token{
-			Term:     matchBytes,
-			Start:    match[0],
-			End:      match[1],
-			Position: i + 1,
-			Type:     detectTokenType(matchBytes),
+		if match[1]-match[0] > 0 {
+			token := analysis.Token{
+				Term:     matchBytes,
+				Start:    match[0],
+				End:      match[1],
+				Position: i + 1,
+				Type:     detectTokenType(matchBytes),
+			}
+			rv = append(rv, &token)
 		}
-		rv[i] = &token
 	}
 	return rv
 }
--- a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go
+++ b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer_test.go
@ -113,3 +113,49 @@ func TestBoundary(t *testing.T) {
 		}
 	}
 }
+
+func TestBugProducingEmptyTokens(t *testing.T) {
+
+	wordRegex := regexp.MustCompile(`[0-9a-zA-Z_]*`)
+
+	tests := []struct {
+		input  []byte
+		output analysis.TokenStream
+	}{
+		{
+			[]byte("Chatha Edwards Sr."),
+			analysis.TokenStream{
+				{
+					Start:    0,
+					End:      6,
+					Term:     []byte("Chatha"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
+				},
+				{
+					Start:    7,
+					End:      14,
+					Term:     []byte("Edwards"),
+					Position: 2,
+					Type:     analysis.AlphaNumeric,
+				},
+				{
+					Start:    15,
+					End:      17,
+					Term:     []byte("Sr"),
+					Position: 3,
+					Type:     analysis.AlphaNumeric,
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		tokenizer := NewRegexpTokenizer(wordRegex)
+		actual := tokenizer.Tokenize(test.input)
+
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
+		}
+	}
+}