elision_filter: correctly strip multi-bytes quotation marks

2015-11-04 10:59:10 +01:00 · 2015-11-04 10:59:10 +01:00 · eb26402924
commit eb26402924
parent bae2079eb2
2 changed files with 28 additions and 14 deletions
--- a/analysis/token_filters/elision_filter/elision_filter.go
+++ b/analysis/token_filters/elision_filter/elision_filter.go
@ -10,8 +10,8 @@
 package elision_filter

 import (
-	"bytes"
 	"fmt"
+	"unicode/utf8"

 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/registry"
@ -19,10 +19,8 @@ import (

 const Name = "elision"

-const RightSingleQuotationMark = "’"
-const Apostrophe = "'"
-
-const Apostrophes = Apostrophe + RightSingleQuotationMark
+const RightSingleQuotationMark = '’'
+const Apostrophe = '\''

 type ElisionFilter struct {
 	articles analysis.TokenMap
@ -36,15 +34,19 @@ func NewElisionFilter(articles analysis.TokenMap) *ElisionFilter {

 func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
 	for _, token := range input {
-		firstApostrophe := bytes.IndexAny(token.Term, Apostrophes)
-		if firstApostrophe >= 0 {
-			// found an apostrophe
-			prefix := token.Term[0:firstApostrophe]
-			// see if the prefix matches one of the articles
-			_, articleMatch := s.articles[string(prefix)]
-			if articleMatch {
-				token.Term = token.Term[firstApostrophe+1:]
+		term := token.Term
+		for i := 0; i < len(term); {
+			r, size := utf8.DecodeRune(term[i:])
+			if r == Apostrophe || r == RightSingleQuotationMark {
+				// see if the prefix matches one of the articles
+				prefix := term[0:i]
+				_, articleMatch := s.articles[string(prefix)]
+				if articleMatch {
+					token.Term = term[i+size:]
+					break
+				}
 			}
+			i += size
 		}
 	}
 	return input
--- a/analysis/token_filters/elision_filter/elision_filter_test.go
+++ b/analysis/token_filters/elision_filter/elision_filter_test.go
@ -27,7 +27,19 @@ func TestElisionFilter(t *testing.T) {
 		{
 			input: analysis.TokenStream{
 				&analysis.Token{
-					Term: []byte("ar'word"),
+					Term: []byte("ar" + string(Apostrophe) + "word"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("word"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("ar" + string(RightSingleQuotationMark) + "word"),
 				},
 			},
 			output: analysis.TokenStream{