elision_filter: correctly strip multi-bytes quotation marks

2015-11-04 10:59:10 +01:00 · 2015-11-04 10:59:10 +01:00 · eb26402924
commit eb26402924
parent bae2079eb2
2 changed files with 28 additions and 14 deletions
--- a/analysis/token_filters/elision_filter/elision_filter.go
+++ b/analysis/token_filters/elision_filter/elision_filter.go
@ -10,8 +10,8 @@
 package elision_filter
 import (
 	"bytes"
 	"fmt"
 	"unicode/utf8"
 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/registry"
@ -19,10 +19,8 @@ import (
 const Name = "elision"
-const RightSingleQuotationMark = "’"
+const RightSingleQuotationMark = '’'
-const Apostrophe = "'"
+const Apostrophe = '\''
 const Apostrophes = Apostrophe + RightSingleQuotationMark
 type ElisionFilter struct {
 	articles analysis.TokenMap
@ -36,15 +34,19 @@ func NewElisionFilter(articles analysis.TokenMap) *ElisionFilter {
 func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
 	for _, token := range input {
-		firstApostrophe := bytes.IndexAny(token.Term, Apostrophes)
+		term := token.Term
-		if firstApostrophe >= 0 {
+		for i := 0; i < len(term); {
-			// found an apostrophe
+			r, size := utf8.DecodeRune(term[i:])
-			prefix := token.Term[0:firstApostrophe]
+			if r == Apostrophe || r == RightSingleQuotationMark {
-			// see if the prefix matches one of the articles
+				// see if the prefix matches one of the articles
-			_, articleMatch := s.articles[string(prefix)]
+				prefix := term[0:i]
-			if articleMatch {
+				_, articleMatch := s.articles[string(prefix)]
-				token.Term = token.Term[firstApostrophe+1:]
+				if articleMatch {
 					token.Term = term[i+size:]
 					break
 				}
 			}
 			i += size
 		}
 	}
 	return input
--- a/analysis/token_filters/elision_filter/elision_filter_test.go
+++ b/analysis/token_filters/elision_filter/elision_filter_test.go
@ -27,7 +27,19 @@ func TestElisionFilter(t *testing.T) {
 		{
 			input: analysis.TokenStream{
 				&analysis.Token{
-					Term: []byte("ar'word"),
+					Term: []byte("ar" + string(Apostrophe) + "word"),
 				},
 			},
 			output: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("word"),
 				},
 			},
 		},
 		{
 			input: analysis.TokenStream{
 				&analysis.Token{
 					Term: []byte("ar" + string(RightSingleQuotationMark) + "word"),
 				},
 			},
 			output: analysis.TokenStream{