0
0

elision_filter: correctly strip multi-bytes quotation marks

This commit is contained in:
Patrick Mezard 2015-11-04 10:59:10 +01:00
parent bae2079eb2
commit eb26402924
2 changed files with 28 additions and 14 deletions

View File

@ -10,8 +10,8 @@
package elision_filter package elision_filter
import ( import (
"bytes"
"fmt" "fmt"
"unicode/utf8"
"github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/registry"
@ -19,10 +19,8 @@ import (
const Name = "elision" const Name = "elision"
const RightSingleQuotationMark = "" const RightSingleQuotationMark = ''
const Apostrophe = "'" const Apostrophe = '\''
const Apostrophes = Apostrophe + RightSingleQuotationMark
type ElisionFilter struct { type ElisionFilter struct {
articles analysis.TokenMap articles analysis.TokenMap
@ -36,15 +34,19 @@ func NewElisionFilter(articles analysis.TokenMap) *ElisionFilter {
func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream { func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input { for _, token := range input {
firstApostrophe := bytes.IndexAny(token.Term, Apostrophes) term := token.Term
if firstApostrophe >= 0 { for i := 0; i < len(term); {
// found an apostrophe r, size := utf8.DecodeRune(term[i:])
prefix := token.Term[0:firstApostrophe] if r == Apostrophe || r == RightSingleQuotationMark {
// see if the prefix matches one of the articles // see if the prefix matches one of the articles
_, articleMatch := s.articles[string(prefix)] prefix := term[0:i]
if articleMatch { _, articleMatch := s.articles[string(prefix)]
token.Term = token.Term[firstApostrophe+1:] if articleMatch {
token.Term = term[i+size:]
break
}
} }
i += size
} }
} }
return input return input

View File

@ -27,7 +27,19 @@ func TestElisionFilter(t *testing.T) {
{ {
input: analysis.TokenStream{ input: analysis.TokenStream{
&analysis.Token{ &analysis.Token{
Term: []byte("ar'word"), Term: []byte("ar" + string(Apostrophe) + "word"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("word"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ar" + string(RightSingleQuotationMark) + "word"),
}, },
}, },
output: analysis.TokenStream{ output: analysis.TokenStream{