elision_filter: correctly strip multi-bytes quotation marks
This commit is contained in:
parent
bae2079eb2
commit
eb26402924
|
@ -10,8 +10,8 @@
|
||||||
package elision_filter
|
package elision_filter
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/analysis"
|
"github.com/blevesearch/bleve/analysis"
|
||||||
"github.com/blevesearch/bleve/registry"
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
@ -19,10 +19,8 @@ import (
|
||||||
|
|
||||||
const Name = "elision"
|
const Name = "elision"
|
||||||
|
|
||||||
const RightSingleQuotationMark = "’"
|
const RightSingleQuotationMark = '’'
|
||||||
const Apostrophe = "'"
|
const Apostrophe = '\''
|
||||||
|
|
||||||
const Apostrophes = Apostrophe + RightSingleQuotationMark
|
|
||||||
|
|
||||||
type ElisionFilter struct {
|
type ElisionFilter struct {
|
||||||
articles analysis.TokenMap
|
articles analysis.TokenMap
|
||||||
|
@ -36,15 +34,19 @@ func NewElisionFilter(articles analysis.TokenMap) *ElisionFilter {
|
||||||
|
|
||||||
func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||||
for _, token := range input {
|
for _, token := range input {
|
||||||
firstApostrophe := bytes.IndexAny(token.Term, Apostrophes)
|
term := token.Term
|
||||||
if firstApostrophe >= 0 {
|
for i := 0; i < len(term); {
|
||||||
// found an apostrophe
|
r, size := utf8.DecodeRune(term[i:])
|
||||||
prefix := token.Term[0:firstApostrophe]
|
if r == Apostrophe || r == RightSingleQuotationMark {
|
||||||
// see if the prefix matches one of the articles
|
// see if the prefix matches one of the articles
|
||||||
_, articleMatch := s.articles[string(prefix)]
|
prefix := term[0:i]
|
||||||
if articleMatch {
|
_, articleMatch := s.articles[string(prefix)]
|
||||||
token.Term = token.Term[firstApostrophe+1:]
|
if articleMatch {
|
||||||
|
token.Term = term[i+size:]
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
i += size
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return input
|
return input
|
||||||
|
|
|
@ -27,7 +27,19 @@ func TestElisionFilter(t *testing.T) {
|
||||||
{
|
{
|
||||||
input: analysis.TokenStream{
|
input: analysis.TokenStream{
|
||||||
&analysis.Token{
|
&analysis.Token{
|
||||||
Term: []byte("ar'word"),
|
Term: []byte("ar" + string(Apostrophe) + "word"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("word"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("ar" + string(RightSingleQuotationMark) + "word"),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
output: analysis.TokenStream{
|
output: analysis.TokenStream{
|
||||||
|
|
Loading…
Reference in New Issue
Block a user