0
0

Merge pull request #470 from daschl/sigma

Address special unicode sigma at end of term when lowercasing.
This commit is contained in:
Marty Schoch 2016-10-12 14:03:17 +02:00 committed by GitHub
commit 4160fb296f
2 changed files with 13 additions and 0 deletions

View File

@ -75,6 +75,13 @@ func toLowerDeferredCopy(s []byte) []byte {
continue
}
// Handles the Unicode edge-case where the last
// rune in a word on the greek Σ needs to be converted
// differently.
if l == 'σ' && i + 2 == len(s) {
l = 'ς'
}
lwid := utf8.RuneLen(l)
if lwid > wid {
// utf-8 encoded replacement is wider

View File

@ -48,6 +48,9 @@ func TestLowerCaseFilter(t *testing.T) {
&analysis.Token{
Term: []byte("ȺȾCAT"),
},
&analysis.Token{
Term: []byte("ὈΔΥΣΣ"),
},
}
expectedTokenStream := analysis.TokenStream{
@ -69,6 +72,9 @@ func TestLowerCaseFilter(t *testing.T) {
&analysis.Token{
Term: []byte("ⱥⱦcat"),
},
&analysis.Token{
Term: []byte("ὀδυσς"),
},
}
filter := NewLowerCaseFilter()