0
0
Fork 0

Address special unicode sigma at end of term when lowercasing.

Σ maps to σ, except at the end of a word where it maps to ς.
This is the only conditional (contextual) but language-independent
mapping in unicode.
This commit is contained in:
Michael Nitschinger 2016-10-11 12:31:59 +02:00
parent ff35d75aa4
commit 7e656dad32
2 changed files with 13 additions and 0 deletions

View File

@ -75,6 +75,13 @@ func toLowerDeferredCopy(s []byte) []byte {
continue
}
// Handles the Unicode edge-case where the last
// rune in a word on the greek Σ needs to be converted
// differently.
if l == 'σ' && i + 2 == len(s) {
l = 'ς'
}
lwid := utf8.RuneLen(l)
if lwid > wid {
// utf-8 encoded replacement is wider

View File

@ -48,6 +48,9 @@ func TestLowerCaseFilter(t *testing.T) {
&analysis.Token{
Term: []byte("ȺȾCAT"),
},
&analysis.Token{
Term: []byte("ὈΔΥΣΣ"),
},
}
expectedTokenStream := analysis.TokenStream{
@ -69,6 +72,9 @@ func TestLowerCaseFilter(t *testing.T) {
&analysis.Token{
Term: []byte("ⱥⱦcat"),
},
&analysis.Token{
Term: []byte("ὀδυσς"),
},
}
filter := NewLowerCaseFilter()