Address special unicode sigma at end of term when lowercasing.
Σ maps to σ, except at the end of a word where it maps to ς. This is the only conditional (contextual) but language-independent mapping in unicode.
This commit is contained in:
parent
ff35d75aa4
commit
7e656dad32
|
@ -75,6 +75,13 @@ func toLowerDeferredCopy(s []byte) []byte {
|
|||
continue
|
||||
}
|
||||
|
||||
// Handles the Unicode edge-case where the last
|
||||
// rune in a word on the greek Σ needs to be converted
|
||||
// differently.
|
||||
if l == 'σ' && i + 2 == len(s) {
|
||||
l = 'ς'
|
||||
}
|
||||
|
||||
lwid := utf8.RuneLen(l)
|
||||
if lwid > wid {
|
||||
// utf-8 encoded replacement is wider
|
||||
|
|
|
@ -48,6 +48,9 @@ func TestLowerCaseFilter(t *testing.T) {
|
|||
&analysis.Token{
|
||||
Term: []byte("ȺȾCAT"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ὈΔΥΣΣ"),
|
||||
},
|
||||
}
|
||||
|
||||
expectedTokenStream := analysis.TokenStream{
|
||||
|
@ -69,6 +72,9 @@ func TestLowerCaseFilter(t *testing.T) {
|
|||
&analysis.Token{
|
||||
Term: []byte("ⱥⱦcat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ὀδυσς"),
|
||||
},
|
||||
}
|
||||
|
||||
filter := NewLowerCaseFilter()
|
||||
|
|
Loading…
Reference in New Issue