Merge pull request #470 from daschl/sigma
Address special unicode sigma at end of term when lowercasing.
This commit is contained in:
commit
4160fb296f
|
@ -75,6 +75,13 @@ func toLowerDeferredCopy(s []byte) []byte {
|
|||
continue
|
||||
}
|
||||
|
||||
// Handles the Unicode edge-case where the last
|
||||
// rune in a word on the greek Σ needs to be converted
|
||||
// differently.
|
||||
if l == 'σ' && i + 2 == len(s) {
|
||||
l = 'ς'
|
||||
}
|
||||
|
||||
lwid := utf8.RuneLen(l)
|
||||
if lwid > wid {
|
||||
// utf-8 encoded replacement is wider
|
||||
|
|
|
@ -48,6 +48,9 @@ func TestLowerCaseFilter(t *testing.T) {
|
|||
&analysis.Token{
|
||||
Term: []byte("ȺȾCAT"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ὈΔΥΣΣ"),
|
||||
},
|
||||
}
|
||||
|
||||
expectedTokenStream := analysis.TokenStream{
|
||||
|
@ -69,6 +72,9 @@ func TestLowerCaseFilter(t *testing.T) {
|
|||
&analysis.Token{
|
||||
Term: []byte("ⱥⱦcat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ὀδυσς"),
|
||||
},
|
||||
}
|
||||
|
||||
filter := NewLowerCaseFilter()
|
||||
|
|
Loading…
Reference in New Issue
Block a user