0
0

arabic: check minimum length before stemming

This invloves converting tokens to a rune slice in the filter, but
at least we're now compatable with Lucene's stemmer.
This commit is contained in:
Salmān Aljammāz 2015-02-06 19:23:49 +03:00
parent 0470f93955
commit 91a8d5da9f

View File

@ -19,26 +19,26 @@ import (
const StemmerName = "stemmer_ar" const StemmerName = "stemmer_ar"
// These were obtained from org.apache.lucene.analysis.ar.ArabicStemmer // These were obtained from org.apache.lucene.analysis.ar.ArabicStemmer
var prefixes = [][]byte{ var prefixes = [][]rune{
[]byte("ال"), []rune("ال"),
[]byte("وال"), []rune("وال"),
[]byte("بال"), []rune("بال"),
[]byte("كال"), []rune("كال"),
[]byte("فال"), []rune("فال"),
[]byte("لل"), []rune("لل"),
[]byte("و"), []rune("و"),
} }
var suffixes = [][]byte{ var suffixes = [][]rune{
[]byte("ها"), []rune("ها"),
[]byte("ان"), []rune("ان"),
[]byte("ات"), []rune("ات"),
[]byte("ون"), []rune("ون"),
[]byte("ين"), []rune("ين"),
[]byte("يه"), []rune("يه"),
[]byte("ية"), []rune("ية"),
[]byte("ه"), []rune("ه"),
[]byte("ة"), []rune("ة"),
[]byte("ي"), []rune("ي"),
} }
type ArabicStemmerFilter struct{} type ArabicStemmerFilter struct{}
@ -55,21 +55,53 @@ func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenS
return input return input
} }
func canStemPrefix(input, prefix []rune) bool {
// Wa- prefix requires at least 3 characters.
if len(prefix) == 1 && len(input) < 4 {
return false
}
// Other prefixes require only 2.
if len(input)-len(prefix) < 2 {
return false
}
for i := range prefix {
if prefix[i] != input[i] {
return false
}
}
return true
}
func canStemSuffix(input, suffix []rune) bool {
// All suffixes require at least 2 characters after stemming.
if len(input)-len(suffix) < 2 {
return false
}
stemEnd := len(input) - len(suffix)
for i := range suffix {
if suffix[i] != input[stemEnd+i] {
return false
}
}
return true
}
func stem(input []byte) []byte { func stem(input []byte) []byte {
runes := bytes.Runes(input)
// Strip a single prefix. // Strip a single prefix.
for _, p := range prefixes { for _, p := range prefixes {
if bytes.HasPrefix(input, p) { if canStemPrefix(runes, p) {
input = input[len(p):] runes = runes[len(p):]
break break
} }
} }
// Strip off multiple suffixes, in their order in the suffixes array. // Strip off multiple suffixes, in their order in the suffixes array.
for _, s := range suffixes { for _, s := range suffixes {
if bytes.HasSuffix(input, s) { if canStemSuffix(runes, s) {
input = input[:len(input)-len(s)] runes = runes[:len(runes)-len(s)]
} }
} }
return input return analysis.BuildTermFromRunes(runes)
} }
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {