0
0

arabic stemmer: strip multiple suffixes

updates #150
This commit is contained in:
Salmān Aljammāz 2015-02-05 16:07:58 +03:00
parent 4be974f489
commit e461fed92a
2 changed files with 20 additions and 20 deletions

View File

@ -69,18 +69,17 @@ func TestArabicAnalyzer(t *testing.T) {
}, },
}, },
// plural -in // plural -in
// currently fails {
// { input: []byte("أمريكيين"),
// input: []byte("أمريكيين"), output: analysis.TokenStream{
// output: analysis.TokenStream{ &analysis.Token{
// &analysis.Token{ Term: []byte("امريك"),
// Term: []byte("امريك"), Position: 1,
// Position: 1, Start: 0,
// Start: 0, End: 16,
// End: 16, },
// }, },
// }, },
// },
// singular with bare alif // singular with bare alif
{ {
input: []byte("امريكي"), input: []byte("امريكي"),

View File

@ -29,16 +29,16 @@ var prefixes = [][]byte{
[]byte("و"), []byte("و"),
} }
var suffixes = [][]byte{ var suffixes = [][]byte{
[]byte("ه"), []byte("ها"),
[]byte("ة"), []byte("ان"),
[]byte("ات"),
[]byte("ون"),
[]byte("ين"),
[]byte("يه"), []byte("يه"),
[]byte("ية"), []byte("ية"),
[]byte("ها"), []byte("ه"),
[]byte("ة"),
[]byte("ي"), []byte("ي"),
[]byte("ان"),
[]byte("ين"),
[]byte("ون"),
[]byte("ات"),
} }
type ArabicStemmerFilter struct{} type ArabicStemmerFilter struct{}
@ -56,16 +56,17 @@ func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenS
} }
func stem(input []byte) []byte { func stem(input []byte) []byte {
// Strip a single prefix.
for _, p := range prefixes { for _, p := range prefixes {
if bytes.HasPrefix(input, p) { if bytes.HasPrefix(input, p) {
input = input[len(p):] input = input[len(p):]
break break
} }
} }
// Strip off multiple suffixes, in their order in the suffixes array.
for _, s := range suffixes { for _, s := range suffixes {
if bytes.HasSuffix(input, s) { if bytes.HasSuffix(input, s) {
input = input[:len(input)-len(s)] input = input[:len(input)-len(s)]
break
} }
} }
return input return input