0
0

arabic stemmer: strip multiple suffixes

updates #150
This commit is contained in:
Salmān Aljammāz 2015-02-05 16:07:58 +03:00
parent 4be974f489
commit e461fed92a
2 changed files with 20 additions and 20 deletions

View File

@ -69,18 +69,17 @@ func TestArabicAnalyzer(t *testing.T) {
},
},
// plural -in
// currently fails
// {
// input: []byte("أمريكيين"),
// output: analysis.TokenStream{
// &analysis.Token{
// Term: []byte("امريك"),
// Position: 1,
// Start: 0,
// End: 16,
// },
// },
// },
{
input: []byte("أمريكيين"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("امريك"),
Position: 1,
Start: 0,
End: 16,
},
},
},
// singular with bare alif
{
input: []byte("امريكي"),

View File

@ -29,16 +29,16 @@ var prefixes = [][]byte{
[]byte("و"),
}
var suffixes = [][]byte{
[]byte("ه"),
[]byte("ة"),
[]byte("ها"),
[]byte("ان"),
[]byte("ات"),
[]byte("ون"),
[]byte("ين"),
[]byte("يه"),
[]byte("ية"),
[]byte("ها"),
[]byte("ه"),
[]byte("ة"),
[]byte("ي"),
[]byte("ان"),
[]byte("ين"),
[]byte("ون"),
[]byte("ات"),
}
type ArabicStemmerFilter struct{}
@ -56,16 +56,17 @@ func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenS
}
func stem(input []byte) []byte {
// Strip a single prefix.
for _, p := range prefixes {
if bytes.HasPrefix(input, p) {
input = input[len(p):]
break
}
}
// Strip off multiple suffixes, in their order in the suffixes array.
for _, s := range suffixes {
if bytes.HasSuffix(input, s) {
input = input[:len(input)-len(s)]
break
}
}
return input