Merge pull request #155 from saljam/ar_stemmer
arabic stemmer: strip multiple suffixes
This commit is contained in:
commit
41cd64337b
@ -69,18 +69,17 @@ func TestArabicAnalyzer(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
// plural -in
|
// plural -in
|
||||||
// currently fails
|
{
|
||||||
// {
|
input: []byte("أمريكيين"),
|
||||||
// input: []byte("أمريكيين"),
|
output: analysis.TokenStream{
|
||||||
// output: analysis.TokenStream{
|
&analysis.Token{
|
||||||
// &analysis.Token{
|
Term: []byte("امريك"),
|
||||||
// Term: []byte("امريك"),
|
Position: 1,
|
||||||
// Position: 1,
|
Start: 0,
|
||||||
// Start: 0,
|
End: 16,
|
||||||
// End: 16,
|
},
|
||||||
// },
|
},
|
||||||
// },
|
},
|
||||||
// },
|
|
||||||
// singular with bare alif
|
// singular with bare alif
|
||||||
{
|
{
|
||||||
input: []byte("امريكي"),
|
input: []byte("امريكي"),
|
||||||
|
@ -29,16 +29,16 @@ var prefixes = [][]byte{
|
|||||||
[]byte("و"),
|
[]byte("و"),
|
||||||
}
|
}
|
||||||
var suffixes = [][]byte{
|
var suffixes = [][]byte{
|
||||||
[]byte("ه"),
|
[]byte("ها"),
|
||||||
[]byte("ة"),
|
[]byte("ان"),
|
||||||
|
[]byte("ات"),
|
||||||
|
[]byte("ون"),
|
||||||
|
[]byte("ين"),
|
||||||
[]byte("يه"),
|
[]byte("يه"),
|
||||||
[]byte("ية"),
|
[]byte("ية"),
|
||||||
[]byte("ها"),
|
[]byte("ه"),
|
||||||
|
[]byte("ة"),
|
||||||
[]byte("ي"),
|
[]byte("ي"),
|
||||||
[]byte("ان"),
|
|
||||||
[]byte("ين"),
|
|
||||||
[]byte("ون"),
|
|
||||||
[]byte("ات"),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type ArabicStemmerFilter struct{}
|
type ArabicStemmerFilter struct{}
|
||||||
@ -56,16 +56,17 @@ func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenS
|
|||||||
}
|
}
|
||||||
|
|
||||||
func stem(input []byte) []byte {
|
func stem(input []byte) []byte {
|
||||||
|
// Strip a single prefix.
|
||||||
for _, p := range prefixes {
|
for _, p := range prefixes {
|
||||||
if bytes.HasPrefix(input, p) {
|
if bytes.HasPrefix(input, p) {
|
||||||
input = input[len(p):]
|
input = input[len(p):]
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Strip off multiple suffixes, in their order in the suffixes array.
|
||||||
for _, s := range suffixes {
|
for _, s := range suffixes {
|
||||||
if bytes.HasSuffix(input, s) {
|
if bytes.HasSuffix(input, s) {
|
||||||
input = input[:len(input)-len(s)]
|
input = input[:len(input)-len(s)]
|
||||||
break
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return input
|
return input
|
||||||
|
Loading…
Reference in New Issue
Block a user