From e461fed92ae8b0070dd710bfd9f255242471f52f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Salm=C4=81n=20Aljamm=C4=81z?= Date: Thu, 5 Feb 2015 16:07:58 +0300 Subject: [PATCH] arabic stemmer: strip multiple suffixes updates #150 --- analysis/language/ar/analyzer_ar_test.go | 23 +++++++++++------------ analysis/language/ar/stemmer_ar.go | 17 +++++++++-------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/analysis/language/ar/analyzer_ar_test.go b/analysis/language/ar/analyzer_ar_test.go index 50fd36cf..5b4cce36 100644 --- a/analysis/language/ar/analyzer_ar_test.go +++ b/analysis/language/ar/analyzer_ar_test.go @@ -69,18 +69,17 @@ func TestArabicAnalyzer(t *testing.T) { }, }, // plural -in - // currently fails - // { - // input: []byte("أمريكيين"), - // output: analysis.TokenStream{ - // &analysis.Token{ - // Term: []byte("امريك"), - // Position: 1, - // Start: 0, - // End: 16, - // }, - // }, - // }, + { + input: []byte("أمريكيين"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("امريك"), + Position: 1, + Start: 0, + End: 16, + }, + }, + }, // singular with bare alif { input: []byte("امريكي"), diff --git a/analysis/language/ar/stemmer_ar.go b/analysis/language/ar/stemmer_ar.go index 6343746e..c6047291 100644 --- a/analysis/language/ar/stemmer_ar.go +++ b/analysis/language/ar/stemmer_ar.go @@ -29,16 +29,16 @@ var prefixes = [][]byte{ []byte("و"), } var suffixes = [][]byte{ - []byte("ه"), - []byte("ة"), + []byte("ها"), + []byte("ان"), + []byte("ات"), + []byte("ون"), + []byte("ين"), []byte("يه"), []byte("ية"), - []byte("ها"), + []byte("ه"), + []byte("ة"), []byte("ي"), - []byte("ان"), - []byte("ين"), - []byte("ون"), - []byte("ات"), } type ArabicStemmerFilter struct{} @@ -56,16 +56,17 @@ func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenS } func stem(input []byte) []byte { + // Strip a single prefix. for _, p := range prefixes { if bytes.HasPrefix(input, p) { input = input[len(p):] break } } + // Strip off multiple suffixes, in their order in the suffixes array. for _, s := range suffixes { if bytes.HasSuffix(input, s) { input = input[:len(input)-len(s)] - break } } return input