arabic stemmer: strip multiple suffixes

updates #150
2015-02-05 16:07:58 +03:00 · 2015-02-05 16:07:58 +03:00 · e461fed92a
commit e461fed92a
parent 4be974f489
2 changed files with 20 additions and 20 deletions
--- a/analysis/language/ar/analyzer_ar_test.go
+++ b/analysis/language/ar/analyzer_ar_test.go
@ -69,18 +69,17 @@ func TestArabicAnalyzer(t *testing.T) {
 			},
 		},
 		// plural -in
-		// currently fails
-		// {
-		// 	input: []byte("أمريكيين"),
-		// 	output: analysis.TokenStream{
-		// 		&analysis.Token{
-		// 			Term:     []byte("امريك"),
-		// 			Position: 1,
-		// 			Start:    0,
-		// 			End:      16,
-		// 		},
-		// 	},
-		// },
+		{
+			input: []byte("أمريكيين"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("امريك"),
+					Position: 1,
+					Start:    0,
+					End:      16,
+				},
+			},
+		},
 		// singular with bare alif
 		{
 			input: []byte("امريكي"),
--- a/analysis/language/ar/stemmer_ar.go
+++ b/analysis/language/ar/stemmer_ar.go
@ -29,16 +29,16 @@ var prefixes = [][]byte{
 	[]byte("و"),
 }
 var suffixes = [][]byte{
-	[]byte("ه"),
-	[]byte("ة"),
+	[]byte("ها"),
+	[]byte("ان"),
+	[]byte("ات"),
+	[]byte("ون"),
+	[]byte("ين"),
 	[]byte("يه"),
 	[]byte("ية"),
-	[]byte("ها"),
+	[]byte("ه"),
+	[]byte("ة"),
 	[]byte("ي"),
-	[]byte("ان"),
-	[]byte("ين"),
-	[]byte("ون"),
-	[]byte("ات"),
 }

 type ArabicStemmerFilter struct{}
@ -56,16 +56,17 @@ func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenS
 }

 func stem(input []byte) []byte {
+	// Strip a single prefix.
 	for _, p := range prefixes {
 		if bytes.HasPrefix(input, p) {
 			input = input[len(p):]
 			break
 		}
 	}
+	// Strip off multiple suffixes, in their order in the suffixes array.
 	for _, s := range suffixes {
 		if bytes.HasSuffix(input, s) {
 			input = input[:len(input)-len(s)]
-			break
 		}
 	}
 	return input