From 0470f93955d942544f12898a02900e223e2ccb1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Salm=C4=81n=20Aljamm=C4=81z?= Date: Fri, 6 Feb 2015 17:43:03 +0300 Subject: [PATCH 1/3] arabic: add more stemmer tests These came from org.apache.lucene.analysis.ar. --- analysis/language/ar/stemmer_ar_test.go | 275 +++++++++++++++++++++++- 1 file changed, 274 insertions(+), 1 deletion(-) diff --git a/analysis/language/ar/stemmer_ar_test.go b/analysis/language/ar/stemmer_ar_test.go index 23684a8d..62ed6ef1 100644 --- a/analysis/language/ar/stemmer_ar_test.go +++ b/analysis/language/ar/stemmer_ar_test.go @@ -21,6 +21,279 @@ func TestArabicStemmerFilter(t *testing.T) { input analysis.TokenStream output analysis.TokenStream }{ + // AlPrefix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("الحسن"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("حسن"), + }, + }, + }, + // WalPrefix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("والحسن"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("حسن"), + }, + }, + }, + // BalPrefix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("بالحسن"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("حسن"), + }, + }, + }, + // KalPrefix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("كالحسن"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("حسن"), + }, + }, + }, + // FalPrefix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("فالحسن"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("حسن"), + }, + }, + }, + // LlPrefix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("للاخر"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("اخر"), + }, + }, + }, + // WaPrefix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("وحسن"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("حسن"), + }, + }, + }, + // AhSuffix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("زوجها"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("زوج"), + }, + }, + }, + // AnSuffix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهدان"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهد"), + }, + }, + }, + // AtSuffix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهدات"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهد"), + }, + }, + }, + // WnSuffix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهدون"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهد"), + }, + }, + }, + // YnSuffix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهدين"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهد"), + }, + }, + }, + // YhSuffix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهديه"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهد"), + }, + }, + }, + // YpSuffix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهدية"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهد"), + }, + }, + }, + // HSuffix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهده"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهد"), + }, + }, + }, + // PSuffix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهدة"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهد"), + }, + }, + }, + // YSuffix + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهدي"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهد"), + }, + }, + }, + // ComboPrefSuf + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("وساهدون"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهد"), + }, + }, + }, + // ComboSuf + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهدهات"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ساهد"), + }, + }, + }, + // ShouldntStem + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("الو"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("الو"), + }, + }, + }, + // NonArabic + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("English"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("English"), + }, + }, + }, { input: analysis.TokenStream{ &analysis.Token{ @@ -93,7 +366,7 @@ func TestArabicStemmerFilter(t *testing.T) { }, }, }, - // empty + // Empty { input: analysis.TokenStream{ &analysis.Token{ From 91a8d5da9f01148166142f0c0c8afadee738962e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Salm=C4=81n=20Aljamm=C4=81z?= Date: Fri, 6 Feb 2015 19:23:49 +0300 Subject: [PATCH 2/3] arabic: check minimum length before stemming This invloves converting tokens to a rune slice in the filter, but at least we're now compatable with Lucene's stemmer. --- analysis/language/ar/stemmer_ar.go | 80 +++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/analysis/language/ar/stemmer_ar.go b/analysis/language/ar/stemmer_ar.go index c6047291..c76757b4 100644 --- a/analysis/language/ar/stemmer_ar.go +++ b/analysis/language/ar/stemmer_ar.go @@ -19,26 +19,26 @@ import ( const StemmerName = "stemmer_ar" // These were obtained from org.apache.lucene.analysis.ar.ArabicStemmer -var prefixes = [][]byte{ - []byte("ال"), - []byte("وال"), - []byte("بال"), - []byte("كال"), - []byte("فال"), - []byte("لل"), - []byte("و"), +var prefixes = [][]rune{ + []rune("ال"), + []rune("وال"), + []rune("بال"), + []rune("كال"), + []rune("فال"), + []rune("لل"), + []rune("و"), } -var suffixes = [][]byte{ - []byte("ها"), - []byte("ان"), - []byte("ات"), - []byte("ون"), - []byte("ين"), - []byte("يه"), - []byte("ية"), - []byte("ه"), - []byte("ة"), - []byte("ي"), +var suffixes = [][]rune{ + []rune("ها"), + []rune("ان"), + []rune("ات"), + []rune("ون"), + []rune("ين"), + []rune("يه"), + []rune("ية"), + []rune("ه"), + []rune("ة"), + []rune("ي"), } type ArabicStemmerFilter struct{} @@ -55,21 +55,53 @@ func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenS return input } +func canStemPrefix(input, prefix []rune) bool { + // Wa- prefix requires at least 3 characters. + if len(prefix) == 1 && len(input) < 4 { + return false + } + // Other prefixes require only 2. + if len(input)-len(prefix) < 2 { + return false + } + for i := range prefix { + if prefix[i] != input[i] { + return false + } + } + return true +} + +func canStemSuffix(input, suffix []rune) bool { + // All suffixes require at least 2 characters after stemming. + if len(input)-len(suffix) < 2 { + return false + } + stemEnd := len(input) - len(suffix) + for i := range suffix { + if suffix[i] != input[stemEnd+i] { + return false + } + } + return true +} + func stem(input []byte) []byte { + runes := bytes.Runes(input) // Strip a single prefix. for _, p := range prefixes { - if bytes.HasPrefix(input, p) { - input = input[len(p):] + if canStemPrefix(runes, p) { + runes = runes[len(p):] break } } // Strip off multiple suffixes, in their order in the suffixes array. for _, s := range suffixes { - if bytes.HasSuffix(input, s) { - input = input[:len(input)-len(s)] + if canStemSuffix(runes, s) { + runes = runes[:len(runes)-len(s)] } } - return input + return analysis.BuildTermFromRunes(runes) } func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { From 9444af9366846b9c150656dbbcff70c906874c27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Salm=C4=81n=20Aljamm=C4=81z?= Date: Fri, 6 Feb 2015 19:48:06 +0300 Subject: [PATCH 3/3] arabic: add unicode normalization to analyzer --- analysis/language/ar/analyzer_ar.go | 3 +++ analysis/language/ar/analyzer_ar_test.go | 12 ++++++++++++ 2 files changed, 15 insertions(+) diff --git a/analysis/language/ar/analyzer_ar.go b/analysis/language/ar/analyzer_ar.go index d05aed22..1d7149af 100644 --- a/analysis/language/ar/analyzer_ar.go +++ b/analysis/language/ar/analyzer_ar.go @@ -14,6 +14,7 @@ import ( "github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter" + "github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize" "github.com/blevesearch/bleve/analysis/tokenizers/unicode" ) @@ -28,6 +29,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) ( if err != nil { return nil, err } + normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKC) stopArFilter, err := cache.TokenFilterNamed(StopName) if err != nil { return nil, err @@ -44,6 +46,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) ( Tokenizer: tokenizer, TokenFilters: []analysis.TokenFilter{ toLowerFilter, + normalizeFilter, stopArFilter, normalizeArFilter, stemmerArFilter, diff --git a/analysis/language/ar/analyzer_ar_test.go b/analysis/language/ar/analyzer_ar_test.go index 5b4cce36..ecade6b1 100644 --- a/analysis/language/ar/analyzer_ar_test.go +++ b/analysis/language/ar/analyzer_ar_test.go @@ -150,6 +150,18 @@ func TestArabicAnalyzer(t *testing.T) { }, }, }, + // presentation form normalization + { + input: []byte("ﺍﻟﺴﻼﻢ"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("سلام"), + Position: 1, + Start: 0, + End: 15, + }, + }, + }, } cache := registry.NewCache()