diff --git a/analysis/language/ar/analyzer_ar.go b/analysis/language/ar/analyzer_ar.go index d05aed22..1d7149af 100644 --- a/analysis/language/ar/analyzer_ar.go +++ b/analysis/language/ar/analyzer_ar.go @@ -14,6 +14,7 @@ import ( "github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter" + "github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize" "github.com/blevesearch/bleve/analysis/tokenizers/unicode" ) @@ -28,6 +29,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) ( if err != nil { return nil, err } + normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKC) stopArFilter, err := cache.TokenFilterNamed(StopName) if err != nil { return nil, err @@ -44,6 +46,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) ( Tokenizer: tokenizer, TokenFilters: []analysis.TokenFilter{ toLowerFilter, + normalizeFilter, stopArFilter, normalizeArFilter, stemmerArFilter, diff --git a/analysis/language/ar/analyzer_ar_test.go b/analysis/language/ar/analyzer_ar_test.go index 5b4cce36..ecade6b1 100644 --- a/analysis/language/ar/analyzer_ar_test.go +++ b/analysis/language/ar/analyzer_ar_test.go @@ -150,6 +150,18 @@ func TestArabicAnalyzer(t *testing.T) { }, }, }, + // presentation form normalization + { + input: []byte("ﺍﻟﺴﻼﻢ"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("سلام"), + Position: 1, + Start: 0, + End: 15, + }, + }, + }, } cache := registry.NewCache()