arabic: add unicode normalization to analyzer
This commit is contained in:
parent
91a8d5da9f
commit
9444af9366
|
@ -14,6 +14,7 @@ import (
|
|||
"github.com/blevesearch/bleve/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||
"github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
|
||||
"github.com/blevesearch/bleve/analysis/tokenizers/unicode"
|
||||
)
|
||||
|
||||
|
@ -28,6 +29,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
|
|||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKC)
|
||||
stopArFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -44,6 +46,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
|
|||
Tokenizer: tokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
normalizeFilter,
|
||||
stopArFilter,
|
||||
normalizeArFilter,
|
||||
stemmerArFilter,
|
||||
|
|
|
@ -150,6 +150,18 @@ func TestArabicAnalyzer(t *testing.T) {
|
|||
},
|
||||
},
|
||||
},
|
||||
// presentation form normalization
|
||||
{
|
||||
input: []byte("ﺍﻟﺴﻼﻢ"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 15,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
|
|
Loading…
Reference in New Issue
Block a user