arabic: add unicode normalization to analyzer
This commit is contained in:
parent
91a8d5da9f
commit
9444af9366
|
@ -14,6 +14,7 @@ import (
|
||||||
"github.com/blevesearch/bleve/registry"
|
"github.com/blevesearch/bleve/registry"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
|
||||||
|
"github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
|
||||||
"github.com/blevesearch/bleve/analysis/tokenizers/unicode"
|
"github.com/blevesearch/bleve/analysis/tokenizers/unicode"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -28,6 +29,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKC)
|
||||||
stopArFilter, err := cache.TokenFilterNamed(StopName)
|
stopArFilter, err := cache.TokenFilterNamed(StopName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -44,6 +46,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
|
||||||
Tokenizer: tokenizer,
|
Tokenizer: tokenizer,
|
||||||
TokenFilters: []analysis.TokenFilter{
|
TokenFilters: []analysis.TokenFilter{
|
||||||
toLowerFilter,
|
toLowerFilter,
|
||||||
|
normalizeFilter,
|
||||||
stopArFilter,
|
stopArFilter,
|
||||||
normalizeArFilter,
|
normalizeArFilter,
|
||||||
stemmerArFilter,
|
stemmerArFilter,
|
||||||
|
|
|
@ -150,6 +150,18 @@ func TestArabicAnalyzer(t *testing.T) {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
// presentation form normalization
|
||||||
|
{
|
||||||
|
input: []byte("ﺍﻟﺴﻼﻢ"),
|
||||||
|
output: analysis.TokenStream{
|
||||||
|
&analysis.Token{
|
||||||
|
Term: []byte("سلام"),
|
||||||
|
Position: 1,
|
||||||
|
Start: 0,
|
||||||
|
End: 15,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
cache := registry.NewCache()
|
cache := registry.NewCache()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user