0
0

arabic: add unicode normalization to analyzer

This commit is contained in:
Salmān Aljammāz 2015-02-06 19:48:06 +03:00
parent 91a8d5da9f
commit 9444af9366
2 changed files with 15 additions and 0 deletions

View File

@ -14,6 +14,7 @@ import (
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
"github.com/blevesearch/bleve/analysis/tokenizers/unicode"
)
@ -28,6 +29,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
if err != nil {
return nil, err
}
normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKC)
stopArFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
@ -44,6 +46,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
normalizeFilter,
stopArFilter,
normalizeArFilter,
stemmerArFilter,

View File

@ -150,6 +150,18 @@ func TestArabicAnalyzer(t *testing.T) {
},
},
},
// presentation form normalization
{
input: []byte("ﺍﻟﺴﻼﻢ"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
Position: 1,
Start: 0,
End: 15,
},
},
},
}
cache := registry.NewCache()