0
0

arabic: add unicode normalization to analyzer

This commit is contained in:
Salmān Aljammāz 2015-02-06 19:48:06 +03:00
parent 91a8d5da9f
commit 9444af9366
2 changed files with 15 additions and 0 deletions

View File

@ -14,6 +14,7 @@ import (
"github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter" "github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
"github.com/blevesearch/bleve/analysis/token_filters/unicode_normalize"
"github.com/blevesearch/bleve/analysis/tokenizers/unicode" "github.com/blevesearch/bleve/analysis/tokenizers/unicode"
) )
@ -28,6 +29,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
if err != nil { if err != nil {
return nil, err return nil, err
} }
normalizeFilter := unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKC)
stopArFilter, err := cache.TokenFilterNamed(StopName) stopArFilter, err := cache.TokenFilterNamed(StopName)
if err != nil { if err != nil {
return nil, err return nil, err
@ -44,6 +46,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
Tokenizer: tokenizer, Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{ TokenFilters: []analysis.TokenFilter{
toLowerFilter, toLowerFilter,
normalizeFilter,
stopArFilter, stopArFilter,
normalizeArFilter, normalizeArFilter,
stemmerArFilter, stemmerArFilter,

View File

@ -150,6 +150,18 @@ func TestArabicAnalyzer(t *testing.T) {
}, },
}, },
}, },
// presentation form normalization
{
input: []byte("ﺍﻟﺴﻼﻢ"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
Position: 1,
Start: 0,
End: 15,
},
},
},
} }
cache := registry.NewCache() cache := registry.NewCache()