configured zero width non joiner char filter, and persian analyzer
This commit is contained in:
parent
4ccd69ed45
commit
a4707ebb4e
|
@ -55,3 +55,27 @@ func TestRegexpCharFilter(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestZeroWidthNonJoinerCharFilter(t *testing.T) {
|
||||||
|
|
||||||
|
zeroWidthNonJoinerPattern := `\x{200C}`
|
||||||
|
zeroWidthNonJoinerRegex := regexp.MustCompile(zeroWidthNonJoinerPattern)
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
input []byte
|
||||||
|
output []byte
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
input: []byte("water\u200Cunder\u200Cthe\u200Cbridge"),
|
||||||
|
output: []byte("water under the bridge"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
filter := NewRegexpCharFilter(zeroWidthNonJoinerRegex, []byte{' '})
|
||||||
|
output := filter.Filter(test.input)
|
||||||
|
if !reflect.DeepEqual(output, test.output) {
|
||||||
|
t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -180,6 +180,9 @@ func init() {
|
||||||
htmlCharFilterRegexp := regexp.MustCompile(`</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`)
|
htmlCharFilterRegexp := regexp.MustCompile(`</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`)
|
||||||
htmlCharFilter := regexp_char_filter.NewRegexpCharFilter(htmlCharFilterRegexp, []byte{' '})
|
htmlCharFilter := regexp_char_filter.NewRegexpCharFilter(htmlCharFilterRegexp, []byte{' '})
|
||||||
Config.Analysis.CharFilters["html"] = htmlCharFilter
|
Config.Analysis.CharFilters["html"] = htmlCharFilter
|
||||||
|
zeroWidthNonJoinerRegexp := regexp.MustCompile(`\x{200C}`)
|
||||||
|
zeroWidthNonJoinerCharFilter := regexp_char_filter.NewRegexpCharFilter(zeroWidthNonJoinerRegexp, []byte{' '})
|
||||||
|
Config.Analysis.CharFilters["zero_width_spaces"] = zeroWidthNonJoinerCharFilter
|
||||||
|
|
||||||
// register tokenizers
|
// register tokenizers
|
||||||
whitespaceTokenizerRegexp := regexp.MustCompile(`\w+`)
|
whitespaceTokenizerRegexp := regexp.MustCompile(`\w+`)
|
||||||
|
@ -339,6 +342,8 @@ func init() {
|
||||||
Config.Analysis.Analyzers["th"] = thaiAnalyzer
|
Config.Analysis.Analyzers["th"] = thaiAnalyzer
|
||||||
soraniAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"normalize_ckb", "to_lower", "stop_token_ckb", "stemmer_ckb"})
|
soraniAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"normalize_ckb", "to_lower", "stop_token_ckb", "stemmer_ckb"})
|
||||||
Config.Analysis.Analyzers["ckb"] = soraniAnalyzer
|
Config.Analysis.Analyzers["ckb"] = soraniAnalyzer
|
||||||
|
persianAnalyzer := Config.MustBuildNewAnalyzer([]string{"zero_width_spaces"}, "unicode", []string{"to_lower", "normalize_ar", "normalize_fa", "stop_token_fa"})
|
||||||
|
Config.Analysis.Analyzers["fa"] = persianAnalyzer
|
||||||
|
|
||||||
// register ansi highlighter
|
// register ansi highlighter
|
||||||
Config.Highlight.Highlighters["ansi"] = search.NewSimpleHighlighter()
|
Config.Highlight.Highlighters["ansi"] = search.NewSimpleHighlighter()
|
||||||
|
|
Loading…
Reference in New Issue