0
0
Fork 0

configured zero width non joiner char filter, and persian analyzer

This commit is contained in:
Marty Schoch 2014-08-11 18:57:04 -04:00
parent 4ccd69ed45
commit a4707ebb4e
2 changed files with 29 additions and 0 deletions

View File

@ -55,3 +55,27 @@ func TestRegexpCharFilter(t *testing.T) {
}
}
}
func TestZeroWidthNonJoinerCharFilter(t *testing.T) {
zeroWidthNonJoinerPattern := `\x{200C}`
zeroWidthNonJoinerRegex := regexp.MustCompile(zeroWidthNonJoinerPattern)
tests := []struct {
input []byte
output []byte
}{
{
input: []byte("water\u200Cunder\u200Cthe\u200Cbridge"),
output: []byte("water under the bridge"),
},
}
for _, test := range tests {
filter := NewRegexpCharFilter(zeroWidthNonJoinerRegex, []byte{' '})
output := filter.Filter(test.input)
if !reflect.DeepEqual(output, test.output) {
t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input))
}
}
}

View File

@ -180,6 +180,9 @@ func init() {
htmlCharFilterRegexp := regexp.MustCompile(`</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`)
htmlCharFilter := regexp_char_filter.NewRegexpCharFilter(htmlCharFilterRegexp, []byte{' '})
Config.Analysis.CharFilters["html"] = htmlCharFilter
zeroWidthNonJoinerRegexp := regexp.MustCompile(`\x{200C}`)
zeroWidthNonJoinerCharFilter := regexp_char_filter.NewRegexpCharFilter(zeroWidthNonJoinerRegexp, []byte{' '})
Config.Analysis.CharFilters["zero_width_spaces"] = zeroWidthNonJoinerCharFilter
// register tokenizers
whitespaceTokenizerRegexp := regexp.MustCompile(`\w+`)
@ -339,6 +342,8 @@ func init() {
Config.Analysis.Analyzers["th"] = thaiAnalyzer
soraniAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"normalize_ckb", "to_lower", "stop_token_ckb", "stemmer_ckb"})
Config.Analysis.Analyzers["ckb"] = soraniAnalyzer
persianAnalyzer := Config.MustBuildNewAnalyzer([]string{"zero_width_spaces"}, "unicode", []string{"to_lower", "normalize_ar", "normalize_fa", "stop_token_fa"})
Config.Analysis.Analyzers["fa"] = persianAnalyzer
// register ansi highlighter
Config.Highlight.Highlighters["ansi"] = search.NewSimpleHighlighter()