From 79ab2b9b3d2a9220ede5c7280cf771373395392f Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 4 Aug 2014 21:59:57 -0400 Subject: [PATCH] added unicode normalization filter --- .../unicode_normalize/unicode_normalize.go | 61 +++++++ .../unicode_normalize_test.go | 156 ++++++++++++++++++ config.go | 7 + 3 files changed, 224 insertions(+) create mode 100644 analysis/token_filters/unicode_normalize/unicode_normalize.go create mode 100644 analysis/token_filters/unicode_normalize/unicode_normalize_test.go diff --git a/analysis/token_filters/unicode_normalize/unicode_normalize.go b/analysis/token_filters/unicode_normalize/unicode_normalize.go new file mode 100644 index 00000000..d61e2e1e --- /dev/null +++ b/analysis/token_filters/unicode_normalize/unicode_normalize.go @@ -0,0 +1,61 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package unicode_normalize + +import ( + "fmt" + + "code.google.com/p/go.text/unicode/norm" + "github.com/couchbaselabs/bleve/analysis" +) + +const NFC = "nfc" +const NFD = "nfd" +const NFKC = "nfkc" +const NFKD = "nfkd" + +var forms = map[string]norm.Form{ + NFC: norm.NFC, + NFD: norm.NFD, + NFKC: norm.NFKC, + NFKD: norm.NFKC, +} + +type UnicodeNormalizeFilter struct { + form norm.Form +} + +func NewUnicodeNormalizeFilter(formName string) (*UnicodeNormalizeFilter, error) { + form, ok := forms[formName] + if !ok { + return nil, fmt.Errorf("no form named %s", formName) + } + return &UnicodeNormalizeFilter{ + form: form, + }, nil +} + +func MustNewUnicodeNormalizeFilter(formName string) *UnicodeNormalizeFilter { + filter, err := NewUnicodeNormalizeFilter(formName) + if err != nil { + panic(err) + } + return filter +} + +func (s *UnicodeNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + rv := make(analysis.TokenStream, 0) + + for _, token := range input { + token.Term = s.form.Bytes(token.Term) + rv = append(rv, token) + } + + return rv +} diff --git a/analysis/token_filters/unicode_normalize/unicode_normalize_test.go b/analysis/token_filters/unicode_normalize/unicode_normalize_test.go new file mode 100644 index 00000000..0f7da282 --- /dev/null +++ b/analysis/token_filters/unicode_normalize/unicode_normalize_test.go @@ -0,0 +1,156 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package unicode_normalize + +import ( + "reflect" + "testing" + + "github.com/couchbaselabs/bleve/analysis" +) + +// the following tests come from the lucene +// test cases for CJK width filter +// which is our bases for using this +// as a substitute for that +func TestUnicodeNormalization(t *testing.T) { + + tests := []struct { + formName string + input analysis.TokenStream + output analysis.TokenStream + }{ + { + formName: NFKD, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("Test"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("Test"), + }, + }, + }, + { + formName: NFKD, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("1234"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("1234"), + }, + }, + }, + { + formName: NFKD, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("カタカナ"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("カタカナ"), + }, + }, + }, + { + formName: NFKD, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ヴィッツ"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ヴィッツ"), + }, + }, + }, + { + formName: NFKD, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("パナソニック"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("パナソニック"), + }, + }, + }, + { + formName: NFD, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("\u212B"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("\u0041\u030A"), + }, + }, + }, + { + formName: NFC, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("\u212B"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("\u00C5"), + }, + }, + }, + { + formName: NFKD, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("\uFB01"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("\u0066\u0069"), + }, + }, + }, + { + formName: NFKC, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("\uFB01"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("\u0066\u0069"), + }, + }, + }, + } + + for _, test := range tests { + filter := MustNewUnicodeNormalizeFilter(test.formName) + actual := filter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) + t.Errorf("expected %#v, got %#v", test.output[0].Term, actual[0].Term) + } + } +} diff --git a/config.go b/config.go index 9783af99..9aecde8d 100644 --- a/config.go +++ b/config.go @@ -29,6 +29,7 @@ import ( "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" "github.com/couchbaselabs/bleve/analysis/token_filters/stop_words_filter" + "github.com/couchbaselabs/bleve/analysis/token_filters/unicode_normalize" "github.com/couchbaselabs/bleve/search" ) @@ -262,6 +263,12 @@ func init() { Config.Analysis.TokenFilters["elision_ga"] = elision_filter.NewElisionFilter( Config.Analysis.TokenMaps["ga_articles"]) + // register unicode normalizers + Config.Analysis.TokenFilters["normalize_nfc"] = unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFC) + Config.Analysis.TokenFilters["normalize_nfd"] = unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFD) + Config.Analysis.TokenFilters["normalize_nfkc"] = unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKC) + Config.Analysis.TokenFilters["normalize_nfkd"] = unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKD) + // register analyzers keywordAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "single", []string{}) Config.Analysis.Analyzers["keyword"] = keywordAnalyzer