diff --git a/analysis/analyzers/detect_lang_analyzer/detect_lang_analyzer.go b/analysis/analyzers/detect_lang_analyzer/detect_lang_analyzer.go new file mode 100644 index 00000000..fe623ece --- /dev/null +++ b/analysis/analyzers/detect_lang_analyzer/detect_lang_analyzer.go @@ -0,0 +1,44 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package detect_lang_analyzer + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const Name = "detect_lang" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + keywordTokenizer, err := cache.TokenizerNamed("single") + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + detectLangFilter, err := cache.TokenFilterNamed("detect_lang") + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: keywordTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + detectLangFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(Name, AnalyzerConstructor) +} diff --git a/analysis/analyzers/keyword_analyzer/keyword_analyzer.go b/analysis/analyzers/keyword_analyzer/keyword_analyzer.go new file mode 100644 index 00000000..fee66ef9 --- /dev/null +++ b/analysis/analyzers/keyword_analyzer/keyword_analyzer.go @@ -0,0 +1,32 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package keyword_analyzer + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/tokenizers/single_token" + "github.com/couchbaselabs/bleve/registry" +) + +const Name = "keyword" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + keywordTokenizer, err := cache.TokenizerNamed(single_token.Name) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: keywordTokenizer, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(Name, AnalyzerConstructor) +} diff --git a/analysis/analyzers/simple_analyzer/simple_analyzer.go b/analysis/analyzers/simple_analyzer/simple_analyzer.go new file mode 100644 index 00000000..b6125202 --- /dev/null +++ b/analysis/analyzers/simple_analyzer/simple_analyzer.go @@ -0,0 +1,40 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package simple_analyzer + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/whitespace_tokenizer" + "github.com/couchbaselabs/bleve/registry" +) + +const Name = "simple" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + keywordTokenizer, err := cache.TokenizerNamed(whitespace_tokenizer.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: keywordTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(Name, AnalyzerConstructor) +} diff --git a/analysis/analyzers/standard_analyzer/standard_analyzer.go b/analysis/analyzers/standard_analyzer/standard_analyzer.go new file mode 100644 index 00000000..b44d6a81 --- /dev/null +++ b/analysis/analyzers/standard_analyzer/standard_analyzer.go @@ -0,0 +1,46 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package standard_analyzer + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/language/en" + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/whitespace_tokenizer" + "github.com/couchbaselabs/bleve/registry" +) + +const Name = "standard" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + keywordTokenizer, err := cache.TokenizerNamed(whitespace_tokenizer.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopEnFilter, err := cache.TokenFilterNamed(en.StopName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: keywordTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopEnFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(Name, AnalyzerConstructor) +} diff --git a/analysis/char_filters/html_char_filter/html_char_filter.go b/analysis/char_filters/html_char_filter/html_char_filter.go new file mode 100644 index 00000000..4466dd2a --- /dev/null +++ b/analysis/char_filters/html_char_filter/html_char_filter.go @@ -0,0 +1,30 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package html_char_filter + +import ( + "regexp" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/char_filters/regexp_char_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const Name = "html" + +var htmlCharFilterRegexp = regexp.MustCompile(`\s]+))?)+\s*|\s*)/?>`) + +func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) { + replaceBytes := []byte(" ") + return regexp_char_filter.NewRegexpCharFilter(htmlCharFilterRegexp, replaceBytes), nil +} + +func init() { + registry.RegisterCharFilter(Name, CharFilterConstructor) +} diff --git a/analysis/char_filters/regexp_char_filter/regexp_char_filter.go b/analysis/char_filters/regexp_char_filter/regexp_char_filter.go index 6f84a37c..2445bdc2 100644 --- a/analysis/char_filters/regexp_char_filter/regexp_char_filter.go +++ b/analysis/char_filters/regexp_char_filter/regexp_char_filter.go @@ -10,9 +10,15 @@ package regexp_char_filter import ( "bytes" + "fmt" "regexp" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const Name = "regexp" + type RegexpCharFilter struct { r *regexp.Regexp replacement []byte @@ -28,3 +34,24 @@ func NewRegexpCharFilter(r *regexp.Regexp, replacement []byte) *RegexpCharFilter func (s *RegexpCharFilter) Filter(input []byte) []byte { return s.r.ReplaceAllFunc(input, func(in []byte) []byte { return bytes.Repeat(s.replacement, len(in)) }) } + +func RegexpCharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) { + regexpStr, ok := config["regexp"].(string) + if !ok { + return nil, fmt.Errorf("must specify regexp") + } + r, err := regexp.Compile(regexpStr) + if err != nil { + return nil, fmt.Errorf("unable to build regexp char filter: %v", err) + } + replaceBytes := []byte(" ") + replaceStr, ok := config["replace"].(string) + if ok { + replaceBytes = []byte(replaceStr) + } + return NewRegexpCharFilter(r, replaceBytes), nil +} + +func init() { + registry.RegisterCharFilter(Name, RegexpCharFilterConstructor) +} diff --git a/analysis/char_filters/zero_width_non_joiner/zero_width_non_joiner_char_filter.go b/analysis/char_filters/zero_width_non_joiner/zero_width_non_joiner_char_filter.go new file mode 100644 index 00000000..5282b0ae --- /dev/null +++ b/analysis/char_filters/zero_width_non_joiner/zero_width_non_joiner_char_filter.go @@ -0,0 +1,30 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package zero_width_non_joiner + +import ( + "regexp" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/char_filters/regexp_char_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const Name = "zero_width_spaces" + +var zeroWidthNonJoinerRegexp = regexp.MustCompile(`\x{200C}`) + +func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) { + replaceBytes := []byte(" ") + return regexp_char_filter.NewRegexpCharFilter(zeroWidthNonJoinerRegexp, replaceBytes), nil +} + +func init() { + registry.RegisterCharFilter(Name, CharFilterConstructor) +} diff --git a/analysis/datetime_parsers/datetime_optional/datetime_optional.go b/analysis/datetime_parsers/datetime_optional/datetime_optional.go new file mode 100644 index 00000000..2e9e2616 --- /dev/null +++ b/analysis/datetime_parsers/datetime_optional/datetime_optional.go @@ -0,0 +1,39 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package html_char_filter + +import ( + "time" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/datetime_parsers/flexible_go" + "github.com/couchbaselabs/bleve/registry" +) + +const Name = "dateTimeOptional" + +const rfc3339NoTimezone = "2006-01-02T15:04:05" +const rfc3339NoTimezoneNoT = "2006-01-02 15:04:05" +const rfc3339NoTime = "2006-01-02" + +var layouts = []string{ + time.RFC3339Nano, + time.RFC3339, + rfc3339NoTimezone, + rfc3339NoTimezoneNoT, + rfc3339NoTime, +} + +func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) { + return flexible_go.NewFlexibleGoDateTimeParser(layouts), nil +} + +func init() { + registry.RegisterDateTimeParser(Name, DateTimeParserConstructor) +} diff --git a/analysis/datetime_parsers/flexible_go/flexible_go.go b/analysis/datetime_parsers/flexible_go/flexible_go.go index 601d919c..7ef28eec 100644 --- a/analysis/datetime_parsers/flexible_go/flexible_go.go +++ b/analysis/datetime_parsers/flexible_go/flexible_go.go @@ -9,11 +9,15 @@ package flexible_go import ( + "fmt" "time" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const Name = "flexiblego" + type FlexibleGoDateTimeParser struct { layouts []string } @@ -33,3 +37,22 @@ func (p *FlexibleGoDateTimeParser) ParseDateTime(input string) (time.Time, error } return time.Time{}, analysis.INVALID_DATETIME } + +func FlexibleGoDateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) { + layouts, ok := config["layouts"].([]interface{}) + if !ok { + return nil, fmt.Errorf("must specify layouts") + } + layoutStrs := make([]string, 0) + for _, layout := range layouts { + layoutStr, ok := layout.(string) + if ok { + layoutStrs = append(layoutStrs, layoutStr) + } + } + return NewFlexibleGoDateTimeParser(layoutStrs), nil +} + +func init() { + registry.RegisterDateTimeParser(Name, FlexibleGoDateTimeParserConstructor) +} diff --git a/analysis/freq_test.go b/analysis/freq_test.go index d9f3f805..6d67fddd 100644 --- a/analysis/freq_test.go +++ b/analysis/freq_test.go @@ -162,6 +162,5 @@ func TestTokenFrequenciesMergeAllLeftEmpty(t *testing.T) { result := tf1.MergeAll("tf2", tf2) if !reflect.DeepEqual(result, expectedResult) { t.Errorf("expected %#v, got %#v", expectedResult, result) - //t.Logf("%#v", tf1[0]) } } diff --git a/analysis/token_filters/arabic_normalize/arabic_normalize.go b/analysis/language/ar/arabic_normalize.go similarity index 85% rename from analysis/token_filters/arabic_normalize/arabic_normalize.go rename to analysis/language/ar/arabic_normalize.go index 93dec322..428bdbb0 100644 --- a/analysis/token_filters/arabic_normalize/arabic_normalize.go +++ b/analysis/language/ar/arabic_normalize.go @@ -6,14 +6,17 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package arabic_normalize +package ar import ( "bytes" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const NormalizeName = "normalize_ar" + const ( ALEF = '\u0627' ALEF_MADDA = '\u0622' @@ -70,3 +73,11 @@ func normalize(input []byte) []byte { } return analysis.BuildTermFromRunes(runes) } + +func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewArabicNormalizeFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor) +} diff --git a/analysis/token_filters/arabic_normalize/arabic_normalize_test.go b/analysis/language/ar/arabic_normalize_test.go similarity index 99% rename from analysis/token_filters/arabic_normalize/arabic_normalize_test.go rename to analysis/language/ar/arabic_normalize_test.go index 42c7715e..4e456113 100644 --- a/analysis/token_filters/arabic_normalize/arabic_normalize_test.go +++ b/analysis/language/ar/arabic_normalize_test.go @@ -6,7 +6,7 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package arabic_normalize +package ar import ( "reflect" diff --git a/analysis/token_filters/stop_words_filter/stop_words_filter.go b/analysis/language/ar/stop_filter_ar.go similarity index 57% rename from analysis/token_filters/stop_words_filter/stop_words_filter.go rename to analysis/language/ar/stop_filter_ar.go index 9d7b77b4..e5dc2c6a 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_filter.go +++ b/analysis/language/ar/stop_filter_ar.go @@ -6,32 +6,22 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package stop_words_filter +package ar import ( "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" ) -type StopWordsFilter struct { - stopWords analysis.WordMap -} - -func NewStopWordsFilter(stopWords analysis.WordMap) *StopWordsFilter { - return &StopWordsFilter{ - stopWords: stopWords, +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil } -func (f *StopWordsFilter) Filter(input analysis.TokenStream) analysis.TokenStream { - rv := make(analysis.TokenStream, 0) - - for _, token := range input { - word := string(token.Term) - _, isStopWord := f.stopWords[word] - if !isStopWord { - rv = append(rv, token) - } - } - - return rv +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) } diff --git a/analysis/token_filters/stop_words_filter/stop_words_ar.go b/analysis/language/ar/stop_words_ar.go similarity index 77% rename from analysis/token_filters/stop_words_filter/stop_words_ar.go rename to analysis/language/ar/stop_words_ar.go index 5e7eadb0..be6feee2 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_ar.go +++ b/analysis/language/ar/stop_words_ar.go @@ -1,4 +1,11 @@ -package stop_words_filter +package ar + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_ar" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis @@ -130,3 +137,13 @@ var ArabicStopWords = []byte(`# This file was created by Jacques Savoy and is di لدى جميع `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(ArabicStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/bg/stop_filter_bg.go b/analysis/language/bg/stop_filter_bg.go new file mode 100644 index 00000000..008f4b91 --- /dev/null +++ b/analysis/language/bg/stop_filter_bg.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package bg + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_bg.go b/analysis/language/bg/stop_words_bg.go similarity index 83% rename from analysis/token_filters/stop_words_filter/stop_words_bg.go rename to analysis/language/bg/stop_words_bg.go index af2ff8be..3fbe2cc9 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_bg.go +++ b/analysis/language/bg/stop_words_bg.go @@ -1,4 +1,11 @@ -package stop_words_filter +package bg + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_bg" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ @@ -198,3 +205,13 @@ var BulgarianStopWords = []byte(`# This file was created by Jacques Savoy and is щом я `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(BulgarianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/ca/articles_ca.go b/analysis/language/ca/articles_ca.go new file mode 100644 index 00000000..246bec38 --- /dev/null +++ b/analysis/language/ca/articles_ca.go @@ -0,0 +1,30 @@ +package ca + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const ArticlesName = "articles_ca" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis + +var CatalanArticles = []byte(` +d +l +m +n +s +t +`) + +func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(CatalanArticles) + return rv, err +} + +func init() { + registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor) +} diff --git a/analysis/language/ca/elision_ca.go b/analysis/language/ca/elision_ca.go new file mode 100644 index 00000000..2f6f6826 --- /dev/null +++ b/analysis/language/ca/elision_ca.go @@ -0,0 +1,31 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package ca + +import ( + "fmt" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const ElisionName = "elision_ca" + +func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + articlesTokenMap, err := cache.TokenMapNamed(ArticlesName) + if err != nil { + return nil, fmt.Errorf("error building elision filter: %v", err) + } + return elision_filter.NewElisionFilter(articlesTokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor) +} diff --git a/analysis/language/ca/elision_ca_test.go b/analysis/language/ca/elision_ca_test.go new file mode 100644 index 00000000..418f7eeb --- /dev/null +++ b/analysis/language/ca/elision_ca_test.go @@ -0,0 +1,55 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package ca + +import ( + "reflect" + "testing" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +func TestFrenchElision(t *testing.T) { + tests := []struct { + input analysis.TokenStream + output analysis.TokenStream + }{ + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("l'Institut"), + }, + &analysis.Token{ + Term: []byte("d'Estudis"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("Institut"), + }, + &analysis.Token{ + Term: []byte("Estudis"), + }, + }, + }, + } + + cache := registry.NewCache() + elisionFilter, err := cache.TokenFilterNamed(ElisionName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := elisionFilter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) + } + } +} diff --git a/analysis/language/ca/stop_filter_ca.go b/analysis/language/ca/stop_filter_ca.go new file mode 100644 index 00000000..f4625e1f --- /dev/null +++ b/analysis/language/ca/stop_filter_ca.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package ca + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_ca.go b/analysis/language/ca/stop_words_ca.go similarity index 78% rename from analysis/token_filters/stop_words_filter/stop_words_ca.go rename to analysis/language/ca/stop_words_ca.go index f9caedbb..3669efe0 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_ca.go +++ b/analysis/language/ca/stop_words_ca.go @@ -1,4 +1,11 @@ -package stop_words_filter +package ca + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_ca" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ @@ -225,3 +232,13 @@ vostra vostre vostres `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(CatalanStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/ckb/analyzer_ckb.go b/analysis/language/ckb/analyzer_ckb.go new file mode 100644 index 00000000..4270c8ec --- /dev/null +++ b/analysis/language/ckb/analyzer_ckb.go @@ -0,0 +1,55 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package ckb + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" + "github.com/couchbaselabs/bleve/registry" +) + +const AnalyzerName = "ckb" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + normCkbFilter, err := cache.TokenFilterNamed(NormalizeName) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopCkbFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerCkbFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + normCkbFilter, + toLowerFilter, + stopCkbFilter, + stemmerCkbFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/token_filters/sorani_normalize/sorani_normalize.go b/analysis/language/ckb/sorani_normalize.go similarity index 87% rename from analysis/token_filters/sorani_normalize/sorani_normalize.go rename to analysis/language/ckb/sorani_normalize.go index e7d55481..948f7b50 100644 --- a/analysis/token_filters/sorani_normalize/sorani_normalize.go +++ b/analysis/language/ckb/sorani_normalize.go @@ -6,15 +6,18 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package sorani_normalize +package ckb import ( "bytes" "unicode" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const NormalizeName = "normalize_ckb" + const ( YEH = '\u064A' DOTLESS_YEH = '\u0649' @@ -103,3 +106,11 @@ func normalize(input []byte) []byte { } return analysis.BuildTermFromRunes(runes) } + +func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewSoraniNormalizeFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor) +} diff --git a/analysis/token_filters/sorani_normalize/sorani_normalize_test.go b/analysis/language/ckb/sorani_normalize_test.go similarity index 99% rename from analysis/token_filters/sorani_normalize/sorani_normalize_test.go rename to analysis/language/ckb/sorani_normalize_test.go index 3d167aca..17ffb6b6 100644 --- a/analysis/token_filters/sorani_normalize/sorani_normalize_test.go +++ b/analysis/language/ckb/sorani_normalize_test.go @@ -6,7 +6,7 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package sorani_normalize +package ckb import ( "reflect" diff --git a/analysis/token_filters/sorani_stemmer_filter/sorani_stemmer_filter.go b/analysis/language/ckb/sorani_stemmer_filter.go similarity index 93% rename from analysis/token_filters/sorani_stemmer_filter/sorani_stemmer_filter.go rename to analysis/language/ckb/sorani_stemmer_filter.go index 7e7faf98..0aea0bb1 100644 --- a/analysis/token_filters/sorani_stemmer_filter/sorani_stemmer_filter.go +++ b/analysis/language/ckb/sorani_stemmer_filter.go @@ -6,15 +6,18 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package sorani_stemmer_filter +package ckb import ( "bytes" "unicode/utf8" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const StemmerName = "stemmer_ckb" + type SoraniStemmerFilter struct { } @@ -133,3 +136,11 @@ func buildTermFromRunes(runes []rune) []byte { } return rv } + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewSoraniStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/token_filters/sorani_stemmer_filter/sorani_stemmer_filter_test.go b/analysis/language/ckb/sorani_stemmer_filter_test.go similarity index 97% rename from analysis/token_filters/sorani_stemmer_filter/sorani_stemmer_filter_test.go rename to analysis/language/ckb/sorani_stemmer_filter_test.go index 1225cc14..aecea401 100644 --- a/analysis/token_filters/sorani_stemmer_filter/sorani_stemmer_filter_test.go +++ b/analysis/language/ckb/sorani_stemmer_filter_test.go @@ -6,14 +6,13 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package sorani_stemmer_filter +package ckb import ( "reflect" "testing" "github.com/couchbaselabs/bleve/analysis" - "github.com/couchbaselabs/bleve/analysis/token_filters/sorani_normalize" "github.com/couchbaselabs/bleve/analysis/tokenizers/single_token" ) @@ -24,7 +23,7 @@ func TestSoraniStemmerFilter(t *testing.T) { analyzer := analysis.Analyzer{ Tokenizer: single_token.NewSingleTokenTokenizer(), TokenFilters: []analysis.TokenFilter{ - sorani_normalize.NewSoraniNormalizeFilter(), + NewSoraniNormalizeFilter(), NewSoraniStemmerFilter(), }, } diff --git a/analysis/language/ckb/stop_filter_ckb.go b/analysis/language/ckb/stop_filter_ckb.go new file mode 100644 index 00000000..97160bb3 --- /dev/null +++ b/analysis/language/ckb/stop_filter_ckb.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package ckb + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_ckb.go b/analysis/language/ckb/stop_words_ckb.go similarity index 82% rename from analysis/token_filters/stop_words_filter/stop_words_ckb.go rename to analysis/language/ckb/stop_words_ckb.go index f780a9e1..5618b7c1 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_ckb.go +++ b/analysis/language/ckb/stop_words_ckb.go @@ -1,4 +1,11 @@ -package stop_words_filter +package ckb + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_ckb" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ @@ -141,3 +148,13 @@ var SoraniStopWords = []byte(`# set of kurdish stopwords # like وەک `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(SoraniStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/cs/stop_filter_cs.go b/analysis/language/cs/stop_filter_cs.go new file mode 100644 index 00000000..423d63d2 --- /dev/null +++ b/analysis/language/cs/stop_filter_cs.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package cs + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_cs.go b/analysis/language/cs/stop_words_cs.go similarity index 72% rename from analysis/token_filters/stop_words_filter/stop_words_cs.go rename to analysis/language/cs/stop_words_cs.go index b091a475..d2369cae 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_cs.go +++ b/analysis/language/cs/stop_words_cs.go @@ -1,4 +1,11 @@ -package stop_words_filter +package cs + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_cs" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ @@ -177,3 +184,13 @@ jež jakož načež `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(CzechStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/da/analyzer_da.go b/analysis/language/da/analyzer_da.go new file mode 100644 index 00000000..1b343bd9 --- /dev/null +++ b/analysis/language/da/analyzer_da.go @@ -0,0 +1,50 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package da + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" + "github.com/couchbaselabs/bleve/registry" +) + +const AnalyzerName = "da" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopDaFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerDaFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopDaFilter, + stemmerDaFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/tokenizers/rune_tokenizer/whitespace_classifier.go b/analysis/language/da/stemmer_da.go similarity index 57% rename from analysis/tokenizers/rune_tokenizer/whitespace_classifier.go rename to analysis/language/da/stemmer_da.go index 63e7d9ee..bc60d3bd 100644 --- a/analysis/tokenizers/rune_tokenizer/whitespace_classifier.go +++ b/analysis/language/da/stemmer_da.go @@ -6,18 +6,20 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package rune_tokenizer +package da import ( - "unicode" + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + "github.com/couchbaselabs/bleve/registry" ) -type WhitespaceClassifier struct{} +const StemmerName = "stemmer_da" -func NewWhitespaceClassifier() *WhitespaceClassifier { - return &WhitespaceClassifier{} +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return stemmer_filter.NewStemmerFilter("da") } -func (c *WhitespaceClassifier) InToken(r rune) bool { - return !unicode.IsSpace(r) +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) } diff --git a/analysis/language/da/stop_filter_da.go b/analysis/language/da/stop_filter_da.go new file mode 100644 index 00000000..2ecdabc7 --- /dev/null +++ b/analysis/language/da/stop_filter_da.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package da + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_da.go b/analysis/language/da/stop_words_da.go similarity index 90% rename from analysis/token_filters/stop_words_filter/stop_words_da.go rename to analysis/language/da/stop_words_da.go index 3e591e37..b163e6b0 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_da.go +++ b/analysis/language/da/stop_words_da.go @@ -1,4 +1,11 @@ -package stop_words_filter +package da + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_da" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ @@ -115,3 +122,13 @@ thi | for (conj) jer | you sådan | such, like this/like that `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(DanishStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/de/analyzer_de.go b/analysis/language/de/analyzer_de.go new file mode 100644 index 00000000..97b08680 --- /dev/null +++ b/analysis/language/de/analyzer_de.go @@ -0,0 +1,55 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package de + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" + "github.com/couchbaselabs/bleve/registry" +) + +const AnalyzerName = "de" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopDeFilter, err := cache.TokenFilterNamed(NormalizeName) + if err != nil { + return nil, err + } + normalizeDeFilter, err := cache.TokenFilterNamed(NormalizeName) + if err != nil { + return nil, err + } + stemmerDeFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopDeFilter, + normalizeDeFilter, + stemmerDeFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/token_filters/german_normalize/german_normalize.go b/analysis/language/de/german_normalize.go similarity index 85% rename from analysis/token_filters/german_normalize/german_normalize.go rename to analysis/language/de/german_normalize.go index c8ca3e55..20d82e77 100644 --- a/analysis/token_filters/german_normalize/german_normalize.go +++ b/analysis/language/de/german_normalize.go @@ -6,14 +6,17 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package german_normalize +package de import ( "bytes" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const NormalizeName = "normalize_de" + const ( N = 0 /* ordinary state */ V = 1 /* stops 'u' from entering umlaut state */ @@ -84,3 +87,11 @@ func normalize(input []byte) []byte { } return analysis.BuildTermFromRunes(runes) } + +func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewGermanNormalizeFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor) +} diff --git a/analysis/token_filters/german_normalize/german_normalize_test.go b/analysis/language/de/german_normalize_test.go similarity index 98% rename from analysis/token_filters/german_normalize/german_normalize_test.go rename to analysis/language/de/german_normalize_test.go index d34c6ba0..424d0ea9 100644 --- a/analysis/token_filters/german_normalize/german_normalize_test.go +++ b/analysis/language/de/german_normalize_test.go @@ -6,7 +6,7 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package german_normalize +package de import ( "reflect" diff --git a/analysis/language/de/stemmer_de.go b/analysis/language/de/stemmer_de.go new file mode 100644 index 00000000..b5651896 --- /dev/null +++ b/analysis/language/de/stemmer_de.go @@ -0,0 +1,25 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package de + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const StemmerName = "stemmer_de" + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return stemmer_filter.NewStemmerFilter("de") +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/language/de/stop_filter_de.go b/analysis/language/de/stop_filter_de.go new file mode 100644 index 00000000..93bc4678 --- /dev/null +++ b/analysis/language/de/stop_filter_de.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package de + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_de.go b/analysis/language/de/stop_words_de.go similarity index 92% rename from analysis/token_filters/stop_words_filter/stop_words_de.go rename to analysis/language/de/stop_words_de.go index 6d9f3a3f..592e4a1b 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_de.go +++ b/analysis/language/de/stop_words_de.go @@ -1,4 +1,11 @@ -package stop_words_filter +package de + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_de" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ @@ -299,3 +306,13 @@ zwar | indeed zwischen | between `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(GermanStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/el/stop_filter_el.go b/analysis/language/el/stop_filter_el.go new file mode 100644 index 00000000..8afc73cb --- /dev/null +++ b/analysis/language/el/stop_filter_el.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package el + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_el.go b/analysis/language/el/stop_words_el.go similarity index 71% rename from analysis/token_filters/stop_words_filter/stop_words_el.go rename to analysis/language/el/stop_words_el.go index aa7ff855..f9011e43 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_el.go +++ b/analysis/language/el/stop_words_el.go @@ -1,4 +1,11 @@ -package stop_words_filter +package el + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_el" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ @@ -83,3 +90,13 @@ var GreekStopWords = []byte(`# Lucene Greek Stopwords list οσο οτι `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(GreekStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/en/analyzer_en.go b/analysis/language/en/analyzer_en.go new file mode 100644 index 00000000..e3344557 --- /dev/null +++ b/analysis/language/en/analyzer_en.go @@ -0,0 +1,51 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package en + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" + + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" +) + +const AnalyzerName = "en" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopEnFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerEnFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopEnFilter, + stemmerEnFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/language/en/stemmer_en.go b/analysis/language/en/stemmer_en.go new file mode 100644 index 00000000..f8f30ed0 --- /dev/null +++ b/analysis/language/en/stemmer_en.go @@ -0,0 +1,25 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package en + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const StemmerName = "stemmer_en" + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return stemmer_filter.NewStemmerFilter("en") +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/language/en/stemmer_en_test.go b/analysis/language/en/stemmer_en_test.go new file mode 100644 index 00000000..abd1b8bc --- /dev/null +++ b/analysis/language/en/stemmer_en_test.go @@ -0,0 +1,69 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package en + +import ( + "reflect" + "testing" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +func TestEnglishStemmer(t *testing.T) { + tests := []struct { + input analysis.TokenStream + output analysis.TokenStream + }{ + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("walking"), + }, + &analysis.Token{ + Term: []byte("talked"), + }, + &analysis.Token{ + Term: []byte("business"), + }, + &analysis.Token{ + Term: []byte("protected"), + KeyWord: true, + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("walk"), + }, + &analysis.Token{ + Term: []byte("talk"), + }, + &analysis.Token{ + Term: []byte("busi"), + }, + &analysis.Token{ + Term: []byte("protected"), + KeyWord: true, + }, + }, + }, + } + + cache := registry.NewCache() + stemmerFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := stemmerFilter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) + } + } +} diff --git a/analysis/language/en/stop_filter_en.go b/analysis/language/en/stop_filter_en.go new file mode 100644 index 00000000..790a8bfa --- /dev/null +++ b/analysis/language/en/stop_filter_en.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package en + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_en.go b/analysis/language/en/stop_words_en.go similarity index 92% rename from analysis/token_filters/stop_words_filter/stop_words_en.go rename to analysis/language/en/stop_words_en.go index 9cab14ac..9b21a2d8 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_en.go +++ b/analysis/language/en/stop_words_en.go @@ -1,4 +1,11 @@ -package stop_words_filter +package en + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_en" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ @@ -324,3 +331,13 @@ very | high | long `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(EnglishStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/es/analyzer_es.go b/analysis/language/es/analyzer_es.go new file mode 100644 index 00000000..b155225a --- /dev/null +++ b/analysis/language/es/analyzer_es.go @@ -0,0 +1,51 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package es + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" + + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" +) + +const AnalyzerName = "es" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopEsFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerEsFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopEsFilter, + stemmerEsFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/language/es/stemmer_es.go b/analysis/language/es/stemmer_es.go new file mode 100644 index 00000000..c8b5be70 --- /dev/null +++ b/analysis/language/es/stemmer_es.go @@ -0,0 +1,25 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package es + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const StemmerName = "stemmer_es" + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return stemmer_filter.NewStemmerFilter("es") +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/language/es/stop_filter_es.go b/analysis/language/es/stop_filter_es.go new file mode 100644 index 00000000..a129071c --- /dev/null +++ b/analysis/language/es/stop_filter_es.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package es + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_es.go b/analysis/language/es/stop_words_es.go similarity index 93% rename from analysis/token_filters/stop_words_filter/stop_words_es.go rename to analysis/language/es/stop_words_es.go index af428f1c..10457349 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_es.go +++ b/analysis/language/es/stop_words_es.go @@ -1,4 +1,11 @@ -package stop_words_filter +package es + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_es" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ @@ -361,3 +368,13 @@ tenidas tened `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(SpanishStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/eu/stop_filter_eu.go b/analysis/language/eu/stop_filter_eu.go new file mode 100644 index 00000000..f69dfb63 --- /dev/null +++ b/analysis/language/eu/stop_filter_eu.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package eu + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_eu.go b/analysis/language/eu/stop_words_eu.go similarity index 66% rename from analysis/token_filters/stop_words_filter/stop_words_eu.go rename to analysis/language/eu/stop_words_eu.go index 98de8502..0cdcb632 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_eu.go +++ b/analysis/language/eu/stop_words_eu.go @@ -1,4 +1,11 @@ -package stop_words_filter +package eu + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_eu" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ @@ -104,3 +111,13 @@ zuek zuen zuten `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(BasqueStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/fa/analyzer_fa.go b/analysis/language/fa/analyzer_fa.go new file mode 100644 index 00000000..96d9f695 --- /dev/null +++ b/analysis/language/fa/analyzer_fa.go @@ -0,0 +1,65 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package fa + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" + + "github.com/couchbaselabs/bleve/analysis/char_filters/zero_width_non_joiner" + "github.com/couchbaselabs/bleve/analysis/language/ar" + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" +) + +const AnalyzerName = "fa" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + zFilter, err := cache.CharFilterNamed(zero_width_non_joiner.Name) + if err != nil { + return nil, err + } + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + normArFilter, err := cache.TokenFilterNamed(ar.NormalizeName) + if err != nil { + return nil, err + } + normFaFilter, err := cache.TokenFilterNamed(NormalizeName) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopFaFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + CharFilters: []analysis.CharFilter{ + zFilter, + }, + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + normArFilter, + normFaFilter, + stopFaFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/token_filters/persian_normalize/persian_normalize.go b/analysis/language/fa/persian_normalize.go similarity index 82% rename from analysis/token_filters/persian_normalize/persian_normalize.go rename to analysis/language/fa/persian_normalize.go index 12401663..9ea3ac7d 100644 --- a/analysis/token_filters/persian_normalize/persian_normalize.go +++ b/analysis/language/fa/persian_normalize.go @@ -6,14 +6,17 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package persian_normalize +package fa import ( "bytes" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const NormalizeName = "normalize_fa" + const ( YEH = '\u064A' FARSI_YEH = '\u06CC' @@ -62,3 +65,11 @@ func normalize(input []byte) []byte { } return analysis.BuildTermFromRunes(runes) } + +func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewPersianNormalizeFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor) +} diff --git a/analysis/token_filters/persian_normalize/persian_normalize_test.go b/analysis/language/fa/persian_normalize_test.go similarity index 99% rename from analysis/token_filters/persian_normalize/persian_normalize_test.go rename to analysis/language/fa/persian_normalize_test.go index d17e65f4..b58d8795 100644 --- a/analysis/token_filters/persian_normalize/persian_normalize_test.go +++ b/analysis/language/fa/persian_normalize_test.go @@ -6,7 +6,7 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package persian_normalize +package fa import ( "reflect" diff --git a/analysis/language/fa/stop_filter_fa.go b/analysis/language/fa/stop_filter_fa.go new file mode 100644 index 00000000..a965a389 --- /dev/null +++ b/analysis/language/fa/stop_filter_fa.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package fa + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_fa.go b/analysis/language/fa/stop_words_fa.go similarity index 88% rename from analysis/token_filters/stop_words_filter/stop_words_fa.go rename to analysis/language/fa/stop_words_fa.go index 08342fd6..38216a7d 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_fa.go +++ b/analysis/language/fa/stop_words_fa.go @@ -1,4 +1,11 @@ -package stop_words_filter +package fa + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_fa" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ @@ -318,3 +325,13 @@ var PersianStopWords = []byte(`# This file was created by Jacques Savoy and is d عنوان بود `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(PersianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/fi/analyzer_fi.go b/analysis/language/fi/analyzer_fi.go new file mode 100644 index 00000000..87333db9 --- /dev/null +++ b/analysis/language/fi/analyzer_fi.go @@ -0,0 +1,51 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package fi + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" + + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" +) + +const AnalyzerName = "fi" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopFiFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerFiFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopFiFilter, + stemmerFiFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/language/fi/stemmer_fi.go b/analysis/language/fi/stemmer_fi.go new file mode 100644 index 00000000..bc04128d --- /dev/null +++ b/analysis/language/fi/stemmer_fi.go @@ -0,0 +1,25 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package fi + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const StemmerName = "stemmer_fi" + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return stemmer_filter.NewStemmerFilter("fi") +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/language/fi/stop_filter_fi.go b/analysis/language/fi/stop_filter_fi.go new file mode 100644 index 00000000..5e660d79 --- /dev/null +++ b/analysis/language/fi/stop_filter_fi.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package fi + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_fi.go b/analysis/language/fi/stop_words_fi.go similarity index 88% rename from analysis/token_filters/stop_words_filter/stop_words_fi.go rename to analysis/language/fi/stop_words_fi.go index 210314d7..fd028876 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_fi.go +++ b/analysis/language/fi/stop_words_fi.go @@ -1,4 +1,11 @@ -package stop_words_filter +package fi + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_fi" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ @@ -102,3 +109,13 @@ nyt | now itse | self `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(FinnishStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/fr/analyzer_fr.go b/analysis/language/fr/analyzer_fr.go new file mode 100644 index 00000000..1924e096 --- /dev/null +++ b/analysis/language/fr/analyzer_fr.go @@ -0,0 +1,56 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package fr + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" + + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" +) + +const AnalyzerName = "fr" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + elisionFilter, err := cache.TokenFilterNamed(ElisionName) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopFrFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerFrFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + elisionFilter, + toLowerFilter, + stopFrFilter, + stemmerFrFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/language/fr/articles_fr.go b/analysis/language/fr/articles_fr.go new file mode 100644 index 00000000..255254a8 --- /dev/null +++ b/analysis/language/fr/articles_fr.go @@ -0,0 +1,37 @@ +package fr + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const ArticlesName = "articles_fr" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis + +var FrenchArticles = []byte(` +l +m +t +qu +n +s +j +d +c +jusqu +quoiqu +lorsqu +puisqu +`) + +func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(FrenchArticles) + return rv, err +} + +func init() { + registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor) +} diff --git a/analysis/language/fr/elision_fr.go b/analysis/language/fr/elision_fr.go new file mode 100644 index 00000000..c1d1a8e5 --- /dev/null +++ b/analysis/language/fr/elision_fr.go @@ -0,0 +1,31 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package fr + +import ( + "fmt" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const ElisionName = "elision_fr" + +func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + articlesTokenMap, err := cache.TokenMapNamed(ArticlesName) + if err != nil { + return nil, fmt.Errorf("error building elision filter: %v", err) + } + return elision_filter.NewElisionFilter(articlesTokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor) +} diff --git a/analysis/tokenizers/rune_tokenizer/rune_tokenizer_test.go b/analysis/language/fr/elision_fr_test.go similarity index 57% rename from analysis/tokenizers/rune_tokenizer/rune_tokenizer_test.go rename to analysis/language/fr/elision_fr_test.go index 6a2a2974..f2bc6186 100644 --- a/analysis/tokenizers/rune_tokenizer/rune_tokenizer_test.go +++ b/analysis/language/fr/elision_fr_test.go @@ -6,50 +6,44 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package rune_tokenizer +package fr import ( "reflect" "testing" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) -func TestWhitespaceTokenizer(t *testing.T) { - - classifier := NewWhitespaceClassifier() - +func TestFrenchElision(t *testing.T) { tests := []struct { - input []byte + input analysis.TokenStream output analysis.TokenStream }{ { - []byte("Hello World"), - analysis.TokenStream{ - { - Start: 0, - End: 5, - Term: []byte("Hello"), - Position: 1, - Type: analysis.AlphaNumeric, + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("l'avion"), }, - { - Start: 6, - End: 11, - Term: []byte("World"), - Position: 2, - Type: analysis.AlphaNumeric, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("avion"), }, }, }, } + cache := registry.NewCache() + elisionFilter, err := cache.TokenFilterNamed(ElisionName) + if err != nil { + t.Fatal(err) + } for _, test := range tests { - tokenizer := NewRuneTokenizer(classifier) - actual := tokenizer.Tokenize(test.input) - + actual := elisionFilter.Filter(test.input) if !reflect.DeepEqual(actual, test.output) { - t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input)) + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) } } } diff --git a/analysis/language/fr/stemmer_fr.go b/analysis/language/fr/stemmer_fr.go new file mode 100644 index 00000000..ae391598 --- /dev/null +++ b/analysis/language/fr/stemmer_fr.go @@ -0,0 +1,25 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package fr + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const StemmerName = "stemmer_fr" + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return stemmer_filter.NewStemmerFilter("fr") +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/language/fr/stop_filter_fr.go b/analysis/language/fr/stop_filter_fr.go new file mode 100644 index 00000000..bb76fb9b --- /dev/null +++ b/analysis/language/fr/stop_filter_fr.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package fr + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_fr.go b/analysis/language/fr/stop_words_fr.go similarity index 89% rename from analysis/token_filters/stop_words_filter/stop_words_fr.go rename to analysis/language/fr/stop_words_fr.go index 3e286d14..410b6f22 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_fr.go +++ b/analysis/language/fr/stop_words_fr.go @@ -1,4 +1,11 @@ -package stop_words_filter +package fr + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_fr" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ @@ -191,3 +198,13 @@ sans | without soi | oneself `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(FrenchStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/ga/articles_ga.go b/analysis/language/ga/articles_ga.go new file mode 100644 index 00000000..32da8a66 --- /dev/null +++ b/analysis/language/ga/articles_ga.go @@ -0,0 +1,27 @@ +package ga + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const ArticlesName = "articles_ga" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis + +var IrishArticles = []byte(` +d +m +b +`) + +func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(IrishArticles) + return rv, err +} + +func init() { + registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor) +} diff --git a/analysis/language/ga/elision_ga.go b/analysis/language/ga/elision_ga.go new file mode 100644 index 00000000..8c2b2529 --- /dev/null +++ b/analysis/language/ga/elision_ga.go @@ -0,0 +1,31 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package ga + +import ( + "fmt" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const ElisionName = "elision_ga" + +func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + articlesTokenMap, err := cache.TokenMapNamed(ArticlesName) + if err != nil { + return nil, fmt.Errorf("error building elision filter: %v", err) + } + return elision_filter.NewElisionFilter(articlesTokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor) +} diff --git a/analysis/language/ga/elision_ga_test.go b/analysis/language/ga/elision_ga_test.go new file mode 100644 index 00000000..e4fd6fcc --- /dev/null +++ b/analysis/language/ga/elision_ga_test.go @@ -0,0 +1,49 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package ga + +import ( + "reflect" + "testing" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +func TestFrenchElision(t *testing.T) { + tests := []struct { + input analysis.TokenStream + output analysis.TokenStream + }{ + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("b'fhearr"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("fhearr"), + }, + }, + }, + } + + cache := registry.NewCache() + elisionFilter, err := cache.TokenFilterNamed(ElisionName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := elisionFilter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) + } + } +} diff --git a/analysis/language/ga/stop_filter_ga.go b/analysis/language/ga/stop_filter_ga.go new file mode 100644 index 00000000..28b04210 --- /dev/null +++ b/analysis/language/ga/stop_filter_ga.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package ga + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_ga.go b/analysis/language/ga/stop_words_ga.go similarity index 65% rename from analysis/token_filters/stop_words_filter/stop_words_ga.go rename to analysis/language/ga/stop_words_ga.go index 483707d9..a3db4042 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_ga.go +++ b/analysis/language/ga/stop_words_ga.go @@ -1,4 +1,11 @@ -package stop_words_filter +package ga + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_ga" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ @@ -115,3 +122,13 @@ um óna ónár `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(IrishStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/gl/stop_filter_gl.go b/analysis/language/gl/stop_filter_gl.go new file mode 100644 index 00000000..c9af35cd --- /dev/null +++ b/analysis/language/gl/stop_filter_gl.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package gl + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_gl.go b/analysis/language/gl/stop_words_gl.go similarity index 71% rename from analysis/token_filters/stop_words_filter/stop_words_gl.go rename to analysis/language/gl/stop_words_gl.go index 30f25d03..261e8232 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_gl.go +++ b/analysis/language/gl/stop_words_gl.go @@ -1,4 +1,11 @@ -package stop_words_filter +package gl + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_gl" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ @@ -166,3 +173,13 @@ voso vosos vós `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(GalicianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/token_filters/hindi_normalize/hindi_normalize.go b/analysis/language/hi/hindi_normalize.go similarity index 89% rename from analysis/token_filters/hindi_normalize/hindi_normalize.go rename to analysis/language/hi/hindi_normalize.go index 61ad2f81..5ca012bf 100644 --- a/analysis/token_filters/hindi_normalize/hindi_normalize.go +++ b/analysis/language/hi/hindi_normalize.go @@ -6,14 +6,17 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package hindi_normalize +package hi import ( "bytes" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const NormalizeName = "normalize_hi" + type HindiNormalizeFilter struct { } @@ -123,3 +126,11 @@ func normalize(input []byte) []byte { } return analysis.BuildTermFromRunes(runes) } + +func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewHindiNormalizeFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor) +} diff --git a/analysis/token_filters/hindi_normalize/hindi_normalize_test.go b/analysis/language/hi/hindi_normalize_test.go similarity index 99% rename from analysis/token_filters/hindi_normalize/hindi_normalize_test.go rename to analysis/language/hi/hindi_normalize_test.go index 6db06bcb..c64ab17a 100644 --- a/analysis/token_filters/hindi_normalize/hindi_normalize_test.go +++ b/analysis/language/hi/hindi_normalize_test.go @@ -6,7 +6,7 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package hindi_normalize +package hi import ( "reflect" diff --git a/analysis/token_filters/hindi_stemmer_filter/hindi_stemmer_filter.go b/analysis/language/hi/hindi_stemmer_filter.go similarity index 93% rename from analysis/token_filters/hindi_stemmer_filter/hindi_stemmer_filter.go rename to analysis/language/hi/hindi_stemmer_filter.go index ad5a26e7..e145fcb2 100644 --- a/analysis/token_filters/hindi_stemmer_filter/hindi_stemmer_filter.go +++ b/analysis/language/hi/hindi_stemmer_filter.go @@ -6,15 +6,18 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package hindi_stemmer_filter +package hi import ( "bytes" "unicode/utf8" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const StemmerName = "stemmer_hi" + type HindiStemmerFilter struct { } @@ -134,3 +137,11 @@ func stem(input []byte) []byte { return input } + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewHindiStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/token_filters/hindi_stemmer_filter/hindi_stemmer_filter_test.go b/analysis/language/hi/hindi_stemmer_filter_test.go similarity index 99% rename from analysis/token_filters/hindi_stemmer_filter/hindi_stemmer_filter_test.go rename to analysis/language/hi/hindi_stemmer_filter_test.go index 05e9a787..d2b1de35 100644 --- a/analysis/token_filters/hindi_stemmer_filter/hindi_stemmer_filter_test.go +++ b/analysis/language/hi/hindi_stemmer_filter_test.go @@ -6,7 +6,7 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package hindi_stemmer_filter +package hi import ( "reflect" diff --git a/analysis/language/hi/stop_filter_hi.go b/analysis/language/hi/stop_filter_hi.go new file mode 100644 index 00000000..d3e6a2bd --- /dev/null +++ b/analysis/language/hi/stop_filter_hi.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package hi + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_hi.go b/analysis/language/hi/stop_words_hi.go similarity index 89% rename from analysis/token_filters/stop_words_filter/stop_words_hi.go rename to analysis/language/hi/stop_words_hi.go index 5e28b8f0..f1ead538 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_hi.go +++ b/analysis/language/hi/stop_words_hi.go @@ -1,4 +1,11 @@ -package stop_words_filter +package hi + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_hi" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ @@ -240,3 +247,13 @@ var HindiStopWords = []byte(`# Also see http://www.opensource.org/licenses/bsd-l जेसा नहिं `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(HindiStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/hu/analyzer_hu.go b/analysis/language/hu/analyzer_hu.go new file mode 100644 index 00000000..f9c55152 --- /dev/null +++ b/analysis/language/hu/analyzer_hu.go @@ -0,0 +1,51 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package hu + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" + + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" +) + +const AnalyzerName = "hu" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopHuFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerHuFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopHuFilter, + stemmerHuFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/language/hu/stemmer_hu.go b/analysis/language/hu/stemmer_hu.go new file mode 100644 index 00000000..14dfe193 --- /dev/null +++ b/analysis/language/hu/stemmer_hu.go @@ -0,0 +1,25 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package hu + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const StemmerName = "stemmer_hu" + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return stemmer_filter.NewStemmerFilter("hu") +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/language/hu/stop_filter_hu.go b/analysis/language/hu/stop_filter_hu.go new file mode 100644 index 00000000..8cb13dff --- /dev/null +++ b/analysis/language/hu/stop_filter_hu.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package hu + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_hu.go b/analysis/language/hu/stop_words_hu.go similarity index 82% rename from analysis/token_filters/stop_words_filter/stop_words_hu.go rename to analysis/language/hu/stop_words_hu.go index 7dac646b..dc757238 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_hu.go +++ b/analysis/language/hu/stop_words_hu.go @@ -1,4 +1,11 @@ -package stop_words_filter +package hu + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_hu" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ @@ -216,3 +223,13 @@ vele viszont volna `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(HungarianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/hy/stop_filter_hy.go b/analysis/language/hy/stop_filter_hy.go new file mode 100644 index 00000000..ca461870 --- /dev/null +++ b/analysis/language/hy/stop_filter_hy.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package hy + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_hy.go b/analysis/language/hy/stop_words_hy.go similarity index 56% rename from analysis/token_filters/stop_words_filter/stop_words_hy.go rename to analysis/language/hy/stop_words_hy.go index df2d1324..07b00763 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_hy.go +++ b/analysis/language/hy/stop_words_hy.go @@ -1,4 +1,11 @@ -package stop_words_filter +package hy + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_hy" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ @@ -51,3 +58,13 @@ var ArmenianStopWords = []byte(`# example set of Armenian stopwords. վրա և `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(ArmenianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/id/stop_filter_id.go b/analysis/language/id/stop_filter_id.go new file mode 100644 index 00000000..e3199c24 --- /dev/null +++ b/analysis/language/id/stop_filter_id.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package id + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_id.go b/analysis/language/id/stop_words_id.go similarity index 87% rename from analysis/token_filters/stop_words_filter/stop_words_id.go rename to analysis/language/id/stop_words_id.go index c766c967..ab50d900 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_id.go +++ b/analysis/language/id/stop_words_id.go @@ -1,4 +1,11 @@ -package stop_words_filter +package id + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_id" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ @@ -364,3 +371,13 @@ yaitu yakni yang `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(IndonesianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/it/analyzer_it.go b/analysis/language/it/analyzer_it.go new file mode 100644 index 00000000..09a7794d --- /dev/null +++ b/analysis/language/it/analyzer_it.go @@ -0,0 +1,56 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package it + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" + + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" +) + +const AnalyzerName = "it" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + elisionFilter, err := cache.TokenFilterNamed(ElisionName) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopItFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerItFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + elisionFilter, + toLowerFilter, + stopItFilter, + stemmerItFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/language/it/articles_it.go b/analysis/language/it/articles_it.go new file mode 100644 index 00000000..8ff8a8ca --- /dev/null +++ b/analysis/language/it/articles_it.go @@ -0,0 +1,45 @@ +package it + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const ArticlesName = "articles_it" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis + +var ItalianArticles = []byte(` +c +l +all +dall +dell +nell +sull +coll +pell +gl +agl +dagl +degl +negl +sugl +un +m +t +s +v +d +`) + +func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(ItalianArticles) + return rv, err +} + +func init() { + registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor) +} diff --git a/analysis/language/it/elision_it.go b/analysis/language/it/elision_it.go new file mode 100644 index 00000000..a80ab1ca --- /dev/null +++ b/analysis/language/it/elision_it.go @@ -0,0 +1,31 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package it + +import ( + "fmt" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const ElisionName = "elision_it" + +func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + articlesTokenMap, err := cache.TokenMapNamed(ArticlesName) + if err != nil { + return nil, fmt.Errorf("error building elision filter: %v", err) + } + return elision_filter.NewElisionFilter(articlesTokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor) +} diff --git a/analysis/language/it/elision_it_test.go b/analysis/language/it/elision_it_test.go new file mode 100644 index 00000000..50627f7d --- /dev/null +++ b/analysis/language/it/elision_it_test.go @@ -0,0 +1,49 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package it + +import ( + "reflect" + "testing" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +func TestFrenchElision(t *testing.T) { + tests := []struct { + input analysis.TokenStream + output analysis.TokenStream + }{ + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("dell'Italia"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("Italia"), + }, + }, + }, + } + + cache := registry.NewCache() + elisionFilter, err := cache.TokenFilterNamed(ElisionName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := elisionFilter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) + } + } +} diff --git a/analysis/language/it/stemmer_it.go b/analysis/language/it/stemmer_it.go new file mode 100644 index 00000000..64c26630 --- /dev/null +++ b/analysis/language/it/stemmer_it.go @@ -0,0 +1,25 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package it + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const StemmerName = "stemmer_it" + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return stemmer_filter.NewStemmerFilter("it") +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/language/it/stop_filter_it.go b/analysis/language/it/stop_filter_it.go new file mode 100644 index 00000000..1008ceee --- /dev/null +++ b/analysis/language/it/stop_filter_it.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package it + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_it.go b/analysis/language/it/stop_words_it.go similarity index 92% rename from analysis/token_filters/stop_words_filter/stop_words_it.go rename to analysis/language/it/stop_words_it.go index 3a91aba8..25c77508 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_it.go +++ b/analysis/language/it/stop_words_it.go @@ -1,4 +1,11 @@ -package stop_words_filter +package it + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_it" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ @@ -308,3 +315,13 @@ stessimo stessero stando `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(ItalianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/nl/analyzer_nl.go b/analysis/language/nl/analyzer_nl.go new file mode 100644 index 00000000..3ede75be --- /dev/null +++ b/analysis/language/nl/analyzer_nl.go @@ -0,0 +1,51 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package nl + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" + + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" +) + +const AnalyzerName = "nl" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopNlFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerNlFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopNlFilter, + stemmerNlFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/language/nl/stemmer_nl.go b/analysis/language/nl/stemmer_nl.go new file mode 100644 index 00000000..8dd8fab1 --- /dev/null +++ b/analysis/language/nl/stemmer_nl.go @@ -0,0 +1,25 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package nl + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const StemmerName = "stemmer_nl" + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return stemmer_filter.NewStemmerFilter("nl") +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/language/nl/stop_filter_nl.go b/analysis/language/nl/stop_filter_nl.go new file mode 100644 index 00000000..35d63113 --- /dev/null +++ b/analysis/language/nl/stop_filter_nl.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package nl + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_nl.go b/analysis/language/nl/stop_words_nl.go similarity index 92% rename from analysis/token_filters/stop_words_filter/stop_words_nl.go rename to analysis/language/nl/stop_words_nl.go index dd518c8c..b0b69bde 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_nl.go +++ b/analysis/language/nl/stop_words_nl.go @@ -1,4 +1,11 @@ -package stop_words_filter +package nl + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_nl" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ @@ -124,3 +131,13 @@ iemand | somebody geweest | been; past participle of 'be' andere | other `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(DutchStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/no/analyzer_no.go b/analysis/language/no/analyzer_no.go new file mode 100644 index 00000000..7da589fe --- /dev/null +++ b/analysis/language/no/analyzer_no.go @@ -0,0 +1,51 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package no + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" + + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" +) + +const AnalyzerName = "no" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopNoFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerNoFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopNoFilter, + stemmerNoFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/language/no/stemmer_no.go b/analysis/language/no/stemmer_no.go new file mode 100644 index 00000000..f639760e --- /dev/null +++ b/analysis/language/no/stemmer_no.go @@ -0,0 +1,25 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package no + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const StemmerName = "stemmer_no" + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return stemmer_filter.NewStemmerFilter("no") +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/language/no/stop_filter_no.go b/analysis/language/no/stop_filter_no.go new file mode 100644 index 00000000..8788c532 --- /dev/null +++ b/analysis/language/no/stop_filter_no.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package no + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_no.go b/analysis/language/no/stop_words_no.go similarity index 92% rename from analysis/token_filters/stop_words_filter/stop_words_no.go rename to analysis/language/no/stop_words_no.go index 10e7d4ef..e5bcd661 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_no.go +++ b/analysis/language/no/stop_words_no.go @@ -1,4 +1,11 @@ -package stop_words_filter +package no + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_no" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ @@ -199,3 +206,13 @@ varte | became * vart | became * `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(NorwegianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/porter/stemmer_porter.go b/analysis/language/porter/stemmer_porter.go new file mode 100644 index 00000000..b6a30f25 --- /dev/null +++ b/analysis/language/porter/stemmer_porter.go @@ -0,0 +1,25 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package porter + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const StemmerName = "stemmer_porter" + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return stemmer_filter.NewStemmerFilter("porter") +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/language/pt/analyzer_pt.go b/analysis/language/pt/analyzer_pt.go new file mode 100644 index 00000000..ec09bc56 --- /dev/null +++ b/analysis/language/pt/analyzer_pt.go @@ -0,0 +1,51 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package pt + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" + + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" +) + +const AnalyzerName = "pt" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopPtFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerPtFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopPtFilter, + stemmerPtFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/language/pt/stemmer_pt.go b/analysis/language/pt/stemmer_pt.go new file mode 100644 index 00000000..6881e9de --- /dev/null +++ b/analysis/language/pt/stemmer_pt.go @@ -0,0 +1,25 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package pt + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const StemmerName = "stemmer_pt" + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return stemmer_filter.NewStemmerFilter("pt") +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/language/pt/stop_filter_pt.go b/analysis/language/pt/stop_filter_pt.go new file mode 100644 index 00000000..eff2d5d8 --- /dev/null +++ b/analysis/language/pt/stop_filter_pt.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package pt + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_pt.go b/analysis/language/pt/stop_words_pt.go similarity index 92% rename from analysis/token_filters/stop_words_filter/stop_words_pt.go rename to analysis/language/pt/stop_words_pt.go index 1e191267..2249c49c 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_pt.go +++ b/analysis/language/pt/stop_words_pt.go @@ -1,4 +1,11 @@ -package stop_words_filter +package pt + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_pt" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ @@ -258,3 +265,13 @@ teria teríamos teriam `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(PortugueseStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/ro/analyzer_ro.go b/analysis/language/ro/analyzer_ro.go new file mode 100644 index 00000000..c4eb7e49 --- /dev/null +++ b/analysis/language/ro/analyzer_ro.go @@ -0,0 +1,51 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package ro + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" + + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" +) + +const AnalyzerName = "ro" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopRoFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerRoFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopRoFilter, + stemmerRoFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/language/ro/stemmer_ro.go b/analysis/language/ro/stemmer_ro.go new file mode 100644 index 00000000..97a970c6 --- /dev/null +++ b/analysis/language/ro/stemmer_ro.go @@ -0,0 +1,25 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package ro + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const StemmerName = "stemmer_ro" + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return stemmer_filter.NewStemmerFilter("ro") +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/language/ro/stop_filter_ro.go b/analysis/language/ro/stop_filter_ro.go new file mode 100644 index 00000000..56d5c846 --- /dev/null +++ b/analysis/language/ro/stop_filter_ro.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package ro + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_ro.go b/analysis/language/ro/stop_words_ro.go similarity index 80% rename from analysis/token_filters/stop_words_filter/stop_words_ro.go rename to analysis/language/ro/stop_words_ro.go index bfc35044..c05ec9a2 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_ro.go +++ b/analysis/language/ro/stop_words_ro.go @@ -1,4 +1,11 @@ -package stop_words_filter +package ro + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_ro" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ @@ -238,3 +245,13 @@ vouă vreo vreun `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(RomanianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/ru/analyzer_ru.go b/analysis/language/ru/analyzer_ru.go new file mode 100644 index 00000000..7a4cdf24 --- /dev/null +++ b/analysis/language/ru/analyzer_ru.go @@ -0,0 +1,51 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package ru + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" + + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" +) + +const AnalyzerName = "ru" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopRuFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerRuFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopRuFilter, + stemmerRuFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/language/ru/stemmer_ru.go b/analysis/language/ru/stemmer_ru.go new file mode 100644 index 00000000..688b5d37 --- /dev/null +++ b/analysis/language/ru/stemmer_ru.go @@ -0,0 +1,25 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package ru + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const StemmerName = "stemmer_ru" + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return stemmer_filter.NewStemmerFilter("ru") +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/language/ru/stop_filter_ru.go b/analysis/language/ru/stop_filter_ru.go new file mode 100644 index 00000000..85bfbd3b --- /dev/null +++ b/analysis/language/ru/stop_filter_ru.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package ru + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_ru.go b/analysis/language/ru/stop_words_ru.go similarity index 95% rename from analysis/token_filters/stop_words_filter/stop_words_ru.go rename to analysis/language/ru/stop_words_ru.go index b83410ea..985c30d3 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_ru.go +++ b/analysis/language/ru/stop_words_ru.go @@ -1,4 +1,11 @@ -package stop_words_filter +package ru + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_ru" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ @@ -248,3 +255,13 @@ var RussianStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/a | нельзя `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(RussianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/sv/analyzer_sv.go b/analysis/language/sv/analyzer_sv.go new file mode 100644 index 00000000..8ff11c68 --- /dev/null +++ b/analysis/language/sv/analyzer_sv.go @@ -0,0 +1,51 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package sv + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" + + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" +) + +const AnalyzerName = "sv" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopSvFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerSvFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopSvFilter, + stemmerSvFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/language/sv/stemmer_sv.go b/analysis/language/sv/stemmer_sv.go new file mode 100644 index 00000000..b03fa6ab --- /dev/null +++ b/analysis/language/sv/stemmer_sv.go @@ -0,0 +1,25 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package sv + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const StemmerName = "stemmer_sv" + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return stemmer_filter.NewStemmerFilter("sv") +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/language/sv/stop_filter_sv.go b/analysis/language/sv/stop_filter_sv.go new file mode 100644 index 00000000..1d05c17c --- /dev/null +++ b/analysis/language/sv/stop_filter_sv.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package sv + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_sv.go b/analysis/language/sv/stop_words_sv.go similarity index 90% rename from analysis/token_filters/stop_words_filter/stop_words_sv.go rename to analysis/language/sv/stop_words_sv.go index 4b09843b..81b1901f 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_sv.go +++ b/analysis/language/sv/stop_words_sv.go @@ -1,4 +1,11 @@ -package stop_words_filter +package sv + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_sv" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ @@ -138,3 +145,13 @@ era | your vilkas | whose `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(SwedishStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/th/analyzer_th.go b/analysis/language/th/analyzer_th.go new file mode 100644 index 00000000..02d12a32 --- /dev/null +++ b/analysis/language/th/analyzer_th.go @@ -0,0 +1,45 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package th + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" + + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" +) + +const AnalyzerName = "th" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(TokenizerName) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopThFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopThFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/language/th/stop_filter_th.go b/analysis/language/th/stop_filter_th.go new file mode 100644 index 00000000..5139c908 --- /dev/null +++ b/analysis/language/th/stop_filter_th.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package th + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_th.go b/analysis/language/th/stop_words_th.go similarity index 81% rename from analysis/token_filters/stop_words_filter/stop_words_th.go rename to analysis/language/th/stop_words_th.go index 07e0edbf..ea1fba8f 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_th.go +++ b/analysis/language/th/stop_words_th.go @@ -1,4 +1,11 @@ -package stop_words_filter +package th + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_th" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ @@ -124,3 +131,13 @@ var ThaiStopWords = []byte(`# Thai stopwords from: กว่า กล่าว `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(ThaiStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/language/th/unicode_tokenizer_th.go b/analysis/language/th/unicode_tokenizer_th.go new file mode 100644 index 00000000..ea6f1b1c --- /dev/null +++ b/analysis/language/th/unicode_tokenizer_th.go @@ -0,0 +1,25 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package th + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" + "github.com/couchbaselabs/bleve/registry" +) + +const TokenizerName = "unicode_th" + +func TokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) { + return unicode_word_boundary.NewUnicodeWordBoundaryCustomLocaleTokenizer("th_TH"), nil +} + +func init() { + registry.RegisterTokenizer(TokenizerName, TokenizerConstructor) +} diff --git a/analysis/language/tr/analyzer_tr.go b/analysis/language/tr/analyzer_tr.go new file mode 100644 index 00000000..5a7287cd --- /dev/null +++ b/analysis/language/tr/analyzer_tr.go @@ -0,0 +1,57 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package tr + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" + + "github.com/couchbaselabs/bleve/analysis/token_filters/apostrophe_filter" + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" +) + +const AnalyzerName = "tr" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name) + if err != nil { + return nil, err + } + aposFilter, err := cache.TokenFilterNamed(apostrophe_filter.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name) + if err != nil { + return nil, err + } + stopTrFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerTrFilter, err := cache.TokenFilterNamed(StemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + aposFilter, + toLowerFilter, + stopTrFilter, + stemmerTrFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/language/tr/stemmer_tr.go b/analysis/language/tr/stemmer_tr.go new file mode 100644 index 00000000..a32634f7 --- /dev/null +++ b/analysis/language/tr/stemmer_tr.go @@ -0,0 +1,25 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package tr + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + "github.com/couchbaselabs/bleve/registry" +) + +const StemmerName = "stemmer_tr" + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return stemmer_filter.NewStemmerFilter("tr") +} + +func init() { + registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor) +} diff --git a/analysis/language/tr/stop_filter_tr.go b/analysis/language/tr/stop_filter_tr.go new file mode 100644 index 00000000..b7a118ed --- /dev/null +++ b/analysis/language/tr/stop_filter_tr.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package tr + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + "github.com/couchbaselabs/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_tr.go b/analysis/language/tr/stop_words_tr.go similarity index 80% rename from analysis/token_filters/stop_words_filter/stop_words_tr.go rename to analysis/language/tr/stop_words_tr.go index e9b5183d..c8062fc6 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_tr.go +++ b/analysis/language/tr/stop_words_tr.go @@ -1,4 +1,11 @@ -package stop_words_filter +package tr + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const StopName = "stop_tr" // this content was obtained from: // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ @@ -217,3 +224,13 @@ yoksa yüz zaten `) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(TurkishStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/stop_words_map.go b/analysis/stop_words_map.go deleted file mode 100644 index f7043a1b..00000000 --- a/analysis/stop_words_map.go +++ /dev/null @@ -1,57 +0,0 @@ -package analysis - -import ( - "bufio" - "bytes" - "io" - "io/ioutil" - "strings" -) - -type WordMap map[string]bool - -func NewWordMap() WordMap { - return make(WordMap, 0) -} - -func (s WordMap) LoadFile(filename string) error { - data, err := ioutil.ReadFile(filename) - if err != nil { - return err - } - return s.LoadBytes(data) -} - -func (s WordMap) LoadBytes(data []byte) error { - bytesReader := bytes.NewReader(data) - bufioReader := bufio.NewReader(bytesReader) - line, err := bufioReader.ReadString('\n') - for err == nil { - s.LoadLine(line) - line, err = bufioReader.ReadString('\n') - } - // if the err was EOF still need to process last value - if err == io.EOF { - s.LoadLine(line) - return nil - } - return err -} - -func (s WordMap) LoadLine(line string) error { - // find the start of comment, if any - startComment := strings.IndexAny(line, "#|") - if startComment >= 0 { - line = line[:startComment] - } - - words := strings.Fields(line) - for _, word := range words { - s.AddWord(word) - } - return nil -} - -func (s WordMap) AddWord(word string) { - s[word] = true -} diff --git a/analysis/stop_words_map_test.go b/analysis/stop_words_map_test.go deleted file mode 100644 index 88b173ae..00000000 --- a/analysis/stop_words_map_test.go +++ /dev/null @@ -1,26 +0,0 @@ -package analysis - -import ( - "reflect" - "testing" -) - -func TestWordMapLoadFile(t *testing.T) { - wordMap := make(WordMap, 0) - wordMap.LoadFile("test_stop_words.txt") - - expectedWords := make(WordMap, 0) - expectedWords.AddWord("marty") - expectedWords.AddWord("steve") - expectedWords.AddWord("dustin") - expectedWords.AddWord("siri") - expectedWords.AddWord("multiple") - expectedWords.AddWord("words") - expectedWords.AddWord("with") - expectedWords.AddWord("different") - expectedWords.AddWord("whitespace") - - if !reflect.DeepEqual(wordMap, expectedWords) { - t.Errorf("expected %#v, got %#v", expectedWords, wordMap) - } -} diff --git a/analysis/test_stop_words.txt b/analysis/test_words.txt similarity index 100% rename from analysis/test_stop_words.txt rename to analysis/test_words.txt diff --git a/analysis/token_filters/apostrophe_filter/apostrophe_filter.go b/analysis/token_filters/apostrophe_filter/apostrophe_filter.go index 148390e6..0b24be20 100644 --- a/analysis/token_filters/apostrophe_filter/apostrophe_filter.go +++ b/analysis/token_filters/apostrophe_filter/apostrophe_filter.go @@ -12,8 +12,11 @@ import ( "bytes" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const Name = "apostrophe" + const RIGHT_SINGLE_QUOTATION_MARK = "’" const APOSTROPHE = "'" @@ -39,3 +42,11 @@ func (s *ApostropheFilter) Filter(input analysis.TokenStream) analysis.TokenStre return rv } + +func ApostropheFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewApostropheFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(Name, ApostropheFilterConstructor) +} diff --git a/analysis/token_filters/cld2/cld2_filter.go b/analysis/token_filters/cld2/cld2_filter.go index 6aa957f7..4da3ebe1 100644 --- a/analysis/token_filters/cld2/cld2_filter.go +++ b/analysis/token_filters/cld2/cld2_filter.go @@ -17,8 +17,11 @@ import ( "unsafe" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const Name = "detect_lang" + type Cld2Filter struct { } @@ -51,3 +54,11 @@ func (f *Cld2Filter) detectLanguage(input []byte) ([]byte, error) { res := C.DetectLang(cstr) return C.GoBytes(unsafe.Pointer(res), C.int(C.strlen(res))), nil } + +func Cld2FilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewCld2Filter(), nil +} + +func init() { + registry.RegisterTokenFilter(Name, Cld2FilterConstructor) +} diff --git a/analysis/token_filters/edge_ngram_filter/edge_ngram_filter.go b/analysis/token_filters/edge_ngram_filter/edge_ngram_filter.go index 8221c35b..7496c636 100644 --- a/analysis/token_filters/edge_ngram_filter/edge_ngram_filter.go +++ b/analysis/token_filters/edge_ngram_filter/edge_ngram_filter.go @@ -13,8 +13,11 @@ import ( "unicode/utf8" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const Name = "edge_ngram" + type Side bool const BACK Side = true @@ -89,3 +92,27 @@ func buildTermFromRunes(runes []rune) []byte { } return rv } + +func EdgeNgramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + side := FRONT + back, ok := config["back"].(bool) + if ok && back { + side = BACK + } + min := 1 + minVal, ok := config["min"].(float64) + if ok { + min = int(minVal) + } + max := 2 + maxVal, ok := config["max"].(float64) + if ok { + max = int(maxVal) + } + + return NewEdgeNgramFilter(side, min, max), nil +} + +func init() { + registry.RegisterTokenFilter(Name, EdgeNgramFilterConstructor) +} diff --git a/analysis/token_filters/elision_filter/articles_ca.go b/analysis/token_filters/elision_filter/articles_ca.go deleted file mode 100644 index 7563b2ff..00000000 --- a/analysis/token_filters/elision_filter/articles_ca.go +++ /dev/null @@ -1,13 +0,0 @@ -package elision_filter - -// this content was obtained from: -// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis - -var CatalanArticles = []byte(` -d -l -m -n -s -t -`) diff --git a/analysis/token_filters/elision_filter/articles_fr.go b/analysis/token_filters/elision_filter/articles_fr.go deleted file mode 100644 index 93a21ad1..00000000 --- a/analysis/token_filters/elision_filter/articles_fr.go +++ /dev/null @@ -1,20 +0,0 @@ -package elision_filter - -// this content was obtained from: -// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis - -var FrenchArticles = []byte(` -l -m -t -qu -n -s -j -d -c -jusqu -quoiqu -lorsqu -puisqu -`) diff --git a/analysis/token_filters/elision_filter/articles_ga.go b/analysis/token_filters/elision_filter/articles_ga.go deleted file mode 100644 index e545c988..00000000 --- a/analysis/token_filters/elision_filter/articles_ga.go +++ /dev/null @@ -1,10 +0,0 @@ -package elision_filter - -// this content was obtained from: -// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis - -var IrishArticles = []byte(` -d -m -b -`) diff --git a/analysis/token_filters/elision_filter/articles_it.go b/analysis/token_filters/elision_filter/articles_it.go deleted file mode 100644 index 403ab810..00000000 --- a/analysis/token_filters/elision_filter/articles_it.go +++ /dev/null @@ -1,28 +0,0 @@ -package elision_filter - -// this content was obtained from: -// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis - -var ItalianArticles = []byte(` -c -l -all -dall -dell -nell -sull -coll -pell -gl -agl -dagl -degl -negl -sugl -un -m -t -s -v -d -`) diff --git a/analysis/token_filters/elision_filter/elision_filter.go b/analysis/token_filters/elision_filter/elision_filter.go index 9d6dfc2a..42c24bf9 100644 --- a/analysis/token_filters/elision_filter/elision_filter.go +++ b/analysis/token_filters/elision_filter/elision_filter.go @@ -10,20 +10,24 @@ package elision_filter import ( "bytes" + "fmt" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const Name = "elision" + const RIGHT_SINGLE_QUOTATION_MARK = "’" const APOSTROPHE = "'" const APOSTROPHES = APOSTROPHE + RIGHT_SINGLE_QUOTATION_MARK type ElisionFilter struct { - articles analysis.WordMap + articles analysis.TokenMap } -func NewElisionFilter(articles analysis.WordMap) *ElisionFilter { +func NewElisionFilter(articles analysis.TokenMap) *ElisionFilter { return &ElisionFilter{ articles: articles, } @@ -48,3 +52,19 @@ func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream return rv } + +func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + articlesTokenMapName, ok := config["articles_token_map"].(string) + if !ok { + return nil, fmt.Errorf("must specify articles_token_map") + } + articlesTokenMap, err := cache.TokenMapNamed(articlesTokenMapName) + if err != nil { + return nil, fmt.Errorf("error building elision filter: %v", err) + } + return NewElisionFilter(articlesTokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(Name, ElisionFilterConstructor) +} diff --git a/analysis/token_filters/elision_filter/elision_filter_test.go b/analysis/token_filters/elision_filter/elision_filter_test.go index 12ef7a7b..5befa610 100644 --- a/analysis/token_filters/elision_filter/elision_filter_test.go +++ b/analysis/token_filters/elision_filter/elision_filter_test.go @@ -13,101 +13,50 @@ import ( "testing" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_map" + "github.com/couchbaselabs/bleve/registry" ) func TestElisionFilter(t *testing.T) { - frenchArticlesMap := analysis.NewWordMap() - err := frenchArticlesMap.LoadBytes(FrenchArticles) - if err != nil { - t.Fatal(err) - } - - italianArticlesMap := analysis.NewWordMap() - err = italianArticlesMap.LoadBytes(ItalianArticles) - if err != nil { - t.Fatal(err) - } - - catalanArticlesMap := analysis.NewWordMap() - err = catalanArticlesMap.LoadBytes(CatalanArticles) - if err != nil { - t.Fatal(err) - } - - irishArticlesMap := analysis.NewWordMap() - err = irishArticlesMap.LoadBytes(IrishArticles) - if err != nil { - t.Fatal(err) - } - tests := []struct { - articleMap analysis.WordMap - input analysis.TokenStream - output analysis.TokenStream + input analysis.TokenStream + output analysis.TokenStream }{ { - articleMap: frenchArticlesMap, input: analysis.TokenStream{ &analysis.Token{ - Term: []byte("l'avion"), + Term: []byte("ar'word"), }, }, output: analysis.TokenStream{ &analysis.Token{ - Term: []byte("avion"), - }, - }, - }, - { - articleMap: italianArticlesMap, - input: analysis.TokenStream{ - &analysis.Token{ - Term: []byte("dell'Italia"), - }, - }, - output: analysis.TokenStream{ - &analysis.Token{ - Term: []byte("Italia"), - }, - }, - }, - { - articleMap: catalanArticlesMap, - input: analysis.TokenStream{ - &analysis.Token{ - Term: []byte("l'Institut"), - }, - &analysis.Token{ - Term: []byte("d'Estudis"), - }, - }, - output: analysis.TokenStream{ - &analysis.Token{ - Term: []byte("Institut"), - }, - &analysis.Token{ - Term: []byte("Estudis"), - }, - }, - }, - { - articleMap: irishArticlesMap, - input: analysis.TokenStream{ - &analysis.Token{ - Term: []byte("b'fhearr"), - }, - }, - output: analysis.TokenStream{ - &analysis.Token{ - Term: []byte("fhearr"), + Term: []byte("word"), }, }, }, } + cache := registry.NewCache() + + articleListConfig := map[string]interface{}{ + "tokens": []interface{}{"ar"}, + } + _, err := cache.DefineTokenMap("articles_test", token_map.Name, articleListConfig) + if err != nil { + t.Fatal(err) + } + + elisionConfig := map[string]interface{}{ + "articles_token_map": "articles_test", + } + elisionFilter, err := cache.DefineTokenFilter("elision_test", "elision", elisionConfig) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { - elisionFilter := NewElisionFilter(test.articleMap) + actual := elisionFilter.Filter(test.input) if !reflect.DeepEqual(actual, test.output) { t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) diff --git a/analysis/token_filters/keyword_marker_filter/keyword_marker_filter.go b/analysis/token_filters/keyword_marker_filter/keyword_marker_filter.go index 2667fdfc..29052536 100644 --- a/analysis/token_filters/keyword_marker_filter/keyword_marker_filter.go +++ b/analysis/token_filters/keyword_marker_filter/keyword_marker_filter.go @@ -9,14 +9,19 @@ package keyword_filter import ( + "fmt" + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const Name = "keyword_marker" + type KeyWordMarkerFilter struct { - keyWords analysis.WordMap + keyWords analysis.TokenMap } -func NewKeyWordMarkerFilter(keyWords analysis.WordMap) *KeyWordMarkerFilter { +func NewKeyWordMarkerFilter(keyWords analysis.TokenMap) *KeyWordMarkerFilter { return &KeyWordMarkerFilter{ keyWords: keyWords, } @@ -34,3 +39,19 @@ func (f *KeyWordMarkerFilter) Filter(input analysis.TokenStream) analysis.TokenS return input } + +func KeyWordMarkerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + keywordsTokenMapName, ok := config["keywords_token_map"].(string) + if !ok { + return nil, fmt.Errorf("must specify keywords_token_map") + } + keywordsTokenMap, err := cache.TokenMapNamed(keywordsTokenMapName) + if err != nil { + return nil, fmt.Errorf("error building keyword marker filter: %v", err) + } + return NewKeyWordMarkerFilter(keywordsTokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(Name, KeyWordMarkerFilterConstructor) +} diff --git a/analysis/token_filters/keyword_marker_filter/keyword_marker_filter_test.go b/analysis/token_filters/keyword_marker_filter/keyword_marker_filter_test.go index 8ac79d54..8c1bb649 100644 --- a/analysis/token_filters/keyword_marker_filter/keyword_marker_filter_test.go +++ b/analysis/token_filters/keyword_marker_filter/keyword_marker_filter_test.go @@ -55,9 +55,9 @@ func TestKeyWordMarkerFilter(t *testing.T) { }, } - keyWordsMap := analysis.NewWordMap() - keyWordsMap.AddWord("walk") - keyWordsMap.AddWord("park") + keyWordsMap := analysis.NewTokenMap() + keyWordsMap.AddToken("walk") + keyWordsMap.AddToken("park") filter := NewKeyWordMarkerFilter(keyWordsMap) ouputTokenStream := filter.Filter(inputTokenStream) diff --git a/analysis/token_filters/length_filter/length_filter.go b/analysis/token_filters/length_filter/length_filter.go index 5f296bc5..4e788799 100644 --- a/analysis/token_filters/length_filter/length_filter.go +++ b/analysis/token_filters/length_filter/length_filter.go @@ -12,8 +12,11 @@ import ( "unicode/utf8" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const Name = "length" + type LengthFilter struct { min int max int @@ -42,3 +45,23 @@ func (f *LengthFilter) Filter(input analysis.TokenStream) analysis.TokenStream { return rv } + +func LengthFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + min := 0 + max := 0 + + minVal, ok := config["min"].(float64) + if ok { + min = int(minVal) + } + maxVal, ok := config["max"].(float64) + if ok { + max = int(maxVal) + } + + return NewLengthFilter(min, max), nil +} + +func init() { + registry.RegisterTokenFilter(Name, LengthFilterConstructor) +} diff --git a/analysis/token_filters/lower_case_filter/lower_case_filter.go b/analysis/token_filters/lower_case_filter/lower_case_filter.go index bc0e7c8e..7e7b5740 100644 --- a/analysis/token_filters/lower_case_filter/lower_case_filter.go +++ b/analysis/token_filters/lower_case_filter/lower_case_filter.go @@ -12,8 +12,11 @@ import ( "strings" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const Name = "to_lower" + type LowerCaseFilter struct { } @@ -33,3 +36,11 @@ func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStrea return rv } + +func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewLowerCaseFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(Name, LowerCaseFilterConstructor) +} diff --git a/analysis/token_filters/ngram_filter/ngram_filter.go b/analysis/token_filters/ngram_filter/ngram_filter.go index d24052d3..31dd0429 100644 --- a/analysis/token_filters/ngram_filter/ngram_filter.go +++ b/analysis/token_filters/ngram_filter/ngram_filter.go @@ -13,8 +13,11 @@ import ( "unicode/utf8" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const Name = "ngram" + type NgramFilter struct { minLength int maxLength int @@ -64,3 +67,22 @@ func buildTermFromRunes(runes []rune) []byte { } return rv } + +func NgramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + min := 1 + minVal, ok := config["min"].(float64) + if ok { + min = int(minVal) + } + max := 2 + maxVal, ok := config["max"].(float64) + if ok { + max = int(maxVal) + } + + return NewNgramFilter(min, max), nil +} + +func init() { + registry.RegisterTokenFilter(Name, NgramFilterConstructor) +} diff --git a/analysis/token_filters/stemmer_filter/stemmer_filter.go b/analysis/token_filters/stemmer_filter/stemmer_filter.go index c89bc291..c3098a7b 100644 --- a/analysis/token_filters/stemmer_filter/stemmer_filter.go +++ b/analysis/token_filters/stemmer_filter/stemmer_filter.go @@ -11,8 +11,11 @@ package stemmer_filter import ( "bitbucket.org/tebeka/snowball" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const Name = "stem" + type StemmerFilter struct { lang string stemmer *snowball.Stemmer @@ -55,3 +58,16 @@ func (s *StemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream return rv } + +func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + lang := "en" + langVal, ok := config["lang"].(string) + if ok { + lang = langVal + } + return NewStemmerFilter(lang) +} + +func init() { + registry.RegisterTokenFilter(Name, StemmerFilterConstructor) +} diff --git a/analysis/token_filters/stemmer_filter/stemmer_filter_test.go b/analysis/token_filters/stemmer_filter/stemmer_filter_test.go index 639e7d16..41156c4a 100644 --- a/analysis/token_filters/stemmer_filter/stemmer_filter_test.go +++ b/analysis/token_filters/stemmer_filter/stemmer_filter_test.go @@ -49,7 +49,7 @@ func TestStemmerFilter(t *testing.T) { }, } - filter, err := NewStemmerFilter("english") + filter, err := NewStemmerFilter("en") if err != nil { t.Fatal(err) } diff --git a/analysis/token_filters/stop_tokens_filter/stop_tokens_filter.go b/analysis/token_filters/stop_tokens_filter/stop_tokens_filter.go new file mode 100644 index 00000000..3543c4f3 --- /dev/null +++ b/analysis/token_filters/stop_tokens_filter/stop_tokens_filter.go @@ -0,0 +1,58 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package stop_tokens_filter + +import ( + "fmt" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const Name = "stop_tokens" + +type StopTokensFilter struct { + stopTokens analysis.TokenMap +} + +func NewStopTokensFilter(stopTokens analysis.TokenMap) *StopTokensFilter { + return &StopTokensFilter{ + stopTokens: stopTokens, + } +} + +func (f *StopTokensFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + rv := make(analysis.TokenStream, 0) + + for _, token := range input { + tokenTerm := string(token.Term) + _, isStopToken := f.stopTokens[tokenTerm] + if !isStopToken { + rv = append(rv, token) + } + } + + return rv +} + +func StopTokensFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + stopTokenMapName, ok := config["stop_token_map"].(string) + if !ok { + return nil, fmt.Errorf("must specify stop_token_map") + } + stopTokenMap, err := cache.TokenMapNamed(stopTokenMapName) + if err != nil { + return nil, fmt.Errorf("error building stop words filter: %v", err) + } + return NewStopTokensFilter(stopTokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(Name, StopTokensFilterConstructor) +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_filter_test.go b/analysis/token_filters/stop_tokens_filter/stop_tokens_filter_test.go similarity index 69% rename from analysis/token_filters/stop_words_filter/stop_words_filter_test.go rename to analysis/token_filters/stop_tokens_filter/stop_tokens_filter_test.go index 6b280fd2..03badb68 100644 --- a/analysis/token_filters/stop_words_filter/stop_words_filter_test.go +++ b/analysis/token_filters/stop_tokens_filter/stop_tokens_filter_test.go @@ -6,13 +6,15 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package stop_words_filter +package stop_tokens_filter import ( "reflect" "testing" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_map" + "github.com/couchbaselabs/bleve/registry" ) func TestStopWordsFilter(t *testing.T) { @@ -44,13 +46,24 @@ func TestStopWordsFilter(t *testing.T) { }, } - stopWordsMap := analysis.NewWordMap() - err := stopWordsMap.LoadBytes(EnglishStopWords) + cache := registry.NewCache() + stopListConfig := map[string]interface{}{ + "tokens": []interface{}{"a", "in", "the"}, + } + _, err := cache.DefineTokenMap("stop_test", token_map.Name, stopListConfig) if err != nil { t.Fatal(err) } - filter := NewStopWordsFilter(stopWordsMap) - ouputTokenStream := filter.Filter(inputTokenStream) + + stopConfig := map[string]interface{}{ + "stop_token_map": "stop_test", + } + stopFilter, err := cache.DefineTokenFilter("stop_test", "stop_tokens", stopConfig) + if err != nil { + t.Fatal(err) + } + + ouputTokenStream := stopFilter.Filter(inputTokenStream) if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) { t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream) } diff --git a/analysis/token_filters/truncate_token_filter/truncate_token_filter.go b/analysis/token_filters/truncate_token_filter/truncate_token_filter.go index 4e539550..b8f23eab 100644 --- a/analysis/token_filters/truncate_token_filter/truncate_token_filter.go +++ b/analysis/token_filters/truncate_token_filter/truncate_token_filter.go @@ -13,8 +13,11 @@ import ( "unicode/utf8" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const Name = "truncate_token" + type TruncateTokenFilter struct { length int } @@ -45,3 +48,18 @@ func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenS return rv } + +func TruncateTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + length := 25 + + lenVal, ok := config["length"].(float64) + if ok { + length = int(lenVal) + } + + return NewTruncateTokenFilter(length), nil +} + +func init() { + registry.RegisterTokenFilter(Name, TruncateTokenFilterConstructor) +} diff --git a/analysis/token_filters/unicode_normalize/unicode_normalize.go b/analysis/token_filters/unicode_normalize/unicode_normalize.go index d61e2e1e..07208422 100644 --- a/analysis/token_filters/unicode_normalize/unicode_normalize.go +++ b/analysis/token_filters/unicode_normalize/unicode_normalize.go @@ -13,8 +13,11 @@ import ( "code.google.com/p/go.text/unicode/norm" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const Name = "normalize_unicode" + const NFC = "nfc" const NFD = "nfd" const NFKC = "nfkc" @@ -59,3 +62,16 @@ func (s *UnicodeNormalizeFilter) Filter(input analysis.TokenStream) analysis.Tok return rv } + +func UnicodeNormalizeFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + form := NFKC + formVal, ok := config["form"].(string) + if ok { + form = formVal + } + return NewUnicodeNormalizeFilter(form) +} + +func init() { + registry.RegisterTokenFilter(Name, UnicodeNormalizeFilterConstructor) +} diff --git a/analysis/token_map.go b/analysis/token_map.go new file mode 100644 index 00000000..9a7c97ff --- /dev/null +++ b/analysis/token_map.go @@ -0,0 +1,65 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package analysis + +import ( + "bufio" + "bytes" + "io" + "io/ioutil" + "strings" +) + +type TokenMap map[string]bool + +func NewTokenMap() TokenMap { + return make(TokenMap, 0) +} + +func (s TokenMap) LoadFile(filename string) error { + data, err := ioutil.ReadFile(filename) + if err != nil { + return err + } + return s.LoadBytes(data) +} + +func (t TokenMap) LoadBytes(data []byte) error { + bytesReader := bytes.NewReader(data) + bufioReader := bufio.NewReader(bytesReader) + line, err := bufioReader.ReadString('\n') + for err == nil { + t.LoadLine(line) + line, err = bufioReader.ReadString('\n') + } + // if the err was EOF still need to process last value + if err == io.EOF { + t.LoadLine(line) + return nil + } + return err +} + +func (t TokenMap) LoadLine(line string) error { + // find the start of comment, if any + startComment := strings.IndexAny(line, "#|") + if startComment >= 0 { + line = line[:startComment] + } + + tokens := strings.Fields(line) + for _, token := range tokens { + t.AddToken(token) + } + return nil +} + +func (t TokenMap) AddToken(token string) { + t[token] = true +} diff --git a/analysis/token_map/standard.go b/analysis/token_map/standard.go new file mode 100644 index 00000000..721a23dc --- /dev/null +++ b/analysis/token_map/standard.go @@ -0,0 +1,45 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package token_map + +import ( + "fmt" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const Name = "standard" + +func GenericTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + + // first try to load by filename + filename, ok := config["filename"].(string) + if ok { + err := rv.LoadFile(filename) + return rv, err + } + // next look for an inline word list + tokens, ok := config["tokens"].([]interface{}) + if ok { + for _, token := range tokens { + tokenStr, ok := token.(string) + if ok { + rv.AddToken(tokenStr) + } + } + return rv, nil + } + return nil, fmt.Errorf("must specify filename or list of tokens for token map") +} + +func init() { + registry.RegisterTokenMap(Name, GenericTokenMapConstructor) +} diff --git a/analysis/token_map_test.go b/analysis/token_map_test.go new file mode 100644 index 00000000..4134c899 --- /dev/null +++ b/analysis/token_map_test.go @@ -0,0 +1,34 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package analysis + +import ( + "reflect" + "testing" +) + +func TestTokenMapLoadFile(t *testing.T) { + tokenMap := NewTokenMap() + tokenMap.LoadFile("test_words.txt") + + expectedTokens := NewTokenMap() + expectedTokens.AddToken("marty") + expectedTokens.AddToken("steve") + expectedTokens.AddToken("dustin") + expectedTokens.AddToken("siri") + expectedTokens.AddToken("multiple") + expectedTokens.AddToken("words") + expectedTokens.AddToken("with") + expectedTokens.AddToken("different") + expectedTokens.AddToken("whitespace") + + if !reflect.DeepEqual(tokenMap, expectedTokens) { + t.Errorf("expected %#v, got %#v", expectedTokens, tokenMap) + } +} diff --git a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go index 8e720024..f07d9fc2 100644 --- a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go +++ b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go @@ -9,11 +9,15 @@ package regexp_tokenizer import ( + "fmt" "regexp" "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const Name = "regexp" + type RegexpTokenizer struct { r *regexp.Regexp } @@ -39,3 +43,19 @@ func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream { } return rv } + +func RegexpTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) { + rval, ok := config["regexp"].(string) + if !ok { + return nil, fmt.Errorf("must specify regexp") + } + r, err := regexp.Compile(rval) + if err != nil { + return nil, fmt.Errorf("unable to build regexp tokenizer: %v", err) + } + return NewRegexpTokenizer(r), nil +} + +func init() { + registry.RegisterTokenizer(Name, RegexpTokenizerConstructor) +} diff --git a/analysis/tokenizers/rune_tokenizer/rune_tokenizer.go b/analysis/tokenizers/rune_tokenizer/rune_tokenizer.go deleted file mode 100644 index a5cc5e2a..00000000 --- a/analysis/tokenizers/rune_tokenizer/rune_tokenizer.go +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2014 Couchbase, Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file -// except in compliance with the License. You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software distributed under the -// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, -// either express or implied. See the License for the specific language governing permissions -// and limitations under the License. -package rune_tokenizer - -import ( - "unicode/utf8" - - "github.com/couchbaselabs/bleve/analysis" -) - -type RuneTokenizer struct { - c RuneTokenClassifer -} - -func NewRuneTokenizer(c RuneTokenClassifer) *RuneTokenizer { - return &RuneTokenizer{ - c: c, - } -} - -func (rt *RuneTokenizer) Tokenize(input []byte) analysis.TokenStream { - // rv := make(analysis.TokenStream, 0) - // runes := bytes.Runes(input) - // nextTokenRunes := make([]rune, 0) - // for _, r := range runes { - - // } - // return rv - - rv := make(analysis.TokenStream, 0) - - currentInputPos := 0 - nextTokenRunes := make([]rune, 0) - tokenPos := 1 - tokenStart := 0 - - nextRune, nextRuneLen := utf8.DecodeRune(input[currentInputPos:]) - for nextRune != utf8.RuneError && currentInputPos < len(input) { - if rt.c.InToken(nextRune) { - nextTokenRunes = append(nextTokenRunes, nextRune) - } else { - // end the last loken, if one is building - if len(nextTokenRunes) > 0 { - nextToken := analysis.Token{ - Term: buildTermFromRunes(nextTokenRunes), - Position: tokenPos, - Start: tokenStart, - End: currentInputPos, - Type: analysis.AlphaNumeric, - } - rv = append(rv, &nextToken) - nextTokenRunes = make([]rune, 0) - tokenPos++ - } - tokenStart = currentInputPos + nextRuneLen - } - - currentInputPos += nextRuneLen - nextRune, nextRuneLen = utf8.DecodeRune(input[currentInputPos:]) - } - // build one last token if we didn't end on whitespace - if len(nextTokenRunes) > 0 { - nextToken := analysis.Token{ - Term: buildTermFromRunes(nextTokenRunes), - Position: tokenPos, - Start: tokenStart, - End: len(input), - Type: analysis.AlphaNumeric, - } - rv = append(rv, &nextToken) - nextTokenRunes = make([]rune, 0) - tokenPos++ - } - - return rv -} - -func buildTermFromRunes(runes []rune) []byte { - rv := make([]byte, 0, len(runes)*4) - for _, r := range runes { - runeBytes := make([]byte, utf8.RuneLen(r)) - utf8.EncodeRune(runeBytes, r) - rv = append(rv, runeBytes...) - } - return rv -} diff --git a/analysis/tokenizers/single_token/single_token.go b/analysis/tokenizers/single_token/single_token.go index 4b3dbf5f..f6966537 100644 --- a/analysis/tokenizers/single_token/single_token.go +++ b/analysis/tokenizers/single_token/single_token.go @@ -10,8 +10,11 @@ package single_token import ( "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) +const Name = "single" + type SingleTokenTokenizer struct { } @@ -30,3 +33,11 @@ func (t *SingleTokenTokenizer) Tokenize(input []byte) analysis.TokenStream { }, } } + +func SingleTokenTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) { + return NewSingleTokenTokenizer(), nil +} + +func init() { + registry.RegisterTokenizer(Name, SingleTokenTokenizerConstructor) +} diff --git a/analysis/tokenizers/unicode_word_boundary/boundary.go b/analysis/tokenizers/unicode_word_boundary/boundary.go index f05e9f53..a6045110 100644 --- a/analysis/tokenizers/unicode_word_boundary/boundary.go +++ b/analysis/tokenizers/unicode_word_boundary/boundary.go @@ -17,9 +17,15 @@ package unicode_word_boundary // #include "unicode/ustring.h" import "C" -import "log" -import "unsafe" -import "github.com/couchbaselabs/bleve/analysis" +import ( + "log" + "unsafe" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" +) + +const Name = "unicode" type UnicodeWordBoundaryTokenizer struct { locale *C.char @@ -36,9 +42,7 @@ func NewUnicodeWordBoundaryCustomLocaleTokenizer(locale string) *UnicodeWordBoun } func (t *UnicodeWordBoundaryTokenizer) Tokenize(input []byte) analysis.TokenStream { - // var bi *C.UBreakIterator rv := make(analysis.TokenStream, 0) - defer C.free(unsafe.Pointer(t.locale)) if len(input) < 1 { return rv @@ -51,13 +55,10 @@ func (t *UnicodeWordBoundaryTokenizer) Tokenize(input []byte) analysis.TokenStre var inlen C.int32_t = C.int32_t(len(input)) var buflen C.int32_t = C.int32_t(2*len(input) + 1) // worse case each byte becomes 2 var stringToExamine []C.UChar = make([]C.UChar, buflen) - //log.Printf("new buff is: %v", stringToExamine) var myUnsafePointerToExamine = unsafe.Pointer(&(stringToExamine[0])) var myUCharPointer *C.UChar = (*C.UChar)(myUnsafePointerToExamine) C.u_uastrncpy(myUCharPointer, myCCharPointer, inlen) - //log.Printf("after copy new buff is: %v", stringToExamine) - var err C.UErrorCode = C.U_ZERO_ERROR bi := C.ubrk_open(C.UBRK_WORD, t.locale, myUCharPointer, -1, &err) @@ -113,3 +114,20 @@ func (t *UnicodeWordBoundaryTokenizer) Tokenize(input []byte) analysis.TokenStre return rv } + +func UnicodeWordBoundaryTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) { + locale := "" + localeVal, ok := config["locale"].(string) + if ok { + locale = localeVal + } + if locale == "" { + return NewUnicodeWordBoundaryTokenizer(), nil + } else { + return NewUnicodeWordBoundaryCustomLocaleTokenizer(locale), nil + } +} + +func init() { + registry.RegisterTokenizer(Name, UnicodeWordBoundaryTokenizerConstructor) +} diff --git a/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go b/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go new file mode 100644 index 00000000..9e07f825 --- /dev/null +++ b/analysis/tokenizers/whitespace_tokenizer/whitespace_tokenizer.go @@ -0,0 +1,29 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package whitespace_tokenizer + +import ( + "regexp" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/tokenizers/regexp_tokenizer" + "github.com/couchbaselabs/bleve/registry" +) + +const Name = "whitespace" + +var whitespaceTokenizerRegexp = regexp.MustCompile(`\w+`) + +func TokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) { + return regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp), nil +} + +func init() { + registry.RegisterTokenizer(Name, TokenizerConstructor) +} diff --git a/config.go b/config.go index 982e6f98..1a0c4ef5 100644 --- a/config.go +++ b/config.go @@ -9,55 +9,86 @@ package bleve import ( - "fmt" - "regexp" + "expvar" "time" - "github.com/couchbaselabs/bleve/analysis" - - "github.com/couchbaselabs/bleve/analysis/datetime_parsers/flexible_go" - - "github.com/couchbaselabs/bleve/analysis/char_filters/regexp_char_filter" - - "github.com/couchbaselabs/bleve/analysis/tokenizers/regexp_tokenizer" - "github.com/couchbaselabs/bleve/analysis/tokenizers/single_token" - "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" - - "github.com/couchbaselabs/bleve/analysis/token_filters/apostrophe_filter" - "github.com/couchbaselabs/bleve/analysis/token_filters/arabic_normalize" - "github.com/couchbaselabs/bleve/analysis/token_filters/cld2" - "github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter" - "github.com/couchbaselabs/bleve/analysis/token_filters/german_normalize" - "github.com/couchbaselabs/bleve/analysis/token_filters/hindi_normalize" - "github.com/couchbaselabs/bleve/analysis/token_filters/hindi_stemmer_filter" - "github.com/couchbaselabs/bleve/analysis/token_filters/length_filter" - "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" - "github.com/couchbaselabs/bleve/analysis/token_filters/persian_normalize" - "github.com/couchbaselabs/bleve/analysis/token_filters/sorani_normalize" - "github.com/couchbaselabs/bleve/analysis/token_filters/sorani_stemmer_filter" - "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" - "github.com/couchbaselabs/bleve/analysis/token_filters/stop_words_filter" - "github.com/couchbaselabs/bleve/analysis/token_filters/truncate_token_filter" - "github.com/couchbaselabs/bleve/analysis/token_filters/unicode_normalize" - "github.com/couchbaselabs/bleve/search" + + // char filters + _ "github.com/couchbaselabs/bleve/analysis/char_filters/html_char_filter" + _ "github.com/couchbaselabs/bleve/analysis/char_filters/regexp_char_filter" + _ "github.com/couchbaselabs/bleve/analysis/char_filters/zero_width_non_joiner" + + // analyzers + _ "github.com/couchbaselabs/bleve/analysis/analyzers/detect_lang_analyzer" + _ "github.com/couchbaselabs/bleve/analysis/analyzers/keyword_analyzer" + _ "github.com/couchbaselabs/bleve/analysis/analyzers/simple_analyzer" + _ "github.com/couchbaselabs/bleve/analysis/analyzers/standard_analyzer" + + // token filters + _ "github.com/couchbaselabs/bleve/analysis/token_filters/apostrophe_filter" + _ "github.com/couchbaselabs/bleve/analysis/token_filters/cld2" + _ "github.com/couchbaselabs/bleve/analysis/token_filters/edge_ngram_filter" + _ "github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter" + _ "github.com/couchbaselabs/bleve/analysis/token_filters/keyword_marker_filter" + _ "github.com/couchbaselabs/bleve/analysis/token_filters/length_filter" + _ "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + _ "github.com/couchbaselabs/bleve/analysis/token_filters/ngram_filter" + _ "github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter" + _ "github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter" + _ "github.com/couchbaselabs/bleve/analysis/token_filters/truncate_token_filter" + _ "github.com/couchbaselabs/bleve/analysis/token_filters/unicode_normalize" + + // tokenizers + _ "github.com/couchbaselabs/bleve/analysis/tokenizers/regexp_tokenizer" + _ "github.com/couchbaselabs/bleve/analysis/tokenizers/single_token" + _ "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" + _ "github.com/couchbaselabs/bleve/analysis/tokenizers/whitespace_tokenizer" + + // date time parsers + _ "github.com/couchbaselabs/bleve/analysis/datetime_parsers/datetime_optional" + _ "github.com/couchbaselabs/bleve/analysis/datetime_parsers/flexible_go" + + // languages + _ "github.com/couchbaselabs/bleve/analysis/language/ar" + _ "github.com/couchbaselabs/bleve/analysis/language/bg" + _ "github.com/couchbaselabs/bleve/analysis/language/ca" + _ "github.com/couchbaselabs/bleve/analysis/language/ckb" + _ "github.com/couchbaselabs/bleve/analysis/language/cs" + _ "github.com/couchbaselabs/bleve/analysis/language/da" + _ "github.com/couchbaselabs/bleve/analysis/language/de" + _ "github.com/couchbaselabs/bleve/analysis/language/el" + _ "github.com/couchbaselabs/bleve/analysis/language/en" + _ "github.com/couchbaselabs/bleve/analysis/language/es" + _ "github.com/couchbaselabs/bleve/analysis/language/eu" + _ "github.com/couchbaselabs/bleve/analysis/language/fa" + _ "github.com/couchbaselabs/bleve/analysis/language/fi" + _ "github.com/couchbaselabs/bleve/analysis/language/fr" + _ "github.com/couchbaselabs/bleve/analysis/language/ga" + _ "github.com/couchbaselabs/bleve/analysis/language/gl" + _ "github.com/couchbaselabs/bleve/analysis/language/hi" + _ "github.com/couchbaselabs/bleve/analysis/language/hu" + _ "github.com/couchbaselabs/bleve/analysis/language/hy" + _ "github.com/couchbaselabs/bleve/analysis/language/id" + _ "github.com/couchbaselabs/bleve/analysis/language/it" + _ "github.com/couchbaselabs/bleve/analysis/language/nl" + _ "github.com/couchbaselabs/bleve/analysis/language/no" + _ "github.com/couchbaselabs/bleve/analysis/language/porter" + _ "github.com/couchbaselabs/bleve/analysis/language/pt" + _ "github.com/couchbaselabs/bleve/analysis/language/ro" + _ "github.com/couchbaselabs/bleve/analysis/language/ru" + _ "github.com/couchbaselabs/bleve/analysis/language/sv" + _ "github.com/couchbaselabs/bleve/analysis/language/th" + _ "github.com/couchbaselabs/bleve/analysis/language/tr" ) -type AnalysisConfig struct { - TokenMaps map[string]analysis.WordMap - CharFilters map[string]analysis.CharFilter - Tokenizers map[string]analysis.Tokenizer - TokenFilters map[string]analysis.TokenFilter - Analyzers map[string]*analysis.Analyzer - DateTimeParsers map[string]analysis.DateTimeParser -} +var bleveExpVar = expvar.NewMap("bleve") type HighlightConfig struct { Highlighters map[string]search.Highlighter } type Configuration struct { - Analysis *AnalysisConfig DefaultAnalyzer *string Highlight *HighlightConfig DefaultHighlighter *string @@ -67,62 +98,8 @@ type Configuration struct { ByteArrayConverters map[string]ByteArrayConverter } -func (c *Configuration) BuildNewAnalyzer(charFilterNames []string, tokenizerName string, tokenFilterNames []string) (*analysis.Analyzer, error) { - rv := analysis.Analyzer{} - if len(charFilterNames) > 0 { - rv.CharFilters = make([]analysis.CharFilter, len(charFilterNames)) - for i, charFilterName := range charFilterNames { - charFilter := c.Analysis.CharFilters[charFilterName] - if charFilter == nil { - return nil, fmt.Errorf("no character filter named `%s` registered", charFilterName) - } - rv.CharFilters[i] = charFilter - } - } - rv.Tokenizer = c.Analysis.Tokenizers[tokenizerName] - if rv.Tokenizer == nil { - return nil, fmt.Errorf("no tokenizer named `%s` registered", tokenizerName) - } - if len(tokenFilterNames) > 0 { - rv.TokenFilters = make([]analysis.TokenFilter, len(tokenFilterNames)) - for i, tokenFilterName := range tokenFilterNames { - tokenFilter := c.Analysis.TokenFilters[tokenFilterName] - if tokenFilter == nil { - return nil, fmt.Errorf("no token filter named `%s` registered", tokenFilterName) - } - rv.TokenFilters[i] = tokenFilter - } - } - return &rv, nil -} - -func (c *Configuration) MustBuildNewAnalyzer(charFilterNames []string, tokenizerName string, tokenFilterNames []string) *analysis.Analyzer { - analyzer, err := c.BuildNewAnalyzer(charFilterNames, tokenizerName, tokenFilterNames) - if err != nil { - panic(err) - } - return analyzer -} - -func (c *Configuration) MustLoadStopWords(stopWordsBytes []byte) analysis.WordMap { - rv := analysis.NewWordMap() - err := rv.LoadBytes(stopWordsBytes) - if err != nil { - panic(err) - } - return rv -} - func NewConfiguration() *Configuration { return &Configuration{ - Analysis: &AnalysisConfig{ - TokenMaps: make(map[string]analysis.WordMap), - CharFilters: make(map[string]analysis.CharFilter), - Tokenizers: make(map[string]analysis.Tokenizer), - TokenFilters: make(map[string]analysis.TokenFilter), - Analyzers: make(map[string]*analysis.Analyzer), - DateTimeParsers: make(map[string]analysis.DateTimeParser), - }, Highlight: &HighlightConfig{ Highlighters: make(map[string]search.Highlighter), }, @@ -133,6 +110,7 @@ func NewConfiguration() *Configuration { var Config *Configuration func init() { + bootStart := time.Now() // build the default configuration Config = NewConfiguration() @@ -142,215 +120,6 @@ func init() { Config.ByteArrayConverters["json"] = NewJSONByteArrayConverter() Config.ByteArrayConverters["ignore"] = NewIgnoreByteArrayConverter() - // register stop token maps - Config.Analysis.TokenMaps["da_stop"] = Config.MustLoadStopWords(stop_words_filter.DanishStopWords) - Config.Analysis.TokenMaps["nl_stop"] = Config.MustLoadStopWords(stop_words_filter.DutchStopWords) - Config.Analysis.TokenMaps["en_stop"] = Config.MustLoadStopWords(stop_words_filter.EnglishStopWords) - Config.Analysis.TokenMaps["fi_stop"] = Config.MustLoadStopWords(stop_words_filter.FinnishStopWords) - Config.Analysis.TokenMaps["fr_stop"] = Config.MustLoadStopWords(stop_words_filter.FrenchStopWords) - Config.Analysis.TokenMaps["de_stop"] = Config.MustLoadStopWords(stop_words_filter.GermanStopWords) - Config.Analysis.TokenMaps["hu_stop"] = Config.MustLoadStopWords(stop_words_filter.HungarianStopWords) - Config.Analysis.TokenMaps["it_stop"] = Config.MustLoadStopWords(stop_words_filter.ItalianStopWords) - Config.Analysis.TokenMaps["no_stop"] = Config.MustLoadStopWords(stop_words_filter.NorwegianStopWords) - Config.Analysis.TokenMaps["pt_stop"] = Config.MustLoadStopWords(stop_words_filter.PortugueseStopWords) - Config.Analysis.TokenMaps["ro_stop"] = Config.MustLoadStopWords(stop_words_filter.RomanianStopWords) - Config.Analysis.TokenMaps["ru_stop"] = Config.MustLoadStopWords(stop_words_filter.RussianStopWords) - Config.Analysis.TokenMaps["es_stop"] = Config.MustLoadStopWords(stop_words_filter.SpanishStopWords) - Config.Analysis.TokenMaps["sv_stop"] = Config.MustLoadStopWords(stop_words_filter.SwedishStopWords) - Config.Analysis.TokenMaps["tr_stop"] = Config.MustLoadStopWords(stop_words_filter.TurkishStopWords) - Config.Analysis.TokenMaps["ar_stop"] = Config.MustLoadStopWords(stop_words_filter.ArabicStopWords) - Config.Analysis.TokenMaps["hy_stop"] = Config.MustLoadStopWords(stop_words_filter.ArmenianStopWords) - Config.Analysis.TokenMaps["eu_stop"] = Config.MustLoadStopWords(stop_words_filter.BasqueStopWords) - Config.Analysis.TokenMaps["bg_stop"] = Config.MustLoadStopWords(stop_words_filter.BulgarianStopWords) - Config.Analysis.TokenMaps["ca_stop"] = Config.MustLoadStopWords(stop_words_filter.CatalanStopWords) - Config.Analysis.TokenMaps["gl_stop"] = Config.MustLoadStopWords(stop_words_filter.GalicianStopWords) - Config.Analysis.TokenMaps["el_stop"] = Config.MustLoadStopWords(stop_words_filter.GreekStopWords) - Config.Analysis.TokenMaps["hi_stop"] = Config.MustLoadStopWords(stop_words_filter.HindiStopWords) - Config.Analysis.TokenMaps["id_stop"] = Config.MustLoadStopWords(stop_words_filter.IndonesianStopWords) - Config.Analysis.TokenMaps["ga_stop"] = Config.MustLoadStopWords(stop_words_filter.IrishStopWords) - Config.Analysis.TokenMaps["fa_stop"] = Config.MustLoadStopWords(stop_words_filter.PersianStopWords) - Config.Analysis.TokenMaps["ckb_stop"] = Config.MustLoadStopWords(stop_words_filter.SoraniStopWords) - Config.Analysis.TokenMaps["th_stop"] = Config.MustLoadStopWords(stop_words_filter.ThaiStopWords) - Config.Analysis.TokenMaps["cs_stop"] = Config.MustLoadStopWords(stop_words_filter.CzechStopWords) - - // register article token maps for elision filters - Config.Analysis.TokenMaps["fr_articles"] = Config.MustLoadStopWords(elision_filter.FrenchArticles) - Config.Analysis.TokenMaps["it_articles"] = Config.MustLoadStopWords(elision_filter.ItalianArticles) - Config.Analysis.TokenMaps["ca_articles"] = Config.MustLoadStopWords(elision_filter.CatalanArticles) - Config.Analysis.TokenMaps["ga_articles"] = Config.MustLoadStopWords(elision_filter.IrishArticles) - - // register char filters - htmlCharFilterRegexp := regexp.MustCompile(`\s]+))?)+\s*|\s*)/?>`) - htmlCharFilter := regexp_char_filter.NewRegexpCharFilter(htmlCharFilterRegexp, []byte{' '}) - Config.Analysis.CharFilters["html"] = htmlCharFilter - zeroWidthNonJoinerRegexp := regexp.MustCompile(`\x{200C}`) - zeroWidthNonJoinerCharFilter := regexp_char_filter.NewRegexpCharFilter(zeroWidthNonJoinerRegexp, []byte{' '}) - Config.Analysis.CharFilters["zero_width_spaces"] = zeroWidthNonJoinerCharFilter - - // register tokenizers - whitespaceTokenizerRegexp := regexp.MustCompile(`\w+`) - Config.Analysis.Tokenizers["single"] = single_token.NewSingleTokenTokenizer() - Config.Analysis.Tokenizers["unicode"] = unicode_word_boundary.NewUnicodeWordBoundaryTokenizer() - Config.Analysis.Tokenizers["unicode_th"] = unicode_word_boundary.NewUnicodeWordBoundaryCustomLocaleTokenizer("th_TH") - Config.Analysis.Tokenizers["whitespace"] = regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp) - - // register token filters - Config.Analysis.TokenFilters["detect_lang"] = cld2.NewCld2Filter() - Config.Analysis.TokenFilters["short"] = length_filter.NewLengthFilter(3, -1) - Config.Analysis.TokenFilters["long"] = length_filter.NewLengthFilter(-1, 255) - Config.Analysis.TokenFilters["to_lower"] = lower_case_filter.NewLowerCaseFilter() - Config.Analysis.TokenFilters["apostrophe"] = apostrophe_filter.NewApostropheFilter() - Config.Analysis.TokenFilters["truncate_token"] = truncate_token_filter.NewTruncateTokenFilter(25) - - // register stemmer filters - Config.Analysis.TokenFilters["stemmer_da"] = stemmer_filter.MustNewStemmerFilter("danish") - Config.Analysis.TokenFilters["stemmer_nl"] = stemmer_filter.MustNewStemmerFilter("dutch") - Config.Analysis.TokenFilters["stemmer_en"] = stemmer_filter.MustNewStemmerFilter("english") - Config.Analysis.TokenFilters["stemmer_fi"] = stemmer_filter.MustNewStemmerFilter("finnish") - Config.Analysis.TokenFilters["stemmer_fr"] = stemmer_filter.MustNewStemmerFilter("french") - Config.Analysis.TokenFilters["stemmer_de"] = stemmer_filter.MustNewStemmerFilter("german") - Config.Analysis.TokenFilters["stemmer_hu"] = stemmer_filter.MustNewStemmerFilter("hungarian") - Config.Analysis.TokenFilters["stemmer_it"] = stemmer_filter.MustNewStemmerFilter("italian") - Config.Analysis.TokenFilters["stemmer_no"] = stemmer_filter.MustNewStemmerFilter("norwegian") - Config.Analysis.TokenFilters["stemmer_porter"] = stemmer_filter.MustNewStemmerFilter("porter") - Config.Analysis.TokenFilters["stemmer_pt"] = stemmer_filter.MustNewStemmerFilter("portuguese") - Config.Analysis.TokenFilters["stemmer_ro"] = stemmer_filter.MustNewStemmerFilter("romanian") - Config.Analysis.TokenFilters["stemmer_ru"] = stemmer_filter.MustNewStemmerFilter("russian") - Config.Analysis.TokenFilters["stemmer_es"] = stemmer_filter.MustNewStemmerFilter("spanish") - Config.Analysis.TokenFilters["stemmer_sv"] = stemmer_filter.MustNewStemmerFilter("swedish") - Config.Analysis.TokenFilters["stemmer_tr"] = stemmer_filter.MustNewStemmerFilter("turkish") - Config.Analysis.TokenFilters["stemmer_ckb"] = sorani_stemmer_filter.NewSoraniStemmerFilter() - Config.Analysis.TokenFilters["stemmer_hi"] = hindi_stemmer_filter.NewHindiStemmerFilter() - - // register stop token filters - Config.Analysis.TokenFilters["stop_token_da"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["da_stop"]) - Config.Analysis.TokenFilters["stop_token_nl"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["nl_stop"]) - Config.Analysis.TokenFilters["stop_token_en"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["en_stop"]) - Config.Analysis.TokenFilters["stop_token_fi"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["fi_stop"]) - Config.Analysis.TokenFilters["stop_token_fr"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["fr_stop"]) - Config.Analysis.TokenFilters["stop_token_de"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["de_stop"]) - Config.Analysis.TokenFilters["stop_token_hu"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["hu_stop"]) - Config.Analysis.TokenFilters["stop_token_it"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["it_stop"]) - Config.Analysis.TokenFilters["stop_token_no"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["no_stop"]) - Config.Analysis.TokenFilters["stop_token_pt"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["pt_stop"]) - Config.Analysis.TokenFilters["stop_token_ro"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["ro_stop"]) - Config.Analysis.TokenFilters["stop_token_ru"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["ru_stop"]) - Config.Analysis.TokenFilters["stop_token_es"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["es_stop"]) - Config.Analysis.TokenFilters["stop_token_sv"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["sv_stop"]) - Config.Analysis.TokenFilters["stop_token_tr"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["tr_stop_stop"]) - Config.Analysis.TokenFilters["stop_token_ar"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["ar_stop"]) - Config.Analysis.TokenFilters["stop_token_hy"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["hy_stop"]) - Config.Analysis.TokenFilters["stop_token_eu"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["eu_stop"]) - Config.Analysis.TokenFilters["stop_token_bg"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["bg_stop"]) - Config.Analysis.TokenFilters["stop_token_ca"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["ca_stop"]) - Config.Analysis.TokenFilters["stop_token_gl"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["gl_stop"]) - Config.Analysis.TokenFilters["stop_token_el"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["el_stop"]) - Config.Analysis.TokenFilters["stop_token_hi"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["hi_stop"]) - Config.Analysis.TokenFilters["stop_token_id"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["id_stop"]) - Config.Analysis.TokenFilters["stop_token_ga"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["ga_stop"]) - Config.Analysis.TokenFilters["stop_token_fa"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["fa_stop"]) - Config.Analysis.TokenFilters["stop_token_ckb"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["ckb_stop"]) - Config.Analysis.TokenFilters["stop_token_th"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["th_stop"]) - Config.Analysis.TokenFilters["stop_token_cs"] = stop_words_filter.NewStopWordsFilter( - Config.Analysis.TokenMaps["cs_stop"]) - - // register elision filters - Config.Analysis.TokenFilters["elision_fr"] = elision_filter.NewElisionFilter( - Config.Analysis.TokenMaps["fr_articles"]) - Config.Analysis.TokenFilters["elision_it"] = elision_filter.NewElisionFilter( - Config.Analysis.TokenMaps["it_articles"]) - Config.Analysis.TokenFilters["elision_ca"] = elision_filter.NewElisionFilter( - Config.Analysis.TokenMaps["ca_articles"]) - Config.Analysis.TokenFilters["elision_ga"] = elision_filter.NewElisionFilter( - Config.Analysis.TokenMaps["ga_articles"]) - - // register unicode normalizers - Config.Analysis.TokenFilters["normalize_nfc"] = unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFC) - Config.Analysis.TokenFilters["normalize_nfd"] = unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFD) - Config.Analysis.TokenFilters["normalize_nfkc"] = unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKC) - Config.Analysis.TokenFilters["normalize_nfkd"] = unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKD) - Config.Analysis.TokenFilters["normalize_ckb"] = sorani_normalize.NewSoraniNormalizeFilter() - Config.Analysis.TokenFilters["normalize_fa"] = persian_normalize.NewPersianNormalizeFilter() - Config.Analysis.TokenFilters["normalize_ar"] = arabic_normalize.NewArabicNormalizeFilter() - Config.Analysis.TokenFilters["normalize_de"] = german_normalize.NewGermanNormalizeFilter() - Config.Analysis.TokenFilters["normalize_hi"] = hindi_normalize.NewHindiNormalizeFilter() - - // register analyzers - keywordAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "single", []string{}) - Config.Analysis.Analyzers["keyword"] = keywordAnalyzer - simpleAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "whitespace", []string{"to_lower"}) - Config.Analysis.Analyzers["simple"] = simpleAnalyzer - standardAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "whitespace", []string{"to_lower", "stop_token_en"}) - Config.Analysis.Analyzers["standard"] = standardAnalyzer - detectLangAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "single", []string{"to_lower", "detect_lang"}) - Config.Analysis.Analyzers["detect_lang"] = detectLangAnalyzer - - // language specific analyzers - danishAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_da", "stemmer_da"}) - Config.Analysis.Analyzers["da"] = danishAnalyzer - dutchAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_nl", "stemmer_nl"}) - Config.Analysis.Analyzers["nl"] = dutchAnalyzer - englishAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_en", "stemmer_en"}) - Config.Analysis.Analyzers["en"] = englishAnalyzer - finnishAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_fi", "stemmer_fi"}) - Config.Analysis.Analyzers["fi"] = finnishAnalyzer - frenchAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"elision_fr", "to_lower", "stop_token_fr", "stemmer_fr"}) - Config.Analysis.Analyzers["fr"] = frenchAnalyzer - germanAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_de", "normalize_de", "stemmer_de"}) - Config.Analysis.Analyzers["de"] = germanAnalyzer - hungarianAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_hu", "stemmer_hu"}) - Config.Analysis.Analyzers["hu"] = hungarianAnalyzer - italianAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"elision_it", "to_lower", "stop_token_it", "stemmer_it"}) - Config.Analysis.Analyzers["it"] = italianAnalyzer - norwegianAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_no", "stemmer_no"}) - Config.Analysis.Analyzers["no"] = norwegianAnalyzer - portugueseAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_pt", "stemmer_pt"}) - Config.Analysis.Analyzers["pt"] = portugueseAnalyzer - romanianAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_ro", "stemmer_ro"}) - Config.Analysis.Analyzers["ro"] = romanianAnalyzer - russianAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_ru", "stemmer_ru"}) - Config.Analysis.Analyzers["ru"] = russianAnalyzer - spanishAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_es", "stemmer_es"}) - Config.Analysis.Analyzers["es"] = spanishAnalyzer - swedishAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_sv", "stemmer_sv"}) - Config.Analysis.Analyzers["sv"] = swedishAnalyzer - turkishAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"apostrophe", "to_lower", "stop_token_tr", "stemmer_tr"}) - Config.Analysis.Analyzers["tr"] = turkishAnalyzer - thaiAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode_th", []string{"to_lower", "stop_token_th"}) - Config.Analysis.Analyzers["th"] = thaiAnalyzer - soraniAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"normalize_ckb", "to_lower", "stop_token_ckb", "stemmer_ckb"}) - Config.Analysis.Analyzers["ckb"] = soraniAnalyzer - persianAnalyzer := Config.MustBuildNewAnalyzer([]string{"zero_width_spaces"}, "unicode", []string{"to_lower", "normalize_ar", "normalize_fa", "stop_token_fa"}) - Config.Analysis.Analyzers["fa"] = persianAnalyzer - // register ansi highlighter Config.Highlight.Highlighters["ansi"] = search.NewSimpleHighlighter() @@ -360,10 +129,6 @@ func init() { htmlHighlighter.SetFragmentFormatter(htmlFormatter) Config.Highlight.Highlighters["html"] = htmlHighlighter - // set the default analyzer - simpleAnalyzerName := "simple" - Config.DefaultAnalyzer = &simpleAnalyzerName - // set the default highlighter htmlHighlighterName := "html" Config.DefaultHighlighter = &htmlHighlighterName @@ -371,23 +136,6 @@ func init() { // default CreateIfMissing to true Config.CreateIfMissing = true - // set up the built-in date time formats - - rfc3339NoTimezone := "2006-01-02T15:04:05" - rfc3339NoTimezoneNoT := "2006-01-02 15:04:05" - rfc3339NoTime := "2006-01-02" - - Config.Analysis.DateTimeParsers["dateTimeOptional"] = flexible_go.NewFlexibleGoDateTimeParser( - []string{ - time.RFC3339Nano, - time.RFC3339, - rfc3339NoTimezone, - rfc3339NoTimezoneNoT, - rfc3339NoTime, - }) - dateTimeOptionalName := "dateTimeOptional" - Config.DefaultDateTimeFormat = &dateTimeOptionalName - - defaultField := "_all" - Config.DefaultField = &defaultField + bootDuration := time.Since(bootStart) + bleveExpVar.Add("bootDuration", int64(bootDuration)) } diff --git a/error.go b/error.go index bbad9448..f607646a 100644 --- a/error.go +++ b/error.go @@ -9,8 +9,8 @@ package bleve const ( - ERROR_NO_ID Error = iota - ERROR_NO_TYPE + ERROR_INDEX_EXISTS Error = iota + ERROR_INDEX_DOES_NOT_EXIST ) type Error int @@ -20,5 +20,6 @@ func (e Error) Error() string { } var errorMessages = map[int]string{ - 0: "unable to determine document id", + int(ERROR_INDEX_EXISTS): "cannot create new index, it already exists", + int(ERROR_INDEX_DOES_NOT_EXIST): "cannot open index, it does not exist", } diff --git a/examples/beer-search/main.go b/examples/beer-search/main.go index 90d514d1..28f0667c 100644 --- a/examples/beer-search/main.go +++ b/examples/beer-search/main.go @@ -9,17 +9,22 @@ package main import ( + _ "expvar" "flag" "io/ioutil" "log" "net/http" + "os" "path/filepath" + "runtime/pprof" "time" "github.com/couchbaselabs/bleve" bleveHttp "github.com/couchbaselabs/bleve/http" ) +var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file") +var memprofile = flag.String("memprofile", "", "write mem profile to file") var batchSize = flag.Int("batchSize", 100, "batch size for indexing") var bindAddr = flag.String("addr", ":8094", "http listen address") var jsonDir = flag.String("jsonDir", "../../samples/beer-sample/", "json directory") @@ -31,6 +36,15 @@ func main() { flag.Parse() + // create cpu profile if requested + if *cpuprofile != "" { + f, err := os.Create(*cpuprofile) + if err != nil { + log.Fatal(err) + } + pprof.StartCPUProfile(f) + } + // create a mapping indexMapping := buildIndexMapping() @@ -46,6 +60,18 @@ func main() { if err != nil { log.Fatal(err) } + if *cpuprofile != "" { + pprof.StopCPUProfile() + log.Printf("closing cpu profile") + } + if *memprofile != "" { + f, err := os.Create(*memprofile) + if err != nil { + log.Fatal(err) + } + pprof.WriteHeapProfile(f) + log.Printf("mem profile written") + } }() // create a router to serve static files diff --git a/index/index.go b/index/index.go index 4419059b..1bb6ec91 100644 --- a/index/index.go +++ b/index/index.go @@ -32,6 +32,10 @@ type Index interface { Fields() ([]string, error) + SetInternal(key, val []byte) error + GetInternal(key []byte) ([]byte, error) + DeleteInternal(key []byte) error + Dump() DumpDoc(id string) ([]interface{}, error) DumpFields() diff --git a/index/upside_down/row.go b/index/upside_down/row.go index ed232266..84ebd3d2 100644 --- a/index/upside_down/row.go +++ b/index/upside_down/row.go @@ -38,6 +38,8 @@ func ParseFromKeyValue(key, value []byte) (UpsideDownCouchRow, error) { return NewBackIndexRowKV(key, value) case 's': return NewStoredRowKV(key, value) + case 'i': + return NewInternalRowKV(key, value) } return nil, fmt.Errorf("Unknown field type '%s'", string(key[0])) } @@ -78,6 +80,42 @@ func NewVersionRowKV(key, value []byte) (*VersionRow, error) { return &rv, nil } +// INTERNAL STORAGE + +type InternalRow struct { + key []byte + val []byte +} + +func (i *InternalRow) Key() []byte { + buf := make([]byte, len(i.key)+1) + buf[0] = 'i' + copy(buf[1:], i.key) + return buf +} + +func (i *InternalRow) Value() []byte { + return i.val +} + +func (i *InternalRow) String() string { + return fmt.Sprintf("InternalStore - Key: %s (% x) Val: %s (% x)", i.key, i.key, i.val, i.val) +} + +func NewInternalRow(key, val []byte) *InternalRow { + return &InternalRow{ + key: key, + val: val, + } +} + +func NewInternalRowKV(key, value []byte) (*InternalRow, error) { + rv := InternalRow{} + rv.key = key[1:] + rv.val = value + return &rv, nil +} + // FIELD definition type FieldRow struct { diff --git a/index/upside_down/row_test.go b/index/upside_down/row_test.go index 40f09555..589c1aeb 100644 --- a/index/upside_down/row_test.go +++ b/index/upside_down/row_test.go @@ -74,6 +74,11 @@ func TestRows(t *testing.T) { []byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', BYTE_SEPARATOR, 0, 0}, []byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'}, }, + { + NewInternalRow([]byte("mapping"), []byte(`{"mapping":"json content"}`)), + []byte{'i', 'm', 'a', 'p', 'p', 'i', 'n', 'g'}, + []byte{'{', '"', 'm', 'a', 'p', 'p', 'i', 'n', 'g', '"', ':', '"', 'j', 's', 'o', 'n', ' ', 'c', 'o', 'n', 't', 'e', 'n', 't', '"', '}'}, + }, } // test going from struct to k/v bytes diff --git a/index/upside_down/upside_down.go b/index/upside_down/upside_down.go index 33e64c5c..6ab7bd07 100644 --- a/index/upside_down/upside_down.go +++ b/index/upside_down/upside_down.go @@ -786,3 +786,24 @@ func (udc *UpsideDownCouch) Batch(batch index.Batch) error { } return err } + +func (udc *UpsideDownCouch) SetInternal(key, val []byte) error { + internalRow := NewInternalRow(key, val) + return udc.store.Set(internalRow.Key(), internalRow.Value()) +} + +func (udc *UpsideDownCouch) GetInternal(key []byte) ([]byte, error) { + internalRow, err := NewInternalRowKV(key, nil) + if err != nil { + return nil, err + } + return udc.store.Get(internalRow.Key()) +} + +func (udc *UpsideDownCouch) DeleteInternal(key []byte) error { + internalRow, err := NewInternalRowKV(key, nil) + if err != nil { + return err + } + return udc.store.Delete(internalRow.Key()) +} diff --git a/index_impl.go b/index_impl.go index b4b00e10..1e2b3520 100644 --- a/index_impl.go +++ b/index_impl.go @@ -27,6 +27,12 @@ type indexImpl struct { } func newIndex(path string, mapping *IndexMapping) (*indexImpl, error) { + // start by validating the index mapping + err := mapping.Validate() + if err != nil { + return nil, err + } + store, err := leveldb.Open(path, Config.CreateIfMissing) if err != nil { return nil, err @@ -109,7 +115,9 @@ func (i *indexImpl) Search(req *SearchRequest) (*SearchResult, error) { } else if facetRequest.DateTimeRanges != nil { // build date range facet facetBuilder := search.NewDateTimeFacetBuilder(facetRequest.Field, facetRequest.Size) + dateTimeParser := i.m.DateTimeParserNamed(i.m.DefaultDateTimeParser) for _, dr := range facetRequest.DateTimeRanges { + dr.ParseDates(dateTimeParser) facetBuilder.AddRange(dr.Name, dr.Start, dr.End) } facetsBuilder.Add(facetName, facetBuilder) diff --git a/mapping_document.go b/mapping_document.go index 280930ed..3dec3f71 100644 --- a/mapping_document.go +++ b/mapping_document.go @@ -12,19 +12,57 @@ import ( "encoding/json" "fmt" - "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/registry" ) type DocumentMapping struct { - Enabled *bool `json:"enabled"` - Dynamic *bool `json:"dynamic"` + Enabled bool `json:"enabled"` + Dynamic bool `json:"dynamic"` Properties map[string]*DocumentMapping `json:"properties"` Fields []*FieldMapping `json:"fields"` - DefaultAnalyzer *string `json:"default_analyzer"` + DefaultAnalyzer string `json:"default_analyzer"` +} + +func (dm *DocumentMapping) Validate(cache *registry.Cache) error { + var err error + if dm.DefaultAnalyzer != "" { + _, err := cache.AnalyzerNamed(dm.DefaultAnalyzer) + if err != nil { + return err + } + } + for _, property := range dm.Properties { + err = property.Validate(cache) + if err != nil { + return err + } + } + for _, field := range dm.Fields { + if field.Analyzer != nil { + _, err = cache.AnalyzerNamed(*field.Analyzer) + if err != nil { + return err + } + } + if field.DateFormat != nil { + _, err = cache.DateTimeParserNamed(*field.DateFormat) + if err != nil { + return err + } + } + if field.Type != nil { + switch *field.Type { + case "text", "datetime", "number": + default: + return fmt.Errorf("unknown field type: '%s'", *field.Type) + } + } + } + return nil } func (dm *DocumentMapping) GoString() string { - return fmt.Sprintf(" &bleve.DocumentMapping{Enabled:%t, Dynamic:%t, Properties:%#v, Fields:%#v}", *dm.Enabled, *dm.Dynamic, dm.Properties, dm.Fields) + return fmt.Sprintf(" &bleve.DocumentMapping{Enabled:%t, Dynamic:%t, Properties:%#v, Fields:%#v}", dm.Enabled, dm.Dynamic, dm.Properties, dm.Fields) } func (dm *DocumentMapping) DocumentMappingForPath(path string) *DocumentMapping { @@ -42,22 +80,19 @@ func (dm *DocumentMapping) DocumentMappingForPath(path string) *DocumentMapping func NewDocumentMapping() *DocumentMapping { return &DocumentMapping{ - Enabled: &tRUE, - Dynamic: &tRUE, + Enabled: true, + Dynamic: true, } } func NewDocumentStaticMapping() *DocumentMapping { return &DocumentMapping{ - Enabled: &tRUE, - Dynamic: &fALSE, + Enabled: true, } } func NewDocumentDisabledMapping() *DocumentMapping { - return &DocumentMapping{ - Enabled: &fALSE, - } + return &DocumentMapping{} } func (dm *DocumentMapping) AddSubDocumentMapping(property string, sdm *DocumentMapping) *DocumentMapping { @@ -82,23 +117,25 @@ func (dm *DocumentMapping) UnmarshalJSON(data []byte) error { Dynamic *bool `json:"dynamic"` Properties map[string]*DocumentMapping `json:"properties"` Fields []*FieldMapping `json:"fields"` - DefaultAnalyzer *string `json:"default_analyzer"` + DefaultAnalyzer string `json:"default_analyzer"` } err := json.Unmarshal(data, &tmp) if err != nil { return err } - dm.Enabled = &tRUE + + dm.Enabled = true if tmp.Enabled != nil { - dm.Enabled = tmp.Enabled + dm.Enabled = *tmp.Enabled } - dm.Dynamic = &tRUE + + dm.Dynamic = true if tmp.Dynamic != nil { - dm.Dynamic = tmp.Dynamic - } - if tmp.DefaultAnalyzer != nil { - dm.DefaultAnalyzer = tmp.DefaultAnalyzer + dm.Dynamic = *tmp.Dynamic } + + dm.DefaultAnalyzer = tmp.DefaultAnalyzer + if tmp.Properties != nil { dm.Properties = make(map[string]*DocumentMapping, len(tmp.Properties)) } @@ -114,8 +151,8 @@ func (dm *DocumentMapping) UnmarshalJSON(data []byte) error { return nil } -func (dm *DocumentMapping) defaultAnalyzer(path []string) *analysis.Analyzer { - var rv *analysis.Analyzer +func (dm *DocumentMapping) defaultAnalyzerName(path []string) string { + rv := "" current := dm for _, pathElement := range path { var ok bool @@ -123,8 +160,8 @@ func (dm *DocumentMapping) defaultAnalyzer(path []string) *analysis.Analyzer { if !ok { break } - if current.DefaultAnalyzer != nil { - rv = Config.Analysis.Analyzers[*current.DefaultAnalyzer] + if current.DefaultAnalyzer != "" { + rv = current.DefaultAnalyzer } } return rv diff --git a/mapping_index.go b/mapping_index.go index a47723c1..ff5796cc 100644 --- a/mapping_index.go +++ b/mapping_index.go @@ -11,67 +11,114 @@ package bleve import ( "encoding/json" "fmt" + "log" "reflect" "time" "github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/document" + "github.com/couchbaselabs/bleve/registry" ) var tRUE = true var fALSE = false -var DEFAULT_ID_FIELD = "_id" -var DEFAULT_TYPE_FIELD = "_type" -var DEFAULT_TYPE = "_default" -var DEFAULT_FIELD = "_all" -var DEFAULT_TOP_LEVEL_BYTE_ARRAY_CONVERTER = "json" +const DefaultTypeField = "_type" +const DefaultType = "_default" +const DefaultField = "_all" +const DefaultAnalyzer = "standard" +const DefaultDateTimeParser = "dateTimeOptional" +const DefaultByteArrayConverter = "json" type IndexMapping struct { - TypeMapping map[string]*DocumentMapping `json:"types"` - DefaultMapping *DocumentMapping `json:"default_mapping"` - IdField *string `json:"id_field"` - TypeField *string `json:"type_field"` - DefaultType *string `json:"default_type"` - DefaultAnalyzer *string `json:"default_analyzer"` - DefaultField *string `json:"default_field"` - ByteArrayConverter *string `json:"byte_array_converter"` + TypeMapping map[string]*DocumentMapping `json:"types"` + DefaultMapping *DocumentMapping `json:"default_mapping"` + TypeField string `json:"type_field"` + DefaultType string `json:"default_type"` + DefaultAnalyzer string `json:"default_analyzer"` + DefaultDateTimeParser string `json:"default_datetime_parser"` + DefaultField string `json:"default_field"` + ByteArrayConverter string `json:"byte_array_converter"` + cache *registry.Cache `json:"_"` } func (im *IndexMapping) GoString() string { - return fmt.Sprintf("&bleve.IndexMapping{TypeMapping:%#v, TypeField:%s, DefaultType:%s}", im.TypeMapping, *im.TypeField, *im.DefaultType) + return fmt.Sprintf("&bleve.IndexMapping{TypeMapping:%#v, TypeField:%s, DefaultType:%s}", im.TypeMapping, im.TypeField, im.DefaultType) } func NewIndexMapping() *IndexMapping { return &IndexMapping{ - TypeMapping: make(map[string]*DocumentMapping), - DefaultMapping: NewDocumentMapping(), - IdField: &DEFAULT_ID_FIELD, - TypeField: &DEFAULT_TYPE_FIELD, - DefaultType: &DEFAULT_TYPE, - DefaultField: &DEFAULT_FIELD, - ByteArrayConverter: &DEFAULT_TOP_LEVEL_BYTE_ARRAY_CONVERTER, + TypeMapping: make(map[string]*DocumentMapping), + DefaultMapping: NewDocumentMapping(), + TypeField: DefaultTypeField, + DefaultType: DefaultType, + DefaultAnalyzer: DefaultAnalyzer, + DefaultDateTimeParser: DefaultDateTimeParser, + DefaultField: DefaultField, + ByteArrayConverter: DefaultByteArrayConverter, + cache: registry.NewCache(), } } +// Validate will walk the entire structure ensuring the following +// explicitly named and default analyzers can be built +// explicitly named and default date parsers can be built +// field type names are valid +func (im *IndexMapping) Validate() error { + _, err := im.cache.AnalyzerNamed(im.DefaultAnalyzer) + if err != nil { + return err + } + _, err = im.cache.DateTimeParserNamed(im.DefaultDateTimeParser) + if err != nil { + return err + } + err = im.DefaultMapping.Validate(im.cache) + if err != nil { + return err + } + for _, docMapping := range im.TypeMapping { + err = docMapping.Validate(im.cache) + if err != nil { + return err + } + } + return nil +} + func (im *IndexMapping) AddDocumentMapping(doctype string, dm *DocumentMapping) *IndexMapping { im.TypeMapping[doctype] = dm return im } +func (im *IndexMapping) SetDefaultMapping(defaultMapping *DocumentMapping) *IndexMapping { + im.DefaultMapping = defaultMapping + return im +} + func (im *IndexMapping) SetTypeField(typeField string) *IndexMapping { - im.TypeField = &typeField + im.TypeField = typeField + return im +} + +func (im *IndexMapping) SetDefaultType(defaultType string) *IndexMapping { + im.DefaultType = defaultType return im } func (im *IndexMapping) SetDefaultAnalyzer(analyzer string) *IndexMapping { - im.DefaultAnalyzer = &analyzer + im.DefaultAnalyzer = analyzer return im } func (im *IndexMapping) SetDefaultField(field string) *IndexMapping { - im.DefaultField = &field + im.DefaultField = field + return im +} + +func (im *IndexMapping) SetByteArrayConverter(byteArrayConverter string) *IndexMapping { + im.ByteArrayConverter = byteArrayConverter return im } @@ -85,53 +132,57 @@ func (im *IndexMapping) MappingForType(docType string) *DocumentMapping { func (im *IndexMapping) UnmarshalJSON(data []byte) error { var tmp struct { - TypeMapping map[string]*DocumentMapping `json:"types"` - DefaultMapping *DocumentMapping `json:"default_mapping"` - IdField *string `json:"id_field"` - TypeField *string `json:"type_field"` - DefaultType *string `json:"default_type"` - DefaultAnalyzer *string `json:"default_analyzer"` - DefaultField *string `json:"default_field"` - ByteArrayConverter *string `json:"byte_array_converter"` + TypeMapping map[string]*DocumentMapping `json:"types"` + DefaultMapping *DocumentMapping `json:"default_mapping"` + TypeField string `json:"type_field"` + DefaultType string `json:"default_type"` + DefaultAnalyzer string `json:"default_analyzer"` + DefaultDateTimeParser string `json:"default_datetime_parser"` + DefaultField string `json:"default_field"` + ByteArrayConverter string `json:"byte_array_converter"` } err := json.Unmarshal(data, &tmp) if err != nil { return err } - im.IdField = &DEFAULT_ID_FIELD - if tmp.IdField != nil { - im.IdField = tmp.IdField - } + im.cache = registry.NewCache() - im.TypeField = &DEFAULT_TYPE_FIELD - if tmp.TypeField != nil { + im.TypeField = DefaultTypeField + if tmp.TypeField != "" { im.TypeField = tmp.TypeField } - im.DefaultType = &DEFAULT_TYPE - if tmp.DefaultType != nil { + im.DefaultType = DefaultType + if tmp.DefaultType != "" { im.DefaultType = tmp.DefaultType } + im.DefaultAnalyzer = DefaultAnalyzer + if tmp.DefaultAnalyzer != "" { + im.DefaultAnalyzer = tmp.DefaultAnalyzer + } + + im.DefaultDateTimeParser = DefaultDateTimeParser + if tmp.DefaultDateTimeParser != "" { + im.DefaultDateTimeParser = tmp.DefaultDateTimeParser + } + + im.DefaultField = DefaultField + if tmp.DefaultField != "" { + im.DefaultField = tmp.DefaultField + } + + im.ByteArrayConverter = DefaultByteArrayConverter + if tmp.ByteArrayConverter != "" { + im.ByteArrayConverter = tmp.ByteArrayConverter + } + im.DefaultMapping = NewDocumentMapping() if tmp.DefaultMapping != nil { im.DefaultMapping = tmp.DefaultMapping } - if tmp.DefaultAnalyzer != nil { - im.DefaultAnalyzer = tmp.DefaultAnalyzer - } - - im.DefaultField = &DEFAULT_FIELD - if tmp.DefaultField != nil { - im.DefaultField = tmp.DefaultField - } - im.ByteArrayConverter = &DEFAULT_TOP_LEVEL_BYTE_ARRAY_CONVERTER - if tmp.ByteArrayConverter != nil { - im.ByteArrayConverter = tmp.ByteArrayConverter - } - im.TypeMapping = make(map[string]*DocumentMapping, len(tmp.TypeMapping)) for typeName, typeDocMapping := range tmp.TypeMapping { im.TypeMapping[typeName] = typeDocMapping @@ -139,34 +190,27 @@ func (im *IndexMapping) UnmarshalJSON(data []byte) error { return nil } -func (im *IndexMapping) determineType(data interface{}) (string, bool) { +func (im *IndexMapping) determineType(data interface{}) string { // first see if the object implements Identifier classifier, ok := data.(Classifier) if ok { - return classifier.Type(), true + return classifier.Type() } // now see if we can find type using the mapping - if im.TypeField != nil { - typ, ok := mustString(lookupPropertyPath(data, *im.TypeField)) - if ok { - return typ, true - } + typ, ok := mustString(lookupPropertyPath(data, im.TypeField)) + if ok { + return typ } - // fall back to default type if there was one - if im.DefaultType != nil { - return *im.DefaultType, true - } - - return "", false + return im.DefaultType } func (im *IndexMapping) MapDocument(doc *document.Document, data interface{}) error { // see if the top level object is a byte array, and possibly run through conveter byteArrayData, ok := data.([]byte) - if ok && im.ByteArrayConverter != nil { - byteArrayConverter, valid := Config.ByteArrayConverters[*im.ByteArrayConverter] + if ok { + byteArrayConverter, valid := Config.ByteArrayConverters[im.ByteArrayConverter] if valid { convertedData, err := byteArrayConverter.Convert(byteArrayData) if err != nil { @@ -176,17 +220,14 @@ func (im *IndexMapping) MapDocument(doc *document.Document, data interface{}) er } } - docType, ok := im.determineType(data) - if !ok { - return ERROR_NO_TYPE - } + docType := im.determineType(data) docMapping := im.MappingForType(docType) walkContext := newWalkContext(doc, docMapping) im.walkDocument(data, []string{}, walkContext) // see if the _all field was disabled allMapping := docMapping.DocumentMappingForPath("_all") - if allMapping == nil || (allMapping.Enabled != nil && *allMapping.Enabled != false) { + if allMapping == nil || (allMapping.Enabled != false) { field := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, walkContext.excludedFromAll, document.INDEX_FIELD|document.INCLUDE_TERM_VECTORS) doc.AddField(field) } @@ -259,7 +300,7 @@ func (im *IndexMapping) processProperty(property interface{}, path []string, con subDocMapping := context.dm.DocumentMappingForPath(pathString) // check tos see if we even need to do further processing - if subDocMapping != nil && subDocMapping.Enabled != nil && !*subDocMapping.Enabled { + if subDocMapping != nil && !subDocMapping.Enabled { return } @@ -272,28 +313,27 @@ func (im *IndexMapping) processProperty(property interface{}, path []string, con // index by explicit mapping for _, fieldMapping := range subDocMapping.Fields { fieldName := getFieldName(pathString, path, fieldMapping) + options := fieldMapping.Options() if *fieldMapping.Type == "text" { - options := fieldMapping.Options() - analyzer := Config.Analysis.Analyzers[*fieldMapping.Analyzer] - if analyzer != nil { - field := document.NewTextFieldCustom(fieldName, []byte(propertyValueString), options, analyzer) - context.doc.AddField(field) + analyzer := im.AnalyzerNamed(*fieldMapping.Analyzer) + field := document.NewTextFieldCustom(fieldName, []byte(propertyValueString), options, analyzer) + context.doc.AddField(field) - if fieldMapping.IncludeInAll != nil && !*fieldMapping.IncludeInAll { - context.excludedFromAll = append(context.excludedFromAll, fieldName) - } + if fieldMapping.IncludeInAll != nil && !*fieldMapping.IncludeInAll { + context.excludedFromAll = append(context.excludedFromAll, fieldName) } } else if *fieldMapping.Type == "datetime" { - options := fieldMapping.Options() - dateTimeFormat := *Config.DefaultDateTimeFormat + dateTimeFormat := im.DefaultDateTimeParser if fieldMapping.DateFormat != nil { dateTimeFormat = *fieldMapping.DateFormat } - dateTimeParser := Config.Analysis.DateTimeParsers[dateTimeFormat] - parsedDateTime, err := dateTimeParser.ParseDateTime(propertyValueString) - if err != nil { - field := document.NewDateTimeFieldWithIndexingOptions(fieldName, parsedDateTime, options) - context.doc.AddField(field) + dateTimeParser := im.DateTimeParserNamed(dateTimeFormat) + if dateTimeParser != nil { + parsedDateTime, err := dateTimeParser.ParseDateTime(propertyValueString) + if err != nil { + field := document.NewDateTimeFieldWithIndexingOptions(fieldName, parsedDateTime, options) + context.doc.AddField(field) + } } } } @@ -301,19 +341,24 @@ func (im *IndexMapping) processProperty(property interface{}, path []string, con // automatic indexing behavior // first see if it can be parsed by the default date parser - // FIXME add support for index mapping overriding defaults - dateTimeParser := Config.Analysis.DateTimeParsers[*Config.DefaultDateTimeFormat] - parsedDateTime, err := dateTimeParser.ParseDateTime(propertyValueString) - if err != nil { - // index as plain text - options := document.STORE_FIELD | document.INDEX_FIELD | document.INCLUDE_TERM_VECTORS - analyzer := im.defaultAnalyzer(context.dm, path) - field := document.NewTextFieldCustom(pathString, []byte(propertyValueString), options, analyzer) - context.doc.AddField(field) - } else { - // index as datetime - field := document.NewDateTimeField(pathString, parsedDateTime) - context.doc.AddField(field) + dateTimeParser := im.DateTimeParserNamed(im.DefaultDateTimeParser) + if dateTimeParser != nil { + parsedDateTime, err := dateTimeParser.ParseDateTime(propertyValueString) + if err != nil { + // index as plain text + options := document.STORE_FIELD | document.INDEX_FIELD | document.INCLUDE_TERM_VECTORS + analyzerName := context.dm.defaultAnalyzerName(path) + if analyzerName == "" { + analyzerName = im.DefaultAnalyzer + } + analyzer := im.AnalyzerNamed(analyzerName) + field := document.NewTextFieldCustom(pathString, []byte(propertyValueString), options, analyzer) + context.doc.AddField(field) + } else { + // index as datetime + field := document.NewDateTimeField(pathString, parsedDateTime) + context.doc.AddField(field) + } } } case reflect.Float64: @@ -361,25 +406,12 @@ func (im *IndexMapping) processProperty(property interface{}, path []string, con } } -func (im *IndexMapping) defaultAnalyzer(dm *DocumentMapping, path []string) *analysis.Analyzer { - // first see if the document mapping has an analyzer - rv := dm.defaultAnalyzer(path) - if rv == nil { - if im.DefaultAnalyzer != nil { - rv = Config.Analysis.Analyzers[*im.DefaultAnalyzer] - } else if Config.DefaultAnalyzer != nil { - rv = Config.Analysis.Analyzers[*Config.DefaultAnalyzer] - } - } - return rv -} - // attempts to find the best analyzer to use with only a field name // will walk all the document types, look for field mappings at the // provided path, if one exists and it has an explicit analyzer // that is returned // nil should be an acceptable return value meaning we don't know -func (im *IndexMapping) analyzerForPath(path string) *analysis.Analyzer { +func (im *IndexMapping) analyzerNameForPath(path string) string { // first we look for explicit mapping on the field for _, docMapping := range im.TypeMapping { @@ -387,25 +419,43 @@ func (im *IndexMapping) analyzerForPath(path string) *analysis.Analyzer { if pathMapping != nil { if len(pathMapping.Fields) > 0 { if pathMapping.Fields[0].Analyzer != nil { - return Config.Analysis.Analyzers[*pathMapping.Fields[0].Analyzer] + return *pathMapping.Fields[0].Analyzer } } } } // next we will try default analyzers for the path + pathDecoded := decodePath(path) for _, docMapping := range im.TypeMapping { - rv := im.defaultAnalyzer(docMapping, decodePath(path)) - if rv != nil { + rv := docMapping.defaultAnalyzerName(pathDecoded) + if rv != "" { return rv } } - // finally just return the system-wide default analyzer - return Config.Analysis.Analyzers[*Config.DefaultAnalyzer] + return im.DefaultAnalyzer } -func (im *IndexMapping) datetimeParserForPath(path string) analysis.DateTimeParser { +func (im *IndexMapping) AnalyzerNamed(name string) *analysis.Analyzer { + analyzer, err := im.cache.AnalyzerNamed(name) + if err != nil { + log.Printf("error using analyzer named: %s", name) + return nil + } + return analyzer +} + +func (im *IndexMapping) DateTimeParserNamed(name string) analysis.DateTimeParser { + dateTimeParser, err := im.cache.DateTimeParserNamed(name) + if err != nil { + log.Printf("error using datetime parser named: %s", name) + return nil + } + return dateTimeParser +} + +func (im *IndexMapping) datetimeParserNameForPath(path string) string { // first we look for explicit mapping on the field for _, docMapping := range im.TypeMapping { @@ -413,26 +463,13 @@ func (im *IndexMapping) datetimeParserForPath(path string) analysis.DateTimePars if pathMapping != nil { if len(pathMapping.Fields) > 0 { if pathMapping.Fields[0].Analyzer != nil { - return Config.Analysis.DateTimeParsers[*pathMapping.Fields[0].DateFormat] + return *pathMapping.Fields[0].Analyzer } } } } - // next we will try default analyzers for the path - // FIXME introduce default date time parsers at mapping leves - - // finally just return the system-wide default analyzer - return Config.Analysis.DateTimeParsers[*Config.DefaultDateTimeFormat] -} - -func (im *IndexMapping) defaultField() string { - if im.DefaultField != nil { - return *im.DefaultField - } else if Config.DefaultField != nil { - return *Config.DefaultField - } - return "" + return im.DefaultDateTimeParser } func getFieldName(pathString string, path []string, fieldMapping *FieldMapping) string { diff --git a/query_date_range.go b/query_date_range.go index ede9c499..2fd4683b 100644 --- a/query_date_range.go +++ b/query_date_range.go @@ -12,7 +12,6 @@ import ( "fmt" "math" - "github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/numeric_util" "github.com/couchbaselabs/bleve/search" ) @@ -53,19 +52,20 @@ func (q *DateRangeQuery) SetField(f string) *DateRangeQuery { func (q *DateRangeQuery) Searcher(i *indexImpl, explain bool) (search.Searcher, error) { - var dateTimeParser analysis.DateTimeParser + dateTimeParserName := "" if q.DateTimeParser != nil { - dateTimeParser = Config.Analysis.DateTimeParsers[*q.DateTimeParser] + dateTimeParserName = *q.DateTimeParser } else { - dateTimeParser = i.m.datetimeParserForPath(q.FieldVal) + dateTimeParserName = i.m.datetimeParserNameForPath(q.FieldVal) } + dateTimeParser := i.m.DateTimeParserNamed(dateTimeParserName) if dateTimeParser == nil { return nil, fmt.Errorf("no datetime parser named '%s' registered", *q.DateTimeParser) } field := q.FieldVal if q.FieldVal == "" { - field = i.m.defaultField() + field = i.m.DefaultField } // now parse the endpoints diff --git a/query_match.go b/query_match.go index 1852d091..742e0f0b 100644 --- a/query_match.go +++ b/query_match.go @@ -11,7 +11,6 @@ package bleve import ( "fmt" - "github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/search" ) @@ -49,19 +48,21 @@ func (q *MatchQuery) SetField(f string) *MatchQuery { func (q *MatchQuery) Searcher(i *indexImpl, explain bool) (search.Searcher, error) { - var analyzer *analysis.Analyzer + analyzerName := "" if q.Analyzer != "" { - analyzer = Config.Analysis.Analyzers[q.Analyzer] + analyzerName = q.Analyzer } else { - analyzer = i.m.analyzerForPath(q.FieldVal) + analyzerName = i.m.analyzerNameForPath(q.FieldVal) } + analyzer := i.m.AnalyzerNamed(analyzerName) + if analyzer == nil { return nil, fmt.Errorf("no analyzer named '%s' registered", q.Analyzer) } field := q.FieldVal if q.FieldVal == "" { - field = i.m.defaultField() + field = i.m.DefaultField } tokens := analyzer.Analyze([]byte(q.Match)) diff --git a/query_match_phrase.go b/query_match_phrase.go index 409ed49a..4647f877 100644 --- a/query_match_phrase.go +++ b/query_match_phrase.go @@ -11,7 +11,6 @@ package bleve import ( "fmt" - "github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/search" ) @@ -49,19 +48,20 @@ func (q *MatchPhraseQuery) SetField(f string) *MatchPhraseQuery { func (q *MatchPhraseQuery) Searcher(i *indexImpl, explain bool) (search.Searcher, error) { - var analyzer *analysis.Analyzer + analyzerName := "" if q.Analyzer != "" { - analyzer = Config.Analysis.Analyzers[q.Analyzer] + analyzerName = q.Analyzer } else { - analyzer = i.m.analyzerForPath(q.FieldVal) + analyzerName = i.m.analyzerNameForPath(q.FieldVal) } + analyzer := i.m.AnalyzerNamed(analyzerName) if analyzer == nil { return nil, fmt.Errorf("no analyzer named '%s' registered", q.Analyzer) } field := q.FieldVal if q.FieldVal == "" { - field = i.m.defaultField() + field = i.m.DefaultField } tokens := analyzer.Analyze([]byte(q.MatchPhrase)) diff --git a/query_numeric_range.go b/query_numeric_range.go index 980cc3cc..98507245 100644 --- a/query_numeric_range.go +++ b/query_numeric_range.go @@ -50,7 +50,7 @@ func (q *NumericRangeQuery) SetField(f string) *NumericRangeQuery { func (q *NumericRangeQuery) Searcher(i *indexImpl, explain bool) (search.Searcher, error) { field := q.FieldVal if q.FieldVal == "" { - field = i.m.defaultField() + field = i.m.DefaultField } return search.NewNumericRangeSearcher(i.i, q.Min, q.Max, field, q.BoostVal, explain) } diff --git a/query_prefix.go b/query_prefix.go index 973260be..e2023238 100644 --- a/query_prefix.go +++ b/query_prefix.go @@ -46,7 +46,7 @@ func (q *PrefixQuery) SetField(f string) *PrefixQuery { func (q *PrefixQuery) Searcher(i *indexImpl, explain bool) (search.Searcher, error) { field := q.FieldVal if q.FieldVal == "" { - field = i.m.defaultField() + field = i.m.DefaultField } return search.NewTermPrefixSearcher(i.i, q.Prefix, field, q.BoostVal, explain) } diff --git a/query_term.go b/query_term.go index f3ee99c9..9855c240 100644 --- a/query_term.go +++ b/query_term.go @@ -46,7 +46,7 @@ func (q *TermQuery) SetField(f string) *TermQuery { func (q *TermQuery) Searcher(i *indexImpl, explain bool) (search.Searcher, error) { field := q.FieldVal if q.FieldVal == "" { - field = i.m.defaultField() + field = i.m.DefaultField } return search.NewTermSearcher(i.i, q.Term, field, q.BoostVal, explain) } diff --git a/registry/analyzer.go b/registry/analyzer.go new file mode 100644 index 00000000..e5d72036 --- /dev/null +++ b/registry/analyzer.go @@ -0,0 +1,61 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package registry + +import ( + "fmt" + + "github.com/couchbaselabs/bleve/analysis" +) + +func RegisterAnalyzer(name string, constructor AnalyzerConstructor) { + _, exists := analyzers[name] + if exists { + panic(fmt.Errorf("attempted to register duplicate analyzer named '%s'", name)) + } + analyzers[name] = constructor +} + +type AnalyzerConstructor func(config map[string]interface{}, cache *Cache) (*analysis.Analyzer, error) +type AnalyzerRegistry map[string]AnalyzerConstructor +type AnalyzerCache map[string]*analysis.Analyzer + +func (c AnalyzerCache) AnalyzerNamed(name string, cache *Cache) (*analysis.Analyzer, error) { + analyzer, cached := c[name] + if cached { + return analyzer, nil + } + analyzerConstructor, registered := analyzers[name] + if !registered { + return nil, fmt.Errorf("no analyzer with name or type '%s' registered", name) + } + analyzer, err := analyzerConstructor(nil, cache) + if err != nil { + return nil, fmt.Errorf("error building analyzer: %v", err) + } + c[name] = analyzer + return analyzer, nil +} + +func (c AnalyzerCache) DefineAnalyzer(name string, typ string, config map[string]interface{}, cache *Cache) (*analysis.Analyzer, error) { + _, cached := c[name] + if cached { + return nil, fmt.Errorf("analyzer named '%s' already defined", name) + } + analyzerConstructor, registered := analyzers[typ] + if !registered { + return nil, fmt.Errorf("no analyzer type '%s' registered", typ) + } + analyzer, err := analyzerConstructor(config, cache) + if err != nil { + return nil, fmt.Errorf("error building analyzer: %v", err) + } + c[name] = analyzer + return analyzer, nil +} diff --git a/registry/char_filter.go b/registry/char_filter.go new file mode 100644 index 00000000..537dbd10 --- /dev/null +++ b/registry/char_filter.go @@ -0,0 +1,61 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package registry + +import ( + "fmt" + + "github.com/couchbaselabs/bleve/analysis" +) + +func RegisterCharFilter(name string, constructor CharFilterConstructor) { + _, exists := charFilters[name] + if exists { + panic(fmt.Errorf("attempted to register duplicate char filter named '%s'", name)) + } + charFilters[name] = constructor +} + +type CharFilterConstructor func(config map[string]interface{}, cache *Cache) (analysis.CharFilter, error) +type CharFilterRegistry map[string]CharFilterConstructor +type CharFilterCache map[string]analysis.CharFilter + +func (c CharFilterCache) CharFilterNamed(name string, cache *Cache) (analysis.CharFilter, error) { + charFilter, cached := c[name] + if cached { + return charFilter, nil + } + charFilterConstructor, registered := charFilters[name] + if !registered { + return nil, fmt.Errorf("no char filter with name or type '%s' registered", name) + } + charFilter, err := charFilterConstructor(nil, cache) + if err != nil { + return nil, fmt.Errorf("error building char filter: %v", err) + } + c[name] = charFilter + return charFilter, nil +} + +func (c CharFilterCache) DefineCharFilter(name string, typ string, config map[string]interface{}, cache *Cache) (analysis.CharFilter, error) { + _, cached := c[name] + if cached { + return nil, fmt.Errorf("char filter named '%s' already defined", name) + } + charFilterConstructor, registered := charFilters[typ] + if !registered { + return nil, fmt.Errorf("no char filter type '%s' registered", typ) + } + charFilter, err := charFilterConstructor(config, cache) + if err != nil { + return nil, fmt.Errorf("error building char filter: %v", err) + } + c[name] = charFilter + return charFilter, nil +} diff --git a/registry/datetime_parser.go b/registry/datetime_parser.go new file mode 100644 index 00000000..ea0bd96d --- /dev/null +++ b/registry/datetime_parser.go @@ -0,0 +1,61 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package registry + +import ( + "fmt" + + "github.com/couchbaselabs/bleve/analysis" +) + +func RegisterDateTimeParser(name string, constructor DateTimeParserConstructor) { + _, exists := dateTimeParsers[name] + if exists { + panic(fmt.Errorf("attempted to register duplicate date time parser named '%s'", name)) + } + dateTimeParsers[name] = constructor +} + +type DateTimeParserConstructor func(config map[string]interface{}, cache *Cache) (analysis.DateTimeParser, error) +type DateTimeParserRegistry map[string]DateTimeParserConstructor +type DateTimeParserCache map[string]analysis.DateTimeParser + +func (c DateTimeParserCache) DateTimeParserNamed(name string, cache *Cache) (analysis.DateTimeParser, error) { + dateTimeParser, cached := c[name] + if cached { + return dateTimeParser, nil + } + dateTimeParserConstructor, registered := dateTimeParsers[name] + if !registered { + return nil, fmt.Errorf("no date time parser with name or type '%s' registered", name) + } + dateTimeParser, err := dateTimeParserConstructor(nil, cache) + if err != nil { + return nil, fmt.Errorf("error building date time parse: %v", err) + } + c[name] = dateTimeParser + return dateTimeParser, nil +} + +func (c DateTimeParserCache) DefineDateTimeParser(name string, typ string, config map[string]interface{}, cache *Cache) (analysis.DateTimeParser, error) { + _, cached := c[name] + if cached { + return nil, fmt.Errorf("date time parser named '%s' already defined", name) + } + dateTimeParserConstructor, registered := dateTimeParsers[typ] + if !registered { + return nil, fmt.Errorf("no date time parser type '%s' registered", typ) + } + dateTimeParser, err := dateTimeParserConstructor(config, cache) + if err != nil { + return nil, fmt.Errorf("error building date time parser: %v", err) + } + c[name] = dateTimeParser + return dateTimeParser, nil +} diff --git a/registry/registry.go b/registry/registry.go new file mode 100644 index 00000000..a33197ef --- /dev/null +++ b/registry/registry.go @@ -0,0 +1,159 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package registry + +import ( + "fmt" + "sort" + + "github.com/couchbaselabs/bleve/analysis" +) + +var charFilters = make(CharFilterRegistry, 0) +var tokenizers = make(TokenizerRegistry, 0) +var tokenMaps = make(TokenMapRegistry, 0) +var tokenFilters = make(TokenFilterRegistry, 0) +var analyzers = make(AnalyzerRegistry, 0) +var dateTimeParsers = make(DateTimeParserRegistry, 0) + +type Cache struct { + CharFilters CharFilterCache + Tokenizers TokenizerCache + TokenMaps TokenMapCache + TokenFilters TokenFilterCache + Analyzers AnalyzerCache + DateTimeParsers DateTimeParserCache +} + +func NewCache() *Cache { + return &Cache{ + CharFilters: make(CharFilterCache, 0), + Tokenizers: make(TokenizerCache, 0), + TokenMaps: make(TokenMapCache, 0), + TokenFilters: make(TokenFilterCache, 0), + Analyzers: make(AnalyzerCache, 0), + DateTimeParsers: make(DateTimeParserCache, 0), + } +} + +func (c *Cache) CharFilterNamed(name string) (analysis.CharFilter, error) { + return c.CharFilters.CharFilterNamed(name, c) +} + +func (c *Cache) DefineCharFilter(name string, typ string, config map[string]interface{}) (analysis.CharFilter, error) { + return c.CharFilters.DefineCharFilter(name, typ, config, c) +} + +func (c *Cache) TokenizerNamed(name string) (analysis.Tokenizer, error) { + return c.Tokenizers.TokenizerNamed(name, c) +} + +func (c *Cache) DefineTokenizer(name string, typ string, config map[string]interface{}) (analysis.Tokenizer, error) { + return c.Tokenizers.DefineTokenizer(name, typ, config, c) +} + +func (c *Cache) TokenMapNamed(name string) (analysis.TokenMap, error) { + return c.TokenMaps.TokenMapNamed(name, c) +} + +func (c *Cache) DefineTokenMap(name string, typ string, config map[string]interface{}) (analysis.TokenMap, error) { + return c.TokenMaps.DefineTokenMap(name, typ, config, c) +} + +func (c *Cache) TokenFilterNamed(name string) (analysis.TokenFilter, error) { + return c.TokenFilters.TokenFilterNamed(name, c) +} + +func (c *Cache) DefineTokenFilter(name string, typ string, config map[string]interface{}) (analysis.TokenFilter, error) { + return c.TokenFilters.DefineTokenFilter(name, typ, config, c) +} + +func (c *Cache) AnalyzerNamed(name string) (*analysis.Analyzer, error) { + return c.Analyzers.AnalyzerNamed(name, c) +} + +func (c *Cache) DefineAnalyzer(name string, typ string, config map[string]interface{}) (*analysis.Analyzer, error) { + return c.Analyzers.DefineAnalyzer(name, typ, config, c) +} + +func (c *Cache) DateTimeParserNamed(name string) (analysis.DateTimeParser, error) { + return c.DateTimeParsers.DateTimeParserNamed(name, c) +} + +func (c *Cache) DefineDateTimeParser(name string, typ string, config map[string]interface{}) (analysis.DateTimeParser, error) { + return c.DateTimeParsers.DefineDateTimeParser(name, typ, config, c) +} + +func PrintRegistry() { + sorted := make(sort.StringSlice, 0, len(charFilters)) + for name, _ := range charFilters { + sorted = append(sorted, name) + } + sorted.Sort() + fmt.Printf("Char Filters:\n") + for _, name := range sorted { + fmt.Printf("\t%s\n", name) + } + fmt.Println() + + sorted = make(sort.StringSlice, 0, len(tokenizers)) + for name, _ := range tokenizers { + sorted = append(sorted, name) + } + sorted.Sort() + fmt.Printf("Tokenizers:\n") + for _, name := range sorted { + fmt.Printf("\t%s\n", name) + } + fmt.Println() + + sorted = make(sort.StringSlice, 0, len(tokenMaps)) + for name, _ := range tokenMaps { + sorted = append(sorted, name) + } + sorted.Sort() + fmt.Printf("Token Maps:\n") + for _, name := range sorted { + fmt.Printf("\t%s\n", name) + } + fmt.Println() + + sorted = make(sort.StringSlice, 0, len(tokenFilters)) + for name, _ := range tokenFilters { + sorted = append(sorted, name) + } + sorted.Sort() + fmt.Printf("Token Filters:\n") + for _, name := range sorted { + fmt.Printf("\t%s\n", name) + } + fmt.Println() + + sorted = make(sort.StringSlice, 0, len(analyzers)) + for name, _ := range analyzers { + sorted = append(sorted, name) + } + sorted.Sort() + fmt.Printf("Analyzers:\n") + for _, name := range sorted { + fmt.Printf("\t%s\n", name) + } + fmt.Println() + + sorted = make(sort.StringSlice, 0, len(dateTimeParsers)) + for name, _ := range dateTimeParsers { + sorted = append(sorted, name) + } + sorted.Sort() + fmt.Printf("DateTime Parsers:\n") + for _, name := range sorted { + fmt.Printf("\t%s\n", name) + } + fmt.Println() +} diff --git a/registry/token_filter.go b/registry/token_filter.go new file mode 100644 index 00000000..701161d0 --- /dev/null +++ b/registry/token_filter.go @@ -0,0 +1,61 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package registry + +import ( + "fmt" + + "github.com/couchbaselabs/bleve/analysis" +) + +func RegisterTokenFilter(name string, constructor TokenFilterConstructor) { + _, exists := tokenFilters[name] + if exists { + panic(fmt.Errorf("attempted to register duplicate token filter named '%s'", name)) + } + tokenFilters[name] = constructor +} + +type TokenFilterConstructor func(config map[string]interface{}, cache *Cache) (analysis.TokenFilter, error) +type TokenFilterRegistry map[string]TokenFilterConstructor +type TokenFilterCache map[string]analysis.TokenFilter + +func (c TokenFilterCache) TokenFilterNamed(name string, cache *Cache) (analysis.TokenFilter, error) { + tokenFilter, cached := c[name] + if cached { + return tokenFilter, nil + } + tokenFilterConstructor, registered := tokenFilters[name] + if !registered { + return nil, fmt.Errorf("no token filter with name or type '%s' registered", name) + } + tokenFilter, err := tokenFilterConstructor(nil, cache) + if err != nil { + return nil, fmt.Errorf("error building token filter: %v", err) + } + c[name] = tokenFilter + return tokenFilter, nil +} + +func (c TokenFilterCache) DefineTokenFilter(name string, typ string, config map[string]interface{}, cache *Cache) (analysis.TokenFilter, error) { + _, cached := c[name] + if cached { + return nil, fmt.Errorf("token filter named '%s' already defined", name) + } + tokenFilterConstructor, registered := tokenFilters[typ] + if !registered { + return nil, fmt.Errorf("no token filter type '%s' registered", typ) + } + tokenFilter, err := tokenFilterConstructor(config, cache) + if err != nil { + return nil, fmt.Errorf("error building token filter: %v", err) + } + c[name] = tokenFilter + return tokenFilter, nil +} diff --git a/registry/token_maps.go b/registry/token_maps.go new file mode 100644 index 00000000..8d0ae51f --- /dev/null +++ b/registry/token_maps.go @@ -0,0 +1,61 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package registry + +import ( + "fmt" + + "github.com/couchbaselabs/bleve/analysis" +) + +func RegisterTokenMap(name string, constructor TokenMapConstructor) { + _, exists := tokenMaps[name] + if exists { + panic(fmt.Errorf("attempted to register duplicate token map named '%s'", name)) + } + tokenMaps[name] = constructor +} + +type TokenMapConstructor func(config map[string]interface{}, cache *Cache) (analysis.TokenMap, error) +type TokenMapRegistry map[string]TokenMapConstructor +type TokenMapCache map[string]analysis.TokenMap + +func (c TokenMapCache) TokenMapNamed(name string, cache *Cache) (analysis.TokenMap, error) { + tokenMap, cached := c[name] + if cached { + return tokenMap, nil + } + tokenMapConstructor, registered := tokenMaps[name] + if !registered { + return nil, fmt.Errorf("no token map with name or type '%s' registered", name) + } + tokenMap, err := tokenMapConstructor(nil, cache) + if err != nil { + return nil, fmt.Errorf("error building token map: %v", err) + } + c[name] = tokenMap + return tokenMap, nil +} + +func (c TokenMapCache) DefineTokenMap(name string, typ string, config map[string]interface{}, cache *Cache) (analysis.TokenMap, error) { + _, cached := c[name] + if cached { + return nil, fmt.Errorf("token map named '%s' already defined", name) + } + tokenMapConstructor, registered := tokenMaps[typ] + if !registered { + return nil, fmt.Errorf("no token map type '%s' registered", typ) + } + tokenMap, err := tokenMapConstructor(config, cache) + if err != nil { + return nil, fmt.Errorf("error building token map: %v", err) + } + c[name] = tokenMap + return tokenMap, nil +} diff --git a/registry/tokenizer.go b/registry/tokenizer.go new file mode 100644 index 00000000..fdeabff3 --- /dev/null +++ b/registry/tokenizer.go @@ -0,0 +1,61 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package registry + +import ( + "fmt" + + "github.com/couchbaselabs/bleve/analysis" +) + +func RegisterTokenizer(name string, constructor TokenizerConstructor) { + _, exists := tokenizers[name] + if exists { + panic(fmt.Errorf("attempted to register duplicate tokenizer named '%s'", name)) + } + tokenizers[name] = constructor +} + +type TokenizerConstructor func(config map[string]interface{}, cache *Cache) (analysis.Tokenizer, error) +type TokenizerRegistry map[string]TokenizerConstructor +type TokenizerCache map[string]analysis.Tokenizer + +func (c TokenizerCache) TokenizerNamed(name string, cache *Cache) (analysis.Tokenizer, error) { + tokenizer, cached := c[name] + if cached { + return tokenizer, nil + } + tokenizerConstructor, registered := tokenizers[name] + if !registered { + return nil, fmt.Errorf("no tokenizer with name or type '%s' registered", name) + } + tokenizer, err := tokenizerConstructor(nil, cache) + if err != nil { + return nil, fmt.Errorf("error building tokenizer: %v", err) + } + c[name] = tokenizer + return tokenizer, nil +} + +func (c TokenizerCache) DefineTokenizer(name string, typ string, config map[string]interface{}, cache *Cache) (analysis.Tokenizer, error) { + _, cached := c[name] + if cached { + return nil, fmt.Errorf("tokenizer named '%s' already defined", name) + } + tokenizerConstructor, registered := tokenizers[typ] + if !registered { + return nil, fmt.Errorf("no tokenizer type '%s' registered", typ) + } + tokenizer, err := tokenizerConstructor(config, cache) + if err != nil { + return nil, fmt.Errorf("error building tokenizer: %v", err) + } + c[name] = tokenizer + return tokenizer, nil +} diff --git a/search.go b/search.go index 0fe7e298..59ae2f1f 100644 --- a/search.go +++ b/search.go @@ -13,6 +13,7 @@ import ( "fmt" "time" + "github.com/couchbaselabs/bleve/analysis" "github.com/couchbaselabs/bleve/search" ) @@ -23,9 +24,26 @@ type NumericRange struct { } type DateTimeRange struct { - Name string `json:"name,omitempty"` - Start time.Time `json:"start,omitempty"` - End time.Time `json:"end,omitempty"` + Name string `json:"name,omitempty"` + Start time.Time `json:"start,omitempty"` + End time.Time `json:"end,omitempty"` + startString *string + endString *string +} + +func (dr *DateTimeRange) ParseDates(dateTimeParser analysis.DateTimeParser) { + if dr.Start.IsZero() && dr.startString != nil { + start, err := dateTimeParser.ParseDateTime(*dr.startString) + if err == nil { + dr.Start = start + } + } + if dr.End.IsZero() && dr.endString != nil { + end, err := dateTimeParser.ParseDateTime(*dr.endString) + if err == nil { + dr.End = end + } + } } func (dr *DateTimeRange) UnmarshalJSON(input []byte) error { @@ -40,21 +58,12 @@ func (dr *DateTimeRange) UnmarshalJSON(input []byte) error { return err } - // FIXME allow alternate date parsers - dateTimeParser := Config.Analysis.DateTimeParsers[*Config.DefaultDateTimeFormat] - dr.Name = temp.Name if temp.Start != nil { - start, err := dateTimeParser.ParseDateTime(*temp.Start) - if err == nil { - dr.Start = start - } + dr.startString = temp.Start } if temp.End != nil { - end, err := dateTimeParser.ParseDateTime(*temp.End) - if err == nil { - dr.End = end - } + dr.endString = temp.End } return nil diff --git a/utils/bleve_registry/bleve_registry b/utils/bleve_registry/bleve_registry new file mode 100755 index 00000000..2472cfd6 Binary files /dev/null and b/utils/bleve_registry/bleve_registry differ diff --git a/analysis/tokenizers/rune_tokenizer/rune_token_classifier.go b/utils/bleve_registry/main.go similarity index 76% rename from analysis/tokenizers/rune_tokenizer/rune_token_classifier.go rename to utils/bleve_registry/main.go index 704314ea..2f2b62d6 100644 --- a/analysis/tokenizers/rune_tokenizer/rune_token_classifier.go +++ b/utils/bleve_registry/main.go @@ -6,8 +6,16 @@ // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, // either express or implied. See the License for the specific language governing permissions // and limitations under the License. -package rune_tokenizer +package main -type RuneTokenClassifer interface { - InToken(r rune) bool +import ( + "fmt" + + _ "github.com/couchbaselabs/bleve" + "github.com/couchbaselabs/bleve/registry" +) + +func main() { + fmt.Printf("Bleve Registry:\n") + registry.PrintRegistry() }