2014-07-30 18:30:38 +02:00
|
|
|
// Copyright (c) 2014 Couchbase, Inc.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
|
|
|
// except in compliance with the License. You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
|
|
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
|
|
|
// either express or implied. See the License for the specific language governing permissions
|
|
|
|
// and limitations under the License.
|
|
|
|
package bleve
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"regexp"
|
2014-08-03 23:19:04 +02:00
|
|
|
"time"
|
2014-07-30 18:30:38 +02:00
|
|
|
|
|
|
|
"github.com/couchbaselabs/bleve/analysis"
|
|
|
|
|
2014-08-03 23:19:04 +02:00
|
|
|
"github.com/couchbaselabs/bleve/analysis/datetime_parsers/flexible_go"
|
|
|
|
|
2014-07-30 18:30:38 +02:00
|
|
|
"github.com/couchbaselabs/bleve/analysis/char_filters/regexp_char_filter"
|
|
|
|
|
|
|
|
"github.com/couchbaselabs/bleve/analysis/tokenizers/regexp_tokenizer"
|
|
|
|
"github.com/couchbaselabs/bleve/analysis/tokenizers/single_token"
|
|
|
|
"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
|
|
|
|
|
|
|
|
"github.com/couchbaselabs/bleve/analysis/token_filters/cld2"
|
2014-08-04 01:17:35 +02:00
|
|
|
"github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter"
|
2014-07-30 18:30:38 +02:00
|
|
|
"github.com/couchbaselabs/bleve/analysis/token_filters/length_filter"
|
|
|
|
"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
|
|
|
|
"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
|
|
|
|
"github.com/couchbaselabs/bleve/analysis/token_filters/stop_words_filter"
|
2014-08-05 03:59:57 +02:00
|
|
|
"github.com/couchbaselabs/bleve/analysis/token_filters/unicode_normalize"
|
2014-07-30 18:30:38 +02:00
|
|
|
|
|
|
|
"github.com/couchbaselabs/bleve/search"
|
|
|
|
)
|
|
|
|
|
|
|
|
type AnalysisConfig struct {
|
2014-08-03 23:46:35 +02:00
|
|
|
TokenMaps map[string]analysis.WordMap
|
2014-08-03 23:19:04 +02:00
|
|
|
CharFilters map[string]analysis.CharFilter
|
|
|
|
Tokenizers map[string]analysis.Tokenizer
|
|
|
|
TokenFilters map[string]analysis.TokenFilter
|
|
|
|
Analyzers map[string]*analysis.Analyzer
|
|
|
|
DateTimeParsers map[string]analysis.DateTimeParser
|
2014-07-30 18:30:38 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
type HighlightConfig struct {
|
|
|
|
Highlighters map[string]search.Highlighter
|
|
|
|
}
|
|
|
|
|
2014-07-30 20:29:26 +02:00
|
|
|
type Configuration struct {
|
2014-08-03 23:19:04 +02:00
|
|
|
Analysis *AnalysisConfig
|
|
|
|
DefaultAnalyzer *string
|
|
|
|
Highlight *HighlightConfig
|
|
|
|
DefaultHighlighter *string
|
|
|
|
CreateIfMissing bool
|
|
|
|
DefaultDateTimeFormat *string
|
2014-08-06 14:23:29 +02:00
|
|
|
DefaultField *string
|
2014-07-30 18:30:38 +02:00
|
|
|
}
|
|
|
|
|
2014-07-30 20:29:26 +02:00
|
|
|
func (c *Configuration) BuildNewAnalyzer(charFilterNames []string, tokenizerName string, tokenFilterNames []string) (*analysis.Analyzer, error) {
|
2014-07-30 18:30:38 +02:00
|
|
|
rv := analysis.Analyzer{}
|
|
|
|
if len(charFilterNames) > 0 {
|
|
|
|
rv.CharFilters = make([]analysis.CharFilter, len(charFilterNames))
|
|
|
|
for i, charFilterName := range charFilterNames {
|
|
|
|
charFilter := c.Analysis.CharFilters[charFilterName]
|
|
|
|
if charFilter == nil {
|
|
|
|
return nil, fmt.Errorf("no character filter named `%s` registered", charFilterName)
|
|
|
|
}
|
|
|
|
rv.CharFilters[i] = charFilter
|
|
|
|
}
|
|
|
|
}
|
|
|
|
rv.Tokenizer = c.Analysis.Tokenizers[tokenizerName]
|
|
|
|
if rv.Tokenizer == nil {
|
|
|
|
return nil, fmt.Errorf("no tokenizer named `%s` registered", tokenizerName)
|
|
|
|
}
|
|
|
|
if len(tokenFilterNames) > 0 {
|
|
|
|
rv.TokenFilters = make([]analysis.TokenFilter, len(tokenFilterNames))
|
|
|
|
for i, tokenFilterName := range tokenFilterNames {
|
|
|
|
tokenFilter := c.Analysis.TokenFilters[tokenFilterName]
|
|
|
|
if tokenFilter == nil {
|
|
|
|
return nil, fmt.Errorf("no token filter named `%s` registered", tokenFilterName)
|
|
|
|
}
|
|
|
|
rv.TokenFilters[i] = tokenFilter
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return &rv, nil
|
|
|
|
}
|
|
|
|
|
2014-07-30 20:29:26 +02:00
|
|
|
func (c *Configuration) MustBuildNewAnalyzer(charFilterNames []string, tokenizerName string, tokenFilterNames []string) *analysis.Analyzer {
|
2014-07-30 18:30:38 +02:00
|
|
|
analyzer, err := c.BuildNewAnalyzer(charFilterNames, tokenizerName, tokenFilterNames)
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
return analyzer
|
|
|
|
}
|
|
|
|
|
2014-08-03 23:46:35 +02:00
|
|
|
func (c *Configuration) MustLoadStopWords(stopWordsBytes []byte) analysis.WordMap {
|
|
|
|
rv := analysis.NewWordMap()
|
2014-07-31 01:29:52 +02:00
|
|
|
err := rv.LoadBytes(stopWordsBytes)
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
2014-07-30 20:29:26 +02:00
|
|
|
func NewConfiguration() *Configuration {
|
|
|
|
return &Configuration{
|
2014-07-30 18:30:38 +02:00
|
|
|
Analysis: &AnalysisConfig{
|
2014-08-03 23:46:35 +02:00
|
|
|
TokenMaps: make(map[string]analysis.WordMap),
|
2014-08-03 23:19:04 +02:00
|
|
|
CharFilters: make(map[string]analysis.CharFilter),
|
|
|
|
Tokenizers: make(map[string]analysis.Tokenizer),
|
|
|
|
TokenFilters: make(map[string]analysis.TokenFilter),
|
|
|
|
Analyzers: make(map[string]*analysis.Analyzer),
|
|
|
|
DateTimeParsers: make(map[string]analysis.DateTimeParser),
|
2014-07-30 18:30:38 +02:00
|
|
|
},
|
|
|
|
Highlight: &HighlightConfig{
|
|
|
|
Highlighters: make(map[string]search.Highlighter),
|
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-30 20:29:26 +02:00
|
|
|
var Config *Configuration
|
2014-07-30 18:30:38 +02:00
|
|
|
|
|
|
|
func init() {
|
|
|
|
|
|
|
|
// build the default configuration
|
2014-07-30 20:29:26 +02:00
|
|
|
Config = NewConfiguration()
|
2014-07-30 18:30:38 +02:00
|
|
|
|
2014-07-31 01:29:52 +02:00
|
|
|
// register stop token maps
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["da_stop"] = Config.MustLoadStopWords(stop_words_filter.DanishStopWords)
|
|
|
|
Config.Analysis.TokenMaps["nl_stop"] = Config.MustLoadStopWords(stop_words_filter.DutchStopWords)
|
|
|
|
Config.Analysis.TokenMaps["en_stop"] = Config.MustLoadStopWords(stop_words_filter.EnglishStopWords)
|
|
|
|
Config.Analysis.TokenMaps["fi_stop"] = Config.MustLoadStopWords(stop_words_filter.FinnishStopWords)
|
|
|
|
Config.Analysis.TokenMaps["fr_stop"] = Config.MustLoadStopWords(stop_words_filter.FrenchStopWords)
|
|
|
|
Config.Analysis.TokenMaps["de_stop"] = Config.MustLoadStopWords(stop_words_filter.GermanStopWords)
|
|
|
|
Config.Analysis.TokenMaps["hu_stop"] = Config.MustLoadStopWords(stop_words_filter.HungarianStopWords)
|
|
|
|
Config.Analysis.TokenMaps["it_stop"] = Config.MustLoadStopWords(stop_words_filter.ItalianStopWords)
|
|
|
|
Config.Analysis.TokenMaps["no_stop"] = Config.MustLoadStopWords(stop_words_filter.NorwegianStopWords)
|
|
|
|
Config.Analysis.TokenMaps["pt_stop"] = Config.MustLoadStopWords(stop_words_filter.PortugueseStopWords)
|
|
|
|
Config.Analysis.TokenMaps["ro_stop"] = Config.MustLoadStopWords(stop_words_filter.RomanianStopWords)
|
|
|
|
Config.Analysis.TokenMaps["ru_stop"] = Config.MustLoadStopWords(stop_words_filter.RussianStopWords)
|
|
|
|
Config.Analysis.TokenMaps["es_stop"] = Config.MustLoadStopWords(stop_words_filter.SpanishStopWords)
|
|
|
|
Config.Analysis.TokenMaps["sv_stop"] = Config.MustLoadStopWords(stop_words_filter.SwedishStopWords)
|
|
|
|
Config.Analysis.TokenMaps["tr_stop"] = Config.MustLoadStopWords(stop_words_filter.TurkishStopWords)
|
|
|
|
Config.Analysis.TokenMaps["ar_stop"] = Config.MustLoadStopWords(stop_words_filter.ArabicStopWords)
|
|
|
|
Config.Analysis.TokenMaps["hy_stop"] = Config.MustLoadStopWords(stop_words_filter.ArmenianStopWords)
|
|
|
|
Config.Analysis.TokenMaps["eu_stop"] = Config.MustLoadStopWords(stop_words_filter.BasqueStopWords)
|
|
|
|
Config.Analysis.TokenMaps["bg_stop"] = Config.MustLoadStopWords(stop_words_filter.BulgarianStopWords)
|
|
|
|
Config.Analysis.TokenMaps["ca_stop"] = Config.MustLoadStopWords(stop_words_filter.CatalanStopWords)
|
|
|
|
Config.Analysis.TokenMaps["gl_stop"] = Config.MustLoadStopWords(stop_words_filter.GalicianStopWords)
|
|
|
|
Config.Analysis.TokenMaps["el_stop"] = Config.MustLoadStopWords(stop_words_filter.GreekStopWords)
|
|
|
|
Config.Analysis.TokenMaps["hi_stop"] = Config.MustLoadStopWords(stop_words_filter.HindiStopWords)
|
|
|
|
Config.Analysis.TokenMaps["id_stop"] = Config.MustLoadStopWords(stop_words_filter.IndonesianStopWords)
|
|
|
|
Config.Analysis.TokenMaps["ga_stop"] = Config.MustLoadStopWords(stop_words_filter.IrishStopWords)
|
|
|
|
Config.Analysis.TokenMaps["fa_stop"] = Config.MustLoadStopWords(stop_words_filter.PersianStopWords)
|
|
|
|
Config.Analysis.TokenMaps["ckb_stop"] = Config.MustLoadStopWords(stop_words_filter.SoraniStopWords)
|
|
|
|
Config.Analysis.TokenMaps["th_stop"] = Config.MustLoadStopWords(stop_words_filter.ThaiStopWords)
|
2014-07-31 01:29:52 +02:00
|
|
|
|
2014-08-04 01:17:35 +02:00
|
|
|
// register article token maps for elision filters
|
|
|
|
Config.Analysis.TokenMaps["fr_articles"] = Config.MustLoadStopWords(elision_filter.FrenchArticles)
|
|
|
|
Config.Analysis.TokenMaps["it_articles"] = Config.MustLoadStopWords(elision_filter.ItalianArticles)
|
|
|
|
Config.Analysis.TokenMaps["ca_articles"] = Config.MustLoadStopWords(elision_filter.CatalanArticles)
|
|
|
|
Config.Analysis.TokenMaps["ga_articles"] = Config.MustLoadStopWords(elision_filter.IrishArticles)
|
|
|
|
|
2014-07-30 18:30:38 +02:00
|
|
|
// register char filters
|
|
|
|
htmlCharFilterRegexp := regexp.MustCompile(`</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`)
|
|
|
|
htmlCharFilter := regexp_char_filter.NewRegexpCharFilter(htmlCharFilterRegexp, []byte{' '})
|
2014-07-30 20:29:26 +02:00
|
|
|
Config.Analysis.CharFilters["html"] = htmlCharFilter
|
2014-07-30 18:30:38 +02:00
|
|
|
|
|
|
|
// register tokenizers
|
|
|
|
whitespaceTokenizerRegexp := regexp.MustCompile(`\w+`)
|
2014-07-30 20:29:26 +02:00
|
|
|
Config.Analysis.Tokenizers["single"] = single_token.NewSingleTokenTokenizer()
|
|
|
|
Config.Analysis.Tokenizers["unicode"] = unicode_word_boundary.NewUnicodeWordBoundaryTokenizer()
|
|
|
|
Config.Analysis.Tokenizers["unicode_th"] = unicode_word_boundary.NewUnicodeWordBoundaryCustomLocaleTokenizer("th_TH")
|
|
|
|
Config.Analysis.Tokenizers["whitespace"] = regexp_tokenizer.NewRegexpTokenizer(whitespaceTokenizerRegexp)
|
2014-07-30 18:30:38 +02:00
|
|
|
|
|
|
|
// register token filters
|
2014-07-30 20:29:26 +02:00
|
|
|
Config.Analysis.TokenFilters["detect_lang"] = cld2.NewCld2Filter()
|
|
|
|
Config.Analysis.TokenFilters["short"] = length_filter.NewLengthFilter(3, -1)
|
|
|
|
Config.Analysis.TokenFilters["long"] = length_filter.NewLengthFilter(-1, 255)
|
|
|
|
Config.Analysis.TokenFilters["to_lower"] = lower_case_filter.NewLowerCaseFilter()
|
|
|
|
Config.Analysis.TokenFilters["stemmer_da"] = stemmer_filter.MustNewStemmerFilter("danish")
|
|
|
|
Config.Analysis.TokenFilters["stemmer_nl"] = stemmer_filter.MustNewStemmerFilter("dutch")
|
|
|
|
Config.Analysis.TokenFilters["stemmer_en"] = stemmer_filter.MustNewStemmerFilter("english")
|
|
|
|
Config.Analysis.TokenFilters["stemmer_fi"] = stemmer_filter.MustNewStemmerFilter("finnish")
|
|
|
|
Config.Analysis.TokenFilters["stemmer_fr"] = stemmer_filter.MustNewStemmerFilter("french")
|
|
|
|
Config.Analysis.TokenFilters["stemmer_de"] = stemmer_filter.MustNewStemmerFilter("german")
|
|
|
|
Config.Analysis.TokenFilters["stemmer_hu"] = stemmer_filter.MustNewStemmerFilter("hungarian")
|
|
|
|
Config.Analysis.TokenFilters["stemmer_it"] = stemmer_filter.MustNewStemmerFilter("italian")
|
|
|
|
Config.Analysis.TokenFilters["stemmer_no"] = stemmer_filter.MustNewStemmerFilter("norwegian")
|
|
|
|
Config.Analysis.TokenFilters["stemmer_porter"] = stemmer_filter.MustNewStemmerFilter("porter")
|
|
|
|
Config.Analysis.TokenFilters["stemmer_pt"] = stemmer_filter.MustNewStemmerFilter("portuguese")
|
|
|
|
Config.Analysis.TokenFilters["stemmer_ro"] = stemmer_filter.MustNewStemmerFilter("romanian")
|
|
|
|
Config.Analysis.TokenFilters["stemmer_ru"] = stemmer_filter.MustNewStemmerFilter("russian")
|
|
|
|
Config.Analysis.TokenFilters["stemmer_es"] = stemmer_filter.MustNewStemmerFilter("spanish")
|
|
|
|
Config.Analysis.TokenFilters["stemmer_sv"] = stemmer_filter.MustNewStemmerFilter("swedish")
|
|
|
|
Config.Analysis.TokenFilters["stemmer_tr"] = stemmer_filter.MustNewStemmerFilter("turkish")
|
2014-07-31 01:29:52 +02:00
|
|
|
|
2014-08-04 01:17:35 +02:00
|
|
|
// register stop token filters
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_da"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["da_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_nl"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["nl_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_en"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["en_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_fi"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["fi_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_fr"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["fr_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_de"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["de_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_hu"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["hu_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_it"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["it_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_no"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["no_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_pt"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["pt_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_ro"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["ro_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_ru"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["ru_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_es"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["es_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_sv"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["sv_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_tr"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["tr_stop_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_ar"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["ar_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_hy"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["hy_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_eu"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["eu_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_bg"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["bg_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_ca"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["ca_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_gl"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["gl_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_el"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["el_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_hi"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["hi_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_id"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["id_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_ga"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["ga_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_fa"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["fa_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_ckb"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["ckb_stop"])
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.TokenFilters["stop_token_th"] = stop_words_filter.NewStopWordsFilter(
|
2014-08-03 23:46:35 +02:00
|
|
|
Config.Analysis.TokenMaps["th_stop"])
|
2014-07-30 18:30:38 +02:00
|
|
|
|
2014-08-04 01:17:35 +02:00
|
|
|
// register elision filters
|
|
|
|
Config.Analysis.TokenFilters["elision_fr"] = elision_filter.NewElisionFilter(
|
|
|
|
Config.Analysis.TokenMaps["fr_articles"])
|
|
|
|
Config.Analysis.TokenFilters["elision_it"] = elision_filter.NewElisionFilter(
|
|
|
|
Config.Analysis.TokenMaps["it_articles"])
|
|
|
|
Config.Analysis.TokenFilters["elision_ca"] = elision_filter.NewElisionFilter(
|
|
|
|
Config.Analysis.TokenMaps["ca_articles"])
|
|
|
|
Config.Analysis.TokenFilters["elision_ga"] = elision_filter.NewElisionFilter(
|
|
|
|
Config.Analysis.TokenMaps["ga_articles"])
|
|
|
|
|
2014-08-05 03:59:57 +02:00
|
|
|
// register unicode normalizers
|
|
|
|
Config.Analysis.TokenFilters["normalize_nfc"] = unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFC)
|
|
|
|
Config.Analysis.TokenFilters["normalize_nfd"] = unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFD)
|
|
|
|
Config.Analysis.TokenFilters["normalize_nfkc"] = unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKC)
|
|
|
|
Config.Analysis.TokenFilters["normalize_nfkd"] = unicode_normalize.MustNewUnicodeNormalizeFilter(unicode_normalize.NFKD)
|
|
|
|
|
2014-07-30 18:30:38 +02:00
|
|
|
// register analyzers
|
2014-07-30 20:29:26 +02:00
|
|
|
keywordAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "single", []string{})
|
|
|
|
Config.Analysis.Analyzers["keyword"] = keywordAnalyzer
|
|
|
|
simpleAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "whitespace", []string{"to_lower"})
|
|
|
|
Config.Analysis.Analyzers["simple"] = simpleAnalyzer
|
2014-07-31 01:29:52 +02:00
|
|
|
standardAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "whitespace", []string{"to_lower", "stop_token_en"})
|
2014-07-30 20:29:26 +02:00
|
|
|
Config.Analysis.Analyzers["standard"] = standardAnalyzer
|
|
|
|
detectLangAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "single", []string{"to_lower", "detect_lang"})
|
|
|
|
Config.Analysis.Analyzers["detect_lang"] = detectLangAnalyzer
|
2014-07-30 18:30:38 +02:00
|
|
|
|
2014-07-31 01:29:52 +02:00
|
|
|
// language specific analyzers
|
|
|
|
danishAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_da", "stemmer_da"})
|
|
|
|
Config.Analysis.Analyzers["da"] = danishAnalyzer
|
|
|
|
dutchAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_nl", "stemmer_nl"})
|
|
|
|
Config.Analysis.Analyzers["nl"] = dutchAnalyzer
|
|
|
|
englishAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_en", "stemmer_en"})
|
|
|
|
Config.Analysis.Analyzers["en"] = englishAnalyzer
|
|
|
|
finnishAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_fi", "stemmer_fi"})
|
|
|
|
Config.Analysis.Analyzers["fi"] = finnishAnalyzer
|
2014-08-04 01:17:35 +02:00
|
|
|
frenchAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"elision_fr", "to_lower", "stop_token_fr", "stemmer_fr"})
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.Analyzers["fr"] = frenchAnalyzer
|
|
|
|
germanAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_de", "stemmer_de"})
|
|
|
|
Config.Analysis.Analyzers["de"] = germanAnalyzer
|
|
|
|
hungarianAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_hu", "stemmer_hu"})
|
|
|
|
Config.Analysis.Analyzers["hu"] = hungarianAnalyzer
|
2014-08-04 01:17:35 +02:00
|
|
|
italianAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"elision_it", "to_lower", "stop_token_it", "stemmer_it"})
|
2014-07-31 01:29:52 +02:00
|
|
|
Config.Analysis.Analyzers["it"] = italianAnalyzer
|
|
|
|
norwegianAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_no", "stemmer_no"})
|
|
|
|
Config.Analysis.Analyzers["no"] = norwegianAnalyzer
|
|
|
|
portugueseAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_pt", "stemmer_pt"})
|
|
|
|
Config.Analysis.Analyzers["pt"] = portugueseAnalyzer
|
|
|
|
romanianAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_ro", "stemmer_ro"})
|
|
|
|
Config.Analysis.Analyzers["ro"] = romanianAnalyzer
|
|
|
|
russianAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_ru", "stemmer_ru"})
|
|
|
|
Config.Analysis.Analyzers["ru"] = russianAnalyzer
|
|
|
|
spanishAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_es", "stemmer_es"})
|
|
|
|
Config.Analysis.Analyzers["es"] = spanishAnalyzer
|
|
|
|
swedishAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_sv", "stemmer_sv"})
|
|
|
|
Config.Analysis.Analyzers["sv"] = swedishAnalyzer
|
|
|
|
turkishAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode", []string{"to_lower", "stop_token_tr", "stemmer_tr"})
|
|
|
|
Config.Analysis.Analyzers["tr"] = turkishAnalyzer
|
|
|
|
thaiAnalyzer := Config.MustBuildNewAnalyzer([]string{}, "unicode_th", []string{"to_lower", "stop_token_th"})
|
|
|
|
Config.Analysis.Analyzers["th"] = thaiAnalyzer
|
|
|
|
|
2014-07-30 18:30:38 +02:00
|
|
|
// register ansi highlighter
|
2014-07-30 20:29:26 +02:00
|
|
|
Config.Highlight.Highlighters["ansi"] = search.NewSimpleHighlighter()
|
2014-07-30 18:30:38 +02:00
|
|
|
|
|
|
|
// register html highlighter
|
|
|
|
htmlFormatter := search.NewHTMLFragmentFormatterCustom(`<span class="highlight">`, `</span>`)
|
|
|
|
htmlHighlighter := search.NewSimpleHighlighter()
|
|
|
|
htmlHighlighter.SetFragmentFormatter(htmlFormatter)
|
2014-07-30 20:29:26 +02:00
|
|
|
Config.Highlight.Highlighters["html"] = htmlHighlighter
|
2014-07-30 18:30:38 +02:00
|
|
|
|
|
|
|
// set the default analyzer
|
|
|
|
simpleAnalyzerName := "simple"
|
2014-07-30 20:29:26 +02:00
|
|
|
Config.DefaultAnalyzer = &simpleAnalyzerName
|
2014-07-30 18:30:38 +02:00
|
|
|
|
|
|
|
// set the default highlighter
|
|
|
|
htmlHighlighterName := "html"
|
2014-07-30 20:29:26 +02:00
|
|
|
Config.DefaultHighlighter = &htmlHighlighterName
|
2014-07-30 18:30:38 +02:00
|
|
|
|
2014-07-30 20:29:26 +02:00
|
|
|
// default CreateIfMissing to true
|
|
|
|
Config.CreateIfMissing = true
|
2014-08-03 23:19:04 +02:00
|
|
|
|
|
|
|
// set up the built-in date time formats
|
|
|
|
|
|
|
|
rfc3339NoTimezone := "2006-01-02T15:04:05"
|
|
|
|
rfc3339NoTimezoneNoT := "2006-01-02 15:04:05"
|
|
|
|
rfc3339NoTime := "2006-01-02"
|
|
|
|
|
|
|
|
Config.Analysis.DateTimeParsers["dateTimeOptional"] = flexible_go.NewFlexibleGoDateTimeParser(
|
|
|
|
[]string{
|
|
|
|
time.RFC3339Nano,
|
|
|
|
time.RFC3339,
|
|
|
|
rfc3339NoTimezone,
|
|
|
|
rfc3339NoTimezoneNoT,
|
|
|
|
rfc3339NoTime,
|
|
|
|
})
|
|
|
|
dateTimeOptionalName := "dateTimeOptional"
|
|
|
|
Config.DefaultDateTimeFormat = &dateTimeOptionalName
|
2014-08-06 14:23:29 +02:00
|
|
|
|
|
|
|
defaultField := "_all"
|
|
|
|
Config.DefaultField = &defaultField
|
2014-07-30 18:30:38 +02:00
|
|
|
}
|