0
0
Fork 0
bleve/mapping_index.go

510 lines
16 KiB
Go

// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package bleve
import (
"encoding/json"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/analyzers/standard_analyzer"
"github.com/blevesearch/bleve/analysis/byte_array_converters/json"
"github.com/blevesearch/bleve/analysis/datetime_parsers/datetime_optional"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/registry"
)
const defaultTypeField = "_type"
const defaultType = "_default"
const defaultField = "_all"
const defaultAnalyzer = standard_analyzer.Name
const defaultDateTimeParser = datetime_optional.Name
const defaultByteArrayConverter = json_byte_array_converter.Name
type customAnalysis struct {
CharFilters map[string]map[string]interface{} `json:"char_filters,omitempty"`
Tokenizers map[string]map[string]interface{} `json:"tokenizers,omitempty"`
TokenMaps map[string]map[string]interface{} `json:"token_maps,omitempty"`
TokenFilters map[string]map[string]interface{} `json:"token_filters,omitempty"`
Analyzers map[string]map[string]interface{} `json:"analyzers,omitempty"`
DateTimeParsers map[string]map[string]interface{} `json:"date_time_parsers,omitempty"`
}
func (c *customAnalysis) registerAll(i *IndexMapping) error {
for name, config := range c.CharFilters {
_, err := i.cache.DefineCharFilter(name, config)
if err != nil {
return err
}
}
if len(c.Tokenizers) > 0 {
// put all the names in map tracking work to do
todo := map[string]struct{}{}
for name, _ := range c.Tokenizers {
todo[name] = struct{}{}
}
registered := 1
errs := []error{}
// as long as we keep making progress, keep going
for len(todo) > 0 && registered > 0 {
registered = 0
errs = []error{}
for name, _ := range todo {
config := c.Tokenizers[name]
_, err := i.cache.DefineTokenizer(name, config)
if err != nil {
errs = append(errs, err)
} else {
delete(todo, name)
registered++
}
}
}
if len(errs) > 0 {
return errs[0]
}
}
for name, config := range c.TokenMaps {
_, err := i.cache.DefineTokenMap(name, config)
if err != nil {
return err
}
}
for name, config := range c.TokenFilters {
_, err := i.cache.DefineTokenFilter(name, config)
if err != nil {
return err
}
}
for name, config := range c.Analyzers {
_, err := i.cache.DefineAnalyzer(name, config)
if err != nil {
return err
}
}
for name, config := range c.DateTimeParsers {
_, err := i.cache.DefineDateTimeParser(name, config)
if err != nil {
return err
}
}
return nil
}
func newCustomAnalysis() *customAnalysis {
rv := customAnalysis{
CharFilters: make(map[string]map[string]interface{}),
Tokenizers: make(map[string]map[string]interface{}),
TokenMaps: make(map[string]map[string]interface{}),
TokenFilters: make(map[string]map[string]interface{}),
Analyzers: make(map[string]map[string]interface{}),
DateTimeParsers: make(map[string]map[string]interface{}),
}
return &rv
}
// An IndexMapping controls how objects are placed
// into an index.
// First the type of the object is determined.
// Once the type is know, the appropriate
// DocumentMapping is selected by the type.
// If no mapping was determined for that type,
// a DefaultMapping will be used.
type IndexMapping struct {
TypeMapping map[string]*DocumentMapping `json:"types,omitempty"`
DefaultMapping *DocumentMapping `json:"default_mapping"`
TypeField string `json:"type_field"`
DefaultType string `json:"default_type"`
DefaultAnalyzer string `json:"default_analyzer"`
DefaultDateTimeParser string `json:"default_datetime_parser"`
DefaultField string `json:"default_field"`
ByteArrayConverter string `json:"byte_array_converter"`
CustomAnalysis *customAnalysis `json:"analysis,omitempty"`
cache *registry.Cache
}
// AddCustomCharFilter defines a custom char filter for use in this mapping
func (im *IndexMapping) AddCustomCharFilter(name string, config map[string]interface{}) error {
_, err := im.cache.DefineCharFilter(name, config)
if err != nil {
return err
}
im.CustomAnalysis.CharFilters[name] = config
return nil
}
// AddCustomTokenizer defines a custom tokenizer for use in this mapping
func (im *IndexMapping) AddCustomTokenizer(name string, config map[string]interface{}) error {
_, err := im.cache.DefineTokenizer(name, config)
if err != nil {
return err
}
im.CustomAnalysis.Tokenizers[name] = config
return nil
}
// AddCustomTokenMap defines a custom token map for use in this mapping
func (im *IndexMapping) AddCustomTokenMap(name string, config map[string]interface{}) error {
_, err := im.cache.DefineTokenMap(name, config)
if err != nil {
return err
}
im.CustomAnalysis.TokenMaps[name] = config
return nil
}
// AddCustomTokenFilter defines a custom token filter for use in this mapping
func (im *IndexMapping) AddCustomTokenFilter(name string, config map[string]interface{}) error {
_, err := im.cache.DefineTokenFilter(name, config)
if err != nil {
return err
}
im.CustomAnalysis.TokenFilters[name] = config
return nil
}
// AddCustomAnalyzer defines a custom analyzer for use in this mapping. The
// config map must have a "type" string entry to resolve the analyzer
// constructor. The constructor is invoked with the remaining entries and
// returned analyzer is registered in the IndexMapping.
//
// bleve comes with predefined analyzers, like
// github.com/blevesearch/bleve/analysis/analyzers/custom_analyzer. They are
// available only if their package is imported by client code. To achieve this,
// use their metadata to fill configuration entries:
//
// import (
// "github.com/blevesearch/bleve/analysis/analyzers/custom_analyzer"
// "github.com/blevesearch/bleve/analysis/char_filters/html_char_filter"
// "github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
// "github.com/blevesearch/bleve/analysis/tokenizers/unicode"
// )
//
// m := bleve.NewIndexMapping()
// err := m.AddCustomAnalyzer("html", map[string]interface{}{
// "type": custom_analyzer.Name,
// "char_filters": []string{
// html_char_filter.Name,
// },
// "tokenizer": unicode.Name,
// "token_filters": []string{
// lower_case_filter.Name,
// ...
// },
// })
func (im *IndexMapping) AddCustomAnalyzer(name string, config map[string]interface{}) error {
_, err := im.cache.DefineAnalyzer(name, config)
if err != nil {
return err
}
im.CustomAnalysis.Analyzers[name] = config
return nil
}
// AddCustomDateTimeParser defines a custom date time parser for use in this mapping
func (im *IndexMapping) AddCustomDateTimeParser(name string, config map[string]interface{}) error {
_, err := im.cache.DefineDateTimeParser(name, config)
if err != nil {
return err
}
im.CustomAnalysis.DateTimeParsers[name] = config
return nil
}
// NewIndexMapping creates a new IndexMapping that will use all the default indexing rules
func NewIndexMapping() *IndexMapping {
return &IndexMapping{
TypeMapping: make(map[string]*DocumentMapping),
DefaultMapping: NewDocumentMapping(),
TypeField: defaultTypeField,
DefaultType: defaultType,
DefaultAnalyzer: defaultAnalyzer,
DefaultDateTimeParser: defaultDateTimeParser,
DefaultField: defaultField,
ByteArrayConverter: defaultByteArrayConverter,
CustomAnalysis: newCustomAnalysis(),
cache: registry.NewCache(),
}
}
// Validate will walk the entire structure ensuring the following
// explicitly named and default analyzers can be built
func (im *IndexMapping) validate() error {
_, err := im.cache.AnalyzerNamed(im.DefaultAnalyzer)
if err != nil {
return err
}
_, err = im.cache.DateTimeParserNamed(im.DefaultDateTimeParser)
if err != nil {
return err
}
err = im.DefaultMapping.validate(im.cache)
if err != nil {
return err
}
for _, docMapping := range im.TypeMapping {
err = docMapping.validate(im.cache)
if err != nil {
return err
}
}
return nil
}
// AddDocumentMapping sets a custom document mapping for the specified type
func (im *IndexMapping) AddDocumentMapping(doctype string, dm *DocumentMapping) {
im.TypeMapping[doctype] = dm
}
func (im *IndexMapping) mappingForType(docType string) *DocumentMapping {
docMapping := im.TypeMapping[docType]
if docMapping == nil {
docMapping = im.DefaultMapping
}
return docMapping
}
// UnmarshalJSON deserializes a JSON representation of the IndexMapping
func (im *IndexMapping) UnmarshalJSON(data []byte) error {
var tmp struct {
TypeMapping map[string]*DocumentMapping `json:"types"`
DefaultMapping *DocumentMapping `json:"default_mapping"`
TypeField string `json:"type_field"`
DefaultType string `json:"default_type"`
DefaultAnalyzer string `json:"default_analyzer"`
DefaultDateTimeParser string `json:"default_datetime_parser"`
DefaultField string `json:"default_field"`
ByteArrayConverter string `json:"byte_array_converter"`
CustomAnalysis *customAnalysis `json:"analysis"`
}
err := json.Unmarshal(data, &tmp)
if err != nil {
return err
}
im.cache = registry.NewCache()
im.CustomAnalysis = newCustomAnalysis()
if tmp.CustomAnalysis != nil {
if tmp.CustomAnalysis.CharFilters != nil {
im.CustomAnalysis.CharFilters = tmp.CustomAnalysis.CharFilters
}
if tmp.CustomAnalysis.Tokenizers != nil {
im.CustomAnalysis.Tokenizers = tmp.CustomAnalysis.Tokenizers
}
if tmp.CustomAnalysis.TokenMaps != nil {
im.CustomAnalysis.TokenMaps = tmp.CustomAnalysis.TokenMaps
}
if tmp.CustomAnalysis.TokenFilters != nil {
im.CustomAnalysis.TokenFilters = tmp.CustomAnalysis.TokenFilters
}
if tmp.CustomAnalysis.Analyzers != nil {
im.CustomAnalysis.Analyzers = tmp.CustomAnalysis.Analyzers
}
if tmp.CustomAnalysis.DateTimeParsers != nil {
im.CustomAnalysis.DateTimeParsers = tmp.CustomAnalysis.DateTimeParsers
}
}
im.TypeField = defaultTypeField
if tmp.TypeField != "" {
im.TypeField = tmp.TypeField
}
im.DefaultType = defaultType
if tmp.DefaultType != "" {
im.DefaultType = tmp.DefaultType
}
im.DefaultAnalyzer = defaultAnalyzer
if tmp.DefaultAnalyzer != "" {
im.DefaultAnalyzer = tmp.DefaultAnalyzer
}
im.DefaultDateTimeParser = defaultDateTimeParser
if tmp.DefaultDateTimeParser != "" {
im.DefaultDateTimeParser = tmp.DefaultDateTimeParser
}
im.DefaultField = defaultField
if tmp.DefaultField != "" {
im.DefaultField = tmp.DefaultField
}
im.ByteArrayConverter = defaultByteArrayConverter
if tmp.ByteArrayConverter != "" {
im.ByteArrayConverter = tmp.ByteArrayConverter
}
im.DefaultMapping = NewDocumentMapping()
if tmp.DefaultMapping != nil {
im.DefaultMapping = tmp.DefaultMapping
}
im.TypeMapping = make(map[string]*DocumentMapping, len(tmp.TypeMapping))
for typeName, typeDocMapping := range tmp.TypeMapping {
im.TypeMapping[typeName] = typeDocMapping
}
err = im.CustomAnalysis.registerAll(im)
if err != nil {
return err
}
return nil
}
func (im *IndexMapping) determineType(data interface{}) string {
// first see if the object implements Identifier
classifier, ok := data.(Classifier)
if ok {
return classifier.Type()
}
// now see if we can find a type using the mapping
typ, ok := mustString(lookupPropertyPath(data, im.TypeField))
if ok {
return typ
}
return im.DefaultType
}
func (im *IndexMapping) mapDocument(doc *document.Document, data interface{}) error {
// see if the top level object is a byte array, and possibly run through a converter
byteArrayData, ok := data.([]byte)
if ok {
byteArrayConverterConstructor := registry.ByteArrayConverterByName(im.ByteArrayConverter)
if byteArrayConverterConstructor != nil {
byteArrayConverter, err := byteArrayConverterConstructor(nil, nil)
if err == nil {
convertedData, err := byteArrayConverter.Convert(byteArrayData)
if err != nil {
return err
}
data = convertedData
} else {
logger.Printf("error creating byte array converter: %v", err)
}
} else {
logger.Printf("no byte array converter named: %s", im.ByteArrayConverter)
}
}
docType := im.determineType(data)
docMapping := im.mappingForType(docType)
walkContext := im.newWalkContext(doc, docMapping)
docMapping.walkDocument(data, []string{}, []uint64{}, walkContext)
// see if the _all field was disabled
allMapping := docMapping.documentMappingForPath("_all")
if allMapping == nil || (allMapping.Enabled != false) {
field := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, walkContext.excludedFromAll, document.IndexField|document.IncludeTermVectors)
doc.AddField(field)
}
return nil
}
type walkContext struct {
doc *document.Document
im *IndexMapping
dm *DocumentMapping
excludedFromAll []string
}
func (im *IndexMapping) newWalkContext(doc *document.Document, dm *DocumentMapping) *walkContext {
return &walkContext{
doc: doc,
im: im,
dm: dm,
excludedFromAll: []string{},
}
}
// attempts to find the best analyzer to use with only a field name
// will walk all the document types, look for field mappings at the
// provided path, if one exists and it has an explicit analyzer
// that is returned
// nil should be an acceptable return value meaning we don't know
func (im *IndexMapping) analyzerNameForPath(path string) string {
// first we look for explicit mapping on the field
for _, docMapping := range im.TypeMapping {
analyzerName := docMapping.analyzerNameForPath(path)
if analyzerName != "" {
return analyzerName
}
}
// now try the default mapping
pathMapping := im.DefaultMapping.documentMappingForPath(path)
if pathMapping != nil {
if len(pathMapping.Fields) > 0 {
if pathMapping.Fields[0].Analyzer != "" {
return pathMapping.Fields[0].Analyzer
}
}
}
// next we will try default analyzers for the path
pathDecoded := decodePath(path)
for _, docMapping := range im.TypeMapping {
rv := docMapping.defaultAnalyzerName(pathDecoded)
if rv != "" {
return rv
}
}
return im.DefaultAnalyzer
}
func (im *IndexMapping) analyzerNamed(name string) *analysis.Analyzer {
analyzer, err := im.cache.AnalyzerNamed(name)
if err != nil {
logger.Printf("error using analyzer named: %s", name)
return nil
}
return analyzer
}
func (im *IndexMapping) dateTimeParserNamed(name string) analysis.DateTimeParser {
dateTimeParser, err := im.cache.DateTimeParserNamed(name)
if err != nil {
logger.Printf("error using datetime parser named: %s", name)
return nil
}
return dateTimeParser
}
func (im *IndexMapping) datetimeParserNameForPath(path string) string {
// first we look for explicit mapping on the field
for _, docMapping := range im.TypeMapping {
pathMapping := docMapping.documentMappingForPath(path)
if pathMapping != nil {
if len(pathMapping.Fields) > 0 {
if pathMapping.Fields[0].Analyzer != "" {
return pathMapping.Fields[0].Analyzer
}
}
}
}
return im.DefaultDateTimeParser
}
func (im *IndexMapping) AnalyzeText(analyzerName string, text []byte) (analysis.TokenStream, error) {
analyzer, err := im.cache.AnalyzerNamed(analyzerName)
if err != nil {
return nil, err
}
return analyzer.Analyze(text), nil
}