bleve/index/upsidedown/analysis.go

//  Copyright (c) 2015 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// 		http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package upsidedown

import (
	"github.com/blevesearch/bleve/analysis"
	"github.com/blevesearch/bleve/document"
	"github.com/blevesearch/bleve/index"
)

func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult {
	rv := &index.AnalysisResult{
		DocID: d.ID,
		Rows:  make([]index.IndexRow, 0, 100),
	}

	docIDBytes := []byte(d.ID)

	// track our back index entries
	backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)

	// information we collate as we merge fields with same name
	fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)
	fieldLengths := make(map[uint16]int)
	fieldIncludeTermVectors := make(map[uint16]bool)
	fieldNames := make(map[uint16]string)

	analyzeField := func(field document.Field, storable bool) {
		fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(field.Name())
		if newFieldRow != nil {
			rv.Rows = append(rv.Rows, newFieldRow)
		}
		fieldNames[fieldIndex] = field.Name()

		if field.Options().IsIndexed() {
			fieldLength, tokenFreqs := field.Analyze()
			existingFreqs := fieldTermFreqs[fieldIndex]
			if existingFreqs == nil {
				fieldTermFreqs[fieldIndex] = tokenFreqs
			} else {
				existingFreqs.MergeAll(field.Name(), tokenFreqs)
				fieldTermFreqs[fieldIndex] = existingFreqs
			}
			fieldLengths[fieldIndex] += fieldLength
			fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
		}

		if storable && field.Options().IsStored() {
			rv.Rows, backIndexStoredEntries = udc.storeField(docIDBytes, field, fieldIndex, rv.Rows, backIndexStoredEntries)
		}
	}

	// walk all the fields, record stored fields now
	// place information about indexed fields into map
	// this collates information across fields with
	// same names (arrays)
	for _, field := range d.Fields {
		analyzeField(field, true)
	}

	if len(d.CompositeFields) > 0 {
		for fieldIndex, tokenFreqs := range fieldTermFreqs {
			// see if any of the composite fields need this
			for _, compositeField := range d.CompositeFields {
				compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs)
			}
		}

		for _, compositeField := range d.CompositeFields {
			analyzeField(compositeField, false)
		}
	}

	rowsCapNeeded := len(rv.Rows) + 1
	for _, tokenFreqs := range fieldTermFreqs {
		rowsCapNeeded += len(tokenFreqs)
	}

	rv.Rows = append(make([]index.IndexRow, 0, rowsCapNeeded), rv.Rows...)

	backIndexTermsEntries := make([]*BackIndexTermsEntry, 0, len(fieldTermFreqs))

	// walk through the collated information and process
	// once for each indexed field (unique name)
	for fieldIndex, tokenFreqs := range fieldTermFreqs {
		fieldLength := fieldLengths[fieldIndex]
		includeTermVectors := fieldIncludeTermVectors[fieldIndex]

		// encode this field
		rv.Rows, backIndexTermsEntries = udc.indexField(docIDBytes, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows, backIndexTermsEntries)
	}

	// build the back index row
	backIndexRow := NewBackIndexRow(docIDBytes, backIndexTermsEntries, backIndexStoredEntries)
	rv.Rows = append(rv.Rows, backIndexRow)

	return rv
}
made index type configurable + first version of firestorm 2015-08-25 20:52:42 +02:00			`// Copyright (c) 2015 Couchbase, Inc.`
nicer formatting of license header 2016-10-02 16:13:14 +02:00			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`
made index type configurable + first version of firestorm 2015-08-25 20:52:42 +02:00
BREAKING CHANGE - rename upside_down to upsidedown 2016-09-30 17:30:17 +02:00			`package upsidedown`
made index type configurable + first version of firestorm 2015-08-25 20:52:42 +02:00
			`import (`
fix indexing bug with data coming from arrays fixes #295 2015-12-21 20:59:32 +01:00			`"github.com/blevesearch/bleve/analysis"`
made index type configurable + first version of firestorm 2015-08-25 20:52:42 +02:00			`"github.com/blevesearch/bleve/document"`
			`"github.com/blevesearch/bleve/index"`
			`)`

			`func (udc UpsideDownCouch) Analyze(d document.Document) *index.AnalysisResult {`
			`rv := &index.AnalysisResult{`
			`DocID: d.ID,`
			`Rows: make([]index.IndexRow, 0, 100),`
			`}`

upside_down analysis converts to docIDBytes once 2016-01-07 08:38:02 +01:00			`docIDBytes := []byte(d.ID)`

made index type configurable + first version of firestorm 2015-08-25 20:52:42 +02:00			`// track our back index entries`
			`backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)`

fix indexing bug with data coming from arrays fixes #295 2015-12-21 20:59:32 +01:00			`// information we collate as we merge fields with same name`
			`fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)`
			`fieldLengths := make(map[uint16]int)`
			`fieldIncludeTermVectors := make(map[uint16]bool)`
			`fieldNames := make(map[uint16]string)`

upside_down gets analysis perf rows optimizations from firestorm 2016-01-07 08:53:13 +01:00			`analyzeField := func(field document.Field, storable bool) {`
made index type configurable + first version of firestorm 2015-08-25 20:52:42 +02:00			`fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(field.Name())`
			`if newFieldRow != nil {`
			`rv.Rows = append(rv.Rows, newFieldRow)`
			`}`
fix indexing bug with data coming from arrays fixes #295 2015-12-21 20:59:32 +01:00			`fieldNames[fieldIndex] = field.Name()`
made index type configurable + first version of firestorm 2015-08-25 20:52:42 +02:00
			`if field.Options().IsIndexed() {`
			`fieldLength, tokenFreqs := field.Analyze()`
fix indexing bug with data coming from arrays fixes #295 2015-12-21 20:59:32 +01:00			`existingFreqs := fieldTermFreqs[fieldIndex]`
			`if existingFreqs == nil {`
			`fieldTermFreqs[fieldIndex] = tokenFreqs`
			`} else {`
			`existingFreqs.MergeAll(field.Name(), tokenFreqs)`
			`fieldTermFreqs[fieldIndex] = existingFreqs`
made index type configurable + first version of firestorm 2015-08-25 20:52:42 +02:00			`}`
fix indexing bug with data coming from arrays fixes #295 2015-12-21 20:59:32 +01:00			`fieldLengths[fieldIndex] += fieldLength`
			`fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()`
made index type configurable + first version of firestorm 2015-08-25 20:52:42 +02:00			`}`

upside_down gets analysis perf rows optimizations from firestorm 2016-01-07 08:53:13 +01:00			`if storable && field.Options().IsStored() {`
upside_down storeField/indexField append to provided arrays Taking another optimization from firestorm, upside_down's storeField()/indexField() funcs now also append() to passed-in arrays rather than always allocating their own arrays. 2016-01-07 09:13:38 +01:00			`rv.Rows, backIndexStoredEntries = udc.storeField(docIDBytes, field, fieldIndex, rv.Rows, backIndexStoredEntries)`
made index type configurable + first version of firestorm 2015-08-25 20:52:42 +02:00			`}`
upside_down gets analysis perf rows optimizations from firestorm 2016-01-07 08:53:13 +01:00			`}`

			`// walk all the fields, record stored fields now`
			`// place information about indexed fields into map`
			`// this collates information across fields with`
			`// same names (arrays)`
			`for _, field := range d.Fields {`
			`analyzeField(field, true)`
			`}`

avoid fieldTermFreqs loop if no composite fields 2016-01-13 18:08:19 +01:00			`if len(d.CompositeFields) > 0 {`
			`for fieldIndex, tokenFreqs := range fieldTermFreqs {`
			`// see if any of the composite fields need this`
			`for _, compositeField := range d.CompositeFields {`
			`compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs)`
			`}`
upside_down gets analysis perf rows optimizations from firestorm 2016-01-07 08:53:13 +01:00			`}`

avoid fieldTermFreqs loop if no composite fields 2016-01-13 18:08:19 +01:00			`for _, compositeField := range d.CompositeFields {`
			`analyzeField(compositeField, false)`
			`}`
upside_down gets analysis perf rows optimizations from firestorm 2016-01-07 08:53:13 +01:00			`}`
made index type configurable + first version of firestorm 2015-08-25 20:52:42 +02:00
upside_down gets analysis perf rows optimizations from firestorm 2016-01-07 08:53:13 +01:00			`rowsCapNeeded := len(rv.Rows) + 1`
			`for _, tokenFreqs := range fieldTermFreqs {`
			`rowsCapNeeded += len(tokenFreqs)`
made index type configurable + first version of firestorm 2015-08-25 20:52:42 +02:00			`}`

upside_down backIndexTermEntries precalloc'ed capacity 2016-01-07 09:23:25 +01:00			`rv.Rows = append(make([]index.IndexRow, 0, rowsCapNeeded), rv.Rows...)`

INDEX FORMAT CHANGE: change back index row value Previously term entries were encoded pairwise (field/term), so you'd have data like: F1/T1 F1/T2 F1/T3 F2/T4 F3/T5 As you can see, even though field 1 has 3 terms, we repeat the F1 part in the encoded data. This is a bit wasteful. In the new format we encode it as a list of terms for each field: F1/T1,T2,T3 F2/T4 F3/T5 When fields have multiple terms, this saves space. In unit tests there is no additional waste even in the case that a field has only a single value. Here are the results of an indexing test case (beer-search): $ benchcmp indexing-before.txt indexing-after.txt benchmark old ns/op new ns/op delta BenchmarkIndexing-4 11275835988 10745514321 -4.70% benchmark old allocs new allocs delta BenchmarkIndexing-4 25230685 22480494 -10.90% benchmark old bytes new bytes delta BenchmarkIndexing-4 4802816224 4741641856 -1.27% And here are the results of a MatchAll search building a facet on the "abv" field: $ benchcmp facet-before.txt facet-after.txt benchmark old ns/op new ns/op delta BenchmarkFacets-4 439762100 228064575 -48.14% benchmark old allocs new allocs delta BenchmarkFacets-4 9460208 3723286 -60.64% benchmark old bytes new bytes delta BenchmarkFacets-4 260784261 151746483 -41.81% Although we expect the index to be smaller in many cases, the beer-search index is about the same in this case. However, this may be due to the underlying storage (boltdb) in this case. Finally, the index version was bumped from 5 to 7, since smolder also used version 6, which could lead to some confusion. 2017-01-24 21:33:54 +01:00			`backIndexTermsEntries := make([]*BackIndexTermsEntry, 0, len(fieldTermFreqs))`
upside_down gets analysis perf rows optimizations from firestorm 2016-01-07 08:53:13 +01:00
fix typos 2016-04-03 03:59:30 +02:00			`// walk through the collated information and process`
fix indexing bug with data coming from arrays fixes #295 2015-12-21 20:59:32 +01:00			`// once for each indexed field (unique name)`
			`for fieldIndex, tokenFreqs := range fieldTermFreqs {`
			`fieldLength := fieldLengths[fieldIndex]`
			`includeTermVectors := fieldIncludeTermVectors[fieldIndex]`

			`// encode this field`
INDEX FORMAT CHANGE: change back index row value Previously term entries were encoded pairwise (field/term), so you'd have data like: F1/T1 F1/T2 F1/T3 F2/T4 F3/T5 As you can see, even though field 1 has 3 terms, we repeat the F1 part in the encoded data. This is a bit wasteful. In the new format we encode it as a list of terms for each field: F1/T1,T2,T3 F2/T4 F3/T5 When fields have multiple terms, this saves space. In unit tests there is no additional waste even in the case that a field has only a single value. Here are the results of an indexing test case (beer-search): $ benchcmp indexing-before.txt indexing-after.txt benchmark old ns/op new ns/op delta BenchmarkIndexing-4 11275835988 10745514321 -4.70% benchmark old allocs new allocs delta BenchmarkIndexing-4 25230685 22480494 -10.90% benchmark old bytes new bytes delta BenchmarkIndexing-4 4802816224 4741641856 -1.27% And here are the results of a MatchAll search building a facet on the "abv" field: $ benchcmp facet-before.txt facet-after.txt benchmark old ns/op new ns/op delta BenchmarkFacets-4 439762100 228064575 -48.14% benchmark old allocs new allocs delta BenchmarkFacets-4 9460208 3723286 -60.64% benchmark old bytes new bytes delta BenchmarkFacets-4 260784261 151746483 -41.81% Although we expect the index to be smaller in many cases, the beer-search index is about the same in this case. However, this may be due to the underlying storage (boltdb) in this case. Finally, the index version was bumped from 5 to 7, since smolder also used version 6, which could lead to some confusion. 2017-01-24 21:33:54 +01:00			`rv.Rows, backIndexTermsEntries = udc.indexField(docIDBytes, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows, backIndexTermsEntries)`
fix indexing bug with data coming from arrays fixes #295 2015-12-21 20:59:32 +01:00			`}`

made index type configurable + first version of firestorm 2015-08-25 20:52:42 +02:00			`// build the back index row`
INDEX FORMAT CHANGE: change back index row value Previously term entries were encoded pairwise (field/term), so you'd have data like: F1/T1 F1/T2 F1/T3 F2/T4 F3/T5 As you can see, even though field 1 has 3 terms, we repeat the F1 part in the encoded data. This is a bit wasteful. In the new format we encode it as a list of terms for each field: F1/T1,T2,T3 F2/T4 F3/T5 When fields have multiple terms, this saves space. In unit tests there is no additional waste even in the case that a field has only a single value. Here are the results of an indexing test case (beer-search): $ benchcmp indexing-before.txt indexing-after.txt benchmark old ns/op new ns/op delta BenchmarkIndexing-4 11275835988 10745514321 -4.70% benchmark old allocs new allocs delta BenchmarkIndexing-4 25230685 22480494 -10.90% benchmark old bytes new bytes delta BenchmarkIndexing-4 4802816224 4741641856 -1.27% And here are the results of a MatchAll search building a facet on the "abv" field: $ benchcmp facet-before.txt facet-after.txt benchmark old ns/op new ns/op delta BenchmarkFacets-4 439762100 228064575 -48.14% benchmark old allocs new allocs delta BenchmarkFacets-4 9460208 3723286 -60.64% benchmark old bytes new bytes delta BenchmarkFacets-4 260784261 151746483 -41.81% Although we expect the index to be smaller in many cases, the beer-search index is about the same in this case. However, this may be due to the underlying storage (boltdb) in this case. Finally, the index version was bumped from 5 to 7, since smolder also used version 6, which could lead to some confusion. 2017-01-24 21:33:54 +01:00			`backIndexRow := NewBackIndexRow(docIDBytes, backIndexTermsEntries, backIndexStoredEntries)`
made index type configurable + first version of firestorm 2015-08-25 20:52:42 +02:00			`rv.Rows = append(rv.Rows, backIndexRow)`

			`return rv`
			`}`