2015-08-25 20:52:42 +02:00
|
|
|
// Copyright (c) 2015 Couchbase, Inc.
|
2016-10-02 16:13:14 +02:00
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
2015-08-25 20:52:42 +02:00
|
|
|
|
2016-09-30 17:30:17 +02:00
|
|
|
package upsidedown
|
2015-08-25 20:52:42 +02:00
|
|
|
|
|
|
|
import (
|
2015-12-21 20:59:32 +01:00
|
|
|
"github.com/blevesearch/bleve/analysis"
|
2015-08-25 20:52:42 +02:00
|
|
|
"github.com/blevesearch/bleve/document"
|
|
|
|
"github.com/blevesearch/bleve/index"
|
|
|
|
)
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult {
|
|
|
|
rv := &index.AnalysisResult{
|
|
|
|
DocID: d.ID,
|
|
|
|
Rows: make([]index.IndexRow, 0, 100),
|
|
|
|
}
|
|
|
|
|
2016-01-07 08:38:02 +01:00
|
|
|
docIDBytes := []byte(d.ID)
|
|
|
|
|
2015-08-25 20:52:42 +02:00
|
|
|
// track our back index entries
|
|
|
|
backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)
|
|
|
|
|
2015-12-21 20:59:32 +01:00
|
|
|
// information we collate as we merge fields with same name
|
|
|
|
fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)
|
|
|
|
fieldLengths := make(map[uint16]int)
|
|
|
|
fieldIncludeTermVectors := make(map[uint16]bool)
|
|
|
|
fieldNames := make(map[uint16]string)
|
|
|
|
|
2016-01-07 08:53:13 +01:00
|
|
|
analyzeField := func(field document.Field, storable bool) {
|
2015-08-25 20:52:42 +02:00
|
|
|
fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(field.Name())
|
|
|
|
if newFieldRow != nil {
|
|
|
|
rv.Rows = append(rv.Rows, newFieldRow)
|
|
|
|
}
|
2015-12-21 20:59:32 +01:00
|
|
|
fieldNames[fieldIndex] = field.Name()
|
2015-08-25 20:52:42 +02:00
|
|
|
|
|
|
|
if field.Options().IsIndexed() {
|
|
|
|
fieldLength, tokenFreqs := field.Analyze()
|
2015-12-21 20:59:32 +01:00
|
|
|
existingFreqs := fieldTermFreqs[fieldIndex]
|
|
|
|
if existingFreqs == nil {
|
|
|
|
fieldTermFreqs[fieldIndex] = tokenFreqs
|
|
|
|
} else {
|
|
|
|
existingFreqs.MergeAll(field.Name(), tokenFreqs)
|
|
|
|
fieldTermFreqs[fieldIndex] = existingFreqs
|
2015-08-25 20:52:42 +02:00
|
|
|
}
|
2015-12-21 20:59:32 +01:00
|
|
|
fieldLengths[fieldIndex] += fieldLength
|
|
|
|
fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
|
2015-08-25 20:52:42 +02:00
|
|
|
}
|
|
|
|
|
2016-01-07 08:53:13 +01:00
|
|
|
if storable && field.Options().IsStored() {
|
2016-01-07 09:13:38 +01:00
|
|
|
rv.Rows, backIndexStoredEntries = udc.storeField(docIDBytes, field, fieldIndex, rv.Rows, backIndexStoredEntries)
|
2015-08-25 20:52:42 +02:00
|
|
|
}
|
2016-01-07 08:53:13 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// walk all the fields, record stored fields now
|
|
|
|
// place information about indexed fields into map
|
|
|
|
// this collates information across fields with
|
|
|
|
// same names (arrays)
|
|
|
|
for _, field := range d.Fields {
|
|
|
|
analyzeField(field, true)
|
|
|
|
}
|
|
|
|
|
2016-01-13 18:08:19 +01:00
|
|
|
if len(d.CompositeFields) > 0 {
|
|
|
|
for fieldIndex, tokenFreqs := range fieldTermFreqs {
|
|
|
|
// see if any of the composite fields need this
|
|
|
|
for _, compositeField := range d.CompositeFields {
|
|
|
|
compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs)
|
|
|
|
}
|
2016-01-07 08:53:13 +01:00
|
|
|
}
|
|
|
|
|
2016-01-13 18:08:19 +01:00
|
|
|
for _, compositeField := range d.CompositeFields {
|
|
|
|
analyzeField(compositeField, false)
|
|
|
|
}
|
2016-01-07 08:53:13 +01:00
|
|
|
}
|
2015-08-25 20:52:42 +02:00
|
|
|
|
2016-01-07 08:53:13 +01:00
|
|
|
rowsCapNeeded := len(rv.Rows) + 1
|
|
|
|
for _, tokenFreqs := range fieldTermFreqs {
|
|
|
|
rowsCapNeeded += len(tokenFreqs)
|
2015-08-25 20:52:42 +02:00
|
|
|
}
|
|
|
|
|
2016-01-07 09:23:25 +01:00
|
|
|
rv.Rows = append(make([]index.IndexRow, 0, rowsCapNeeded), rv.Rows...)
|
|
|
|
|
INDEX FORMAT CHANGE: change back index row value
Previously term entries were encoded pairwise (field/term), so
you'd have data like:
F1/T1 F1/T2 F1/T3 F2/T4 F3/T5
As you can see, even though field 1 has 3 terms, we repeat the F1
part in the encoded data. This is a bit wasteful.
In the new format we encode it as a list of terms for each field:
F1/T1,T2,T3 F2/T4 F3/T5
When fields have multiple terms, this saves space. In unit
tests there is no additional waste even in the case that a field
has only a single value.
Here are the results of an indexing test case (beer-search):
$ benchcmp indexing-before.txt indexing-after.txt
benchmark old ns/op new ns/op delta
BenchmarkIndexing-4 11275835988 10745514321 -4.70%
benchmark old allocs new allocs delta
BenchmarkIndexing-4 25230685 22480494 -10.90%
benchmark old bytes new bytes delta
BenchmarkIndexing-4 4802816224 4741641856 -1.27%
And here are the results of a MatchAll search building a facet
on the "abv" field:
$ benchcmp facet-before.txt facet-after.txt
benchmark old ns/op new ns/op delta
BenchmarkFacets-4 439762100 228064575 -48.14%
benchmark old allocs new allocs delta
BenchmarkFacets-4 9460208 3723286 -60.64%
benchmark old bytes new bytes delta
BenchmarkFacets-4 260784261 151746483 -41.81%
Although we expect the index to be smaller in many cases, the
beer-search index is about the same in this case. However,
this may be due to the underlying storage (boltdb) in this case.
Finally, the index version was bumped from 5 to 7, since smolder
also used version 6, which could lead to some confusion.
2017-01-24 21:33:54 +01:00
|
|
|
backIndexTermsEntries := make([]*BackIndexTermsEntry, 0, len(fieldTermFreqs))
|
2016-01-07 08:53:13 +01:00
|
|
|
|
2016-04-03 03:59:30 +02:00
|
|
|
// walk through the collated information and process
|
2015-12-21 20:59:32 +01:00
|
|
|
// once for each indexed field (unique name)
|
|
|
|
for fieldIndex, tokenFreqs := range fieldTermFreqs {
|
|
|
|
fieldLength := fieldLengths[fieldIndex]
|
|
|
|
includeTermVectors := fieldIncludeTermVectors[fieldIndex]
|
|
|
|
|
|
|
|
// encode this field
|
INDEX FORMAT CHANGE: change back index row value
Previously term entries were encoded pairwise (field/term), so
you'd have data like:
F1/T1 F1/T2 F1/T3 F2/T4 F3/T5
As you can see, even though field 1 has 3 terms, we repeat the F1
part in the encoded data. This is a bit wasteful.
In the new format we encode it as a list of terms for each field:
F1/T1,T2,T3 F2/T4 F3/T5
When fields have multiple terms, this saves space. In unit
tests there is no additional waste even in the case that a field
has only a single value.
Here are the results of an indexing test case (beer-search):
$ benchcmp indexing-before.txt indexing-after.txt
benchmark old ns/op new ns/op delta
BenchmarkIndexing-4 11275835988 10745514321 -4.70%
benchmark old allocs new allocs delta
BenchmarkIndexing-4 25230685 22480494 -10.90%
benchmark old bytes new bytes delta
BenchmarkIndexing-4 4802816224 4741641856 -1.27%
And here are the results of a MatchAll search building a facet
on the "abv" field:
$ benchcmp facet-before.txt facet-after.txt
benchmark old ns/op new ns/op delta
BenchmarkFacets-4 439762100 228064575 -48.14%
benchmark old allocs new allocs delta
BenchmarkFacets-4 9460208 3723286 -60.64%
benchmark old bytes new bytes delta
BenchmarkFacets-4 260784261 151746483 -41.81%
Although we expect the index to be smaller in many cases, the
beer-search index is about the same in this case. However,
this may be due to the underlying storage (boltdb) in this case.
Finally, the index version was bumped from 5 to 7, since smolder
also used version 6, which could lead to some confusion.
2017-01-24 21:33:54 +01:00
|
|
|
rv.Rows, backIndexTermsEntries = udc.indexField(docIDBytes, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows, backIndexTermsEntries)
|
2015-12-21 20:59:32 +01:00
|
|
|
}
|
|
|
|
|
2015-08-25 20:52:42 +02:00
|
|
|
// build the back index row
|
INDEX FORMAT CHANGE: change back index row value
Previously term entries were encoded pairwise (field/term), so
you'd have data like:
F1/T1 F1/T2 F1/T3 F2/T4 F3/T5
As you can see, even though field 1 has 3 terms, we repeat the F1
part in the encoded data. This is a bit wasteful.
In the new format we encode it as a list of terms for each field:
F1/T1,T2,T3 F2/T4 F3/T5
When fields have multiple terms, this saves space. In unit
tests there is no additional waste even in the case that a field
has only a single value.
Here are the results of an indexing test case (beer-search):
$ benchcmp indexing-before.txt indexing-after.txt
benchmark old ns/op new ns/op delta
BenchmarkIndexing-4 11275835988 10745514321 -4.70%
benchmark old allocs new allocs delta
BenchmarkIndexing-4 25230685 22480494 -10.90%
benchmark old bytes new bytes delta
BenchmarkIndexing-4 4802816224 4741641856 -1.27%
And here are the results of a MatchAll search building a facet
on the "abv" field:
$ benchcmp facet-before.txt facet-after.txt
benchmark old ns/op new ns/op delta
BenchmarkFacets-4 439762100 228064575 -48.14%
benchmark old allocs new allocs delta
BenchmarkFacets-4 9460208 3723286 -60.64%
benchmark old bytes new bytes delta
BenchmarkFacets-4 260784261 151746483 -41.81%
Although we expect the index to be smaller in many cases, the
beer-search index is about the same in this case. However,
this may be due to the underlying storage (boltdb) in this case.
Finally, the index version was bumped from 5 to 7, since smolder
also used version 6, which could lead to some confusion.
2017-01-24 21:33:54 +01:00
|
|
|
backIndexRow := NewBackIndexRow(docIDBytes, backIndexTermsEntries, backIndexStoredEntries)
|
2015-08-25 20:52:42 +02:00
|
|
|
rv.Rows = append(rv.Rows, backIndexRow)
|
|
|
|
|
|
|
|
return rv
|
|
|
|
}
|