2014-09-12 23:21:35 +02:00
|
|
|
// Copyright (c) 2014 Couchbase, Inc.
|
2016-10-02 16:13:14 +02:00
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
2014-09-12 23:21:35 +02:00
|
|
|
|
2016-09-30 17:30:17 +02:00
|
|
|
package upsidedown
|
2014-09-12 23:21:35 +02:00
|
|
|
|
|
|
|
import (
|
|
|
|
"github.com/blevesearch/bleve/document"
|
|
|
|
"github.com/blevesearch/bleve/index"
|
|
|
|
"github.com/blevesearch/bleve/index/store"
|
|
|
|
)
|
|
|
|
|
|
|
|
type IndexReader struct {
|
|
|
|
index *UpsideDownCouch
|
|
|
|
kvreader store.KVReader
|
|
|
|
docCount uint64
|
|
|
|
}
|
|
|
|
|
2016-07-30 16:26:42 +02:00
|
|
|
func (i *IndexReader) TermFieldReader(term []byte, fieldName string, includeFreq, includeNorm, includeTermVectors bool) (index.TermFieldReader, error) {
|
2015-09-02 19:12:08 +02:00
|
|
|
fieldIndex, fieldExists := i.index.fieldCache.FieldNamed(fieldName, false)
|
2014-09-12 23:21:35 +02:00
|
|
|
if fieldExists {
|
2016-07-30 16:26:42 +02:00
|
|
|
return newUpsideDownCouchTermFieldReader(i, term, uint16(fieldIndex), includeFreq, includeNorm, includeTermVectors)
|
2014-09-12 23:21:35 +02:00
|
|
|
}
|
2016-07-30 16:26:42 +02:00
|
|
|
return newUpsideDownCouchTermFieldReader(i, []byte{ByteSeparator}, ^uint16(0), includeFreq, includeNorm, includeTermVectors)
|
2014-09-12 23:21:35 +02:00
|
|
|
}
|
|
|
|
|
2015-03-10 21:22:19 +01:00
|
|
|
func (i *IndexReader) FieldDict(fieldName string) (index.FieldDict, error) {
|
|
|
|
return i.FieldDictRange(fieldName, nil, nil)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (i *IndexReader) FieldDictRange(fieldName string, startTerm []byte, endTerm []byte) (index.FieldDict, error) {
|
2015-09-02 19:12:08 +02:00
|
|
|
fieldIndex, fieldExists := i.index.fieldCache.FieldNamed(fieldName, false)
|
2014-09-12 23:21:35 +02:00
|
|
|
if fieldExists {
|
2015-03-10 21:22:19 +01:00
|
|
|
return newUpsideDownCouchFieldDict(i, uint16(fieldIndex), startTerm, endTerm)
|
2014-09-12 23:21:35 +02:00
|
|
|
}
|
2015-03-10 21:22:19 +01:00
|
|
|
return newUpsideDownCouchFieldDict(i, ^uint16(0), []byte{ByteSeparator}, []byte{})
|
|
|
|
}
|
|
|
|
|
|
|
|
func (i *IndexReader) FieldDictPrefix(fieldName string, termPrefix []byte) (index.FieldDict, error) {
|
2015-12-04 20:07:16 +01:00
|
|
|
return i.FieldDictRange(fieldName, termPrefix, termPrefix)
|
2014-09-12 23:21:35 +02:00
|
|
|
}
|
|
|
|
|
2016-09-09 17:04:11 +02:00
|
|
|
func (i *IndexReader) DocIDReaderAll() (index.DocIDReader, error) {
|
|
|
|
return newUpsideDownCouchDocIDReader(i)
|
2014-09-12 23:21:35 +02:00
|
|
|
}
|
|
|
|
|
2016-07-31 19:46:18 +02:00
|
|
|
func (i *IndexReader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) {
|
|
|
|
return newUpsideDownCouchDocIDReaderOnly(i, ids)
|
|
|
|
}
|
|
|
|
|
2015-04-07 19:05:47 +02:00
|
|
|
func (i *IndexReader) Document(id string) (doc *document.Document, err error) {
|
2014-09-12 23:21:35 +02:00
|
|
|
// first hit the back index to confirm doc exists
|
2015-04-07 19:05:47 +02:00
|
|
|
var backIndexRow *BackIndexRow
|
2016-09-13 18:40:01 +02:00
|
|
|
backIndexRow, err = backIndexRowForDoc(i.kvreader, []byte(id))
|
2014-09-12 23:21:35 +02:00
|
|
|
if err != nil {
|
2015-04-07 19:05:47 +02:00
|
|
|
return
|
2014-09-12 23:21:35 +02:00
|
|
|
}
|
|
|
|
if backIndexRow == nil {
|
2015-04-07 19:05:47 +02:00
|
|
|
return
|
2014-09-12 23:21:35 +02:00
|
|
|
}
|
2015-04-07 19:05:47 +02:00
|
|
|
doc = document.NewDocument(id)
|
2016-01-07 08:38:02 +01:00
|
|
|
storedRow := NewStoredRow([]byte(id), 0, []uint64{}, 'x', nil)
|
2014-09-12 23:21:35 +02:00
|
|
|
storedRowScanPrefix := storedRow.ScanPrefixForDoc()
|
2015-09-23 20:25:47 +02:00
|
|
|
it := i.kvreader.PrefixIterator(storedRowScanPrefix)
|
2015-04-07 19:05:47 +02:00
|
|
|
defer func() {
|
|
|
|
if cerr := it.Close(); err == nil && cerr != nil {
|
|
|
|
err = cerr
|
|
|
|
}
|
|
|
|
}()
|
2014-09-12 23:21:35 +02:00
|
|
|
key, val, valid := it.Current()
|
|
|
|
for valid {
|
2015-09-23 20:25:47 +02:00
|
|
|
safeVal := make([]byte, len(val))
|
|
|
|
copy(safeVal, val)
|
2015-04-07 19:05:47 +02:00
|
|
|
var row *StoredRow
|
|
|
|
row, err = NewStoredRowKV(key, safeVal)
|
2014-09-12 23:21:35 +02:00
|
|
|
if err != nil {
|
2015-04-07 19:05:47 +02:00
|
|
|
doc = nil
|
|
|
|
return
|
2014-09-12 23:21:35 +02:00
|
|
|
}
|
|
|
|
if row != nil {
|
2015-09-02 19:12:08 +02:00
|
|
|
fieldName := i.index.fieldCache.FieldIndexed(row.field)
|
2015-05-15 21:47:54 +02:00
|
|
|
field := decodeFieldType(row.typ, fieldName, row.arrayPositions, row.value)
|
2014-09-12 23:21:35 +02:00
|
|
|
if field != nil {
|
2015-04-07 19:05:47 +02:00
|
|
|
doc.AddField(field)
|
2014-09-12 23:21:35 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
it.Next()
|
|
|
|
key, val, valid = it.Current()
|
|
|
|
}
|
2015-04-07 19:05:47 +02:00
|
|
|
return
|
2014-09-12 23:21:35 +02:00
|
|
|
}
|
|
|
|
|
2016-08-17 18:20:12 +02:00
|
|
|
func (i *IndexReader) DocumentFieldTerms(id index.IndexInternalID, fields []string) (index.FieldTerms, error) {
|
2016-09-13 18:40:01 +02:00
|
|
|
back, err := backIndexRowForDoc(i.kvreader, id)
|
2016-07-18 00:31:35 +02:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-09-28 04:16:45 +02:00
|
|
|
if back == nil {
|
|
|
|
return nil, nil
|
|
|
|
}
|
2016-07-18 01:29:17 +02:00
|
|
|
rv := make(index.FieldTerms, len(fields))
|
2016-07-20 05:42:45 +02:00
|
|
|
fieldsMap := make(map[uint16]string, len(fields))
|
|
|
|
for _, f := range fields {
|
|
|
|
id, ok := i.index.fieldCache.FieldNamed(f, false)
|
adds support for more complex field sorts with object (not string)
previously from JSON we would just deserialize strings like
"-abv" or "city" or "_id" or "_score" as simple sorts
on fields, ids or scores respectively
while this is simple and compact, it can be ambiguous (for
example if you have a field starting with - or if you have a field
named "_id" already. also, this simple syntax doesnt allow us
to specify more cmoplex options to deal with type/mode/missing
we keep support for the simple string syntax, but now also
recognize a more expressive syntax like:
{
"by": "field",
"field": "abv",
"desc": true,
"type": "string",
"mode": "min",
"missing": "first"
}
type, mode and missing are optional and default to
"auto", "default", and "last" respectively
2016-08-17 23:33:51 +02:00
|
|
|
if ok {
|
|
|
|
fieldsMap[id] = f
|
2016-07-20 05:42:45 +02:00
|
|
|
}
|
|
|
|
}
|
INDEX FORMAT CHANGE: change back index row value
Previously term entries were encoded pairwise (field/term), so
you'd have data like:
F1/T1 F1/T2 F1/T3 F2/T4 F3/T5
As you can see, even though field 1 has 3 terms, we repeat the F1
part in the encoded data. This is a bit wasteful.
In the new format we encode it as a list of terms for each field:
F1/T1,T2,T3 F2/T4 F3/T5
When fields have multiple terms, this saves space. In unit
tests there is no additional waste even in the case that a field
has only a single value.
Here are the results of an indexing test case (beer-search):
$ benchcmp indexing-before.txt indexing-after.txt
benchmark old ns/op new ns/op delta
BenchmarkIndexing-4 11275835988 10745514321 -4.70%
benchmark old allocs new allocs delta
BenchmarkIndexing-4 25230685 22480494 -10.90%
benchmark old bytes new bytes delta
BenchmarkIndexing-4 4802816224 4741641856 -1.27%
And here are the results of a MatchAll search building a facet
on the "abv" field:
$ benchcmp facet-before.txt facet-after.txt
benchmark old ns/op new ns/op delta
BenchmarkFacets-4 439762100 228064575 -48.14%
benchmark old allocs new allocs delta
BenchmarkFacets-4 9460208 3723286 -60.64%
benchmark old bytes new bytes delta
BenchmarkFacets-4 260784261 151746483 -41.81%
Although we expect the index to be smaller in many cases, the
beer-search index is about the same in this case. However,
this may be due to the underlying storage (boltdb) in this case.
Finally, the index version was bumped from 5 to 7, since smolder
also used version 6, which could lead to some confusion.
2017-01-24 21:33:54 +01:00
|
|
|
for _, entry := range back.termsEntries {
|
2016-07-20 05:42:45 +02:00
|
|
|
if field, ok := fieldsMap[uint16(*entry.Field)]; ok {
|
INDEX FORMAT CHANGE: change back index row value
Previously term entries were encoded pairwise (field/term), so
you'd have data like:
F1/T1 F1/T2 F1/T3 F2/T4 F3/T5
As you can see, even though field 1 has 3 terms, we repeat the F1
part in the encoded data. This is a bit wasteful.
In the new format we encode it as a list of terms for each field:
F1/T1,T2,T3 F2/T4 F3/T5
When fields have multiple terms, this saves space. In unit
tests there is no additional waste even in the case that a field
has only a single value.
Here are the results of an indexing test case (beer-search):
$ benchcmp indexing-before.txt indexing-after.txt
benchmark old ns/op new ns/op delta
BenchmarkIndexing-4 11275835988 10745514321 -4.70%
benchmark old allocs new allocs delta
BenchmarkIndexing-4 25230685 22480494 -10.90%
benchmark old bytes new bytes delta
BenchmarkIndexing-4 4802816224 4741641856 -1.27%
And here are the results of a MatchAll search building a facet
on the "abv" field:
$ benchcmp facet-before.txt facet-after.txt
benchmark old ns/op new ns/op delta
BenchmarkFacets-4 439762100 228064575 -48.14%
benchmark old allocs new allocs delta
BenchmarkFacets-4 9460208 3723286 -60.64%
benchmark old bytes new bytes delta
BenchmarkFacets-4 260784261 151746483 -41.81%
Although we expect the index to be smaller in many cases, the
beer-search index is about the same in this case. However,
this may be due to the underlying storage (boltdb) in this case.
Finally, the index version was bumped from 5 to 7, since smolder
also used version 6, which could lead to some confusion.
2017-01-24 21:33:54 +01:00
|
|
|
rv[field] = entry.Terms
|
2016-07-18 00:31:35 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return rv, nil
|
|
|
|
}
|
|
|
|
|
2015-04-07 19:05:47 +02:00
|
|
|
func (i *IndexReader) Fields() (fields []string, err error) {
|
|
|
|
fields = make([]string, 0)
|
2015-09-23 20:25:47 +02:00
|
|
|
it := i.kvreader.PrefixIterator([]byte{'f'})
|
2015-04-07 19:05:47 +02:00
|
|
|
defer func() {
|
|
|
|
if cerr := it.Close(); err == nil && cerr != nil {
|
|
|
|
err = cerr
|
|
|
|
}
|
|
|
|
}()
|
2014-09-12 23:21:35 +02:00
|
|
|
key, val, valid := it.Current()
|
|
|
|
for valid {
|
2015-04-07 19:05:47 +02:00
|
|
|
var row UpsideDownCouchRow
|
|
|
|
row, err = ParseFromKeyValue(key, val)
|
2014-09-12 23:21:35 +02:00
|
|
|
if err != nil {
|
2015-04-07 19:05:47 +02:00
|
|
|
fields = nil
|
|
|
|
return
|
2014-09-12 23:21:35 +02:00
|
|
|
}
|
|
|
|
if row != nil {
|
|
|
|
fieldRow, ok := row.(*FieldRow)
|
|
|
|
if ok {
|
2015-04-07 19:05:47 +02:00
|
|
|
fields = append(fields, fieldRow.name)
|
2014-09-12 23:21:35 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
it.Next()
|
|
|
|
key, val, valid = it.Current()
|
|
|
|
}
|
2015-04-07 19:05:47 +02:00
|
|
|
return
|
2014-09-12 23:21:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func (i *IndexReader) GetInternal(key []byte) ([]byte, error) {
|
|
|
|
internalRow := NewInternalRow(key, nil)
|
|
|
|
return i.kvreader.Get(internalRow.Key())
|
|
|
|
}
|
|
|
|
|
2016-09-13 18:40:01 +02:00
|
|
|
func (i *IndexReader) DocCount() (uint64, error) {
|
|
|
|
return i.docCount, nil
|
2014-09-12 23:21:35 +02:00
|
|
|
}
|
|
|
|
|
2015-03-06 20:46:29 +01:00
|
|
|
func (i *IndexReader) Close() error {
|
|
|
|
return i.kvreader.Close()
|
2014-09-12 23:21:35 +02:00
|
|
|
}
|
2015-09-23 20:25:47 +02:00
|
|
|
|
2016-08-20 20:03:18 +02:00
|
|
|
func (i *IndexReader) ExternalID(id index.IndexInternalID) (string, error) {
|
2016-08-01 20:26:50 +02:00
|
|
|
return string(id), nil
|
2016-07-31 19:46:18 +02:00
|
|
|
}
|
|
|
|
|
2016-08-20 20:03:18 +02:00
|
|
|
func (i *IndexReader) InternalID(id string) (index.IndexInternalID, error) {
|
|
|
|
return index.IndexInternalID(id), nil
|
|
|
|
}
|
|
|
|
|
2015-09-23 20:25:47 +02:00
|
|
|
func incrementBytes(in []byte) []byte {
|
|
|
|
rv := make([]byte, len(in))
|
|
|
|
copy(rv, in)
|
|
|
|
for i := len(rv) - 1; i >= 0; i-- {
|
|
|
|
rv[i] = rv[i] + 1
|
|
|
|
if rv[i] != 0 {
|
|
|
|
// didn't overflow, so stop
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return rv
|
|
|
|
}
|