2014-08-15 19:12:55 +02:00
|
|
|
// Copyright (c) 2014 Couchbase, Inc.
|
2016-10-02 16:13:14 +02:00
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
2014-09-02 16:54:50 +02:00
|
|
|
|
2016-09-30 17:30:17 +02:00
|
|
|
package upsidedown
|
2014-08-15 19:12:55 +02:00
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"sort"
|
2014-09-12 23:21:35 +02:00
|
|
|
|
|
|
|
"github.com/blevesearch/bleve/index/store"
|
2014-08-15 19:12:55 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
// the functions in this file are only intended to be used by
|
|
|
|
// the bleve_dump utility and the debug http handlers
|
2014-12-18 18:43:12 +01:00
|
|
|
// if your application relies on them, you're doing something wrong
|
2014-08-15 19:12:55 +02:00
|
|
|
// they may change or be removed at any time
|
|
|
|
|
2016-09-13 18:40:01 +02:00
|
|
|
func dumpPrefix(kvreader store.KVReader, rv chan interface{}, prefix []byte) {
|
2014-08-15 19:12:55 +02:00
|
|
|
start := prefix
|
|
|
|
if start == nil {
|
|
|
|
start = []byte{0}
|
|
|
|
}
|
2015-09-23 20:25:47 +02:00
|
|
|
it := kvreader.PrefixIterator(start)
|
2015-04-07 18:04:59 +02:00
|
|
|
defer func() {
|
|
|
|
cerr := it.Close()
|
|
|
|
if cerr != nil {
|
|
|
|
rv <- cerr
|
|
|
|
}
|
|
|
|
}()
|
2014-08-15 19:12:55 +02:00
|
|
|
key, val, valid := it.Current()
|
|
|
|
for valid {
|
2015-10-28 17:06:44 +01:00
|
|
|
ck := make([]byte, len(key))
|
|
|
|
copy(ck, key)
|
|
|
|
cv := make([]byte, len(val))
|
|
|
|
copy(cv, val)
|
|
|
|
row, err := ParseFromKeyValue(ck, cv)
|
2015-09-23 20:25:47 +02:00
|
|
|
if err != nil {
|
|
|
|
rv <- err
|
|
|
|
return
|
2014-08-15 19:12:55 +02:00
|
|
|
}
|
2015-09-23 20:25:47 +02:00
|
|
|
rv <- row
|
|
|
|
|
|
|
|
it.Next()
|
|
|
|
key, val, valid = it.Current()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-09-13 18:40:01 +02:00
|
|
|
func dumpRange(kvreader store.KVReader, rv chan interface{}, start, end []byte) {
|
2015-09-23 20:25:47 +02:00
|
|
|
it := kvreader.RangeIterator(start, end)
|
|
|
|
defer func() {
|
|
|
|
cerr := it.Close()
|
|
|
|
if cerr != nil {
|
|
|
|
rv <- cerr
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
key, val, valid := it.Current()
|
|
|
|
for valid {
|
2015-10-28 17:06:44 +01:00
|
|
|
ck := make([]byte, len(key))
|
|
|
|
copy(ck, key)
|
|
|
|
cv := make([]byte, len(val))
|
|
|
|
copy(cv, val)
|
|
|
|
row, err := ParseFromKeyValue(ck, cv)
|
2014-08-15 19:12:55 +02:00
|
|
|
if err != nil {
|
|
|
|
rv <- err
|
|
|
|
return
|
|
|
|
}
|
|
|
|
rv <- row
|
|
|
|
|
|
|
|
it.Next()
|
|
|
|
key, val, valid = it.Current()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-09-13 18:40:01 +02:00
|
|
|
func (i *IndexReader) DumpAll() chan interface{} {
|
2014-08-15 19:12:55 +02:00
|
|
|
rv := make(chan interface{})
|
|
|
|
go func() {
|
|
|
|
defer close(rv)
|
2016-09-13 18:40:01 +02:00
|
|
|
dumpRange(i.kvreader, rv, nil, nil)
|
2014-08-15 19:12:55 +02:00
|
|
|
}()
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
2016-09-13 18:40:01 +02:00
|
|
|
func (i *IndexReader) DumpFields() chan interface{} {
|
2014-08-15 19:12:55 +02:00
|
|
|
rv := make(chan interface{})
|
|
|
|
go func() {
|
|
|
|
defer close(rv)
|
2016-09-13 18:40:01 +02:00
|
|
|
dumpPrefix(i.kvreader, rv, []byte{'f'})
|
2014-08-15 19:12:55 +02:00
|
|
|
}()
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
|
|
|
type keyset [][]byte
|
|
|
|
|
|
|
|
func (k keyset) Len() int { return len(k) }
|
|
|
|
func (k keyset) Swap(i, j int) { k[i], k[j] = k[j], k[i] }
|
|
|
|
func (k keyset) Less(i, j int) bool { return bytes.Compare(k[i], k[j]) < 0 }
|
|
|
|
|
|
|
|
// DumpDoc returns all rows in the index related to this doc id
|
2016-09-13 18:40:01 +02:00
|
|
|
func (i *IndexReader) DumpDoc(id string) chan interface{} {
|
2016-01-07 08:38:02 +01:00
|
|
|
idBytes := []byte(id)
|
|
|
|
|
2014-08-15 19:12:55 +02:00
|
|
|
rv := make(chan interface{})
|
|
|
|
|
|
|
|
go func() {
|
|
|
|
defer close(rv)
|
|
|
|
|
2016-09-13 18:40:01 +02:00
|
|
|
back, err := backIndexRowForDoc(i.kvreader, []byte(id))
|
2014-08-15 19:12:55 +02:00
|
|
|
if err != nil {
|
|
|
|
rv <- err
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// no such doc
|
|
|
|
if back == nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
// build sorted list of term keys
|
|
|
|
keys := make(keyset, 0)
|
INDEX FORMAT CHANGE: change back index row value
Previously term entries were encoded pairwise (field/term), so
you'd have data like:
F1/T1 F1/T2 F1/T3 F2/T4 F3/T5
As you can see, even though field 1 has 3 terms, we repeat the F1
part in the encoded data. This is a bit wasteful.
In the new format we encode it as a list of terms for each field:
F1/T1,T2,T3 F2/T4 F3/T5
When fields have multiple terms, this saves space. In unit
tests there is no additional waste even in the case that a field
has only a single value.
Here are the results of an indexing test case (beer-search):
$ benchcmp indexing-before.txt indexing-after.txt
benchmark old ns/op new ns/op delta
BenchmarkIndexing-4 11275835988 10745514321 -4.70%
benchmark old allocs new allocs delta
BenchmarkIndexing-4 25230685 22480494 -10.90%
benchmark old bytes new bytes delta
BenchmarkIndexing-4 4802816224 4741641856 -1.27%
And here are the results of a MatchAll search building a facet
on the "abv" field:
$ benchcmp facet-before.txt facet-after.txt
benchmark old ns/op new ns/op delta
BenchmarkFacets-4 439762100 228064575 -48.14%
benchmark old allocs new allocs delta
BenchmarkFacets-4 9460208 3723286 -60.64%
benchmark old bytes new bytes delta
BenchmarkFacets-4 260784261 151746483 -41.81%
Although we expect the index to be smaller in many cases, the
beer-search index is about the same in this case. However,
this may be due to the underlying storage (boltdb) in this case.
Finally, the index version was bumped from 5 to 7, since smolder
also used version 6, which could lead to some confusion.
2017-01-24 21:33:54 +01:00
|
|
|
for _, entry := range back.termsEntries {
|
|
|
|
for i := range entry.Terms {
|
|
|
|
tfr := NewTermFrequencyRow([]byte(entry.Terms[i]), uint16(*entry.Field), idBytes, 0, 0)
|
|
|
|
key := tfr.Key()
|
|
|
|
keys = append(keys, key)
|
|
|
|
}
|
2014-08-15 19:12:55 +02:00
|
|
|
}
|
|
|
|
sort.Sort(keys)
|
|
|
|
|
|
|
|
// first add all the stored rows
|
2016-01-07 08:38:02 +01:00
|
|
|
storedRowPrefix := NewStoredRow(idBytes, 0, []uint64{}, 'x', []byte{}).ScanPrefixForDoc()
|
2016-09-13 18:40:01 +02:00
|
|
|
dumpPrefix(i.kvreader, rv, storedRowPrefix)
|
2014-08-15 19:12:55 +02:00
|
|
|
|
|
|
|
// now walk term keys in order and add them as well
|
|
|
|
if len(keys) > 0 {
|
2016-09-13 18:40:01 +02:00
|
|
|
it := i.kvreader.RangeIterator(keys[0], nil)
|
2015-04-07 18:04:59 +02:00
|
|
|
defer func() {
|
|
|
|
cerr := it.Close()
|
|
|
|
if cerr != nil {
|
|
|
|
rv <- cerr
|
|
|
|
}
|
|
|
|
}()
|
2014-08-15 19:12:55 +02:00
|
|
|
|
|
|
|
for _, key := range keys {
|
|
|
|
it.Seek(key)
|
|
|
|
rkey, rval, valid := it.Current()
|
|
|
|
if !valid {
|
|
|
|
break
|
|
|
|
}
|
2015-10-28 17:06:44 +01:00
|
|
|
rck := make([]byte, len(rkey))
|
|
|
|
copy(rck, key)
|
|
|
|
rcv := make([]byte, len(rval))
|
|
|
|
copy(rcv, rval)
|
|
|
|
row, err := ParseFromKeyValue(rck, rcv)
|
2014-08-15 19:12:55 +02:00
|
|
|
if err != nil {
|
|
|
|
rv <- err
|
|
|
|
return
|
|
|
|
}
|
|
|
|
rv <- row
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
|
|
|
return rv
|
|
|
|
}
|