bleve/index/upsidedown/dump.go

//  Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// 		http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package upsidedown

import (
	"bytes"
	"sort"

	"github.com/blevesearch/bleve/index/store"
)

// the functions in this file are only intended to be used by
// the bleve_dump utility and the debug http handlers
// if your application relies on them, you're doing something wrong
// they may change or be removed at any time

func dumpPrefix(kvreader store.KVReader, rv chan interface{}, prefix []byte) {
	start := prefix
	if start == nil {
		start = []byte{0}
	}
	it := kvreader.PrefixIterator(start)
	defer func() {
		cerr := it.Close()
		if cerr != nil {
			rv <- cerr
		}
	}()
	key, val, valid := it.Current()
	for valid {
		ck := make([]byte, len(key))
		copy(ck, key)
		cv := make([]byte, len(val))
		copy(cv, val)
		row, err := ParseFromKeyValue(ck, cv)
		if err != nil {
			rv <- err
			return
		}
		rv <- row

		it.Next()
		key, val, valid = it.Current()
	}
}

func dumpRange(kvreader store.KVReader, rv chan interface{}, start, end []byte) {
	it := kvreader.RangeIterator(start, end)
	defer func() {
		cerr := it.Close()
		if cerr != nil {
			rv <- cerr
		}
	}()
	key, val, valid := it.Current()
	for valid {
		ck := make([]byte, len(key))
		copy(ck, key)
		cv := make([]byte, len(val))
		copy(cv, val)
		row, err := ParseFromKeyValue(ck, cv)
		if err != nil {
			rv <- err
			return
		}
		rv <- row

		it.Next()
		key, val, valid = it.Current()
	}
}

func (i *IndexReader) DumpAll() chan interface{} {
	rv := make(chan interface{})
	go func() {
		defer close(rv)
		dumpRange(i.kvreader, rv, nil, nil)
	}()
	return rv
}

func (i *IndexReader) DumpFields() chan interface{} {
	rv := make(chan interface{})
	go func() {
		defer close(rv)
		dumpPrefix(i.kvreader, rv, []byte{'f'})
	}()
	return rv
}

type keyset [][]byte

func (k keyset) Len() int           { return len(k) }
func (k keyset) Swap(i, j int)      { k[i], k[j] = k[j], k[i] }
func (k keyset) Less(i, j int) bool { return bytes.Compare(k[i], k[j]) < 0 }

// DumpDoc returns all rows in the index related to this doc id
func (i *IndexReader) DumpDoc(id string) chan interface{} {
	idBytes := []byte(id)

	rv := make(chan interface{})

	go func() {
		defer close(rv)

		back, err := backIndexRowForDoc(i.kvreader, []byte(id))
		if err != nil {
			rv <- err
			return
		}

		// no such doc
		if back == nil {
			return
		}
		// build sorted list of term keys
		keys := make(keyset, 0)
		for _, entry := range back.termsEntries {
			for i := range entry.Terms {
				tfr := NewTermFrequencyRow([]byte(entry.Terms[i]), uint16(*entry.Field), idBytes, 0, 0)
				key := tfr.Key()
				keys = append(keys, key)
			}
		}
		sort.Sort(keys)

		// first add all the stored rows
		storedRowPrefix := NewStoredRow(idBytes, 0, []uint64{}, 'x', []byte{}).ScanPrefixForDoc()
		dumpPrefix(i.kvreader, rv, storedRowPrefix)

		// now walk term keys in order and add them as well
		if len(keys) > 0 {
			it := i.kvreader.RangeIterator(keys[0], nil)
			defer func() {
				cerr := it.Close()
				if cerr != nil {
					rv <- cerr
				}
			}()

			for _, key := range keys {
				it.Seek(key)
				rkey, rval, valid := it.Current()
				if !valid {
					break
				}
				rck := make([]byte, len(rkey))
				copy(rck, key)
				rcv := make([]byte, len(rval))
				copy(rcv, rval)
				row, err := ParseFromKeyValue(rck, rcv)
				if err != nil {
					rv <- err
					return
				}
				rv <- row
			}
		}
	}()

	return rv
}
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00			`// Copyright (c) 2014 Couchbase, Inc.`
nicer formatting of license header 2016-10-02 16:13:14 +02:00			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`
add newline between license and package this avoids cluttering godocs with the license 2014-09-02 16:54:50 +02:00
BREAKING CHANGE - rename upside_down to upsidedown 2016-09-30 17:30:17 +02:00			`package upsidedown`
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00
			`import (`
			`"bytes"`
			`"sort"`
major refactor of kvstore/index internals, see below In the index/store package introduce KVReader creates snapshot all read operations consistent from this snapshot must close to release introduce KVWriter only one writer active access to all operations allows for consisten read-modify-write must close to release introduce AssociativeMerge operation on batch allows efficient read-modify-write for associative operations used to consolidate updates to the term summary rows saves 1 set and 1 get op per shared instance of term in field In the index package introduced an IndexReader exposes a consisten snapshot of the index for searching At top level All searches now operate on a consisten snapshot of the index 2014-09-12 23:21:35 +02:00
			`"github.com/blevesearch/bleve/index/store"`
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00			`)`

			`// the functions in this file are only intended to be used by`
			`// the bleve_dump utility and the debug http handlers`
Fix typos in comments and strings 2014-12-18 18:43:12 +01:00			`// if your application relies on them, you're doing something wrong`
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00			`// they may change or be removed at any time`

BREAKING CHANGE - removed DumpXXX() methods from bleve.Index The DumpXXX() methods were always documented as internal and unsupported. However, now they are being removed from the public top-level API. They are still available on the internal IndexReader, which can be accessed using the Advanced() method. The DocCount() and DumpXXX() methods on the internal index have moved to the internal index reader, since they logically operate on a snapshot of an index. 2016-09-13 18:40:01 +02:00			`func dumpPrefix(kvreader store.KVReader, rv chan interface{}, prefix []byte) {`
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00			`start := prefix`
			`if start == nil {`
			`start = []byte{0}`
			`}`
major kvstore interface and impl overhaul clarified the interface contract 2015-09-23 20:25:47 +02:00			`it := kvreader.PrefixIterator(start)`
fix issues identified by errcheck part of #169 2015-04-07 18:04:59 +02:00			`defer func() {`
			`cerr := it.Close()`
			`if cerr != nil {`
			`rv <- cerr`
			`}`
			`}()`
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00			`key, val, valid := it.Current()`
			`for valid {`
fix dump methods to properly copy keys and values 2015-10-28 17:06:44 +01:00			`ck := make([]byte, len(key))`
			`copy(ck, key)`
			`cv := make([]byte, len(val))`
			`copy(cv, val)`
			`row, err := ParseFromKeyValue(ck, cv)`
major kvstore interface and impl overhaul clarified the interface contract 2015-09-23 20:25:47 +02:00			`if err != nil {`
			`rv <- err`
			`return`
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00			`}`
major kvstore interface and impl overhaul clarified the interface contract 2015-09-23 20:25:47 +02:00			`rv <- row`

			`it.Next()`
			`key, val, valid = it.Current()`
			`}`
			`}`

BREAKING CHANGE - removed DumpXXX() methods from bleve.Index The DumpXXX() methods were always documented as internal and unsupported. However, now they are being removed from the public top-level API. They are still available on the internal IndexReader, which can be accessed using the Advanced() method. The DocCount() and DumpXXX() methods on the internal index have moved to the internal index reader, since they logically operate on a snapshot of an index. 2016-09-13 18:40:01 +02:00			`func dumpRange(kvreader store.KVReader, rv chan interface{}, start, end []byte) {`
major kvstore interface and impl overhaul clarified the interface contract 2015-09-23 20:25:47 +02:00			`it := kvreader.RangeIterator(start, end)`
			`defer func() {`
			`cerr := it.Close()`
			`if cerr != nil {`
			`rv <- cerr`
			`}`
			`}()`
			`key, val, valid := it.Current()`
			`for valid {`
fix dump methods to properly copy keys and values 2015-10-28 17:06:44 +01:00			`ck := make([]byte, len(key))`
			`copy(ck, key)`
			`cv := make([]byte, len(val))`
			`copy(cv, val)`
			`row, err := ParseFromKeyValue(ck, cv)`
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00			`if err != nil {`
			`rv <- err`
			`return`
			`}`
			`rv <- row`

			`it.Next()`
			`key, val, valid = it.Current()`
			`}`
			`}`

BREAKING CHANGE - removed DumpXXX() methods from bleve.Index The DumpXXX() methods were always documented as internal and unsupported. However, now they are being removed from the public top-level API. They are still available on the internal IndexReader, which can be accessed using the Advanced() method. The DocCount() and DumpXXX() methods on the internal index have moved to the internal index reader, since they logically operate on a snapshot of an index. 2016-09-13 18:40:01 +02:00			`func (i *IndexReader) DumpAll() chan interface{} {`
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00			`rv := make(chan interface{})`
			`go func() {`
			`defer close(rv)`
BREAKING CHANGE - removed DumpXXX() methods from bleve.Index The DumpXXX() methods were always documented as internal and unsupported. However, now they are being removed from the public top-level API. They are still available on the internal IndexReader, which can be accessed using the Advanced() method. The DocCount() and DumpXXX() methods on the internal index have moved to the internal index reader, since they logically operate on a snapshot of an index. 2016-09-13 18:40:01 +02:00			`dumpRange(i.kvreader, rv, nil, nil)`
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00			`}()`
			`return rv`
			`}`

BREAKING CHANGE - removed DumpXXX() methods from bleve.Index The DumpXXX() methods were always documented as internal and unsupported. However, now they are being removed from the public top-level API. They are still available on the internal IndexReader, which can be accessed using the Advanced() method. The DocCount() and DumpXXX() methods on the internal index have moved to the internal index reader, since they logically operate on a snapshot of an index. 2016-09-13 18:40:01 +02:00			`func (i *IndexReader) DumpFields() chan interface{} {`
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00			`rv := make(chan interface{})`
			`go func() {`
			`defer close(rv)`
BREAKING CHANGE - removed DumpXXX() methods from bleve.Index The DumpXXX() methods were always documented as internal and unsupported. However, now they are being removed from the public top-level API. They are still available on the internal IndexReader, which can be accessed using the Advanced() method. The DocCount() and DumpXXX() methods on the internal index have moved to the internal index reader, since they logically operate on a snapshot of an index. 2016-09-13 18:40:01 +02:00			`dumpPrefix(i.kvreader, rv, []byte{'f'})`
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00			`}()`
			`return rv`
			`}`

			`type keyset [][]byte`

			`func (k keyset) Len() int { return len(k) }`
			`func (k keyset) Swap(i, j int) { k[i], k[j] = k[j], k[i] }`
			`func (k keyset) Less(i, j int) bool { return bytes.Compare(k[i], k[j]) < 0 }`

			`// DumpDoc returns all rows in the index related to this doc id`
BREAKING CHANGE - removed DumpXXX() methods from bleve.Index The DumpXXX() methods were always documented as internal and unsupported. However, now they are being removed from the public top-level API. They are still available on the internal IndexReader, which can be accessed using the Advanced() method. The DocCount() and DumpXXX() methods on the internal index have moved to the internal index reader, since they logically operate on a snapshot of an index. 2016-09-13 18:40:01 +02:00			`func (i *IndexReader) DumpDoc(id string) chan interface{} {`
upside_down analysis converts to docIDBytes once 2016-01-07 08:38:02 +01:00			`idBytes := []byte(id)`

refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00			`rv := make(chan interface{})`

			`go func() {`
			`defer close(rv)`

BREAKING CHANGE - removed DumpXXX() methods from bleve.Index The DumpXXX() methods were always documented as internal and unsupported. However, now they are being removed from the public top-level API. They are still available on the internal IndexReader, which can be accessed using the Advanced() method. The DocCount() and DumpXXX() methods on the internal index have moved to the internal index reader, since they logically operate on a snapshot of an index. 2016-09-13 18:40:01 +02:00			`back, err := backIndexRowForDoc(i.kvreader, []byte(id))`
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00			`if err != nil {`
			`rv <- err`
			`return`
			`}`

			`// no such doc`
			`if back == nil {`
			`return`
			`}`
			`// build sorted list of term keys`
			`keys := make(keyset, 0)`
INDEX FORMAT CHANGE: change back index row value Previously term entries were encoded pairwise (field/term), so you'd have data like: F1/T1 F1/T2 F1/T3 F2/T4 F3/T5 As you can see, even though field 1 has 3 terms, we repeat the F1 part in the encoded data. This is a bit wasteful. In the new format we encode it as a list of terms for each field: F1/T1,T2,T3 F2/T4 F3/T5 When fields have multiple terms, this saves space. In unit tests there is no additional waste even in the case that a field has only a single value. Here are the results of an indexing test case (beer-search): $ benchcmp indexing-before.txt indexing-after.txt benchmark old ns/op new ns/op delta BenchmarkIndexing-4 11275835988 10745514321 -4.70% benchmark old allocs new allocs delta BenchmarkIndexing-4 25230685 22480494 -10.90% benchmark old bytes new bytes delta BenchmarkIndexing-4 4802816224 4741641856 -1.27% And here are the results of a MatchAll search building a facet on the "abv" field: $ benchcmp facet-before.txt facet-after.txt benchmark old ns/op new ns/op delta BenchmarkFacets-4 439762100 228064575 -48.14% benchmark old allocs new allocs delta BenchmarkFacets-4 9460208 3723286 -60.64% benchmark old bytes new bytes delta BenchmarkFacets-4 260784261 151746483 -41.81% Although we expect the index to be smaller in many cases, the beer-search index is about the same in this case. However, this may be due to the underlying storage (boltdb) in this case. Finally, the index version was bumped from 5 to 7, since smolder also used version 6, which could lead to some confusion. 2017-01-24 21:33:54 +01:00			`for _, entry := range back.termsEntries {`
			`for i := range entry.Terms {`
			`tfr := NewTermFrequencyRow([]byte(entry.Terms[i]), uint16(*entry.Field), idBytes, 0, 0)`
			`key := tfr.Key()`
			`keys = append(keys, key)`
			`}`
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00			`}`
			`sort.Sort(keys)`

			`// first add all the stored rows`
upside_down analysis converts to docIDBytes once 2016-01-07 08:38:02 +01:00			`storedRowPrefix := NewStoredRow(idBytes, 0, []uint64{}, 'x', []byte{}).ScanPrefixForDoc()`
BREAKING CHANGE - removed DumpXXX() methods from bleve.Index The DumpXXX() methods were always documented as internal and unsupported. However, now they are being removed from the public top-level API. They are still available on the internal IndexReader, which can be accessed using the Advanced() method. The DocCount() and DumpXXX() methods on the internal index have moved to the internal index reader, since they logically operate on a snapshot of an index. 2016-09-13 18:40:01 +02:00			`dumpPrefix(i.kvreader, rv, storedRowPrefix)`
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00
			`// now walk term keys in order and add them as well`
			`if len(keys) > 0 {`
BREAKING CHANGE - removed DumpXXX() methods from bleve.Index The DumpXXX() methods were always documented as internal and unsupported. However, now they are being removed from the public top-level API. They are still available on the internal IndexReader, which can be accessed using the Advanced() method. The DocCount() and DumpXXX() methods on the internal index have moved to the internal index reader, since they logically operate on a snapshot of an index. 2016-09-13 18:40:01 +02:00			`it := i.kvreader.RangeIterator(keys[0], nil)`
fix issues identified by errcheck part of #169 2015-04-07 18:04:59 +02:00			`defer func() {`
			`cerr := it.Close()`
			`if cerr != nil {`
			`rv <- cerr`
			`}`
			`}()`
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00
			`for _, key := range keys {`
			`it.Seek(key)`
			`rkey, rval, valid := it.Current()`
			`if !valid {`
			`break`
			`}`
fix dump methods to properly copy keys and values 2015-10-28 17:06:44 +01:00			`rck := make([]byte, len(rkey))`
			`copy(rck, key)`
			`rcv := make([]byte, len(rval))`
			`copy(rcv, rval)`
			`row, err := ParseFromKeyValue(rck, rcv)`
refactor dump methods improved test coverage 2014-08-15 19:12:55 +02:00			`if err != nil {`
			`rv <- err`
			`return`
			`}`
			`rv <- row`
			`}`
			`}`
			`}()`

			`return rv`
			`}`