7e36109b3c
This API (unexported) will estimate the amount of memory needed to execute a search query over an index before the collector begins data collection. Sample estimates for certain queries: {Size: 10, BenchmarkUpsidedownSearchOverhead} ESTIMATE BENCHMEM TermQuery 4616 4796 MatchQuery 5210 5405 DisjunctionQuery (Match queries) 7700 8447 DisjunctionQuery (Term queries) 6514 6591 ConjunctionQuery (Match queries) 7524 8175 Nested disjunction query (disjunction of disjunctions) 10306 10708 …
1142 lines
27 KiB
Go
1142 lines
27 KiB
Go
// Copyright (c) 2014 Couchbase, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package upsidedown
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/binary"
|
|
"fmt"
|
|
"io"
|
|
"math"
|
|
"reflect"
|
|
|
|
"github.com/blevesearch/bleve/size"
|
|
"github.com/golang/protobuf/proto"
|
|
)
|
|
|
|
var reflectStaticSizeTermFrequencyRow int
|
|
var reflectStaticSizeTermVector int
|
|
|
|
func init() {
|
|
var tfr TermFrequencyRow
|
|
reflectStaticSizeTermFrequencyRow = int(reflect.TypeOf(tfr).Size())
|
|
var tv TermVector
|
|
reflectStaticSizeTermVector = int(reflect.TypeOf(tv).Size())
|
|
}
|
|
|
|
const ByteSeparator byte = 0xff
|
|
|
|
type UpsideDownCouchRowStream chan UpsideDownCouchRow
|
|
|
|
type UpsideDownCouchRow interface {
|
|
KeySize() int
|
|
KeyTo([]byte) (int, error)
|
|
Key() []byte
|
|
Value() []byte
|
|
ValueSize() int
|
|
ValueTo([]byte) (int, error)
|
|
}
|
|
|
|
func ParseFromKeyValue(key, value []byte) (UpsideDownCouchRow, error) {
|
|
if len(key) > 0 {
|
|
switch key[0] {
|
|
case 'v':
|
|
return NewVersionRowKV(key, value)
|
|
case 'f':
|
|
return NewFieldRowKV(key, value)
|
|
case 'd':
|
|
return NewDictionaryRowKV(key, value)
|
|
case 't':
|
|
return NewTermFrequencyRowKV(key, value)
|
|
case 'b':
|
|
return NewBackIndexRowKV(key, value)
|
|
case 's':
|
|
return NewStoredRowKV(key, value)
|
|
case 'i':
|
|
return NewInternalRowKV(key, value)
|
|
}
|
|
return nil, fmt.Errorf("Unknown field type '%s'", string(key[0]))
|
|
}
|
|
return nil, fmt.Errorf("Invalid empty key")
|
|
}
|
|
|
|
// VERSION
|
|
|
|
type VersionRow struct {
|
|
version uint8
|
|
}
|
|
|
|
func (v *VersionRow) Key() []byte {
|
|
return []byte{'v'}
|
|
}
|
|
|
|
func (v *VersionRow) KeySize() int {
|
|
return 1
|
|
}
|
|
|
|
func (v *VersionRow) KeyTo(buf []byte) (int, error) {
|
|
buf[0] = 'v'
|
|
return 1, nil
|
|
}
|
|
|
|
func (v *VersionRow) Value() []byte {
|
|
return []byte{byte(v.version)}
|
|
}
|
|
|
|
func (v *VersionRow) ValueSize() int {
|
|
return 1
|
|
}
|
|
|
|
func (v *VersionRow) ValueTo(buf []byte) (int, error) {
|
|
buf[0] = v.version
|
|
return 1, nil
|
|
}
|
|
|
|
func (v *VersionRow) String() string {
|
|
return fmt.Sprintf("Version: %d", v.version)
|
|
}
|
|
|
|
func NewVersionRow(version uint8) *VersionRow {
|
|
return &VersionRow{
|
|
version: version,
|
|
}
|
|
}
|
|
|
|
func NewVersionRowKV(key, value []byte) (*VersionRow, error) {
|
|
rv := VersionRow{}
|
|
buf := bytes.NewBuffer(value)
|
|
err := binary.Read(buf, binary.LittleEndian, &rv.version)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &rv, nil
|
|
}
|
|
|
|
// INTERNAL STORAGE
|
|
|
|
type InternalRow struct {
|
|
key []byte
|
|
val []byte
|
|
}
|
|
|
|
func (i *InternalRow) Key() []byte {
|
|
buf := make([]byte, i.KeySize())
|
|
size, _ := i.KeyTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (i *InternalRow) KeySize() int {
|
|
return len(i.key) + 1
|
|
}
|
|
|
|
func (i *InternalRow) KeyTo(buf []byte) (int, error) {
|
|
buf[0] = 'i'
|
|
actual := copy(buf[1:], i.key)
|
|
return 1 + actual, nil
|
|
}
|
|
|
|
func (i *InternalRow) Value() []byte {
|
|
return i.val
|
|
}
|
|
|
|
func (i *InternalRow) ValueSize() int {
|
|
return len(i.val)
|
|
}
|
|
|
|
func (i *InternalRow) ValueTo(buf []byte) (int, error) {
|
|
actual := copy(buf, i.val)
|
|
return actual, nil
|
|
}
|
|
|
|
func (i *InternalRow) String() string {
|
|
return fmt.Sprintf("InternalStore - Key: %s (% x) Val: %s (% x)", i.key, i.key, i.val, i.val)
|
|
}
|
|
|
|
func NewInternalRow(key, val []byte) *InternalRow {
|
|
return &InternalRow{
|
|
key: key,
|
|
val: val,
|
|
}
|
|
}
|
|
|
|
func NewInternalRowKV(key, value []byte) (*InternalRow, error) {
|
|
rv := InternalRow{}
|
|
rv.key = key[1:]
|
|
rv.val = value
|
|
return &rv, nil
|
|
}
|
|
|
|
// FIELD definition
|
|
|
|
type FieldRow struct {
|
|
index uint16
|
|
name string
|
|
}
|
|
|
|
func (f *FieldRow) Key() []byte {
|
|
buf := make([]byte, f.KeySize())
|
|
size, _ := f.KeyTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (f *FieldRow) KeySize() int {
|
|
return 3
|
|
}
|
|
|
|
func (f *FieldRow) KeyTo(buf []byte) (int, error) {
|
|
buf[0] = 'f'
|
|
binary.LittleEndian.PutUint16(buf[1:3], f.index)
|
|
return 3, nil
|
|
}
|
|
|
|
func (f *FieldRow) Value() []byte {
|
|
return append([]byte(f.name), ByteSeparator)
|
|
}
|
|
|
|
func (f *FieldRow) ValueSize() int {
|
|
return len(f.name) + 1
|
|
}
|
|
|
|
func (f *FieldRow) ValueTo(buf []byte) (int, error) {
|
|
size := copy(buf, f.name)
|
|
buf[size] = ByteSeparator
|
|
return size + 1, nil
|
|
}
|
|
|
|
func (f *FieldRow) String() string {
|
|
return fmt.Sprintf("Field: %d Name: %s", f.index, f.name)
|
|
}
|
|
|
|
func NewFieldRow(index uint16, name string) *FieldRow {
|
|
return &FieldRow{
|
|
index: index,
|
|
name: name,
|
|
}
|
|
}
|
|
|
|
func NewFieldRowKV(key, value []byte) (*FieldRow, error) {
|
|
rv := FieldRow{}
|
|
|
|
buf := bytes.NewBuffer(key)
|
|
_, err := buf.ReadByte() // type
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
err = binary.Read(buf, binary.LittleEndian, &rv.index)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
buf = bytes.NewBuffer(value)
|
|
rv.name, err = buf.ReadString(ByteSeparator)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
rv.name = rv.name[:len(rv.name)-1] // trim off separator byte
|
|
|
|
return &rv, nil
|
|
}
|
|
|
|
// DICTIONARY
|
|
|
|
const DictionaryRowMaxValueSize = binary.MaxVarintLen64
|
|
|
|
type DictionaryRow struct {
|
|
term []byte
|
|
count uint64
|
|
field uint16
|
|
}
|
|
|
|
func (dr *DictionaryRow) Key() []byte {
|
|
buf := make([]byte, dr.KeySize())
|
|
size, _ := dr.KeyTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (dr *DictionaryRow) KeySize() int {
|
|
return dictionaryRowKeySize(dr.term)
|
|
}
|
|
|
|
func dictionaryRowKeySize(term []byte) int {
|
|
return len(term) + 3
|
|
}
|
|
|
|
func (dr *DictionaryRow) KeyTo(buf []byte) (int, error) {
|
|
return dictionaryRowKeyTo(buf, dr.field, dr.term), nil
|
|
}
|
|
|
|
func dictionaryRowKeyTo(buf []byte, field uint16, term []byte) int {
|
|
buf[0] = 'd'
|
|
binary.LittleEndian.PutUint16(buf[1:3], field)
|
|
size := copy(buf[3:], term)
|
|
return size + 3
|
|
}
|
|
|
|
func (dr *DictionaryRow) Value() []byte {
|
|
buf := make([]byte, dr.ValueSize())
|
|
size, _ := dr.ValueTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (dr *DictionaryRow) ValueSize() int {
|
|
return DictionaryRowMaxValueSize
|
|
}
|
|
|
|
func (dr *DictionaryRow) ValueTo(buf []byte) (int, error) {
|
|
used := binary.PutUvarint(buf, dr.count)
|
|
return used, nil
|
|
}
|
|
|
|
func (dr *DictionaryRow) String() string {
|
|
return fmt.Sprintf("Dictionary Term: `%s` Field: %d Count: %d ", string(dr.term), dr.field, dr.count)
|
|
}
|
|
|
|
func NewDictionaryRow(term []byte, field uint16, count uint64) *DictionaryRow {
|
|
return &DictionaryRow{
|
|
term: term,
|
|
field: field,
|
|
count: count,
|
|
}
|
|
}
|
|
|
|
func NewDictionaryRowKV(key, value []byte) (*DictionaryRow, error) {
|
|
rv, err := NewDictionaryRowK(key)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
err = rv.parseDictionaryV(value)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return rv, nil
|
|
|
|
}
|
|
|
|
func NewDictionaryRowK(key []byte) (*DictionaryRow, error) {
|
|
rv := &DictionaryRow{}
|
|
err := rv.parseDictionaryK(key)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return rv, nil
|
|
}
|
|
|
|
func (dr *DictionaryRow) parseDictionaryK(key []byte) error {
|
|
dr.field = binary.LittleEndian.Uint16(key[1:3])
|
|
if dr.term != nil {
|
|
dr.term = dr.term[:0]
|
|
}
|
|
dr.term = append(dr.term, key[3:]...)
|
|
return nil
|
|
}
|
|
|
|
func (dr *DictionaryRow) parseDictionaryV(value []byte) error {
|
|
count, err := dictionaryRowParseV(value)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
dr.count = count
|
|
return nil
|
|
}
|
|
|
|
func dictionaryRowParseV(value []byte) (uint64, error) {
|
|
count, nread := binary.Uvarint(value)
|
|
if nread <= 0 {
|
|
return 0, fmt.Errorf("DictionaryRow parse Uvarint error, nread: %d", nread)
|
|
}
|
|
return count, nil
|
|
}
|
|
|
|
// TERM FIELD FREQUENCY
|
|
|
|
type TermVector struct {
|
|
field uint16
|
|
arrayPositions []uint64
|
|
pos uint64
|
|
start uint64
|
|
end uint64
|
|
}
|
|
|
|
func (tv *TermVector) Size() int {
|
|
return reflectStaticSizeTermVector + size.SizeOfPtr +
|
|
len(tv.arrayPositions)*size.SizeOfUint64
|
|
}
|
|
|
|
func (tv *TermVector) String() string {
|
|
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions)
|
|
}
|
|
|
|
type TermFrequencyRow struct {
|
|
term []byte
|
|
doc []byte
|
|
freq uint64
|
|
vectors []*TermVector
|
|
norm float32
|
|
field uint16
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) Size() int {
|
|
sizeInBytes := reflectStaticSizeTermFrequencyRow +
|
|
len(tfr.term) +
|
|
len(tfr.doc)
|
|
|
|
for _, entry := range tfr.vectors {
|
|
sizeInBytes += entry.Size()
|
|
}
|
|
|
|
return sizeInBytes
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) Term() []byte {
|
|
return tfr.term
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) Freq() uint64 {
|
|
return tfr.freq
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) ScanPrefixForField() []byte {
|
|
buf := make([]byte, 3)
|
|
buf[0] = 't'
|
|
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
|
|
return buf
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) ScanPrefixForFieldTermPrefix() []byte {
|
|
buf := make([]byte, 3+len(tfr.term))
|
|
buf[0] = 't'
|
|
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
|
|
copy(buf[3:], tfr.term)
|
|
return buf
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) ScanPrefixForFieldTerm() []byte {
|
|
buf := make([]byte, 3+len(tfr.term)+1)
|
|
buf[0] = 't'
|
|
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
|
|
termLen := copy(buf[3:], tfr.term)
|
|
buf[3+termLen] = ByteSeparator
|
|
return buf
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) Key() []byte {
|
|
buf := make([]byte, tfr.KeySize())
|
|
size, _ := tfr.KeyTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) KeySize() int {
|
|
return termFrequencyRowKeySize(tfr.term, tfr.doc)
|
|
}
|
|
|
|
func termFrequencyRowKeySize(term, doc []byte) int {
|
|
return 3 + len(term) + 1 + len(doc)
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) KeyTo(buf []byte) (int, error) {
|
|
return termFrequencyRowKeyTo(buf, tfr.field, tfr.term, tfr.doc), nil
|
|
}
|
|
|
|
func termFrequencyRowKeyTo(buf []byte, field uint16, term, doc []byte) int {
|
|
buf[0] = 't'
|
|
binary.LittleEndian.PutUint16(buf[1:3], field)
|
|
termLen := copy(buf[3:], term)
|
|
buf[3+termLen] = ByteSeparator
|
|
docLen := copy(buf[3+termLen+1:], doc)
|
|
return 3 + termLen + 1 + docLen
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) KeyAppendTo(buf []byte) ([]byte, error) {
|
|
keySize := tfr.KeySize()
|
|
if cap(buf) < keySize {
|
|
buf = make([]byte, keySize)
|
|
}
|
|
actualSize, err := tfr.KeyTo(buf[0:keySize])
|
|
return buf[0:actualSize], err
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) DictionaryRowKey() []byte {
|
|
dr := NewDictionaryRow(tfr.term, tfr.field, 0)
|
|
return dr.Key()
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) DictionaryRowKeySize() int {
|
|
dr := NewDictionaryRow(tfr.term, tfr.field, 0)
|
|
return dr.KeySize()
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) DictionaryRowKeyTo(buf []byte) (int, error) {
|
|
dr := NewDictionaryRow(tfr.term, tfr.field, 0)
|
|
return dr.KeyTo(buf)
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) Value() []byte {
|
|
buf := make([]byte, tfr.ValueSize())
|
|
size, _ := tfr.ValueTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) ValueSize() int {
|
|
bufLen := binary.MaxVarintLen64 + binary.MaxVarintLen64
|
|
for _, vector := range tfr.vectors {
|
|
bufLen += (binary.MaxVarintLen64 * 4) + (1+len(vector.arrayPositions))*binary.MaxVarintLen64
|
|
}
|
|
return bufLen
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) ValueTo(buf []byte) (int, error) {
|
|
used := binary.PutUvarint(buf[:binary.MaxVarintLen64], tfr.freq)
|
|
|
|
normuint32 := math.Float32bits(tfr.norm)
|
|
newbuf := buf[used : used+binary.MaxVarintLen64]
|
|
used += binary.PutUvarint(newbuf, uint64(normuint32))
|
|
|
|
for _, vector := range tfr.vectors {
|
|
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], uint64(vector.field))
|
|
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], vector.pos)
|
|
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], vector.start)
|
|
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], vector.end)
|
|
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], uint64(len(vector.arrayPositions)))
|
|
for _, arrayPosition := range vector.arrayPositions {
|
|
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], arrayPosition)
|
|
}
|
|
}
|
|
return used, nil
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) String() string {
|
|
return fmt.Sprintf("Term: `%s` Field: %d DocId: `%s` Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, string(tfr.doc), tfr.freq, tfr.norm, tfr.vectors)
|
|
}
|
|
|
|
func InitTermFrequencyRow(tfr *TermFrequencyRow, term []byte, field uint16, docID []byte, freq uint64, norm float32) *TermFrequencyRow {
|
|
tfr.term = term
|
|
tfr.field = field
|
|
tfr.doc = docID
|
|
tfr.freq = freq
|
|
tfr.norm = norm
|
|
return tfr
|
|
}
|
|
|
|
func NewTermFrequencyRow(term []byte, field uint16, docID []byte, freq uint64, norm float32) *TermFrequencyRow {
|
|
return &TermFrequencyRow{
|
|
term: term,
|
|
field: field,
|
|
doc: docID,
|
|
freq: freq,
|
|
norm: norm,
|
|
}
|
|
}
|
|
|
|
func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, docID []byte, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow {
|
|
return &TermFrequencyRow{
|
|
term: term,
|
|
field: field,
|
|
doc: docID,
|
|
freq: freq,
|
|
norm: norm,
|
|
vectors: vectors,
|
|
}
|
|
}
|
|
|
|
func NewTermFrequencyRowK(key []byte) (*TermFrequencyRow, error) {
|
|
rv := &TermFrequencyRow{}
|
|
err := rv.parseK(key)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return rv, nil
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) parseK(key []byte) error {
|
|
keyLen := len(key)
|
|
if keyLen < 3 {
|
|
return fmt.Errorf("invalid term frequency key, no valid field")
|
|
}
|
|
tfr.field = binary.LittleEndian.Uint16(key[1:3])
|
|
|
|
termEndPos := bytes.IndexByte(key[3:], ByteSeparator)
|
|
if termEndPos < 0 {
|
|
return fmt.Errorf("invalid term frequency key, no byte separator terminating term")
|
|
}
|
|
tfr.term = key[3 : 3+termEndPos]
|
|
|
|
docLen := keyLen - (3 + termEndPos + 1)
|
|
if docLen < 1 {
|
|
return fmt.Errorf("invalid term frequency key, empty docid")
|
|
}
|
|
tfr.doc = key[3+termEndPos+1:]
|
|
|
|
return nil
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) parseKDoc(key []byte, term []byte) error {
|
|
tfr.doc = key[3+len(term)+1:]
|
|
if len(tfr.doc) <= 0 {
|
|
return fmt.Errorf("invalid term frequency key, empty docid")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (tfr *TermFrequencyRow) parseV(value []byte, includeTermVectors bool) error {
|
|
var bytesRead int
|
|
tfr.freq, bytesRead = binary.Uvarint(value)
|
|
if bytesRead <= 0 {
|
|
return fmt.Errorf("invalid term frequency value, invalid frequency")
|
|
}
|
|
currOffset := bytesRead
|
|
|
|
var norm uint64
|
|
norm, bytesRead = binary.Uvarint(value[currOffset:])
|
|
if bytesRead <= 0 {
|
|
return fmt.Errorf("invalid term frequency value, no norm")
|
|
}
|
|
currOffset += bytesRead
|
|
|
|
tfr.norm = math.Float32frombits(uint32(norm))
|
|
|
|
tfr.vectors = nil
|
|
if !includeTermVectors {
|
|
return nil
|
|
}
|
|
|
|
var field uint64
|
|
field, bytesRead = binary.Uvarint(value[currOffset:])
|
|
for bytesRead > 0 {
|
|
currOffset += bytesRead
|
|
tv := TermVector{}
|
|
tv.field = uint16(field)
|
|
// at this point we expect at least one term vector
|
|
if tfr.vectors == nil {
|
|
tfr.vectors = make([]*TermVector, 0)
|
|
}
|
|
|
|
tv.pos, bytesRead = binary.Uvarint(value[currOffset:])
|
|
if bytesRead <= 0 {
|
|
return fmt.Errorf("invalid term frequency value, vector contains no position")
|
|
}
|
|
currOffset += bytesRead
|
|
|
|
tv.start, bytesRead = binary.Uvarint(value[currOffset:])
|
|
if bytesRead <= 0 {
|
|
return fmt.Errorf("invalid term frequency value, vector contains no start")
|
|
}
|
|
currOffset += bytesRead
|
|
|
|
tv.end, bytesRead = binary.Uvarint(value[currOffset:])
|
|
if bytesRead <= 0 {
|
|
return fmt.Errorf("invalid term frequency value, vector contains no end")
|
|
}
|
|
currOffset += bytesRead
|
|
|
|
var arrayPositionsLen uint64 = 0
|
|
arrayPositionsLen, bytesRead = binary.Uvarint(value[currOffset:])
|
|
if bytesRead <= 0 {
|
|
return fmt.Errorf("invalid term frequency value, vector contains no arrayPositionLen")
|
|
}
|
|
currOffset += bytesRead
|
|
|
|
if arrayPositionsLen > 0 {
|
|
tv.arrayPositions = make([]uint64, arrayPositionsLen)
|
|
for i := 0; uint64(i) < arrayPositionsLen; i++ {
|
|
tv.arrayPositions[i], bytesRead = binary.Uvarint(value[currOffset:])
|
|
if bytesRead <= 0 {
|
|
return fmt.Errorf("invalid term frequency value, vector contains no arrayPosition of index %d", i)
|
|
}
|
|
currOffset += bytesRead
|
|
}
|
|
}
|
|
|
|
tfr.vectors = append(tfr.vectors, &tv)
|
|
// try to read next record (may not exist)
|
|
field, bytesRead = binary.Uvarint(value[currOffset:])
|
|
}
|
|
if len(value[currOffset:]) > 0 && bytesRead <= 0 {
|
|
return fmt.Errorf("invalid term frequency value, vector field invalid")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func NewTermFrequencyRowKV(key, value []byte) (*TermFrequencyRow, error) {
|
|
rv, err := NewTermFrequencyRowK(key)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
err = rv.parseV(value, true)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return rv, nil
|
|
|
|
}
|
|
|
|
type BackIndexRow struct {
|
|
doc []byte
|
|
termsEntries []*BackIndexTermsEntry
|
|
storedEntries []*BackIndexStoreEntry
|
|
}
|
|
|
|
func (br *BackIndexRow) AllTermKeys() [][]byte {
|
|
if br == nil {
|
|
return nil
|
|
}
|
|
rv := make([][]byte, 0, len(br.termsEntries)) // FIXME this underestimates severely
|
|
for _, termsEntry := range br.termsEntries {
|
|
for i := range termsEntry.Terms {
|
|
termRow := NewTermFrequencyRow([]byte(termsEntry.Terms[i]), uint16(termsEntry.GetField()), br.doc, 0, 0)
|
|
rv = append(rv, termRow.Key())
|
|
}
|
|
}
|
|
return rv
|
|
}
|
|
|
|
func (br *BackIndexRow) AllStoredKeys() [][]byte {
|
|
if br == nil {
|
|
return nil
|
|
}
|
|
rv := make([][]byte, len(br.storedEntries))
|
|
for i, storedEntry := range br.storedEntries {
|
|
storedRow := NewStoredRow(br.doc, uint16(storedEntry.GetField()), storedEntry.GetArrayPositions(), 'x', []byte{})
|
|
rv[i] = storedRow.Key()
|
|
}
|
|
return rv
|
|
}
|
|
|
|
func (br *BackIndexRow) Key() []byte {
|
|
buf := make([]byte, br.KeySize())
|
|
size, _ := br.KeyTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (br *BackIndexRow) KeySize() int {
|
|
return len(br.doc) + 1
|
|
}
|
|
|
|
func (br *BackIndexRow) KeyTo(buf []byte) (int, error) {
|
|
buf[0] = 'b'
|
|
used := copy(buf[1:], br.doc)
|
|
return used + 1, nil
|
|
}
|
|
|
|
func (br *BackIndexRow) Value() []byte {
|
|
buf := make([]byte, br.ValueSize())
|
|
size, _ := br.ValueTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (br *BackIndexRow) ValueSize() int {
|
|
birv := &BackIndexRowValue{
|
|
TermsEntries: br.termsEntries,
|
|
StoredEntries: br.storedEntries,
|
|
}
|
|
return birv.Size()
|
|
}
|
|
|
|
func (br *BackIndexRow) ValueTo(buf []byte) (int, error) {
|
|
birv := &BackIndexRowValue{
|
|
TermsEntries: br.termsEntries,
|
|
StoredEntries: br.storedEntries,
|
|
}
|
|
return birv.MarshalTo(buf)
|
|
}
|
|
|
|
func (br *BackIndexRow) String() string {
|
|
return fmt.Sprintf("Backindex DocId: `%s` Terms Entries: %v, Stored Entries: %v", string(br.doc), br.termsEntries, br.storedEntries)
|
|
}
|
|
|
|
func NewBackIndexRow(docID []byte, entries []*BackIndexTermsEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow {
|
|
return &BackIndexRow{
|
|
doc: docID,
|
|
termsEntries: entries,
|
|
storedEntries: storedFields,
|
|
}
|
|
}
|
|
|
|
func NewBackIndexRowKV(key, value []byte) (*BackIndexRow, error) {
|
|
rv := BackIndexRow{}
|
|
|
|
buf := bytes.NewBuffer(key)
|
|
_, err := buf.ReadByte() // type
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
rv.doc, err = buf.ReadBytes(ByteSeparator)
|
|
if err == io.EOF && len(rv.doc) < 1 {
|
|
err = fmt.Errorf("invalid doc length 0 - % x", key)
|
|
}
|
|
if err != nil && err != io.EOF {
|
|
return nil, err
|
|
} else if err == nil {
|
|
rv.doc = rv.doc[:len(rv.doc)-1] // trim off separator byte
|
|
}
|
|
|
|
var birv BackIndexRowValue
|
|
err = proto.Unmarshal(value, &birv)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
rv.termsEntries = birv.TermsEntries
|
|
rv.storedEntries = birv.StoredEntries
|
|
|
|
return &rv, nil
|
|
}
|
|
|
|
// STORED
|
|
|
|
type StoredRow struct {
|
|
doc []byte
|
|
field uint16
|
|
arrayPositions []uint64
|
|
typ byte
|
|
value []byte
|
|
}
|
|
|
|
func (s *StoredRow) Key() []byte {
|
|
buf := make([]byte, s.KeySize())
|
|
size, _ := s.KeyTo(buf)
|
|
return buf[0:size]
|
|
}
|
|
|
|
func (s *StoredRow) KeySize() int {
|
|
return 1 + len(s.doc) + 1 + 2 + (binary.MaxVarintLen64 * len(s.arrayPositions))
|
|
}
|
|
|
|
func (s *StoredRow) KeyTo(buf []byte) (int, error) {
|
|
docLen := len(s.doc)
|
|
buf[0] = 's'
|
|
copy(buf[1:], s.doc)
|
|
buf[1+docLen] = ByteSeparator
|
|
binary.LittleEndian.PutUint16(buf[1+docLen+1:], s.field)
|
|
bytesUsed := 1 + docLen + 1 + 2
|
|
for _, arrayPosition := range s.arrayPositions {
|
|
varbytes := binary.PutUvarint(buf[bytesUsed:], arrayPosition)
|
|
bytesUsed += varbytes
|
|
}
|
|
return bytesUsed, nil
|
|
}
|
|
|
|
func (s *StoredRow) Value() []byte {
|
|
buf := make([]byte, s.ValueSize())
|
|
size, _ := s.ValueTo(buf)
|
|
return buf[:size]
|
|
}
|
|
|
|
func (s *StoredRow) ValueSize() int {
|
|
return len(s.value) + 1
|
|
}
|
|
|
|
func (s *StoredRow) ValueTo(buf []byte) (int, error) {
|
|
buf[0] = s.typ
|
|
used := copy(buf[1:], s.value)
|
|
return used + 1, nil
|
|
}
|
|
|
|
func (s *StoredRow) String() string {
|
|
return fmt.Sprintf("Document: %s Field %d, Array Positions: %v, Type: %s Value: %s", s.doc, s.field, s.arrayPositions, string(s.typ), s.value)
|
|
}
|
|
|
|
func (s *StoredRow) ScanPrefixForDoc() []byte {
|
|
docLen := len(s.doc)
|
|
buf := make([]byte, 1+docLen+1)
|
|
buf[0] = 's'
|
|
copy(buf[1:], s.doc)
|
|
buf[1+docLen] = ByteSeparator
|
|
return buf
|
|
}
|
|
|
|
func NewStoredRow(docID []byte, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow {
|
|
return &StoredRow{
|
|
doc: docID,
|
|
field: field,
|
|
arrayPositions: arrayPositions,
|
|
typ: typ,
|
|
value: value,
|
|
}
|
|
}
|
|
|
|
func NewStoredRowK(key []byte) (*StoredRow, error) {
|
|
rv := StoredRow{}
|
|
|
|
buf := bytes.NewBuffer(key)
|
|
_, err := buf.ReadByte() // type
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
rv.doc, err = buf.ReadBytes(ByteSeparator)
|
|
if len(rv.doc) < 2 { // 1 for min doc id length, 1 for separator
|
|
err = fmt.Errorf("invalid doc length 0")
|
|
return nil, err
|
|
}
|
|
|
|
rv.doc = rv.doc[:len(rv.doc)-1] // trim off separator byte
|
|
|
|
err = binary.Read(buf, binary.LittleEndian, &rv.field)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
rv.arrayPositions = make([]uint64, 0)
|
|
nextArrayPos, err := binary.ReadUvarint(buf)
|
|
for err == nil {
|
|
rv.arrayPositions = append(rv.arrayPositions, nextArrayPos)
|
|
nextArrayPos, err = binary.ReadUvarint(buf)
|
|
}
|
|
return &rv, nil
|
|
}
|
|
|
|
func NewStoredRowKV(key, value []byte) (*StoredRow, error) {
|
|
rv, err := NewStoredRowK(key)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
rv.typ = value[0]
|
|
rv.value = value[1:]
|
|
return rv, nil
|
|
}
|
|
|
|
type backIndexFieldTermVisitor func(field uint32, term []byte)
|
|
|
|
// visitBackIndexRow is designed to process a protobuf encoded
|
|
// value, without creating unnecessary garbage. Instead values are passed
|
|
// to a callback, inspected first, and only copied if necessary.
|
|
// Due to the fact that this borrows from generated code, it must be marnually
|
|
// updated if the protobuf definition changes.
|
|
//
|
|
// This code originates from:
|
|
// func (m *BackIndexRowValue) Unmarshal(data []byte) error
|
|
// the sections which create garbage or parse unintersting sections
|
|
// have been commented out. This was done by design to allow for easier
|
|
// merging in the future if that original function is regenerated
|
|
func visitBackIndexRow(data []byte, callback backIndexFieldTermVisitor) error {
|
|
l := len(data)
|
|
iNdEx := 0
|
|
for iNdEx < l {
|
|
var wire uint64
|
|
for shift := uint(0); ; shift += 7 {
|
|
if iNdEx >= l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
b := data[iNdEx]
|
|
iNdEx++
|
|
wire |= (uint64(b) & 0x7F) << shift
|
|
if b < 0x80 {
|
|
break
|
|
}
|
|
}
|
|
fieldNum := int32(wire >> 3)
|
|
wireType := int(wire & 0x7)
|
|
switch fieldNum {
|
|
case 1:
|
|
if wireType != 2 {
|
|
return fmt.Errorf("proto: wrong wireType = %d for field TermsEntries", wireType)
|
|
}
|
|
var msglen int
|
|
for shift := uint(0); ; shift += 7 {
|
|
if iNdEx >= l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
b := data[iNdEx]
|
|
iNdEx++
|
|
msglen |= (int(b) & 0x7F) << shift
|
|
if b < 0x80 {
|
|
break
|
|
}
|
|
}
|
|
postIndex := iNdEx + msglen
|
|
if msglen < 0 {
|
|
return ErrInvalidLengthUpsidedown
|
|
}
|
|
if postIndex > l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
// dont parse term entries
|
|
// m.TermsEntries = append(m.TermsEntries, &BackIndexTermsEntry{})
|
|
// if err := m.TermsEntries[len(m.TermsEntries)-1].Unmarshal(data[iNdEx:postIndex]); err != nil {
|
|
// return err
|
|
// }
|
|
// instead, inspect them
|
|
if err := visitBackIndexRowFieldTerms(data[iNdEx:postIndex], callback); err != nil {
|
|
return err
|
|
}
|
|
iNdEx = postIndex
|
|
case 2:
|
|
if wireType != 2 {
|
|
return fmt.Errorf("proto: wrong wireType = %d for field StoredEntries", wireType)
|
|
}
|
|
var msglen int
|
|
for shift := uint(0); ; shift += 7 {
|
|
if iNdEx >= l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
b := data[iNdEx]
|
|
iNdEx++
|
|
msglen |= (int(b) & 0x7F) << shift
|
|
if b < 0x80 {
|
|
break
|
|
}
|
|
}
|
|
postIndex := iNdEx + msglen
|
|
if msglen < 0 {
|
|
return ErrInvalidLengthUpsidedown
|
|
}
|
|
if postIndex > l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
// don't parse stored entries
|
|
// m.StoredEntries = append(m.StoredEntries, &BackIndexStoreEntry{})
|
|
// if err := m.StoredEntries[len(m.StoredEntries)-1].Unmarshal(data[iNdEx:postIndex]); err != nil {
|
|
// return err
|
|
// }
|
|
iNdEx = postIndex
|
|
default:
|
|
var sizeOfWire int
|
|
for {
|
|
sizeOfWire++
|
|
wire >>= 7
|
|
if wire == 0 {
|
|
break
|
|
}
|
|
}
|
|
iNdEx -= sizeOfWire
|
|
skippy, err := skipUpsidedown(data[iNdEx:])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if skippy < 0 {
|
|
return ErrInvalidLengthUpsidedown
|
|
}
|
|
if (iNdEx + skippy) > l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
// don't track unrecognized data
|
|
//m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...)
|
|
iNdEx += skippy
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// visitBackIndexRowFieldTerms is designed to process a protobuf encoded
|
|
// sub-value within the BackIndexRowValue, without creating unnecessary garbage.
|
|
// Instead values are passed to a callback, inspected first, and only copied if
|
|
// necessary. Due to the fact that this borrows from generated code, it must
|
|
// be marnually updated if the protobuf definition changes.
|
|
//
|
|
// This code originates from:
|
|
// func (m *BackIndexTermsEntry) Unmarshal(data []byte) error {
|
|
// the sections which create garbage or parse uninteresting sections
|
|
// have been commented out. This was done by design to allow for easier
|
|
// merging in the future if that original function is regenerated
|
|
func visitBackIndexRowFieldTerms(data []byte, callback backIndexFieldTermVisitor) error {
|
|
var theField uint32
|
|
|
|
var hasFields [1]uint64
|
|
l := len(data)
|
|
iNdEx := 0
|
|
for iNdEx < l {
|
|
var wire uint64
|
|
for shift := uint(0); ; shift += 7 {
|
|
if iNdEx >= l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
b := data[iNdEx]
|
|
iNdEx++
|
|
wire |= (uint64(b) & 0x7F) << shift
|
|
if b < 0x80 {
|
|
break
|
|
}
|
|
}
|
|
fieldNum := int32(wire >> 3)
|
|
wireType := int(wire & 0x7)
|
|
switch fieldNum {
|
|
case 1:
|
|
if wireType != 0 {
|
|
return fmt.Errorf("proto: wrong wireType = %d for field Field", wireType)
|
|
}
|
|
var v uint32
|
|
for shift := uint(0); ; shift += 7 {
|
|
if iNdEx >= l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
b := data[iNdEx]
|
|
iNdEx++
|
|
v |= (uint32(b) & 0x7F) << shift
|
|
if b < 0x80 {
|
|
break
|
|
}
|
|
}
|
|
// m.Field = &v
|
|
theField = v
|
|
hasFields[0] |= uint64(0x00000001)
|
|
case 2:
|
|
if wireType != 2 {
|
|
return fmt.Errorf("proto: wrong wireType = %d for field Terms", wireType)
|
|
}
|
|
var stringLen uint64
|
|
for shift := uint(0); ; shift += 7 {
|
|
if iNdEx >= l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
b := data[iNdEx]
|
|
iNdEx++
|
|
stringLen |= (uint64(b) & 0x7F) << shift
|
|
if b < 0x80 {
|
|
break
|
|
}
|
|
}
|
|
postIndex := iNdEx + int(stringLen)
|
|
if postIndex > l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
//m.Terms = append(m.Terms, string(data[iNdEx:postIndex]))
|
|
callback(theField, data[iNdEx:postIndex])
|
|
iNdEx = postIndex
|
|
default:
|
|
var sizeOfWire int
|
|
for {
|
|
sizeOfWire++
|
|
wire >>= 7
|
|
if wire == 0 {
|
|
break
|
|
}
|
|
}
|
|
iNdEx -= sizeOfWire
|
|
skippy, err := skipUpsidedown(data[iNdEx:])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if skippy < 0 {
|
|
return ErrInvalidLengthUpsidedown
|
|
}
|
|
if (iNdEx + skippy) > l {
|
|
return io.ErrUnexpectedEOF
|
|
}
|
|
//m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...)
|
|
iNdEx += skippy
|
|
}
|
|
}
|
|
// if hasFields[0]&uint64(0x00000001) == 0 {
|
|
// return new(github_com_golang_protobuf_proto.RequiredNotSetError)
|
|
// }
|
|
|
|
return nil
|
|
}
|