0
0
Fork 0
bleve/index/upside_down/row.go

641 lines
15 KiB
Go
Raw Normal View History

2014-04-17 22:55:53 +02:00
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
2014-04-17 22:55:53 +02:00
package upside_down
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"math"
2014-12-09 23:24:59 +01:00
"github.com/golang/protobuf/proto"
2014-04-17 22:55:53 +02:00
)
const ByteSeparator byte = 0xff
2014-04-17 22:55:53 +02:00
type UpsideDownCouchRowStream chan UpsideDownCouchRow
type UpsideDownCouchRow interface {
Key() []byte
Value() []byte
2014-04-17 22:55:53 +02:00
}
func ParseFromKeyValue(key, value []byte) (UpsideDownCouchRow, error) {
2014-04-19 04:31:13 +02:00
if len(key) > 0 {
switch key[0] {
case 'v':
return NewVersionRowKV(key, value)
case 'f':
return NewFieldRowKV(key, value)
case 'd':
return NewDictionaryRowKV(key, value)
2014-04-19 04:31:13 +02:00
case 't':
return NewTermFrequencyRowKV(key, value)
case 'b':
return NewBackIndexRowKV(key, value)
case 's':
return NewStoredRowKV(key, value)
case 'i':
return NewInternalRowKV(key, value)
2014-04-19 04:31:13 +02:00
}
return nil, fmt.Errorf("Unknown field type '%s'", string(key[0]))
2014-04-17 22:55:53 +02:00
}
2014-04-19 04:31:13 +02:00
return nil, fmt.Errorf("Invalid empty key")
2014-04-17 22:55:53 +02:00
}
// VERSION
type VersionRow struct {
version uint8
}
func (v *VersionRow) Key() []byte {
return []byte{'v'}
2014-04-17 22:55:53 +02:00
}
func (v *VersionRow) Value() []byte {
return []byte{byte(v.version)}
2014-04-17 22:55:53 +02:00
}
func (v *VersionRow) String() string {
return fmt.Sprintf("Version: %d", v.version)
}
func NewVersionRow(version uint8) *VersionRow {
return &VersionRow{
version: version,
}
}
func NewVersionRowKV(key, value []byte) (*VersionRow, error) {
2014-04-17 22:55:53 +02:00
rv := VersionRow{}
buf := bytes.NewBuffer(value)
err := binary.Read(buf, binary.LittleEndian, &rv.version)
if err != nil {
return nil, err
2014-04-17 22:55:53 +02:00
}
return &rv, nil
2014-04-17 22:55:53 +02:00
}
// INTERNAL STORAGE
type InternalRow struct {
key []byte
val []byte
}
func (i *InternalRow) Key() []byte {
buf := make([]byte, len(i.key)+1)
buf[0] = 'i'
copy(buf[1:], i.key)
return buf
}
func (i *InternalRow) Value() []byte {
return i.val
}
func (i *InternalRow) String() string {
return fmt.Sprintf("InternalStore - Key: %s (% x) Val: %s (% x)", i.key, i.key, i.val, i.val)
}
func NewInternalRow(key, val []byte) *InternalRow {
return &InternalRow{
key: key,
val: val,
}
}
func NewInternalRowKV(key, value []byte) (*InternalRow, error) {
rv := InternalRow{}
rv.key = key[1:]
rv.val = value
return &rv, nil
}
2014-04-17 22:55:53 +02:00
// FIELD definition
type FieldRow struct {
index uint16
name string
}
func (f *FieldRow) Key() []byte {
buf := make([]byte, 3)
buf[0] = 'f'
binary.LittleEndian.PutUint16(buf[1:3], f.index)
return buf
2014-04-17 22:55:53 +02:00
}
func (f *FieldRow) Value() []byte {
return append([]byte(f.name), ByteSeparator)
2014-04-17 22:55:53 +02:00
}
func (f *FieldRow) String() string {
return fmt.Sprintf("Field: %d Name: %s", f.index, f.name)
}
func NewFieldRow(index uint16, name string) *FieldRow {
return &FieldRow{
index: index,
name: name,
}
}
func NewFieldRowKV(key, value []byte) (*FieldRow, error) {
2014-04-17 22:55:53 +02:00
rv := FieldRow{}
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
if err != nil {
return nil, err
}
err = binary.Read(buf, binary.LittleEndian, &rv.index)
2014-04-17 22:55:53 +02:00
if err != nil {
return nil, err
2014-04-17 22:55:53 +02:00
}
buf = bytes.NewBuffer(value)
rv.name, err = buf.ReadString(ByteSeparator)
2014-04-17 22:55:53 +02:00
if err != nil {
return nil, err
2014-04-17 22:55:53 +02:00
}
rv.name = rv.name[:len(rv.name)-1] // trim off separator byte
return &rv, nil
2014-04-17 22:55:53 +02:00
}
// DICTIONARY
type DictionaryRow struct {
field uint16
term []byte
count uint64
}
func (dr *DictionaryRow) Key() []byte {
buf := make([]byte, 3+len(dr.term))
buf[0] = 'd'
binary.LittleEndian.PutUint16(buf[1:3], dr.field)
copy(buf[3:], dr.term)
return buf
}
func (dr *DictionaryRow) Value() []byte {
used := 0
buf := make([]byte, binary.MaxVarintLen64)
used += binary.PutUvarint(buf, dr.count)
return buf[0:used]
}
func (dr *DictionaryRow) String() string {
return fmt.Sprintf("Dictionary Term: `%s` Field: %d Count: %d ", string(dr.term), dr.field, dr.count)
}
func NewDictionaryRow(term []byte, field uint16, count uint64) *DictionaryRow {
return &DictionaryRow{
term: term,
field: field,
count: count,
}
}
func NewDictionaryRowKV(key, value []byte) (*DictionaryRow, error) {
rv, err := NewDictionaryRowK(key)
if err != nil {
return nil, err
}
err = rv.parseDictionaryV(value)
if err != nil {
return nil, err
}
return rv, nil
}
func NewDictionaryRowK(key []byte) (*DictionaryRow, error) {
rv := DictionaryRow{}
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
if err != nil {
return nil, err
}
err = binary.Read(buf, binary.LittleEndian, &rv.field)
if err != nil {
return nil, err
}
rv.term, err = buf.ReadBytes(ByteSeparator)
// there is no separator expected here, should get EOF
if err != io.EOF {
return nil, err
}
return &rv, nil
}
func (dr *DictionaryRow) parseDictionaryV(value []byte) error {
buf := bytes.NewBuffer((value))
count, err := binary.ReadUvarint(buf)
if err != nil {
return err
}
dr.count = count
return nil
}
2014-04-17 22:55:53 +02:00
// TERM FIELD FREQUENCY
type TermVector struct {
field uint16
pos uint64
start uint64
end uint64
}
func (tv *TermVector) String() string {
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d", tv.field, tv.pos, tv.start, tv.end)
}
type TermFrequencyRow struct {
term []byte
field uint16
doc []byte
freq uint64
norm float32
vectors []*TermVector
}
2014-08-07 19:45:39 +02:00
func (tfr *TermFrequencyRow) ScanPrefixForField() []byte {
buf := make([]byte, 3)
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
return buf
}
func (tfr *TermFrequencyRow) ScanPrefixForFieldTermPrefix() []byte {
buf := make([]byte, 3+len(tfr.term))
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
copy(buf[3:], tfr.term)
return buf
}
func (tfr *TermFrequencyRow) ScanPrefixForFieldTerm() []byte {
buf := make([]byte, 3+len(tfr.term)+1)
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
termLen := copy(buf[3:], tfr.term)
buf[3+termLen] = ByteSeparator
2014-08-07 19:45:39 +02:00
return buf
}
func (tfr *TermFrequencyRow) Key() []byte {
buf := make([]byte, 3+len(tfr.term)+1+len(tfr.doc))
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
termLen := copy(buf[3:], tfr.term)
buf[3+termLen] = ByteSeparator
copy(buf[3+termLen+1:], tfr.doc)
return buf
2014-04-17 22:55:53 +02:00
}
func (tfr *TermFrequencyRow) DictionaryRowKey() []byte {
dr := NewDictionaryRow(tfr.term, tfr.field, 0)
return dr.Key()
}
func (tfr *TermFrequencyRow) Value() []byte {
used := 0
buf := make([]byte, 8+8+(len(tfr.vectors)*(8+8+8+8)))
used += binary.PutUvarint(buf[used:used+8], tfr.freq)
normuint32 := math.Float32bits(tfr.norm)
newbuf := buf[used : used+8]
used += binary.PutUvarint(newbuf, uint64(normuint32))
2014-04-17 22:55:53 +02:00
for _, vector := range tfr.vectors {
used += binary.PutUvarint(buf[used:used+8], uint64(vector.field))
used += binary.PutUvarint(buf[used:used+8], vector.pos)
used += binary.PutUvarint(buf[used:used+8], vector.start)
used += binary.PutUvarint(buf[used:used+8], vector.end)
2014-04-17 22:55:53 +02:00
}
return buf[0:used]
2014-04-17 22:55:53 +02:00
}
func (tfr *TermFrequencyRow) String() string {
return fmt.Sprintf("Term: `%s` Field: %d DocId: `%s` Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, string(tfr.doc), tfr.freq, tfr.norm, tfr.vectors)
}
func NewTermFrequencyRow(term []byte, field uint16, doc string, freq uint64, norm float32) *TermFrequencyRow {
return &TermFrequencyRow{
term: term,
field: field,
doc: []byte(doc),
freq: freq,
norm: norm,
}
}
func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, doc string, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow {
return &TermFrequencyRow{
term: term,
field: field,
doc: []byte(doc),
freq: freq,
norm: norm,
vectors: vectors,
}
}
func NewTermFrequencyRowK(key []byte) (*TermFrequencyRow, error) {
rv := TermFrequencyRow{}
keyLen := len(key)
if keyLen < 3 {
return nil, fmt.Errorf("invalid term frequency key, no valid field")
}
rv.field = binary.LittleEndian.Uint16(key[1:3])
2014-04-17 22:55:53 +02:00
termEndPos := bytes.IndexByte(key[3:], ByteSeparator)
if termEndPos < 0 {
return nil, fmt.Errorf("invalid term frequency key, no byte separator terminating term")
2014-04-17 22:55:53 +02:00
}
rv.term = key[3 : 3+termEndPos]
2014-04-17 22:55:53 +02:00
docLen := len(key) - (3 + termEndPos + 1)
if docLen < 1 {
return nil, fmt.Errorf("invalid term frequency key, empty docid")
2014-04-17 22:55:53 +02:00
}
rv.doc = key[3+termEndPos+1:]
2014-04-17 22:55:53 +02:00
return &rv, nil
}
func (tfr *TermFrequencyRow) parseV(value []byte) error {
currOffset := 0
bytesRead := 0
tfr.freq, bytesRead = binary.Uvarint(value[currOffset:])
if bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, invalid frequency")
2014-04-17 22:55:53 +02:00
}
currOffset += bytesRead
var norm uint64
norm, bytesRead = binary.Uvarint(value[currOffset:])
if bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, no norm")
2014-04-17 22:55:53 +02:00
}
currOffset += bytesRead
2014-04-17 22:55:53 +02:00
tfr.norm = math.Float32frombits(uint32(norm))
var field uint64
field, bytesRead = binary.Uvarint(value[currOffset:])
for bytesRead > 0 {
currOffset += bytesRead
2014-04-17 22:55:53 +02:00
tv := TermVector{}
tv.field = uint16(field)
2014-04-17 22:55:53 +02:00
// at this point we expect at least one term vector
if tfr.vectors == nil {
tfr.vectors = make([]*TermVector, 0)
2014-04-17 22:55:53 +02:00
}
tv.pos, bytesRead = binary.Uvarint(value[currOffset:])
if bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, vector contains no position")
2014-04-17 22:55:53 +02:00
}
currOffset += bytesRead
tv.start, bytesRead = binary.Uvarint(value[currOffset:])
if bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, vector contains no start")
2014-04-17 22:55:53 +02:00
}
currOffset += bytesRead
tv.end, bytesRead = binary.Uvarint(value[currOffset:])
if bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, vector contains no end")
2014-04-17 22:55:53 +02:00
}
currOffset += bytesRead
tfr.vectors = append(tfr.vectors, &tv)
2014-04-17 22:55:53 +02:00
// try to read next record (may not exist)
field, bytesRead = binary.Uvarint(value[currOffset:])
}
if len(value[currOffset:]) > 0 && bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, vector field invalid")
2014-04-17 22:55:53 +02:00
}
return nil
}
2014-04-17 22:55:53 +02:00
func NewTermFrequencyRowKV(key, value []byte) (*TermFrequencyRow, error) {
rv, err := NewTermFrequencyRowK(key)
if err != nil {
return nil, err
}
err = rv.parseV(value)
if err != nil {
return nil, err
}
return rv, nil
2014-04-17 22:55:53 +02:00
}
type BackIndexRow struct {
doc []byte
termEntries []*BackIndexTermEntry
storedEntries []*BackIndexStoreEntry
2014-04-17 22:55:53 +02:00
}
func (br *BackIndexRow) AllTermKeys() [][]byte {
if br == nil {
return nil
}
rv := make([][]byte, len(br.termEntries))
for i, termEntry := range br.termEntries {
termRow := NewTermFrequencyRow([]byte(termEntry.GetTerm()), uint16(termEntry.GetField()), string(br.doc), 0, 0)
rv[i] = termRow.Key()
}
return rv
2014-04-17 22:55:53 +02:00
}
func (br *BackIndexRow) AllStoredKeys() [][]byte {
if br == nil {
return nil
}
rv := make([][]byte, len(br.storedEntries))
for i, storedEntry := range br.storedEntries {
storedRow := NewStoredRow(string(br.doc), uint16(storedEntry.GetField()), storedEntry.GetArrayPositions(), 'x', []byte{})
rv[i] = storedRow.Key()
}
return rv
2014-04-17 22:55:53 +02:00
}
func (br *BackIndexRow) Key() []byte {
buf := make([]byte, len(br.doc)+1)
buf[0] = 'b'
copy(buf[1:], br.doc)
return buf
2014-04-17 22:55:53 +02:00
}
func (br *BackIndexRow) Value() []byte {
birv := &BackIndexRowValue{
TermEntries: br.termEntries,
StoredEntries: br.storedEntries,
}
bytes, _ := proto.Marshal(birv)
return bytes
2014-04-17 22:55:53 +02:00
}
func (br *BackIndexRow) String() string {
return fmt.Sprintf("Backindex DocId: `%s` Term Entries: %v, Stored Entries: %v", string(br.doc), br.termEntries, br.storedEntries)
2014-04-17 22:55:53 +02:00
}
func NewBackIndexRow(doc string, entries []*BackIndexTermEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow {
2014-04-17 22:55:53 +02:00
return &BackIndexRow{
doc: []byte(doc),
termEntries: entries,
storedEntries: storedFields,
2014-04-17 22:55:53 +02:00
}
}
func NewBackIndexRowKV(key, value []byte) (*BackIndexRow, error) {
2014-04-17 22:55:53 +02:00
rv := BackIndexRow{}
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
if err != nil {
return nil, err
}
2014-04-17 22:55:53 +02:00
rv.doc, err = buf.ReadBytes(ByteSeparator)
2014-04-19 04:31:13 +02:00
if err == io.EOF && len(rv.doc) < 1 {
err = fmt.Errorf("invalid doc length 0")
}
if err != nil && err != io.EOF {
return nil, err
2014-04-17 22:55:53 +02:00
}
rv.doc = rv.doc[:len(rv.doc)-1] // trim off separator byte
2014-04-17 22:55:53 +02:00
var birv BackIndexRowValue
err = proto.Unmarshal(value, &birv)
if err != nil {
return nil, err
2014-04-17 22:55:53 +02:00
}
rv.termEntries = birv.TermEntries
rv.storedEntries = birv.StoredEntries
2014-04-17 22:55:53 +02:00
return &rv, nil
2014-04-17 22:55:53 +02:00
}
// STORED
type StoredRow struct {
doc []byte
field uint16
arrayPositions []uint64
typ byte
value []byte
}
func (s *StoredRow) Key() []byte {
docLen := len(s.doc)
buf := make([]byte, 1+docLen+1+2+(binary.MaxVarintLen64*len(s.arrayPositions)))
buf[0] = 's'
copy(buf[1:], s.doc)
buf[1+docLen] = ByteSeparator
binary.LittleEndian.PutUint16(buf[1+docLen+1:], s.field)
bytesUsed := 1 + docLen + 1 + 2
for _, arrayPosition := range s.arrayPositions {
varbytes := binary.PutUvarint(buf[bytesUsed:], arrayPosition)
bytesUsed += varbytes
}
return buf[0:bytesUsed]
}
func (s *StoredRow) Value() []byte {
rv := make([]byte, len(s.value)+1)
rv[0] = s.typ
copy(rv[1:], s.value)
return rv
}
func (s *StoredRow) String() string {
return fmt.Sprintf("Document: %s Field %d, Array Positions: %v, Type: %s Value: %s", s.doc, s.field, s.arrayPositions, string(s.typ), s.value)
}
func (s *StoredRow) ScanPrefixForDoc() []byte {
docLen := len(s.doc)
buf := make([]byte, 1+docLen+1)
buf[0] = 's'
copy(buf[1:], s.doc)
buf[1+docLen] = ByteSeparator
return buf
}
func NewStoredRow(doc string, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow {
return &StoredRow{
doc: []byte(doc),
field: field,
arrayPositions: arrayPositions,
typ: typ,
value: value,
}
}
func NewStoredRowK(key []byte) (*StoredRow, error) {
rv := StoredRow{}
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
if err != nil {
return nil, err
}
rv.doc, err = buf.ReadBytes(ByteSeparator)
if len(rv.doc) < 2 { // 1 for min doc id length, 1 for separator
err = fmt.Errorf("invalid doc length 0")
return nil, err
}
rv.doc = rv.doc[:len(rv.doc)-1] // trim off separator byte
err = binary.Read(buf, binary.LittleEndian, &rv.field)
if err != nil {
return nil, err
}
rv.arrayPositions = make([]uint64, 0)
nextArrayPos, err := binary.ReadUvarint(buf)
for err == nil {
rv.arrayPositions = append(rv.arrayPositions, nextArrayPos)
nextArrayPos, err = binary.ReadUvarint(buf)
}
return &rv, nil
}
func NewStoredRowKV(key, value []byte) (*StoredRow, error) {
rv, err := NewStoredRowK(key)
if err != nil {
return nil, err
}
rv.typ = value[0]
rv.value = value[1:]
return rv, nil
}