0
0
bleve/index/upside_down/row.go
Patrick Mezard e85c9c542e row: expose TermFrequencyRow term and freq fields
Rows content is an implementation detail of bleve index and may change
in the future. That said, they also contains information valuable to
assess the quality of the index or understand its performances. So, as
long as we agree that type asserting rows should only be done if you
know what you are doing and are ready to deal with future changes, I see
no reason to hide the row fields from external packages.

Fix #268
2015-11-17 17:21:26 +01:00

819 lines
19 KiB
Go

// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package upside_down
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"math"
"github.com/golang/protobuf/proto"
)
const ByteSeparator byte = 0xff
type UpsideDownCouchRowStream chan UpsideDownCouchRow
type UpsideDownCouchRow interface {
KeySize() int
KeyTo([]byte) (int, error)
Key() []byte
Value() []byte
ValueSize() int
ValueTo([]byte) (int, error)
}
func ParseFromKeyValue(key, value []byte) (UpsideDownCouchRow, error) {
if len(key) > 0 {
switch key[0] {
case 'v':
return NewVersionRowKV(key, value)
case 'f':
return NewFieldRowKV(key, value)
case 'd':
return NewDictionaryRowKV(key, value)
case 't':
return NewTermFrequencyRowKV(key, value)
case 'b':
return NewBackIndexRowKV(key, value)
case 's':
return NewStoredRowKV(key, value)
case 'i':
return NewInternalRowKV(key, value)
}
return nil, fmt.Errorf("Unknown field type '%s'", string(key[0]))
}
return nil, fmt.Errorf("Invalid empty key")
}
// VERSION
type VersionRow struct {
version uint8
}
func (v *VersionRow) Key() []byte {
return []byte{'v'}
}
func (v *VersionRow) KeySize() int {
return 1
}
func (v *VersionRow) KeyTo(buf []byte) (int, error) {
buf[0] = 'v'
return 1, nil
}
func (v *VersionRow) Value() []byte {
return []byte{byte(v.version)}
}
func (v *VersionRow) ValueSize() int {
return 1
}
func (v *VersionRow) ValueTo(buf []byte) (int, error) {
buf[0] = v.version
return 1, nil
}
func (v *VersionRow) String() string {
return fmt.Sprintf("Version: %d", v.version)
}
func NewVersionRow(version uint8) *VersionRow {
return &VersionRow{
version: version,
}
}
func NewVersionRowKV(key, value []byte) (*VersionRow, error) {
rv := VersionRow{}
buf := bytes.NewBuffer(value)
err := binary.Read(buf, binary.LittleEndian, &rv.version)
if err != nil {
return nil, err
}
return &rv, nil
}
// INTERNAL STORAGE
type InternalRow struct {
key []byte
val []byte
}
func (i *InternalRow) Key() []byte {
buf := make([]byte, i.KeySize())
size, _ := i.KeyTo(buf)
return buf[:size]
}
func (i *InternalRow) KeySize() int {
return len(i.key) + 1
}
func (i *InternalRow) KeyTo(buf []byte) (int, error) {
buf[0] = 'i'
actual := copy(buf[1:], i.key)
return 1 + actual, nil
}
func (i *InternalRow) Value() []byte {
return i.val
}
func (i *InternalRow) ValueSize() int {
return len(i.val)
}
func (i *InternalRow) ValueTo(buf []byte) (int, error) {
actual := copy(buf, i.val)
return actual, nil
}
func (i *InternalRow) String() string {
return fmt.Sprintf("InternalStore - Key: %s (% x) Val: %s (% x)", i.key, i.key, i.val, i.val)
}
func NewInternalRow(key, val []byte) *InternalRow {
return &InternalRow{
key: key,
val: val,
}
}
func NewInternalRowKV(key, value []byte) (*InternalRow, error) {
rv := InternalRow{}
rv.key = key[1:]
rv.val = value
return &rv, nil
}
// FIELD definition
type FieldRow struct {
index uint16
name string
}
func (f *FieldRow) Key() []byte {
buf := make([]byte, f.KeySize())
size, _ := f.KeyTo(buf)
return buf[:size]
}
func (f *FieldRow) KeySize() int {
return 3
}
func (f *FieldRow) KeyTo(buf []byte) (int, error) {
buf[0] = 'f'
binary.LittleEndian.PutUint16(buf[1:3], f.index)
return 3, nil
}
func (f *FieldRow) Value() []byte {
return append([]byte(f.name), ByteSeparator)
}
func (f *FieldRow) ValueSize() int {
return len(f.name) + 1
}
func (f *FieldRow) ValueTo(buf []byte) (int, error) {
size := copy(buf, f.name)
buf[size] = ByteSeparator
return size + 1, nil
}
func (f *FieldRow) String() string {
return fmt.Sprintf("Field: %d Name: %s", f.index, f.name)
}
func NewFieldRow(index uint16, name string) *FieldRow {
return &FieldRow{
index: index,
name: name,
}
}
func NewFieldRowKV(key, value []byte) (*FieldRow, error) {
rv := FieldRow{}
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
if err != nil {
return nil, err
}
err = binary.Read(buf, binary.LittleEndian, &rv.index)
if err != nil {
return nil, err
}
buf = bytes.NewBuffer(value)
rv.name, err = buf.ReadString(ByteSeparator)
if err != nil {
return nil, err
}
rv.name = rv.name[:len(rv.name)-1] // trim off separator byte
return &rv, nil
}
// DICTIONARY
type DictionaryRow struct {
field uint16
term []byte
count uint64
}
func (dr *DictionaryRow) Key() []byte {
buf := make([]byte, dr.KeySize())
size, _ := dr.KeyTo(buf)
return buf[:size]
}
func (dr *DictionaryRow) KeySize() int {
return len(dr.term) + 3
}
func (dr *DictionaryRow) KeyTo(buf []byte) (int, error) {
buf[0] = 'd'
binary.LittleEndian.PutUint16(buf[1:3], dr.field)
size := copy(buf[3:], dr.term)
return size + 3, nil
}
func (dr *DictionaryRow) Value() []byte {
buf := make([]byte, dr.ValueSize())
size, _ := dr.ValueTo(buf)
return buf[:size]
}
func (dr *DictionaryRow) ValueSize() int {
return binary.MaxVarintLen64
}
func (dr *DictionaryRow) ValueTo(buf []byte) (int, error) {
used := binary.PutUvarint(buf, dr.count)
return used, nil
}
func (dr *DictionaryRow) String() string {
return fmt.Sprintf("Dictionary Term: `%s` Field: %d Count: %d ", string(dr.term), dr.field, dr.count)
}
func NewDictionaryRow(term []byte, field uint16, count uint64) *DictionaryRow {
return &DictionaryRow{
term: term,
field: field,
count: count,
}
}
func NewDictionaryRowKV(key, value []byte) (*DictionaryRow, error) {
rv, err := NewDictionaryRowK(key)
if err != nil {
return nil, err
}
err = rv.parseDictionaryV(value)
if err != nil {
return nil, err
}
return rv, nil
}
func NewDictionaryRowK(key []byte) (*DictionaryRow, error) {
rv := DictionaryRow{}
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
if err != nil {
return nil, err
}
err = binary.Read(buf, binary.LittleEndian, &rv.field)
if err != nil {
return nil, err
}
rv.term, err = buf.ReadBytes(ByteSeparator)
// there is no separator expected here, should get EOF
if err != io.EOF {
return nil, err
}
return &rv, nil
}
func (dr *DictionaryRow) parseDictionaryV(value []byte) error {
buf := bytes.NewBuffer((value))
count, err := binary.ReadUvarint(buf)
if err != nil {
return err
}
dr.count = count
return nil
}
// TERM FIELD FREQUENCY
type TermVector struct {
field uint16
arrayPositions []uint64
pos uint64
start uint64
end uint64
}
func (tv *TermVector) String() string {
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions)
}
type TermFrequencyRow struct {
term []byte
field uint16
doc []byte
freq uint64
norm float32
vectors []*TermVector
}
func (tfr *TermFrequencyRow) Term() []byte {
return tfr.term
}
func (tfr *TermFrequencyRow) Freq() uint64 {
return tfr.freq
}
func (tfr *TermFrequencyRow) ScanPrefixForField() []byte {
buf := make([]byte, 3)
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
return buf
}
func (tfr *TermFrequencyRow) ScanPrefixForFieldTermPrefix() []byte {
buf := make([]byte, 3+len(tfr.term))
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
copy(buf[3:], tfr.term)
return buf
}
func (tfr *TermFrequencyRow) ScanPrefixForFieldTerm() []byte {
buf := make([]byte, 3+len(tfr.term)+1)
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
termLen := copy(buf[3:], tfr.term)
buf[3+termLen] = ByteSeparator
return buf
}
func (tfr *TermFrequencyRow) Key() []byte {
buf := make([]byte, tfr.KeySize())
size, _ := tfr.KeyTo(buf)
return buf[:size]
}
func (tfr *TermFrequencyRow) KeySize() int {
return 3 + len(tfr.term) + 1 + len(tfr.doc)
}
func (tfr *TermFrequencyRow) KeyTo(buf []byte) (int, error) {
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
termLen := copy(buf[3:], tfr.term)
buf[3+termLen] = ByteSeparator
docLen := copy(buf[3+termLen+1:], tfr.doc)
return 3 + termLen + 1 + docLen, nil
}
func (tfr *TermFrequencyRow) DictionaryRowKey() []byte {
dr := NewDictionaryRow(tfr.term, tfr.field, 0)
return dr.Key()
}
func (tfr *TermFrequencyRow) DictionaryRowKeySize() int {
dr := NewDictionaryRow(tfr.term, tfr.field, 0)
return dr.KeySize()
}
func (tfr *TermFrequencyRow) DictionaryRowKeyTo(buf []byte) (int, error) {
dr := NewDictionaryRow(tfr.term, tfr.field, 0)
return dr.KeyTo(buf)
}
func (tfr *TermFrequencyRow) Value() []byte {
buf := make([]byte, tfr.ValueSize())
size, _ := tfr.ValueTo(buf)
return buf[:size]
}
func (tfr *TermFrequencyRow) ValueSize() int {
bufLen := binary.MaxVarintLen64 + binary.MaxVarintLen64
for _, vector := range tfr.vectors {
bufLen += (binary.MaxVarintLen64 * 4) + (1+len(vector.arrayPositions))*binary.MaxVarintLen64
}
return bufLen
}
func (tfr *TermFrequencyRow) ValueTo(buf []byte) (int, error) {
used := binary.PutUvarint(buf[:binary.MaxVarintLen64], tfr.freq)
normuint32 := math.Float32bits(tfr.norm)
newbuf := buf[used : used+binary.MaxVarintLen64]
used += binary.PutUvarint(newbuf, uint64(normuint32))
for _, vector := range tfr.vectors {
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], uint64(vector.field))
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], vector.pos)
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], vector.start)
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], vector.end)
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], uint64(len(vector.arrayPositions)))
for _, arrayPosition := range vector.arrayPositions {
used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], arrayPosition)
}
}
return used, nil
}
func (tfr *TermFrequencyRow) String() string {
return fmt.Sprintf("Term: `%s` Field: %d DocId: `%s` Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, string(tfr.doc), tfr.freq, tfr.norm, tfr.vectors)
}
func NewTermFrequencyRow(term []byte, field uint16, doc string, freq uint64, norm float32) *TermFrequencyRow {
return &TermFrequencyRow{
term: term,
field: field,
doc: []byte(doc),
freq: freq,
norm: norm,
}
}
func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, doc string, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow {
return &TermFrequencyRow{
term: term,
field: field,
doc: []byte(doc),
freq: freq,
norm: norm,
vectors: vectors,
}
}
func NewTermFrequencyRowK(key []byte) (*TermFrequencyRow, error) {
rv := TermFrequencyRow{}
keyLen := len(key)
if keyLen < 3 {
return nil, fmt.Errorf("invalid term frequency key, no valid field")
}
rv.field = binary.LittleEndian.Uint16(key[1:3])
termEndPos := bytes.IndexByte(key[3:], ByteSeparator)
if termEndPos < 0 {
return nil, fmt.Errorf("invalid term frequency key, no byte separator terminating term")
}
rv.term = key[3 : 3+termEndPos]
docLen := len(key) - (3 + termEndPos + 1)
if docLen < 1 {
return nil, fmt.Errorf("invalid term frequency key, empty docid")
}
rv.doc = key[3+termEndPos+1:]
return &rv, nil
}
func (tfr *TermFrequencyRow) parseV(value []byte) error {
currOffset := 0
bytesRead := 0
tfr.freq, bytesRead = binary.Uvarint(value[currOffset:])
if bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, invalid frequency")
}
currOffset += bytesRead
var norm uint64
norm, bytesRead = binary.Uvarint(value[currOffset:])
if bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, no norm")
}
currOffset += bytesRead
tfr.norm = math.Float32frombits(uint32(norm))
var field uint64
field, bytesRead = binary.Uvarint(value[currOffset:])
for bytesRead > 0 {
currOffset += bytesRead
tv := TermVector{}
tv.field = uint16(field)
// at this point we expect at least one term vector
if tfr.vectors == nil {
tfr.vectors = make([]*TermVector, 0)
}
tv.pos, bytesRead = binary.Uvarint(value[currOffset:])
if bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, vector contains no position")
}
currOffset += bytesRead
tv.start, bytesRead = binary.Uvarint(value[currOffset:])
if bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, vector contains no start")
}
currOffset += bytesRead
tv.end, bytesRead = binary.Uvarint(value[currOffset:])
if bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, vector contains no end")
}
currOffset += bytesRead
var arrayPositionsLen uint64 = 0
arrayPositionsLen, bytesRead = binary.Uvarint(value[currOffset:])
if bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, vector contains no arrayPositionLen")
}
currOffset += bytesRead
if arrayPositionsLen > 0 {
tv.arrayPositions = make([]uint64, arrayPositionsLen)
for i := 0; uint64(i) < arrayPositionsLen; i++ {
tv.arrayPositions[i], bytesRead = binary.Uvarint(value[currOffset:])
if bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, vector contains no arrayPosition of index %d", i)
}
currOffset += bytesRead
}
}
tfr.vectors = append(tfr.vectors, &tv)
// try to read next record (may not exist)
field, bytesRead = binary.Uvarint(value[currOffset:])
}
if len(value[currOffset:]) > 0 && bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, vector field invalid")
}
return nil
}
func NewTermFrequencyRowKV(key, value []byte) (*TermFrequencyRow, error) {
rv, err := NewTermFrequencyRowK(key)
if err != nil {
return nil, err
}
err = rv.parseV(value)
if err != nil {
return nil, err
}
return rv, nil
}
type BackIndexRow struct {
doc []byte
termEntries []*BackIndexTermEntry
storedEntries []*BackIndexStoreEntry
}
func (br *BackIndexRow) AllTermKeys() [][]byte {
if br == nil {
return nil
}
rv := make([][]byte, len(br.termEntries))
for i, termEntry := range br.termEntries {
termRow := NewTermFrequencyRow([]byte(termEntry.GetTerm()), uint16(termEntry.GetField()), string(br.doc), 0, 0)
rv[i] = termRow.Key()
}
return rv
}
func (br *BackIndexRow) AllStoredKeys() [][]byte {
if br == nil {
return nil
}
rv := make([][]byte, len(br.storedEntries))
for i, storedEntry := range br.storedEntries {
storedRow := NewStoredRow(string(br.doc), uint16(storedEntry.GetField()), storedEntry.GetArrayPositions(), 'x', []byte{})
rv[i] = storedRow.Key()
}
return rv
}
func (br *BackIndexRow) Key() []byte {
buf := make([]byte, br.KeySize())
size, _ := br.KeyTo(buf)
return buf[:size]
}
func (br *BackIndexRow) KeySize() int {
return len(br.doc) + 1
}
func (br *BackIndexRow) KeyTo(buf []byte) (int, error) {
buf[0] = 'b'
used := copy(buf[1:], br.doc)
return used + 1, nil
}
func (br *BackIndexRow) Value() []byte {
buf := make([]byte, br.ValueSize())
size, _ := br.ValueTo(buf)
return buf[:size]
}
func (br *BackIndexRow) ValueSize() int {
birv := &BackIndexRowValue{
TermEntries: br.termEntries,
StoredEntries: br.storedEntries,
}
return birv.Size()
}
func (br *BackIndexRow) ValueTo(buf []byte) (int, error) {
birv := &BackIndexRowValue{
TermEntries: br.termEntries,
StoredEntries: br.storedEntries,
}
return birv.MarshalTo(buf)
}
func (br *BackIndexRow) String() string {
return fmt.Sprintf("Backindex DocId: `%s` Term Entries: %v, Stored Entries: %v", string(br.doc), br.termEntries, br.storedEntries)
}
func NewBackIndexRow(doc string, entries []*BackIndexTermEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow {
return &BackIndexRow{
doc: []byte(doc),
termEntries: entries,
storedEntries: storedFields,
}
}
func NewBackIndexRowKV(key, value []byte) (*BackIndexRow, error) {
rv := BackIndexRow{}
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
if err != nil {
return nil, err
}
rv.doc, err = buf.ReadBytes(ByteSeparator)
if err == io.EOF && len(rv.doc) < 1 {
err = fmt.Errorf("invalid doc length 0 - % x", key)
}
if err != nil && err != io.EOF {
return nil, err
} else if err == nil {
rv.doc = rv.doc[:len(rv.doc)-1] // trim off separator byte
}
var birv BackIndexRowValue
err = proto.Unmarshal(value, &birv)
if err != nil {
return nil, err
}
rv.termEntries = birv.TermEntries
rv.storedEntries = birv.StoredEntries
return &rv, nil
}
// STORED
type StoredRow struct {
doc []byte
field uint16
arrayPositions []uint64
typ byte
value []byte
}
func (s *StoredRow) Key() []byte {
buf := make([]byte, s.KeySize())
size, _ := s.KeyTo(buf)
return buf[0:size]
}
func (s *StoredRow) KeySize() int {
return 1 + len(s.doc) + 1 + 2 + (binary.MaxVarintLen64 * len(s.arrayPositions))
}
func (s *StoredRow) KeyTo(buf []byte) (int, error) {
docLen := len(s.doc)
buf[0] = 's'
copy(buf[1:], s.doc)
buf[1+docLen] = ByteSeparator
binary.LittleEndian.PutUint16(buf[1+docLen+1:], s.field)
bytesUsed := 1 + docLen + 1 + 2
for _, arrayPosition := range s.arrayPositions {
varbytes := binary.PutUvarint(buf[bytesUsed:], arrayPosition)
bytesUsed += varbytes
}
return bytesUsed, nil
}
func (s *StoredRow) Value() []byte {
buf := make([]byte, s.ValueSize())
size, _ := s.ValueTo(buf)
return buf[:size]
}
func (s *StoredRow) ValueSize() int {
return len(s.value) + 1
}
func (s *StoredRow) ValueTo(buf []byte) (int, error) {
buf[0] = s.typ
used := copy(buf[1:], s.value)
return used + 1, nil
}
func (s *StoredRow) String() string {
return fmt.Sprintf("Document: %s Field %d, Array Positions: %v, Type: %s Value: %s", s.doc, s.field, s.arrayPositions, string(s.typ), s.value)
}
func (s *StoredRow) ScanPrefixForDoc() []byte {
docLen := len(s.doc)
buf := make([]byte, 1+docLen+1)
buf[0] = 's'
copy(buf[1:], s.doc)
buf[1+docLen] = ByteSeparator
return buf
}
func NewStoredRow(doc string, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow {
return &StoredRow{
doc: []byte(doc),
field: field,
arrayPositions: arrayPositions,
typ: typ,
value: value,
}
}
func NewStoredRowK(key []byte) (*StoredRow, error) {
rv := StoredRow{}
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
if err != nil {
return nil, err
}
rv.doc, err = buf.ReadBytes(ByteSeparator)
if len(rv.doc) < 2 { // 1 for min doc id length, 1 for separator
err = fmt.Errorf("invalid doc length 0")
return nil, err
}
rv.doc = rv.doc[:len(rv.doc)-1] // trim off separator byte
err = binary.Read(buf, binary.LittleEndian, &rv.field)
if err != nil {
return nil, err
}
rv.arrayPositions = make([]uint64, 0)
nextArrayPos, err := binary.ReadUvarint(buf)
for err == nil {
rv.arrayPositions = append(rv.arrayPositions, nextArrayPos)
nextArrayPos, err = binary.ReadUvarint(buf)
}
return &rv, nil
}
func NewStoredRowKV(key, value []byte) (*StoredRow, error) {
rv, err := NewStoredRowK(key)
if err != nil {
return nil, err
}
rv.typ = value[0]
rv.value = value[1:]
return rv, nil
}