0
0
bleve/index/upside_down/row.go
Marty Schoch c526a38369 major refactor of analysis files, now wired up to registry
ultimately this is make it more convenient for us to wire up
different elements of the analysis pipeline, without having to
preload everything into memory before we need it

separately the index layer now has a mechanism for storing
internal key/value pairs.  this is expected to be used to
store the mapping, and possibly other pieces of data by the
top layer, but not exposed to the user at the top.
2014-08-13 21:14:47 -04:00

522 lines
12 KiB
Go

// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package upside_down
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"math"
)
const BYTE_SEPARATOR byte = 0xff
type UpsideDownCouchRowStream chan UpsideDownCouchRow
type UpsideDownCouchRow interface {
Key() []byte
Value() []byte
}
func ParseFromKeyValue(key, value []byte) (UpsideDownCouchRow, error) {
if len(key) > 0 {
switch key[0] {
case 'v':
return NewVersionRowKV(key, value)
case 'f':
return NewFieldRowKV(key, value)
case 't':
return NewTermFrequencyRowKV(key, value)
case 'b':
return NewBackIndexRowKV(key, value)
case 's':
return NewStoredRowKV(key, value)
case 'i':
return NewInternalRowKV(key, value)
}
return nil, fmt.Errorf("Unknown field type '%s'", string(key[0]))
}
return nil, fmt.Errorf("Invalid empty key")
}
// VERSION
type VersionRow struct {
version uint8
}
func (v *VersionRow) Key() []byte {
return []byte{'v'}
}
func (v *VersionRow) Value() []byte {
return []byte{byte(v.version)}
}
func (v *VersionRow) String() string {
return fmt.Sprintf("Version: %d", v.version)
}
func NewVersionRow(version uint8) *VersionRow {
return &VersionRow{
version: version,
}
}
func NewVersionRowKV(key, value []byte) (*VersionRow, error) {
rv := VersionRow{}
buf := bytes.NewBuffer(value)
err := binary.Read(buf, binary.LittleEndian, &rv.version)
if err != nil {
return nil, err
}
return &rv, nil
}
// INTERNAL STORAGE
type InternalRow struct {
key []byte
val []byte
}
func (i *InternalRow) Key() []byte {
buf := make([]byte, len(i.key)+1)
buf[0] = 'i'
copy(buf[1:], i.key)
return buf
}
func (i *InternalRow) Value() []byte {
return i.val
}
func (i *InternalRow) String() string {
return fmt.Sprintf("InternalStore - Key: %s (% x) Val: %s (% x)", i.key, i.key, i.val, i.val)
}
func NewInternalRow(key, val []byte) *InternalRow {
return &InternalRow{
key: key,
val: val,
}
}
func NewInternalRowKV(key, value []byte) (*InternalRow, error) {
rv := InternalRow{}
rv.key = key[1:]
rv.val = value
return &rv, nil
}
// FIELD definition
type FieldRow struct {
index uint16
name string
}
func (f *FieldRow) Key() []byte {
buf := make([]byte, 3)
buf[0] = 'f'
binary.LittleEndian.PutUint16(buf[1:3], f.index)
return buf
}
func (f *FieldRow) Value() []byte {
return append([]byte(f.name), BYTE_SEPARATOR)
}
func (f *FieldRow) String() string {
return fmt.Sprintf("Field: %d Name: %s", f.index, f.name)
}
func NewFieldRow(index uint16, name string) *FieldRow {
return &FieldRow{
index: index,
name: name,
}
}
func NewFieldRowKV(key, value []byte) (*FieldRow, error) {
rv := FieldRow{}
buf := bytes.NewBuffer(key)
buf.ReadByte() // type
err := binary.Read(buf, binary.LittleEndian, &rv.index)
if err != nil {
return nil, err
}
buf = bytes.NewBuffer(value)
rv.name, err = buf.ReadString(BYTE_SEPARATOR)
if err != nil {
return nil, err
}
rv.name = rv.name[:len(rv.name)-1] // trim off separator byte
return &rv, nil
}
// TERM FIELD FREQUENCY
type TermVector struct {
field uint16
pos uint64
start uint64
end uint64
}
func (tv *TermVector) String() string {
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d", tv.field, tv.pos, tv.start, tv.end)
}
type TermFrequencyRow struct {
term []byte
field uint16
doc []byte
freq uint64
norm float32
vectors []*TermVector
}
func (tfr *TermFrequencyRow) ScanPrefixForField() []byte {
buf := make([]byte, 3)
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
return buf
}
func (tfr *TermFrequencyRow) ScanPrefixForFieldTermPrefix() []byte {
buf := make([]byte, 3+len(tfr.term))
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
copy(buf[3:], tfr.term)
return buf
}
func (tfr *TermFrequencyRow) ScanPrefixForFieldTerm() []byte {
buf := make([]byte, 3+len(tfr.term)+1)
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
termLen := copy(buf[3:], tfr.term)
buf[3+termLen] = BYTE_SEPARATOR
return buf
}
func (tfr *TermFrequencyRow) Key() []byte {
buf := make([]byte, 3+len(tfr.term)+1+len(tfr.doc))
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
termLen := copy(buf[3:], tfr.term)
buf[3+termLen] = BYTE_SEPARATOR
copy(buf[3+termLen+1:], tfr.doc)
return buf
}
func (tfr *TermFrequencyRow) Value() []byte {
buf := make([]byte, 8+4+(len(tfr.vectors)*(2+8+8+8)))
binary.LittleEndian.PutUint64(buf[0:8], tfr.freq)
normuint32 := math.Float32bits(tfr.norm)
binary.LittleEndian.PutUint32(buf[8:12], normuint32)
offset := 12
for _, vector := range tfr.vectors {
binary.LittleEndian.PutUint16(buf[offset:offset+2], vector.field)
binary.LittleEndian.PutUint64(buf[offset+2:offset+10], vector.pos)
binary.LittleEndian.PutUint64(buf[offset+10:offset+18], vector.start)
binary.LittleEndian.PutUint64(buf[offset+18:offset+26], vector.end)
offset += 26
}
return buf
}
func (tfr *TermFrequencyRow) String() string {
return fmt.Sprintf("Term: `%s` Field: %d DocId: `%s` Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, string(tfr.doc), tfr.freq, tfr.norm, tfr.vectors)
}
func NewTermFrequencyRow(term []byte, field uint16, doc string, freq uint64, norm float32) *TermFrequencyRow {
return &TermFrequencyRow{
term: term,
field: field,
doc: []byte(doc),
freq: freq,
norm: norm,
}
}
func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, doc string, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow {
return &TermFrequencyRow{
term: term,
field: field,
doc: []byte(doc),
freq: freq,
norm: norm,
vectors: vectors,
}
}
func NewTermFrequencyRowKV(key, value []byte) (*TermFrequencyRow, error) {
rv := TermFrequencyRow{
doc: []byte(""),
}
buf := bytes.NewBuffer(key)
buf.ReadByte() // type
var err error
err = binary.Read(buf, binary.LittleEndian, &rv.field)
if err != nil {
return nil, err
}
rv.term, err = buf.ReadBytes(BYTE_SEPARATOR)
if err != nil {
return nil, err
}
rv.term = rv.term[:len(rv.term)-1] // trim off separator byte
doc, err := buf.ReadBytes(BYTE_SEPARATOR)
if err != io.EOF {
return nil, err
}
if doc != nil {
rv.doc = doc
}
buf = bytes.NewBuffer((value))
err = binary.Read(buf, binary.LittleEndian, &rv.freq)
if err != nil {
return nil, err
}
err = binary.Read(buf, binary.LittleEndian, &rv.norm)
if err != nil {
return nil, err
}
var field uint16
err = binary.Read(buf, binary.LittleEndian, &field)
if err != nil && err != io.EOF {
return nil, err
}
for err != io.EOF {
tv := TermVector{}
tv.field = field
// at this point we expect at least one term vector
if rv.vectors == nil {
rv.vectors = make([]*TermVector, 0)
}
err = binary.Read(buf, binary.LittleEndian, &tv.pos)
if err != nil {
return nil, err
}
err = binary.Read(buf, binary.LittleEndian, &tv.start)
if err != nil {
return nil, err
}
err = binary.Read(buf, binary.LittleEndian, &tv.end)
if err != nil {
return nil, err
}
rv.vectors = append(rv.vectors, &tv)
// try to read next record (may not exist)
err = binary.Read(buf, binary.LittleEndian, &field)
}
return &rv, nil
}
type BackIndexEntry struct {
term []byte
field uint16
}
func (bie *BackIndexEntry) String() string {
return fmt.Sprintf("Term: `%s` Field: %d", string(bie.term), bie.field)
}
type BackIndexRow struct {
doc []byte
entries []*BackIndexEntry
storedFields []uint16
}
func (br *BackIndexRow) Key() []byte {
buf := new(bytes.Buffer)
buf.WriteByte('b')
buf.Write(br.doc)
return buf.Bytes()
}
func (br *BackIndexRow) Value() []byte {
buf := new(bytes.Buffer)
for _, e := range br.entries {
buf.Write(e.term)
buf.WriteByte(BYTE_SEPARATOR)
fieldbuf := make([]byte, 2)
binary.LittleEndian.PutUint16(fieldbuf, e.field)
buf.Write(fieldbuf)
}
for _, sf := range br.storedFields {
buf.WriteByte(BYTE_SEPARATOR)
fieldbuf := make([]byte, 2)
binary.LittleEndian.PutUint16(fieldbuf, sf)
buf.Write(fieldbuf)
}
return buf.Bytes()
}
func (br *BackIndexRow) String() string {
return fmt.Sprintf("Backindex DocId: `%s` Entries: %v, Stored Fields: %v", string(br.doc), br.entries, br.storedFields)
}
func NewBackIndexRow(doc string, entries []*BackIndexEntry, storedFields []uint16) *BackIndexRow {
return &BackIndexRow{
doc: []byte(doc),
entries: entries,
storedFields: storedFields,
}
}
func NewBackIndexRowKV(key, value []byte) (*BackIndexRow, error) {
rv := BackIndexRow{}
buf := bytes.NewBuffer(key)
buf.ReadByte() // type
var err error
rv.doc, err = buf.ReadBytes(BYTE_SEPARATOR)
if err == io.EOF && len(rv.doc) < 1 {
err = fmt.Errorf("invalid doc length 0")
}
if err != io.EOF {
return nil, err
}
buf = bytes.NewBuffer(value)
rv.entries = make([]*BackIndexEntry, 0)
rv.storedFields = make([]uint16, 0)
var term []byte
term, err = buf.ReadBytes(BYTE_SEPARATOR)
if err == io.EOF && len(term) < 1 {
err = fmt.Errorf("invalid term length 0")
}
if err != nil && err != io.EOF {
return nil, err
}
for err != io.EOF {
if len(term) > 2 {
// this is a back index entry
ent := BackIndexEntry{}
ent.term = term[:len(term)-1] // trim off separator byte
err = binary.Read(buf, binary.LittleEndian, &ent.field)
if err != nil {
return nil, err
}
rv.entries = append(rv.entries, &ent)
} else {
// this is a stored field entry
var sf uint16
err = binary.Read(buf, binary.LittleEndian, &sf)
if err != nil {
return nil, err
}
rv.storedFields = append(rv.storedFields, sf)
}
term, err = buf.ReadBytes(BYTE_SEPARATOR)
if err != nil && err != io.EOF {
return nil, err
}
}
return &rv, nil
}
// STORED
type StoredRow struct {
doc []byte
field uint16
typ byte
value []byte
}
func (s *StoredRow) Key() []byte {
buf := new(bytes.Buffer)
buf.WriteByte('s')
buf.Write(s.doc)
buf.WriteByte(BYTE_SEPARATOR)
fieldbuf := make([]byte, 2)
binary.LittleEndian.PutUint16(fieldbuf, s.field)
buf.Write(fieldbuf)
return buf.Bytes()
}
func (s *StoredRow) Value() []byte {
rv := make([]byte, len(s.value)+1)
rv[0] = s.typ
copy(rv[1:], s.value)
return rv
}
func (s *StoredRow) String() string {
return fmt.Sprintf("Document: %s Field %d, Type: %s Value: %s", s.doc, s.field, string(s.typ), s.value)
}
func (s *StoredRow) ScanPrefixForDoc() []byte {
buf := new(bytes.Buffer)
buf.WriteByte('s')
buf.Write(s.doc)
buf.WriteByte(BYTE_SEPARATOR)
return buf.Bytes()
}
func NewStoredRow(doc string, field uint16, typ byte, value []byte) *StoredRow {
return &StoredRow{
doc: []byte(doc),
field: field,
typ: typ,
value: value,
}
}
func NewStoredRowKV(key, value []byte) (*StoredRow, error) {
rv := StoredRow{}
buf := bytes.NewBuffer(key)
buf.ReadByte() // type
var err error
rv.doc, err = buf.ReadBytes(BYTE_SEPARATOR)
if len(rv.doc) < 2 { // 1 for min doc id length, 1 for separator
err = fmt.Errorf("invalid doc length 0")
return nil, err
}
rv.doc = rv.doc[:len(rv.doc)-1] // trim off separator byte
err = binary.Read(buf, binary.LittleEndian, &rv.field)
if err != nil {
return nil, err
}
rv.typ = value[0]
rv.value = value[1:]
return &rv, nil
}