0
0
bleve/index/upside_down/upside_down.go

603 lines
17 KiB
Go
Raw Normal View History

2014-04-17 22:55:53 +02:00
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
2014-04-17 22:55:53 +02:00
package upside_down
import (
"bytes"
"math"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store"
"code.google.com/p/goprotobuf/proto"
2014-04-17 22:55:53 +02:00
)
2014-09-04 00:17:26 +02:00
var VersionKey = []byte{'v'}
2014-04-17 22:55:53 +02:00
const Version uint8 = 1
2014-04-17 22:55:53 +02:00
type UpsideDownCouch struct {
version uint8
path string
store store.KVStore
2014-04-17 22:55:53 +02:00
fieldIndexes map[string]uint16
lastFieldIndex int
analyzer map[string]*analysis.Analyzer
docCount uint64
}
func NewUpsideDownCouch(s store.KVStore) *UpsideDownCouch {
2014-04-17 22:55:53 +02:00
return &UpsideDownCouch{
version: Version,
2014-04-17 22:55:53 +02:00
analyzer: make(map[string]*analysis.Analyzer),
fieldIndexes: make(map[string]uint16),
store: s,
2014-04-17 22:55:53 +02:00
}
}
func (udc *UpsideDownCouch) init(kvwriter store.KVWriter) (err error) {
2014-04-17 22:55:53 +02:00
// prepare a list of rows
rows := make([]UpsideDownCouchRow, 0)
// version marker
rows = append(rows, NewVersionRow(udc.version))
return udc.batchRows(kvwriter, nil, rows, nil)
2014-04-17 22:55:53 +02:00
}
func (udc *UpsideDownCouch) loadSchema(kvreader store.KVReader) (err error) {
2014-04-17 22:55:53 +02:00
keyPrefix := []byte{'f'}
it := kvreader.Iterator(keyPrefix)
2014-04-17 22:55:53 +02:00
defer it.Close()
it.Seek(keyPrefix)
key, val, valid := it.Current()
for valid {
2014-04-17 22:55:53 +02:00
// stop when
if !bytes.HasPrefix(key, keyPrefix) {
2014-04-17 22:55:53 +02:00
break
}
fieldRow, err := NewFieldRowKV(key, val)
if err != nil {
return err
}
2014-04-17 22:55:53 +02:00
udc.fieldIndexes[fieldRow.name] = fieldRow.index
if int(fieldRow.index) > udc.lastFieldIndex {
udc.lastFieldIndex = int(fieldRow.index)
}
it.Next()
key, val, valid = it.Current()
2014-04-17 22:55:53 +02:00
}
return
}
func (udc *UpsideDownCouch) batchRows(writer store.KVWriter, addRows []UpsideDownCouchRow, updateRows []UpsideDownCouchRow, deleteRows []UpsideDownCouchRow) (err error) {
2014-04-17 22:55:53 +02:00
// prepare batch
wb := writer.NewBatch()
2014-04-17 22:55:53 +02:00
// add
for _, row := range addRows {
tfr, ok := row.(*TermFrequencyRow)
if ok {
// need to increment counter
summaryKey := tfr.SummaryKey()
wb.Merge(summaryKey, newTermSummaryIncr())
2014-04-17 22:55:53 +02:00
}
wb.Set(row.Key(), row.Value())
2014-04-17 22:55:53 +02:00
}
// update
for _, row := range updateRows {
wb.Set(row.Key(), row.Value())
2014-04-17 22:55:53 +02:00
}
// delete
for _, row := range deleteRows {
tfr, ok := row.(*TermFrequencyRow)
if ok {
// need to decrement counter
summaryKey := tfr.SummaryKey()
wb.Merge(summaryKey, newTermSummaryDecr())
2014-04-17 22:55:53 +02:00
}
wb.Delete(row.Key())
2014-04-17 22:55:53 +02:00
}
// write out the batch
err = wb.Execute()
if err != nil {
return
}
2014-04-17 22:55:53 +02:00
return
}
func (udc *UpsideDownCouch) DocCount() uint64 {
return udc.docCount
}
func (udc *UpsideDownCouch) Open() error {
// start a writer for the open process
kvwriter := udc.store.Writer()
defer kvwriter.Close()
value, err := kvwriter.Get(VersionKey)
2014-04-17 22:55:53 +02:00
if err != nil {
return err
2014-04-17 22:55:53 +02:00
}
// init new index OR load schema
if value == nil {
err = udc.init(kvwriter)
2014-04-17 22:55:53 +02:00
if err != nil {
return err
2014-04-17 22:55:53 +02:00
}
} else {
err = udc.loadSchema(kvwriter)
2014-04-17 22:55:53 +02:00
if err != nil {
return err
2014-04-17 22:55:53 +02:00
}
}
// set doc count
udc.docCount = udc.countDocs(kvwriter)
return nil
2014-04-17 22:55:53 +02:00
}
func (udc *UpsideDownCouch) countDocs(kvreader store.KVReader) uint64 {
it := kvreader.Iterator([]byte{'b'})
2014-04-17 22:55:53 +02:00
defer it.Close()
2014-09-04 00:17:26 +02:00
var rv uint64
key, _, valid := it.Current()
for valid {
if !bytes.HasPrefix(key, []byte{'b'}) {
2014-04-17 22:55:53 +02:00
break
}
2014-09-04 00:17:26 +02:00
rv++
it.Next()
key, _, valid = it.Current()
2014-04-17 22:55:53 +02:00
}
2014-04-17 22:55:53 +02:00
return rv
}
func (udc *UpsideDownCouch) rowCount() uint64 {
// start an isolated reader for use during the rowcount
kvreader := udc.store.Reader()
defer kvreader.Close()
it := kvreader.Iterator([]byte{0})
2014-04-17 22:55:53 +02:00
defer it.Close()
2014-09-04 00:17:26 +02:00
var rv uint64
_, _, valid := it.Current()
for valid {
2014-09-04 00:17:26 +02:00
rv++
it.Next()
_, _, valid = it.Current()
2014-04-17 22:55:53 +02:00
}
2014-04-17 22:55:53 +02:00
return rv
}
func (udc *UpsideDownCouch) Close() {
udc.store.Close()
2014-04-17 22:55:53 +02:00
}
func (udc *UpsideDownCouch) Update(doc *document.Document) error {
// start a writer for this update
kvwriter := udc.store.Writer()
defer kvwriter.Close()
2014-04-17 22:55:53 +02:00
// first we lookup the backindex row for the doc id if it exists
// lookup the back index row
backIndexRow, err := udc.backIndexRowForDoc(kvwriter, doc.ID)
2014-04-17 22:55:53 +02:00
if err != nil {
return err
}
// prepare a list of rows
addRows := make([]UpsideDownCouchRow, 0)
updateRows := make([]UpsideDownCouchRow, 0)
deleteRows := make([]UpsideDownCouchRow, 0)
addRows, updateRows, deleteRows = udc.updateSingle(doc, backIndexRow, addRows, updateRows, deleteRows)
err = udc.batchRows(kvwriter, addRows, updateRows, deleteRows)
if err == nil && backIndexRow == nil {
2014-09-04 00:17:26 +02:00
udc.docCount++
}
return err
}
func (udc *UpsideDownCouch) updateSingle(doc *document.Document, backIndexRow *BackIndexRow, addRows, updateRows, deleteRows []UpsideDownCouchRow) ([]UpsideDownCouchRow, []UpsideDownCouchRow, []UpsideDownCouchRow) {
existingTermKeys := make(map[string]bool)
for _, key := range backIndexRow.AllTermKeys() {
existingTermKeys[string(key)] = true
2014-04-17 22:55:53 +02:00
}
existingStoredKeys := make(map[string]bool)
for _, key := range backIndexRow.AllStoredKeys() {
existingStoredKeys[string(key)] = true
}
2014-04-17 22:55:53 +02:00
// track our back index entries
backIndexTermEntries := make([]*BackIndexTermEntry, 0)
backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)
2014-04-17 22:55:53 +02:00
for _, field := range doc.Fields {
2014-07-21 23:05:55 +02:00
fieldIndex, newFieldRow := udc.fieldNameToFieldIndex(field.Name())
if newFieldRow != nil {
updateRows = append(updateRows, newFieldRow)
2014-04-17 22:55:53 +02:00
}
if field.Options().IsIndexed() {
2014-07-21 23:05:55 +02:00
fieldLength, tokenFreqs := field.Analyze()
2014-04-17 22:55:53 +02:00
2014-07-21 23:05:55 +02:00
// see if any of the composite fields need this
for _, compositeField := range doc.CompositeFields {
compositeField.Compose(field.Name(), fieldLength, tokenFreqs)
}
2014-07-21 23:05:55 +02:00
// encode this field
indexAddRows, indexUpdateRows, indexBackIndexTermEntries := udc.indexField(doc.ID, field, fieldIndex, fieldLength, tokenFreqs, existingTermKeys)
2014-07-21 23:05:55 +02:00
addRows = append(addRows, indexAddRows...)
updateRows = append(updateRows, indexUpdateRows...)
backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...)
}
if field.Options().IsStored() {
storeAddRows, storeUpdateRows, indexBackIndexStoreEntries := udc.storeField(doc.ID, field, fieldIndex, existingStoredKeys)
2014-07-21 23:05:55 +02:00
addRows = append(addRows, storeAddRows...)
updateRows = append(updateRows, storeUpdateRows...)
backIndexStoredEntries = append(backIndexStoredEntries, indexBackIndexStoreEntries...)
2014-04-17 22:55:53 +02:00
}
}
2014-07-21 23:05:55 +02:00
// now index the composite fields
for _, compositeField := range doc.CompositeFields {
fieldIndex, newFieldRow := udc.fieldNameToFieldIndex(compositeField.Name())
if newFieldRow != nil {
updateRows = append(updateRows, newFieldRow)
}
if compositeField.Options().IsIndexed() {
fieldLength, tokenFreqs := compositeField.Analyze()
// encode this field
indexAddRows, indexUpdateRows, indexBackIndexTermEntries := udc.indexField(doc.ID, compositeField, fieldIndex, fieldLength, tokenFreqs, existingTermKeys)
2014-07-21 23:05:55 +02:00
addRows = append(addRows, indexAddRows...)
updateRows = append(updateRows, indexUpdateRows...)
backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...)
2014-07-21 23:05:55 +02:00
}
}
2014-04-17 22:55:53 +02:00
// build the back index row
backIndexRow = NewBackIndexRow(doc.ID, backIndexTermEntries, backIndexStoredEntries)
2014-04-17 22:55:53 +02:00
updateRows = append(updateRows, backIndexRow)
// any of the existing rows that weren't updated need to be deleted
2014-09-04 00:17:26 +02:00
for existingTermKey := range existingTermKeys {
termFreqRow, err := NewTermFrequencyRowK([]byte(existingTermKey))
if err == nil {
deleteRows = append(deleteRows, termFreqRow)
2014-04-17 22:55:53 +02:00
}
}
// any of the existing stored fields that weren't updated need to be deleted
2014-09-04 00:17:26 +02:00
for existingStoredKey := range existingStoredKeys {
storedRow, err := NewStoredRowK([]byte(existingStoredKey))
if err == nil {
deleteRows = append(deleteRows, storedRow)
}
}
2014-04-17 22:55:53 +02:00
return addRows, updateRows, deleteRows
2014-04-17 22:55:53 +02:00
}
2014-09-04 01:53:59 +02:00
func (udc *UpsideDownCouch) storeField(docID string, field document.Field, fieldIndex uint16, existingKeys map[string]bool) ([]UpsideDownCouchRow, []UpsideDownCouchRow, []*BackIndexStoreEntry) {
2014-07-21 23:05:55 +02:00
updateRows := make([]UpsideDownCouchRow, 0)
addRows := make([]UpsideDownCouchRow, 0)
backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)
fieldType := encodeFieldType(field)
2014-09-04 01:53:59 +02:00
storedRow := NewStoredRow(docID, fieldIndex, field.ArrayPositions(), fieldType, field.Value())
// record the back index entry
backIndexStoredEntry := BackIndexStoreEntry{Field: proto.Uint32(uint32(fieldIndex)), ArrayPositions: field.ArrayPositions()}
backIndexStoredEntries = append(backIndexStoredEntries, &backIndexStoredEntry)
storedRowKey := string(storedRow.Key())
_, existed := existingKeys[storedRowKey]
if existed {
2014-07-21 23:05:55 +02:00
// this is an update
updateRows = append(updateRows, storedRow)
// this field was stored last time, delete it from that map
delete(existingKeys, storedRowKey)
2014-07-21 23:05:55 +02:00
} else {
addRows = append(addRows, storedRow)
}
return addRows, updateRows, backIndexStoredEntries
2014-07-21 23:05:55 +02:00
}
func encodeFieldType(f document.Field) byte {
fieldType := byte('x')
switch f.(type) {
case *document.TextField:
fieldType = 't'
case *document.NumericField:
fieldType = 'n'
case *document.DateTimeField:
fieldType = 'd'
case *document.CompositeField:
fieldType = 'c'
}
return fieldType
}
2014-09-04 01:53:59 +02:00
func (udc *UpsideDownCouch) indexField(docID string, field document.Field, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, existingKeys map[string]bool) ([]UpsideDownCouchRow, []UpsideDownCouchRow, []*BackIndexTermEntry) {
2014-07-21 23:05:55 +02:00
updateRows := make([]UpsideDownCouchRow, 0)
addRows := make([]UpsideDownCouchRow, 0)
backIndexTermEntries := make([]*BackIndexTermEntry, 0)
2014-07-21 23:05:55 +02:00
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
for _, tf := range tokenFreqs {
var termFreqRow *TermFrequencyRow
if field.Options().IncludeTermVectors() {
tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, tf)
updateRows = append(updateRows, newFieldRows...)
2014-09-04 01:53:59 +02:00
termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv)
2014-07-21 23:05:55 +02:00
} else {
2014-09-04 01:53:59 +02:00
termFreqRow = NewTermFrequencyRow(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm)
2014-07-21 23:05:55 +02:00
}
// record the back index entry
backIndexTermEntry := BackIndexTermEntry{Term: proto.String(string(tf.Term)), Field: proto.Uint32(uint32(fieldIndex))}
backIndexTermEntries = append(backIndexTermEntries, &backIndexTermEntry)
tfrKeyString := string(termFreqRow.Key())
_, existed := existingKeys[tfrKeyString]
if existed {
// this is an update
updateRows = append(updateRows, termFreqRow)
// this term existed last time, delete it from that map
delete(existingKeys, tfrKeyString)
2014-07-21 23:05:55 +02:00
} else {
// this is an add
addRows = append(addRows, termFreqRow)
}
}
return addRows, updateRows, backIndexTermEntries
2014-07-21 23:05:55 +02:00
}
func (udc *UpsideDownCouch) fieldNameToFieldIndex(fieldName string) (uint16, *FieldRow) {
var fieldRow *FieldRow
fieldIndex, fieldExists := udc.fieldIndexes[fieldName]
if !fieldExists {
// assign next field id
fieldIndex = uint16(udc.lastFieldIndex + 1)
udc.fieldIndexes[fieldName] = fieldIndex
// ensure this batch adds a row for this field
fieldRow = NewFieldRow(uint16(fieldIndex), fieldName)
udc.lastFieldIndex = int(fieldIndex)
}
return fieldIndex, fieldRow
}
2014-04-17 22:55:53 +02:00
func (udc *UpsideDownCouch) Delete(id string) error {
// start a writer for this delete
kvwriter := udc.store.Writer()
defer kvwriter.Close()
2014-04-17 22:55:53 +02:00
// lookup the back index row
backIndexRow, err := udc.backIndexRowForDoc(kvwriter, id)
2014-04-17 22:55:53 +02:00
if err != nil {
return err
}
if backIndexRow == nil {
return nil
}
deleteRows := make([]UpsideDownCouchRow, 0)
deleteRows = udc.deleteSingle(id, backIndexRow, deleteRows)
err = udc.batchRows(kvwriter, nil, nil, deleteRows)
if err == nil {
2014-09-04 00:17:26 +02:00
udc.docCount--
}
return err
}
func (udc *UpsideDownCouch) deleteSingle(id string, backIndexRow *BackIndexRow, deleteRows []UpsideDownCouchRow) []UpsideDownCouchRow {
for _, backIndexEntry := range backIndexRow.termEntries {
tfr := NewTermFrequencyRow([]byte(*backIndexEntry.Term), uint16(*backIndexEntry.Field), id, 0, 0)
deleteRows = append(deleteRows, tfr)
2014-04-17 22:55:53 +02:00
}
for _, se := range backIndexRow.storedEntries {
sf := NewStoredRow(id, uint16(*se.Field), se.ArrayPositions, 'x', nil)
deleteRows = append(deleteRows, sf)
}
2014-04-17 22:55:53 +02:00
// also delete the back entry itself
deleteRows = append(deleteRows, backIndexRow)
return deleteRows
2014-04-17 22:55:53 +02:00
}
func (udc *UpsideDownCouch) backIndexRowForDoc(kvreader store.KVReader, docID string) (*BackIndexRow, error) {
2014-04-17 22:55:53 +02:00
// use a temporary row structure to build key
tempRow := &BackIndexRow{
2014-09-04 01:53:59 +02:00
doc: []byte(docID),
2014-04-17 22:55:53 +02:00
}
key := tempRow.Key()
value, err := kvreader.Get(key)
2014-04-17 22:55:53 +02:00
if err != nil {
return nil, err
}
if value == nil {
return nil, nil
}
backIndexRow, err := NewBackIndexRowKV(key, value)
if err != nil {
return nil, err
}
2014-04-17 22:55:53 +02:00
return backIndexRow, nil
}
func (udc *UpsideDownCouch) backIndexRowsForBatch(kvreader store.KVReader, batch index.Batch) (map[string]*BackIndexRow, error) {
// FIXME faster to order the ids and scan sequentially
// for now just get it working
rv := make(map[string]*BackIndexRow, 0)
2014-09-04 01:53:59 +02:00
for docID := range batch {
backIndexRow, err := udc.backIndexRowForDoc(kvreader, docID)
if err != nil {
return nil, err
}
2014-09-04 01:53:59 +02:00
rv[docID] = backIndexRow
}
return rv, nil
}
func decodeFieldType(typ byte, name string, value []byte) document.Field {
switch typ {
case 't':
return document.NewTextField(name, []uint64{}, value)
case 'n':
return document.NewNumericFieldFromBytes(name, []uint64{}, value)
case 'd':
return document.NewDateTimeFieldFromBytes(name, []uint64{}, value)
}
return nil
}
2014-04-17 22:55:53 +02:00
func frequencyFromTokenFreq(tf *analysis.TokenFreq) int {
return len(tf.Locations)
}
2014-07-21 23:05:55 +02:00
func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []UpsideDownCouchRow) {
2014-04-17 22:55:53 +02:00
rv := make([]*TermVector, len(tf.Locations))
2014-07-21 23:05:55 +02:00
newFieldRows := make([]UpsideDownCouchRow, 0)
2014-04-17 22:55:53 +02:00
for i, l := range tf.Locations {
2014-07-21 23:05:55 +02:00
var newFieldRow *FieldRow
fieldIndex := field
if l.Field != "" {
// lookup correct field
fieldIndex, newFieldRow = udc.fieldNameToFieldIndex(l.Field)
if newFieldRow != nil {
newFieldRows = append(newFieldRows, newFieldRow)
}
}
2014-04-17 22:55:53 +02:00
tv := TermVector{
2014-07-21 23:05:55 +02:00
field: fieldIndex,
2014-04-17 22:55:53 +02:00
pos: uint64(l.Position),
start: uint64(l.Start),
end: uint64(l.End),
}
rv[i] = &tv
}
2014-07-21 23:05:55 +02:00
return rv, newFieldRows
2014-04-17 22:55:53 +02:00
}
func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector {
rv := make([]*index.TermFieldVector, len(in))
for i, tv := range in {
fieldName := udc.fieldIndexToName(tv.field)
tfv := index.TermFieldVector{
Field: fieldName,
Pos: tv.pos,
Start: tv.start,
End: tv.end,
}
rv[i] = &tfv
}
return rv
}
func (udc *UpsideDownCouch) fieldIndexToName(i uint16) string {
for fieldName, fieldIndex := range udc.fieldIndexes {
if i == fieldIndex {
return fieldName
}
}
return ""
}
func (udc *UpsideDownCouch) Batch(batch index.Batch) error {
// start a writer for this batch
kvwriter := udc.store.Writer()
defer kvwriter.Close()
// first lookup all the back index rows
backIndexRows, err := udc.backIndexRowsForBatch(kvwriter, batch)
if err != nil {
return err
}
// prepare a list of rows
addRows := make([]UpsideDownCouchRow, 0)
updateRows := make([]UpsideDownCouchRow, 0)
deleteRows := make([]UpsideDownCouchRow, 0)
docsAdded := uint64(0)
docsDeleted := uint64(0)
2014-09-04 01:53:59 +02:00
for docID, doc := range batch {
backIndexRow := backIndexRows[docID]
if doc == nil && backIndexRow != nil {
//delete
2014-09-04 01:53:59 +02:00
deleteRows = udc.deleteSingle(docID, backIndexRow, deleteRows)
docsDeleted++
} else if doc != nil {
addRows, updateRows, deleteRows = udc.updateSingle(doc, backIndexRow, addRows, updateRows, deleteRows)
if backIndexRow == nil {
docsAdded++
}
}
}
err = udc.batchRows(kvwriter, addRows, updateRows, deleteRows)
if err == nil {
udc.docCount += docsAdded
udc.docCount -= docsDeleted
}
return err
}
func (udc *UpsideDownCouch) SetInternal(key, val []byte) error {
internalRow := NewInternalRow(key, val)
writer := udc.store.Writer()
defer writer.Close()
return writer.Set(internalRow.Key(), internalRow.Value())
}
func (udc *UpsideDownCouch) DeleteInternal(key []byte) error {
internalRow := NewInternalRow(key, nil)
writer := udc.store.Writer()
defer writer.Close()
return writer.Delete(internalRow.Key())
}
func (udc *UpsideDownCouch) Reader() index.IndexReader {
return &IndexReader{
index: udc,
kvreader: udc.store.Reader(),
docCount: udc.docCount,
}
}