2014-04-17 22:55:53 +02:00
|
|
|
// Copyright (c) 2014 Couchbase, Inc.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
|
|
|
// except in compliance with the License. You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
|
|
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
|
|
|
// either express or implied. See the License for the specific language governing permissions
|
|
|
|
// and limitations under the License.
|
|
|
|
package upside_down
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"fmt"
|
|
|
|
"math"
|
|
|
|
|
2014-08-28 21:38:57 +02:00
|
|
|
"github.com/blevesearch/bleve/analysis"
|
|
|
|
"github.com/blevesearch/bleve/document"
|
|
|
|
"github.com/blevesearch/bleve/index"
|
|
|
|
"github.com/blevesearch/bleve/index/store"
|
2014-08-19 14:58:26 +02:00
|
|
|
|
|
|
|
"code.google.com/p/goprotobuf/proto"
|
2014-04-17 22:55:53 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
var VERSION_KEY []byte = []byte{'v'}
|
|
|
|
|
|
|
|
const VERSION uint8 = 1
|
|
|
|
|
|
|
|
type UpsideDownCouch struct {
|
|
|
|
version uint8
|
|
|
|
path string
|
2014-05-09 22:37:04 +02:00
|
|
|
store store.KVStore
|
2014-04-17 22:55:53 +02:00
|
|
|
fieldIndexes map[string]uint16
|
|
|
|
lastFieldIndex int
|
|
|
|
analyzer map[string]*analysis.Analyzer
|
|
|
|
docCount uint64
|
|
|
|
}
|
|
|
|
|
2014-05-09 22:37:04 +02:00
|
|
|
func NewUpsideDownCouch(s store.KVStore) *UpsideDownCouch {
|
2014-04-17 22:55:53 +02:00
|
|
|
return &UpsideDownCouch{
|
|
|
|
version: VERSION,
|
|
|
|
analyzer: make(map[string]*analysis.Analyzer),
|
|
|
|
fieldIndexes: make(map[string]uint16),
|
2014-05-09 22:37:04 +02:00
|
|
|
store: s,
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) init() (err error) {
|
|
|
|
// prepare a list of rows
|
|
|
|
rows := make([]UpsideDownCouchRow, 0)
|
|
|
|
|
|
|
|
// version marker
|
|
|
|
rows = append(rows, NewVersionRow(udc.version))
|
|
|
|
|
|
|
|
return udc.batchRows(nil, rows, nil)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) loadSchema() (err error) {
|
|
|
|
|
2014-05-09 22:37:04 +02:00
|
|
|
keyPrefix := []byte{'f'}
|
|
|
|
it := udc.store.Iterator(keyPrefix)
|
2014-04-17 22:55:53 +02:00
|
|
|
defer it.Close()
|
|
|
|
|
|
|
|
it.Seek(keyPrefix)
|
2014-05-09 22:37:04 +02:00
|
|
|
key, val, valid := it.Current()
|
|
|
|
for valid {
|
|
|
|
|
2014-04-17 22:55:53 +02:00
|
|
|
// stop when
|
2014-05-09 22:37:04 +02:00
|
|
|
if !bytes.HasPrefix(key, keyPrefix) {
|
2014-04-17 22:55:53 +02:00
|
|
|
break
|
|
|
|
}
|
2014-05-09 22:37:04 +02:00
|
|
|
fieldRow, err := NewFieldRowKV(key, val)
|
2014-04-19 03:07:41 +02:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2014-04-17 22:55:53 +02:00
|
|
|
udc.fieldIndexes[fieldRow.name] = fieldRow.index
|
|
|
|
if int(fieldRow.index) > udc.lastFieldIndex {
|
|
|
|
udc.lastFieldIndex = int(fieldRow.index)
|
|
|
|
}
|
2014-05-09 22:37:04 +02:00
|
|
|
|
|
|
|
it.Next()
|
|
|
|
key, val, valid = it.Current()
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) batchRows(addRows []UpsideDownCouchRow, updateRows []UpsideDownCouchRow, deleteRows []UpsideDownCouchRow) (err error) {
|
2014-08-11 22:27:18 +02:00
|
|
|
|
2014-04-17 22:55:53 +02:00
|
|
|
// prepare batch
|
2014-05-09 22:37:04 +02:00
|
|
|
wb := udc.store.NewBatch()
|
2014-04-17 22:55:53 +02:00
|
|
|
|
|
|
|
// add
|
|
|
|
for _, row := range addRows {
|
|
|
|
tfr, ok := row.(*TermFrequencyRow)
|
|
|
|
if ok {
|
|
|
|
// need to increment counter
|
|
|
|
tr := NewTermFrequencyRow(tfr.term, tfr.field, "", 0, 0)
|
2014-05-09 22:37:04 +02:00
|
|
|
val, err := udc.store.Get(tr.Key())
|
2014-04-17 22:55:53 +02:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if val != nil {
|
2014-04-19 03:07:41 +02:00
|
|
|
tr, err = NewTermFrequencyRowKV(tr.Key(), val)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2014-04-17 22:55:53 +02:00
|
|
|
tr.freq += 1 // incr
|
|
|
|
} else {
|
|
|
|
tr = NewTermFrequencyRow(tfr.term, tfr.field, "", 1, 0)
|
|
|
|
}
|
|
|
|
|
|
|
|
// now add this to the batch
|
2014-05-09 22:37:04 +02:00
|
|
|
wb.Set(tr.Key(), tr.Value())
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
2014-05-09 22:37:04 +02:00
|
|
|
wb.Set(row.Key(), row.Value())
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// update
|
|
|
|
for _, row := range updateRows {
|
2014-05-09 22:37:04 +02:00
|
|
|
wb.Set(row.Key(), row.Value())
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// delete
|
|
|
|
for _, row := range deleteRows {
|
|
|
|
tfr, ok := row.(*TermFrequencyRow)
|
|
|
|
if ok {
|
|
|
|
// need to decrement counter
|
|
|
|
tr := NewTermFrequencyRow(tfr.term, tfr.field, "", 0, 0)
|
2014-05-09 22:37:04 +02:00
|
|
|
val, err := udc.store.Get(tr.Key())
|
2014-04-17 22:55:53 +02:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if val != nil {
|
2014-04-19 03:07:41 +02:00
|
|
|
tr, err = NewTermFrequencyRowKV(tr.Key(), val)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2014-04-17 22:55:53 +02:00
|
|
|
tr.freq -= 1 // incr
|
|
|
|
} else {
|
2014-04-22 19:57:13 +02:00
|
|
|
return fmt.Errorf("unexpected missing row, deleting term, expected count row to exist: %v", tr.Key())
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if tr.freq == 0 {
|
2014-04-18 22:09:34 +02:00
|
|
|
wb.Delete(tr.Key())
|
2014-04-17 22:55:53 +02:00
|
|
|
} else {
|
|
|
|
// now add this to the batch
|
2014-05-09 22:37:04 +02:00
|
|
|
wb.Set(tr.Key(), tr.Value())
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
2014-04-18 22:09:34 +02:00
|
|
|
wb.Delete(row.Key())
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// write out the batch
|
2014-05-09 22:37:04 +02:00
|
|
|
err = wb.Execute()
|
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
err = udc.store.Commit()
|
2014-04-17 22:55:53 +02:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) DocCount() uint64 {
|
|
|
|
return udc.docCount
|
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) Open() (err error) {
|
|
|
|
var value []byte
|
2014-05-09 22:37:04 +02:00
|
|
|
value, err = udc.store.Get(VERSION_KEY)
|
2014-04-17 22:55:53 +02:00
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// init new index OR load schema
|
|
|
|
if value == nil {
|
|
|
|
err = udc.init()
|
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
err = udc.loadSchema()
|
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// set doc count
|
|
|
|
udc.docCount = udc.countDocs()
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) countDocs() uint64 {
|
2014-05-09 22:37:04 +02:00
|
|
|
it := udc.store.Iterator([]byte{'b'})
|
2014-04-17 22:55:53 +02:00
|
|
|
defer it.Close()
|
|
|
|
|
|
|
|
var rv uint64 = 0
|
2014-05-09 22:37:04 +02:00
|
|
|
key, _, valid := it.Current()
|
|
|
|
for valid {
|
|
|
|
if !bytes.HasPrefix(key, []byte{'b'}) {
|
2014-04-17 22:55:53 +02:00
|
|
|
break
|
|
|
|
}
|
|
|
|
rv += 1
|
2014-05-09 22:37:04 +02:00
|
|
|
it.Next()
|
|
|
|
key, _, valid = it.Current()
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
2014-05-09 22:37:04 +02:00
|
|
|
|
2014-04-17 22:55:53 +02:00
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) rowCount() uint64 {
|
2014-05-09 22:37:04 +02:00
|
|
|
it := udc.store.Iterator([]byte{0})
|
2014-04-17 22:55:53 +02:00
|
|
|
defer it.Close()
|
|
|
|
|
|
|
|
var rv uint64 = 0
|
2014-05-09 22:37:04 +02:00
|
|
|
_, _, valid := it.Current()
|
|
|
|
for valid {
|
2014-04-17 22:55:53 +02:00
|
|
|
rv += 1
|
2014-05-09 22:37:04 +02:00
|
|
|
it.Next()
|
|
|
|
_, _, valid = it.Current()
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
2014-05-09 22:37:04 +02:00
|
|
|
|
2014-04-17 22:55:53 +02:00
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) Close() {
|
2014-05-09 22:37:04 +02:00
|
|
|
udc.store.Close()
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) Update(doc *document.Document) error {
|
|
|
|
// first we lookup the backindex row for the doc id if it exists
|
|
|
|
// lookup the back index row
|
|
|
|
backIndexRow, err := udc.backIndexRowForDoc(doc.ID)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2014-08-11 22:27:18 +02:00
|
|
|
// prepare a list of rows
|
|
|
|
addRows := make([]UpsideDownCouchRow, 0)
|
|
|
|
updateRows := make([]UpsideDownCouchRow, 0)
|
|
|
|
deleteRows := make([]UpsideDownCouchRow, 0)
|
|
|
|
|
|
|
|
addRows, updateRows, deleteRows = udc.updateSingle(doc, backIndexRow, addRows, updateRows, deleteRows)
|
|
|
|
|
|
|
|
err = udc.batchRows(addRows, updateRows, deleteRows)
|
|
|
|
if err == nil && backIndexRow == nil {
|
|
|
|
udc.docCount += 1
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) updateSingle(doc *document.Document, backIndexRow *BackIndexRow, addRows, updateRows, deleteRows []UpsideDownCouchRow) ([]UpsideDownCouchRow, []UpsideDownCouchRow, []UpsideDownCouchRow) {
|
|
|
|
|
2014-08-19 14:58:26 +02:00
|
|
|
existingTermKeys := make(map[string]bool)
|
|
|
|
for _, key := range backIndexRow.AllTermKeys() {
|
|
|
|
existingTermKeys[string(key)] = true
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
2014-08-19 14:58:26 +02:00
|
|
|
|
|
|
|
existingStoredKeys := make(map[string]bool)
|
|
|
|
for _, key := range backIndexRow.AllStoredKeys() {
|
|
|
|
existingStoredKeys[string(key)] = true
|
2014-06-26 17:43:13 +02:00
|
|
|
}
|
2014-04-17 22:55:53 +02:00
|
|
|
|
|
|
|
// track our back index entries
|
2014-08-19 14:58:26 +02:00
|
|
|
backIndexTermEntries := make([]*BackIndexTermEntry, 0)
|
|
|
|
backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)
|
2014-04-17 22:55:53 +02:00
|
|
|
|
|
|
|
for _, field := range doc.Fields {
|
2014-07-21 23:05:55 +02:00
|
|
|
fieldIndex, newFieldRow := udc.fieldNameToFieldIndex(field.Name())
|
|
|
|
if newFieldRow != nil {
|
|
|
|
updateRows = append(updateRows, newFieldRow)
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
|
2014-07-14 20:47:05 +02:00
|
|
|
if field.Options().IsIndexed() {
|
2014-06-26 17:43:13 +02:00
|
|
|
|
2014-07-21 23:05:55 +02:00
|
|
|
fieldLength, tokenFreqs := field.Analyze()
|
2014-04-17 22:55:53 +02:00
|
|
|
|
2014-07-21 23:05:55 +02:00
|
|
|
// see if any of the composite fields need this
|
|
|
|
for _, compositeField := range doc.CompositeFields {
|
|
|
|
compositeField.Compose(field.Name(), fieldLength, tokenFreqs)
|
2014-06-26 17:43:13 +02:00
|
|
|
}
|
2014-07-21 23:05:55 +02:00
|
|
|
|
|
|
|
// encode this field
|
2014-08-19 14:58:26 +02:00
|
|
|
indexAddRows, indexUpdateRows, indexBackIndexTermEntries := udc.indexField(doc.ID, field, fieldIndex, fieldLength, tokenFreqs, existingTermKeys)
|
2014-07-21 23:05:55 +02:00
|
|
|
addRows = append(addRows, indexAddRows...)
|
|
|
|
updateRows = append(updateRows, indexUpdateRows...)
|
2014-08-19 14:58:26 +02:00
|
|
|
backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...)
|
2014-06-26 17:43:13 +02:00
|
|
|
}
|
|
|
|
|
2014-07-14 20:47:05 +02:00
|
|
|
if field.Options().IsStored() {
|
2014-08-19 14:58:26 +02:00
|
|
|
storeAddRows, storeUpdateRows, indexBackIndexStoreEntries := udc.storeField(doc.ID, field, fieldIndex, existingStoredKeys)
|
2014-07-21 23:05:55 +02:00
|
|
|
addRows = append(addRows, storeAddRows...)
|
|
|
|
updateRows = append(updateRows, storeUpdateRows...)
|
2014-08-19 14:58:26 +02:00
|
|
|
backIndexStoredEntries = append(backIndexStoredEntries, indexBackIndexStoreEntries...)
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2014-07-21 23:05:55 +02:00
|
|
|
// now index the composite fields
|
|
|
|
for _, compositeField := range doc.CompositeFields {
|
|
|
|
fieldIndex, newFieldRow := udc.fieldNameToFieldIndex(compositeField.Name())
|
|
|
|
if newFieldRow != nil {
|
|
|
|
updateRows = append(updateRows, newFieldRow)
|
|
|
|
}
|
|
|
|
if compositeField.Options().IsIndexed() {
|
|
|
|
|
|
|
|
fieldLength, tokenFreqs := compositeField.Analyze()
|
|
|
|
// encode this field
|
2014-08-19 14:58:26 +02:00
|
|
|
indexAddRows, indexUpdateRows, indexBackIndexTermEntries := udc.indexField(doc.ID, compositeField, fieldIndex, fieldLength, tokenFreqs, existingTermKeys)
|
2014-07-21 23:05:55 +02:00
|
|
|
addRows = append(addRows, indexAddRows...)
|
|
|
|
updateRows = append(updateRows, indexUpdateRows...)
|
2014-08-19 14:58:26 +02:00
|
|
|
backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...)
|
2014-07-21 23:05:55 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-17 22:55:53 +02:00
|
|
|
// build the back index row
|
2014-08-19 14:58:26 +02:00
|
|
|
backIndexRow = NewBackIndexRow(doc.ID, backIndexTermEntries, backIndexStoredEntries)
|
2014-04-17 22:55:53 +02:00
|
|
|
updateRows = append(updateRows, backIndexRow)
|
|
|
|
|
|
|
|
// any of the existing rows that weren't updated need to be deleted
|
2014-08-19 14:58:26 +02:00
|
|
|
for existingTermKey, _ := range existingTermKeys {
|
|
|
|
termFreqRow, err := NewTermFrequencyRowK([]byte(existingTermKey))
|
|
|
|
if err == nil {
|
|
|
|
deleteRows = append(deleteRows, termFreqRow)
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
}
|
2014-08-19 14:58:26 +02:00
|
|
|
|
2014-06-26 17:43:13 +02:00
|
|
|
// any of the existing stored fields that weren't updated need to be deleted
|
2014-08-19 14:58:26 +02:00
|
|
|
for existingStoredKey, _ := range existingStoredKeys {
|
|
|
|
storedRow, err := NewStoredRowK([]byte(existingStoredKey))
|
|
|
|
if err == nil {
|
|
|
|
deleteRows = append(deleteRows, storedRow)
|
|
|
|
}
|
2014-06-26 17:43:13 +02:00
|
|
|
}
|
2014-04-17 22:55:53 +02:00
|
|
|
|
2014-08-11 22:27:18 +02:00
|
|
|
return addRows, updateRows, deleteRows
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
|
2014-08-19 14:58:26 +02:00
|
|
|
func (udc *UpsideDownCouch) storeField(docId string, field document.Field, fieldIndex uint16, existingKeys map[string]bool) ([]UpsideDownCouchRow, []UpsideDownCouchRow, []*BackIndexStoreEntry) {
|
2014-07-21 23:05:55 +02:00
|
|
|
updateRows := make([]UpsideDownCouchRow, 0)
|
|
|
|
addRows := make([]UpsideDownCouchRow, 0)
|
2014-08-19 14:58:26 +02:00
|
|
|
backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)
|
2014-08-06 19:52:20 +02:00
|
|
|
fieldType := encodeFieldType(field)
|
2014-08-19 14:58:26 +02:00
|
|
|
storedRow := NewStoredRow(docId, fieldIndex, field.ArrayPositions(), fieldType, field.Value())
|
|
|
|
|
|
|
|
// record the back index entry
|
|
|
|
backIndexStoredEntry := BackIndexStoreEntry{Field: proto.Uint32(uint32(fieldIndex)), ArrayPositions: field.ArrayPositions()}
|
|
|
|
backIndexStoredEntries = append(backIndexStoredEntries, &backIndexStoredEntry)
|
|
|
|
|
|
|
|
storedRowKey := string(storedRow.Key())
|
|
|
|
_, existed := existingKeys[storedRowKey]
|
|
|
|
if existed {
|
2014-07-21 23:05:55 +02:00
|
|
|
// this is an update
|
|
|
|
updateRows = append(updateRows, storedRow)
|
|
|
|
// this field was stored last time, delete it from that map
|
2014-08-19 14:58:26 +02:00
|
|
|
delete(existingKeys, storedRowKey)
|
2014-07-21 23:05:55 +02:00
|
|
|
} else {
|
|
|
|
addRows = append(addRows, storedRow)
|
|
|
|
}
|
2014-08-19 14:58:26 +02:00
|
|
|
return addRows, updateRows, backIndexStoredEntries
|
2014-07-21 23:05:55 +02:00
|
|
|
}
|
|
|
|
|
2014-08-06 19:52:20 +02:00
|
|
|
func encodeFieldType(f document.Field) byte {
|
|
|
|
fieldType := byte('x')
|
|
|
|
switch f.(type) {
|
|
|
|
case *document.TextField:
|
|
|
|
fieldType = 't'
|
|
|
|
case *document.NumericField:
|
|
|
|
fieldType = 'n'
|
|
|
|
case *document.DateTimeField:
|
|
|
|
fieldType = 'd'
|
|
|
|
case *document.CompositeField:
|
|
|
|
fieldType = 'c'
|
|
|
|
}
|
|
|
|
return fieldType
|
|
|
|
}
|
|
|
|
|
2014-08-19 14:58:26 +02:00
|
|
|
func (udc *UpsideDownCouch) indexField(docId string, field document.Field, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, existingKeys map[string]bool) ([]UpsideDownCouchRow, []UpsideDownCouchRow, []*BackIndexTermEntry) {
|
2014-07-21 23:05:55 +02:00
|
|
|
|
|
|
|
updateRows := make([]UpsideDownCouchRow, 0)
|
|
|
|
addRows := make([]UpsideDownCouchRow, 0)
|
2014-08-19 14:58:26 +02:00
|
|
|
backIndexTermEntries := make([]*BackIndexTermEntry, 0)
|
2014-07-21 23:05:55 +02:00
|
|
|
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
|
|
|
|
|
|
|
|
for _, tf := range tokenFreqs {
|
|
|
|
var termFreqRow *TermFrequencyRow
|
|
|
|
if field.Options().IncludeTermVectors() {
|
|
|
|
tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, tf)
|
|
|
|
updateRows = append(updateRows, newFieldRows...)
|
|
|
|
termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docId, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv)
|
|
|
|
} else {
|
|
|
|
termFreqRow = NewTermFrequencyRow(tf.Term, fieldIndex, docId, uint64(frequencyFromTokenFreq(tf)), fieldNorm)
|
|
|
|
}
|
|
|
|
|
|
|
|
// record the back index entry
|
2014-08-19 14:58:26 +02:00
|
|
|
backIndexTermEntry := BackIndexTermEntry{Term: proto.String(string(tf.Term)), Field: proto.Uint32(uint32(fieldIndex))}
|
|
|
|
backIndexTermEntries = append(backIndexTermEntries, &backIndexTermEntry)
|
|
|
|
|
|
|
|
tfrKeyString := string(termFreqRow.Key())
|
|
|
|
_, existed := existingKeys[tfrKeyString]
|
|
|
|
if existed {
|
|
|
|
// this is an update
|
|
|
|
updateRows = append(updateRows, termFreqRow)
|
|
|
|
// this term existed last time, delete it from that map
|
|
|
|
delete(existingKeys, tfrKeyString)
|
2014-07-21 23:05:55 +02:00
|
|
|
} else {
|
|
|
|
// this is an add
|
|
|
|
addRows = append(addRows, termFreqRow)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-08-19 14:58:26 +02:00
|
|
|
return addRows, updateRows, backIndexTermEntries
|
2014-07-21 23:05:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) fieldNameToFieldIndex(fieldName string) (uint16, *FieldRow) {
|
|
|
|
var fieldRow *FieldRow
|
|
|
|
fieldIndex, fieldExists := udc.fieldIndexes[fieldName]
|
|
|
|
if !fieldExists {
|
|
|
|
// assign next field id
|
|
|
|
fieldIndex = uint16(udc.lastFieldIndex + 1)
|
|
|
|
udc.fieldIndexes[fieldName] = fieldIndex
|
|
|
|
// ensure this batch adds a row for this field
|
|
|
|
fieldRow = NewFieldRow(uint16(fieldIndex), fieldName)
|
|
|
|
udc.lastFieldIndex = int(fieldIndex)
|
|
|
|
}
|
|
|
|
return fieldIndex, fieldRow
|
|
|
|
}
|
|
|
|
|
2014-04-17 22:55:53 +02:00
|
|
|
func (udc *UpsideDownCouch) Delete(id string) error {
|
|
|
|
// lookup the back index row
|
|
|
|
backIndexRow, err := udc.backIndexRowForDoc(id)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if backIndexRow == nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2014-08-11 22:27:18 +02:00
|
|
|
deleteRows := make([]UpsideDownCouchRow, 0)
|
|
|
|
deleteRows = udc.deleteSingle(id, backIndexRow, deleteRows)
|
|
|
|
|
|
|
|
err = udc.batchRows(nil, nil, deleteRows)
|
|
|
|
if err == nil {
|
|
|
|
udc.docCount -= 1
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) deleteSingle(id string, backIndexRow *BackIndexRow, deleteRows []UpsideDownCouchRow) []UpsideDownCouchRow {
|
|
|
|
|
2014-08-19 14:58:26 +02:00
|
|
|
for _, backIndexEntry := range backIndexRow.termEntries {
|
|
|
|
tfr := NewTermFrequencyRow([]byte(*backIndexEntry.Term), uint16(*backIndexEntry.Field), id, 0, 0)
|
2014-08-11 22:27:18 +02:00
|
|
|
deleteRows = append(deleteRows, tfr)
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
2014-08-19 14:58:26 +02:00
|
|
|
for _, se := range backIndexRow.storedEntries {
|
|
|
|
sf := NewStoredRow(id, uint16(*se.Field), se.ArrayPositions, 'x', nil)
|
2014-08-11 22:27:18 +02:00
|
|
|
deleteRows = append(deleteRows, sf)
|
2014-06-26 17:43:13 +02:00
|
|
|
}
|
2014-04-17 22:55:53 +02:00
|
|
|
|
|
|
|
// also delete the back entry itself
|
2014-08-11 22:27:18 +02:00
|
|
|
deleteRows = append(deleteRows, backIndexRow)
|
|
|
|
return deleteRows
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) backIndexRowForDoc(docId string) (*BackIndexRow, error) {
|
|
|
|
// use a temporary row structure to build key
|
|
|
|
tempRow := &BackIndexRow{
|
|
|
|
doc: []byte(docId),
|
|
|
|
}
|
2014-04-18 22:09:34 +02:00
|
|
|
key := tempRow.Key()
|
2014-05-09 22:37:04 +02:00
|
|
|
value, err := udc.store.Get(key)
|
2014-04-17 22:55:53 +02:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if value == nil {
|
|
|
|
return nil, nil
|
|
|
|
}
|
2014-04-19 03:07:41 +02:00
|
|
|
backIndexRow, err := NewBackIndexRowKV(key, value)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2014-04-17 22:55:53 +02:00
|
|
|
return backIndexRow, nil
|
|
|
|
}
|
|
|
|
|
2014-08-11 22:27:18 +02:00
|
|
|
func (udc *UpsideDownCouch) backIndexRowsForBatch(batch index.Batch) (map[string]*BackIndexRow, error) {
|
|
|
|
// FIXME faster to order the ids and scan sequentially
|
|
|
|
// for now just get it working
|
|
|
|
rv := make(map[string]*BackIndexRow, 0)
|
|
|
|
for docId, _ := range batch {
|
|
|
|
backIndexRow, err := udc.backIndexRowForDoc(docId)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
rv[docId] = backIndexRow
|
|
|
|
}
|
|
|
|
return rv, nil
|
|
|
|
}
|
|
|
|
|
2014-07-31 17:47:36 +02:00
|
|
|
func (udc *UpsideDownCouch) Fields() ([]string, error) {
|
|
|
|
rv := make([]string, 0)
|
|
|
|
it := udc.store.Iterator([]byte{'f'})
|
|
|
|
defer it.Close()
|
|
|
|
key, val, valid := it.Current()
|
|
|
|
for valid {
|
|
|
|
if !bytes.HasPrefix(key, []byte{'f'}) {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
row, err := ParseFromKeyValue(key, val)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if row != nil {
|
|
|
|
fieldRow, ok := row.(*FieldRow)
|
|
|
|
if ok {
|
|
|
|
rv = append(rv, fieldRow.name)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
it.Next()
|
|
|
|
key, val, valid = it.Current()
|
|
|
|
}
|
|
|
|
return rv, nil
|
|
|
|
}
|
|
|
|
|
2014-04-17 22:55:53 +02:00
|
|
|
func (udc *UpsideDownCouch) TermFieldReader(term []byte, fieldName string) (index.TermFieldReader, error) {
|
|
|
|
fieldIndex, fieldExists := udc.fieldIndexes[fieldName]
|
|
|
|
if fieldExists {
|
|
|
|
return newUpsideDownCouchTermFieldReader(udc, term, uint16(fieldIndex))
|
|
|
|
}
|
2014-08-07 19:45:39 +02:00
|
|
|
return newUpsideDownCouchTermFieldReader(udc, []byte{BYTE_SEPARATOR}, ^uint16(0))
|
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) FieldReader(fieldName string, startTerm []byte, endTerm []byte) (index.FieldReader, error) {
|
|
|
|
fieldIndex, fieldExists := udc.fieldIndexes[fieldName]
|
|
|
|
if fieldExists {
|
|
|
|
return newUpsideDownCouchFieldReader(udc, uint16(fieldIndex), startTerm, endTerm)
|
|
|
|
}
|
|
|
|
return newUpsideDownCouchTermFieldReader(udc, []byte{BYTE_SEPARATOR}, ^uint16(0))
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
|
2014-07-11 20:24:28 +02:00
|
|
|
func (udc *UpsideDownCouch) DocIdReader(start, end string) (index.DocIdReader, error) {
|
|
|
|
return newUpsideDownCouchDocIdReader(udc, start, end)
|
|
|
|
}
|
|
|
|
|
2014-06-26 17:43:13 +02:00
|
|
|
func (udc *UpsideDownCouch) Document(id string) (*document.Document, error) {
|
2014-08-25 14:55:14 +02:00
|
|
|
// first hit the back index to confirm doc exists
|
|
|
|
backIndexRow, err := udc.backIndexRowForDoc(id)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if backIndexRow == nil {
|
|
|
|
return nil, nil
|
|
|
|
}
|
2014-06-26 17:43:13 +02:00
|
|
|
rv := document.NewDocument(id)
|
2014-08-19 14:58:26 +02:00
|
|
|
storedRow := NewStoredRow(id, 0, []uint64{}, 'x', nil)
|
2014-06-26 17:43:13 +02:00
|
|
|
storedRowScanPrefix := storedRow.ScanPrefixForDoc()
|
|
|
|
it := udc.store.Iterator(storedRowScanPrefix)
|
2014-08-25 21:13:15 +02:00
|
|
|
defer it.Close()
|
2014-06-26 17:43:13 +02:00
|
|
|
key, val, valid := it.Current()
|
|
|
|
for valid {
|
|
|
|
if !bytes.HasPrefix(key, storedRowScanPrefix) {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
row, err := NewStoredRowKV(key, val)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if row != nil {
|
2014-08-06 19:52:20 +02:00
|
|
|
fieldName := udc.fieldIndexToName(row.field)
|
|
|
|
field := decodeFieldType(row.typ, fieldName, row.value)
|
|
|
|
if field != nil {
|
|
|
|
rv.AddField(field)
|
|
|
|
}
|
2014-06-26 17:43:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
it.Next()
|
|
|
|
key, val, valid = it.Current()
|
|
|
|
}
|
|
|
|
return rv, nil
|
|
|
|
}
|
|
|
|
|
2014-08-11 17:03:29 +02:00
|
|
|
func (udc *UpsideDownCouch) DocumentFieldTerms(id string) (index.FieldTerms, error) {
|
|
|
|
back, err := udc.backIndexRowForDoc(id)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2014-08-19 14:58:26 +02:00
|
|
|
rv := make(index.FieldTerms, len(back.termEntries))
|
|
|
|
for _, entry := range back.termEntries {
|
|
|
|
fieldName := udc.fieldIndexToName(uint16(*entry.Field))
|
2014-08-11 17:03:29 +02:00
|
|
|
terms, ok := rv[fieldName]
|
|
|
|
if !ok {
|
|
|
|
terms = make([]string, 0)
|
|
|
|
}
|
2014-08-19 14:58:26 +02:00
|
|
|
terms = append(terms, *entry.Term)
|
2014-08-11 17:03:29 +02:00
|
|
|
rv[fieldName] = terms
|
|
|
|
}
|
|
|
|
return rv, nil
|
|
|
|
}
|
|
|
|
|
2014-08-06 19:52:20 +02:00
|
|
|
func decodeFieldType(typ byte, name string, value []byte) document.Field {
|
|
|
|
switch typ {
|
|
|
|
case 't':
|
2014-08-19 14:58:26 +02:00
|
|
|
return document.NewTextField(name, []uint64{}, value)
|
2014-08-06 19:52:20 +02:00
|
|
|
case 'n':
|
2014-08-19 14:58:26 +02:00
|
|
|
return document.NewNumericFieldFromBytes(name, []uint64{}, value)
|
2014-08-06 19:52:20 +02:00
|
|
|
case 'd':
|
2014-08-19 14:58:26 +02:00
|
|
|
return document.NewDateTimeFieldFromBytes(name, []uint64{}, value)
|
2014-08-06 19:52:20 +02:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2014-04-17 22:55:53 +02:00
|
|
|
func frequencyFromTokenFreq(tf *analysis.TokenFreq) int {
|
|
|
|
return len(tf.Locations)
|
|
|
|
}
|
|
|
|
|
2014-07-21 23:05:55 +02:00
|
|
|
func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []UpsideDownCouchRow) {
|
2014-04-17 22:55:53 +02:00
|
|
|
rv := make([]*TermVector, len(tf.Locations))
|
2014-07-21 23:05:55 +02:00
|
|
|
newFieldRows := make([]UpsideDownCouchRow, 0)
|
2014-04-17 22:55:53 +02:00
|
|
|
|
|
|
|
for i, l := range tf.Locations {
|
2014-07-21 23:05:55 +02:00
|
|
|
var newFieldRow *FieldRow
|
|
|
|
fieldIndex := field
|
|
|
|
if l.Field != "" {
|
|
|
|
// lookup correct field
|
|
|
|
fieldIndex, newFieldRow = udc.fieldNameToFieldIndex(l.Field)
|
|
|
|
if newFieldRow != nil {
|
|
|
|
newFieldRows = append(newFieldRows, newFieldRow)
|
|
|
|
}
|
|
|
|
}
|
2014-04-17 22:55:53 +02:00
|
|
|
tv := TermVector{
|
2014-07-21 23:05:55 +02:00
|
|
|
field: fieldIndex,
|
2014-04-17 22:55:53 +02:00
|
|
|
pos: uint64(l.Position),
|
|
|
|
start: uint64(l.Start),
|
|
|
|
end: uint64(l.End),
|
|
|
|
}
|
|
|
|
rv[i] = &tv
|
|
|
|
}
|
|
|
|
|
2014-07-21 23:05:55 +02:00
|
|
|
return rv, newFieldRows
|
2014-04-17 22:55:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector {
|
|
|
|
rv := make([]*index.TermFieldVector, len(in))
|
|
|
|
|
|
|
|
for i, tv := range in {
|
|
|
|
fieldName := udc.fieldIndexToName(tv.field)
|
|
|
|
tfv := index.TermFieldVector{
|
|
|
|
Field: fieldName,
|
|
|
|
Pos: tv.pos,
|
|
|
|
Start: tv.start,
|
|
|
|
End: tv.end,
|
|
|
|
}
|
|
|
|
rv[i] = &tfv
|
|
|
|
}
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) fieldIndexToName(i uint16) string {
|
|
|
|
for fieldName, fieldIndex := range udc.fieldIndexes {
|
|
|
|
if i == fieldIndex {
|
|
|
|
return fieldName
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ""
|
|
|
|
}
|
2014-08-11 22:27:18 +02:00
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) Batch(batch index.Batch) error {
|
|
|
|
// first lookup all the back index rows
|
|
|
|
backIndexRows, err := udc.backIndexRowsForBatch(batch)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// prepare a list of rows
|
|
|
|
addRows := make([]UpsideDownCouchRow, 0)
|
|
|
|
updateRows := make([]UpsideDownCouchRow, 0)
|
|
|
|
deleteRows := make([]UpsideDownCouchRow, 0)
|
|
|
|
|
|
|
|
docsAdded := uint64(0)
|
|
|
|
docsDeleted := uint64(0)
|
|
|
|
for docId, doc := range batch {
|
|
|
|
backIndexRow := backIndexRows[docId]
|
|
|
|
if doc == nil && backIndexRow != nil {
|
|
|
|
//delete
|
|
|
|
deleteRows = udc.deleteSingle(docId, backIndexRow, deleteRows)
|
|
|
|
docsDeleted++
|
|
|
|
} else if doc != nil {
|
|
|
|
addRows, updateRows, deleteRows = udc.updateSingle(doc, backIndexRow, addRows, updateRows, deleteRows)
|
|
|
|
if backIndexRow == nil {
|
|
|
|
docsAdded++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
err = udc.batchRows(addRows, updateRows, deleteRows)
|
|
|
|
if err == nil {
|
|
|
|
udc.docCount += docsAdded
|
|
|
|
udc.docCount -= docsDeleted
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
2014-08-14 03:14:47 +02:00
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) SetInternal(key, val []byte) error {
|
|
|
|
internalRow := NewInternalRow(key, val)
|
|
|
|
return udc.store.Set(internalRow.Key(), internalRow.Value())
|
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) GetInternal(key []byte) ([]byte, error) {
|
2014-08-15 15:39:41 +02:00
|
|
|
internalRow := NewInternalRow(key, nil)
|
2014-08-14 03:14:47 +02:00
|
|
|
return udc.store.Get(internalRow.Key())
|
|
|
|
}
|
|
|
|
|
|
|
|
func (udc *UpsideDownCouch) DeleteInternal(key []byte) error {
|
2014-08-15 15:39:41 +02:00
|
|
|
internalRow := NewInternalRow(key, nil)
|
|
|
|
return udc.store.Delete(internalRow.Key())
|
|
|
|
}
|