0
0
Fork 0

remove firestorm index scheme

firestorm was an experiment
we learned a lot, but it did not result in a usable index scheme
This commit is contained in:
Marty Schoch 2016-06-26 07:51:41 -04:00
parent 7e02e616ce
commit bd2a23fb6d
53 changed files with 0 additions and 8759 deletions

View File

@ -12,7 +12,6 @@ script:
- go get -u github.com/FiloSottile/gvt
- gvt restore
- go test -v $(go list ./... | grep -v vendor/)
- go test -v ./test -indexType=firestorm
- go vet $(go list ./... | grep -v vendor/)
- errcheck $(go list ./... | grep -v vendor/)
- docs/project-code-coverage.sh

View File

@ -21,8 +21,6 @@ import (
"github.com/blevesearch/bleve/index/upside_down"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/search/highlight/highlighters/html"
_ "github.com/blevesearch/bleve/index/firestorm"
)
var bleveExpVar = expvar.NewMap("bleve")

View File

@ -91,7 +91,6 @@ import (
_ "github.com/blevesearch/bleve/index/store/moss"
// index types
_ "github.com/blevesearch/bleve/index/firestorm"
_ "github.com/blevesearch/bleve/index/upside_down"
// byte array converters

View File

@ -1,169 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"math"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
)
func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
rv := &index.AnalysisResult{
DocID: d.ID,
Rows: make([]index.IndexRow, 0, 100),
}
docIDBytes := []byte(d.ID)
// add the _id row
rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil))
// information we collate as we merge fields with same name
fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)
fieldLengths := make(map[uint16]int)
fieldIncludeTermVectors := make(map[uint16]bool)
fieldNames := make(map[uint16]string)
analyzeField := func(field document.Field, storable bool) {
fieldIndex, newFieldRow := f.fieldIndexOrNewRow(field.Name())
if newFieldRow != nil {
rv.Rows = append(rv.Rows, newFieldRow)
}
fieldNames[fieldIndex] = field.Name()
if field.Options().IsIndexed() {
fieldLength, tokenFreqs := field.Analyze()
existingFreqs := fieldTermFreqs[fieldIndex]
if existingFreqs == nil {
fieldTermFreqs[fieldIndex] = tokenFreqs
} else {
existingFreqs.MergeAll(field.Name(), tokenFreqs)
fieldTermFreqs[fieldIndex] = existingFreqs
}
fieldLengths[fieldIndex] += fieldLength
fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
}
if storable && field.Options().IsStored() {
storeRow := f.storeField(docIDBytes, d.Number, field, fieldIndex)
rv.Rows = append(rv.Rows, storeRow)
}
}
for _, field := range d.Fields {
analyzeField(field, true)
}
if len(d.CompositeFields) > 0 {
for fieldIndex, tokenFreqs := range fieldTermFreqs {
// see if any of the composite fields need this
for _, compositeField := range d.CompositeFields {
compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs)
}
}
for _, compositeField := range d.CompositeFields {
analyzeField(compositeField, false)
}
}
rowsCapNeeded := len(rv.Rows)
for _, tokenFreqs := range fieldTermFreqs {
rowsCapNeeded += len(tokenFreqs)
}
rows := make([]index.IndexRow, 0, rowsCapNeeded)
rv.Rows = append(rows, rv.Rows...)
// walk through the collated information and process
// once for each indexed field (unique name)
for fieldIndex, tokenFreqs := range fieldTermFreqs {
fieldLength := fieldLengths[fieldIndex]
includeTermVectors := fieldIncludeTermVectors[fieldIndex]
rv.Rows = f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows)
}
return rv
}
func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow) []index.IndexRow {
tfrs := make([]TermFreqRow, len(tokenFreqs))
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
if !includeTermVectors {
i := 0
for _, tf := range tokenFreqs {
rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil))
i++
}
return rows
}
i := 0
for _, tf := range tokenFreqs {
var tv []*TermVector
tv, rows = f.termVectorsFromTokenFreq(fieldIndex, tf, rows)
rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, tv))
i++
}
return rows
}
func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq, rows []index.IndexRow) ([]*TermVector, []index.IndexRow) {
rv := make([]*TermVector, len(tf.Locations))
for i, l := range tf.Locations {
var newFieldRow *FieldRow
fieldIndex := field
if l.Field != "" {
// lookup correct field
fieldIndex, newFieldRow = f.fieldIndexOrNewRow(l.Field)
if newFieldRow != nil {
rows = append(rows, newFieldRow)
}
}
tv := NewTermVector(fieldIndex, uint64(l.Position), uint64(l.Start), uint64(l.End), l.ArrayPositions)
rv[i] = tv
}
return rv, rows
}
func (f *Firestorm) storeField(docID []byte, docNum uint64, field document.Field, fieldIndex uint16) index.IndexRow {
fieldValue := make([]byte, 1+len(field.Value()))
fieldValue[0] = encodeFieldType(field)
copy(fieldValue[1:], field.Value())
storedRow := NewStoredRow(docID, docNum, fieldIndex, field.ArrayPositions(), fieldValue)
return storedRow
}
func encodeFieldType(f document.Field) byte {
fieldType := byte('x')
switch f.(type) {
case *document.TextField:
fieldType = 't'
case *document.NumericField:
fieldType = 'n'
case *document.DateTimeField:
fieldType = 'd'
case *document.BooleanField:
fieldType = 'b'
case *document.CompositeField:
fieldType = 'c'
}
return fieldType
}

View File

@ -1,192 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis/analyzers/standard_analyzer"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/gtreap"
"github.com/blevesearch/bleve/index/store/null"
"github.com/blevesearch/bleve/registry"
)
func TestAnalysis(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open()
if err != nil {
t.Fatal(err)
}
rows := []index.IndexRow{
NewFieldRow(0, IDFieldName),
}
kvwriter, err := f.(*Firestorm).store.Writer()
if err != nil {
t.Fatal(err)
}
for _, row := range rows {
wb := kvwriter.NewBatch()
wb.Set(row.Key(), row.Value())
err := kvwriter.ExecuteBatch(wb)
if err != nil {
t.Fatal(err)
}
}
err = kvwriter.Close()
if err != nil {
t.Fatal(err)
}
kvreader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
// warmup to load field cache and set maxRead correctly
err = f.(*Firestorm).warmup(kvreader)
if err != nil {
t.Fatal(err)
}
tests := []struct {
d *document.Document
r *index.AnalysisResult
}{
{
d: document.NewDocument("a").
AddField(
document.NewTextFieldWithIndexingOptions("name", nil, []byte("test"), document.IndexField|document.StoreField|document.IncludeTermVectors)),
r: &index.AnalysisResult{
DocID: "a",
Rows: []index.IndexRow{
NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil),
NewFieldRow(1, "name"),
NewStoredRow([]byte("a"), 1, 1, nil, []byte("ttest")),
NewTermFreqRow(1, []byte("test"), []byte("a"), 1, 1, 1.0, []*TermVector{NewTermVector(1, 1, 0, 4, nil)}),
},
},
},
}
for _, test := range tests {
test.d.Number = 1
actual := f.Analyze(test.d)
if !reflect.DeepEqual(actual, test.r) {
t.Errorf("expected: %v got %v", test.r, actual)
}
}
err = kvreader.Close()
if err != nil {
t.Fatal(err)
}
}
func TestAnalysisBug328(t *testing.T) {
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name)
if err != nil {
t.Fatal(err)
}
analysisQueue := index.NewAnalysisQueue(1)
idx, err := NewFirestorm(gtreap.Name, nil, analysisQueue)
if err != nil {
t.Fatal(err)
}
d := document.NewDocument("1")
f := document.NewTextFieldCustom("title", nil, []byte("bleve"), document.IndexField|document.IncludeTermVectors, analyzer)
d.AddField(f)
f = document.NewTextFieldCustom("body", nil, []byte("bleve"), document.IndexField|document.IncludeTermVectors, analyzer)
d.AddField(f)
cf := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, []string{}, document.IndexField|document.IncludeTermVectors)
d.AddField(cf)
rv := idx.Analyze(d)
fieldIndexes := make(map[uint16]string)
for _, row := range rv.Rows {
if row, ok := row.(*FieldRow); ok {
fieldIndexes[row.index] = row.Name()
}
if row, ok := row.(*TermFreqRow); ok && string(row.term) == "bleve" {
for _, vec := range row.Vectors() {
if vec.GetField() != uint32(row.field) {
if fieldIndexes[row.field] != "_all" {
t.Errorf("row named %s field %d - vector field %d", fieldIndexes[row.field], row.field, vec.GetField())
}
}
}
}
}
}
func BenchmarkAnalyze(b *testing.B) {
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name)
if err != nil {
b.Fatal(err)
}
analysisQueue := index.NewAnalysisQueue(1)
idx, err := NewFirestorm(null.Name, nil, analysisQueue)
if err != nil {
b.Fatal(err)
}
d := document.NewDocument("1")
f := document.NewTextFieldWithAnalyzer("desc", nil, bleveWikiArticle1K, analyzer)
d.AddField(f)
b.ResetTimer()
for i := 0; i < b.N; i++ {
rv := idx.Analyze(d)
if len(rv.Rows) < 92 || len(rv.Rows) > 93 {
b.Fatalf("expected 512-13 rows, got %d", len(rv.Rows))
}
}
}
var bleveWikiArticle1K = []byte(`Boiling liquid expanding vapor explosion
From Wikipedia, the free encyclopedia
See also: Boiler explosion and Steam explosion
Flames subsequent to a flammable liquid BLEVE from a tanker. BLEVEs do not necessarily involve fire.
This article's tone or style may not reflect the encyclopedic tone used on Wikipedia. See Wikipedia's guide to writing better articles for suggestions. (July 2013)
A boiling liquid expanding vapor explosion (BLEVE, /ˈblɛviː/ blev-ee) is an explosion caused by the rupture of a vessel containing a pressurized liquid above its boiling point.[1]
Contents [hide]
1 Mechanism
1.1 Water example
1.2 BLEVEs without chemical reactions
2 Fires
3 Incidents
4 Safety measures
5 See also
6 References
7 External links
Mechanism[edit]
This section needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed. (July 2013)
There are three characteristics of liquids which are relevant to the discussion of a BLEVE:`)

View File

@ -1,70 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"testing"
"github.com/blevesearch/bleve/index/store/boltdb"
)
var boltTestConfig = map[string]interface{}{
"path": "test",
}
func BenchmarkBoltDBIndexing1Workers(b *testing.B) {
CommonBenchmarkIndex(b, boltdb.Name, boltTestConfig, DestroyTest, 1)
}
func BenchmarkBoltDBIndexing2Workers(b *testing.B) {
CommonBenchmarkIndex(b, boltdb.Name, boltTestConfig, DestroyTest, 2)
}
func BenchmarkBoltDBIndexing4Workers(b *testing.B) {
CommonBenchmarkIndex(b, boltdb.Name, boltTestConfig, DestroyTest, 4)
}
// batches
func BenchmarkBoltDBIndexing1Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 1, 10)
}
func BenchmarkBoltDBIndexing2Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 2, 10)
}
func BenchmarkBoltDBIndexing4Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 4, 10)
}
func BenchmarkBoltDBIndexing1Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 1, 100)
}
func BenchmarkBoltDBIndexing2Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 2, 100)
}
func BenchmarkBoltDBIndexing4Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 4, 100)
}
func BenchmarkBoltBIndexing1Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 1, 1000)
}
func BenchmarkBoltBIndexing2Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 2, 1000)
}
func BenchmarkBoltBIndexing4Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 4, 1000)
}

View File

@ -1,144 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"os"
"strconv"
"testing"
_ "github.com/blevesearch/bleve/analysis/analyzers/standard_analyzer"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/registry"
)
var benchmarkDocBodies = []string{
"A boiling liquid expanding vapor explosion (BLEVE, /ˈblɛviː/ blev-ee) is an explosion caused by the rupture of a vessel containing a pressurized liquid above its boiling point.",
"A boiler explosion is a catastrophic failure of a boiler. As seen today, boiler explosions are of two kinds. One kind is a failure of the pressure parts of the steam and water sides. There can be many different causes, such as failure of the safety valve, corrosion of critical parts of the boiler, or low water level. Corrosion along the edges of lap joints was a common cause of early boiler explosions.",
"A boiler is a closed vessel in which water or other fluid is heated. The fluid does not necessarily boil. (In North America the term \"furnace\" is normally used if the purpose is not actually to boil the fluid.) The heated or vaporized fluid exits the boiler for use in various processes or heating applications,[1][2] including central heating, boiler-based power generation, cooking, and sanitation.",
"A pressure vessel is a closed container designed to hold gases or liquids at a pressure substantially different from the ambient pressure.",
"Pressure (symbol: p or P) is the ratio of force to the area over which that force is distributed.",
"Liquid is one of the four fundamental states of matter (the others being solid, gas, and plasma), and is the only state with a definite volume but no fixed shape.",
"The boiling point of a substance is the temperature at which the vapor pressure of the liquid equals the pressure surrounding the liquid[1][2] and the liquid changes into a vapor.",
"Vapor pressure or equilibrium vapor pressure is defined as the pressure exerted by a vapor in thermodynamic equilibrium with its condensed phases (solid or liquid) at a given temperature in a closed system.",
"Industrial gases are a group of gases that are specifically manufactured for use in a wide range of industries, which include oil and gas, petrochemicals, chemicals, power, mining, steelmaking, metals, environmental protection, medicine, pharmaceuticals, biotechnology, food, water, fertilizers, nuclear power, electronics and aerospace.",
"The expansion ratio of a liquefied and cryogenic substance is the volume of a given amount of that substance in liquid form compared to the volume of the same amount of substance in gaseous form, at room temperature and normal atmospheric pressure.",
}
type KVStoreDestroy func() error
func DestroyTest() error {
return os.RemoveAll("test")
}
func CommonBenchmarkIndex(b *testing.B, storeName string, storeConfig map[string]interface{}, destroy KVStoreDestroy, analysisWorkers int) {
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed("standard")
if err != nil {
b.Fatal(err)
}
indexDocument := document.NewDocument("").
AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[0]), analyzer))
b.ResetTimer()
b.StopTimer()
for i := 0; i < b.N; i++ {
analysisQueue := index.NewAnalysisQueue(analysisWorkers)
idx, err := NewFirestorm(storeName, storeConfig, analysisQueue)
if err != nil {
b.Fatal(err)
}
err = idx.Open()
if err != nil {
b.Fatal(err)
}
indexDocument.ID = strconv.Itoa(i)
// just time the indexing portion
b.StartTimer()
err = idx.Update(indexDocument)
if err != nil {
b.Fatal(err)
}
b.StopTimer()
err = idx.Close()
if err != nil {
b.Fatal(err)
}
err = destroy()
if err != nil {
b.Fatal(err)
}
analysisQueue.Close()
}
}
func CommonBenchmarkIndexBatch(b *testing.B, storeName string, storeConfig map[string]interface{}, destroy KVStoreDestroy, analysisWorkers, batchSize int) {
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed("standard")
if err != nil {
b.Fatal(err)
}
b.ResetTimer()
b.StopTimer()
for i := 0; i < b.N; i++ {
analysisQueue := index.NewAnalysisQueue(analysisWorkers)
idx, err := NewFirestorm(storeName, storeConfig, analysisQueue)
if err != nil {
b.Fatal(err)
}
err = idx.Open()
if err != nil {
b.Fatal(err)
}
b.StartTimer()
batch := index.NewBatch()
for j := 0; j < 1000; j++ {
if j%batchSize == 0 {
if len(batch.IndexOps) > 0 {
err := idx.Batch(batch)
if err != nil {
b.Fatal(err)
}
}
batch = index.NewBatch()
}
indexDocument := document.NewDocument("").
AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[j%10]), analyzer))
indexDocument.ID = strconv.Itoa(i) + "-" + strconv.Itoa(j)
batch.Update(indexDocument)
}
// close last batch
if len(batch.IndexOps) > 0 {
err := idx.Batch(batch)
if err != nil {
b.Fatal(err)
}
}
b.StopTimer()
err = idx.Close()
if err != nil {
b.Fatal(err)
}
err = destroy()
if err != nil {
b.Fatal(err)
}
analysisQueue.Close()
}
}

View File

@ -1,72 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build cznicb
package firestorm
import (
"testing"
"github.com/blevesearch/blevex/cznicb"
)
func DestroyCznicB() error {
return nil
}
func BenchmarkCznicBIndexing1Workers(b *testing.B) {
CommonBenchmarkIndex(b, cznicb.Name, nil, DestroyCznicB, 1)
}
func BenchmarkCznicBIndexing2Workers(b *testing.B) {
CommonBenchmarkIndex(b, cznicb.Name, nil, DestroyCznicB, 2)
}
func BenchmarkCznicBIndexing4Workers(b *testing.B) {
CommonBenchmarkIndex(b, cznicb.Name, nil, DestroyCznicB, 4)
}
// batches
func BenchmarkCznicBIndexing1Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 1, 10)
}
func BenchmarkCznicBIndexing2Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 2, 10)
}
func BenchmarkCznicBIndexing4Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 4, 10)
}
func BenchmarkCznicBIndexing1Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 1, 100)
}
func BenchmarkCznicBIndexing2Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 2, 100)
}
func BenchmarkCznicBIndexing4Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 4, 100)
}
func BenchmarkCznicBIndexing1Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 1, 1000)
}
func BenchmarkCznicBIndexing2Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 2, 1000)
}
func BenchmarkCznicBIndexing4Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 4, 1000)
}

View File

@ -1,86 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build forestdb
package firestorm
import (
"os"
"testing"
"github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/index/store/forestdb"
)
func CreateForestDB() (store.KVStore, error) {
err := os.MkdirAll("testdir", 0700)
if err != nil {
return nil, err
}
s, err := forestdb.New("testdir/test", true, nil)
if err != nil {
return nil, err
}
return s, nil
}
func DestroyForestDB() error {
return os.RemoveAll("testdir")
}
func BenchmarkForestDBIndexing1Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateForestDB, DestroyForestDB, 1)
}
func BenchmarkForestDBIndexing2Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateForestDB, DestroyForestDB, 2)
}
func BenchmarkForestDBIndexing4Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateForestDB, DestroyForestDB, 4)
}
// batches
func BenchmarkForestDBIndexing1Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 1, 10)
}
func BenchmarkForestDBIndexing2Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 2, 10)
}
func BenchmarkForestDBIndexing4Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 4, 10)
}
func BenchmarkForestDBIndexing1Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 1, 100)
}
func BenchmarkForestDBIndexing2Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 2, 100)
}
func BenchmarkForestDBIndexing4Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 4, 100)
}
func BenchmarkForestDBIndexing1Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 1, 1000)
}
func BenchmarkForestDBIndexing2Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 2, 1000)
}
func BenchmarkForestDBIndexing4Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 4, 1000)
}

View File

@ -1,71 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"testing"
"github.com/blevesearch/bleve/index/store/goleveldb"
)
var goLevelDBTestOptions = map[string]interface{}{
"create_if_missing": true,
"path": "test",
}
func BenchmarkGoLevelDBIndexing1Workers(b *testing.B) {
CommonBenchmarkIndex(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 1)
}
func BenchmarkGoLevelDBIndexing2Workers(b *testing.B) {
CommonBenchmarkIndex(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 2)
}
func BenchmarkGoLevelDBIndexing4Workers(b *testing.B) {
CommonBenchmarkIndex(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 4)
}
// batches
func BenchmarkGoLevelDBIndexing1Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 1, 10)
}
func BenchmarkGoLevelDBIndexing2Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 2, 10)
}
func BenchmarkGoLevelDBIndexing4Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 4, 10)
}
func BenchmarkGoLevelDBIndexing1Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 1, 100)
}
func BenchmarkGoLevelDBIndexing2Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 2, 100)
}
func BenchmarkGoLevelDBIndexing4Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 4, 100)
}
func BenchmarkGoLevelDBIndexing1Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 1, 1000)
}
func BenchmarkGoLevelDBIndexing2Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 2, 1000)
}
func BenchmarkGoLevelDBIndexing4Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 4, 1000)
}

View File

@ -1,81 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build rocksdb
package firestorm
import (
"os"
"testing"
"github.com/blevesearch/bleve/index/store"
)
var rocksdbTestOptions = map[string]interface{}{
"create_if_missing": true,
}
func CreateGoRocksDB() (store.KVStore, error) {
return rocksdb.New("test", rocksdbTestOptions)
}
func DestroyGoRocksDB() error {
return os.RemoveAll("test")
}
func BenchmarkRocksDBIndexing1Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateGoRocksDB, DestroyGoRocksDB, 1)
}
func BenchmarkRocksDBIndexing2Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateGoRocksDB, DestroyGoRocksDB, 2)
}
func BenchmarkRocksDBIndexing4Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateGoRocksDB, DestroyGoRocksDB, 4)
}
// batches
func BenchmarkRocksDBIndexing1Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 1, 10)
}
func BenchmarkRocksDBIndexing2Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 2, 10)
}
func BenchmarkRocksDBIndexing4Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 4, 10)
}
func BenchmarkRocksDBIndexing1Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 1, 100)
}
func BenchmarkRocksDBIndexing2Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 2, 100)
}
func BenchmarkRocksDBIndexing4Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 4, 100)
}
func BenchmarkRocksDBIndexing1Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 1, 1000)
}
func BenchmarkRocksDBIndexing2Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 2, 1000)
}
func BenchmarkRocksDBIndexing4Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 4, 1000)
}

View File

@ -1,70 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"testing"
"github.com/blevesearch/bleve/index/store/gtreap"
)
func DestroyGTreap() error {
return nil
}
func BenchmarkGTreapIndexing1Workers(b *testing.B) {
CommonBenchmarkIndex(b, gtreap.Name, nil, DestroyGTreap, 1)
}
func BenchmarkGTreapIndexing2Workers(b *testing.B) {
CommonBenchmarkIndex(b, gtreap.Name, nil, DestroyGTreap, 2)
}
func BenchmarkGTreapIndexing4Workers(b *testing.B) {
CommonBenchmarkIndex(b, gtreap.Name, nil, DestroyGTreap, 4)
}
// batches
func BenchmarkGTreapIndexing1Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 1, 10)
}
func BenchmarkGTreapIndexing2Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 2, 10)
}
func BenchmarkGTreapIndexing4Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 4, 10)
}
func BenchmarkGTreapIndexing1Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 1, 100)
}
func BenchmarkGTreapIndexing2Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 2, 100)
}
func BenchmarkGTreapIndexing4Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 4, 100)
}
func BenchmarkGTreapIndexing1Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 1, 1000)
}
func BenchmarkGTreapIndexing2Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 2, 1000)
}
func BenchmarkGTreapIndexing4Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 4, 1000)
}

View File

@ -1,82 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build leveldb full
package firestorm
import (
"os"
"testing"
"github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/index/store/leveldb"
)
var leveldbTestOptions = map[string]interface{}{
"create_if_missing": true,
}
func CreateLevelDB() (store.KVStore, error) {
return leveldb.New("test", leveldbTestOptions)
}
func DestroyLevelDB() error {
return os.RemoveAll("test")
}
func BenchmarkLevelDBIndexing1Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateLevelDB, DestroyLevelDB, 1)
}
func BenchmarkLevelDBIndexing2Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateLevelDB, DestroyLevelDB, 2)
}
func BenchmarkLevelDBIndexing4Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateLevelDB, DestroyLevelDB, 4)
}
// batches
func BenchmarkLevelDBIndexing1Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 1, 10)
}
func BenchmarkLevelDBIndexing2Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 2, 10)
}
func BenchmarkLevelDBIndexing4Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 4, 10)
}
func BenchmarkLevelDBIndexing1Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 1, 100)
}
func BenchmarkLevelDBIndexing2Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 2, 100)
}
func BenchmarkLevelDBIndexing4Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 4, 100)
}
func BenchmarkLevelDBIndexing1Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 1, 1000)
}
func BenchmarkLevelDBIndexing2Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 2, 1000)
}
func BenchmarkLevelDBIndexing4Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 4, 1000)
}

View File

@ -1,70 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"testing"
"github.com/blevesearch/bleve/index/store/null"
)
func DestroyNull() error {
return nil
}
func BenchmarkNullIndexing1Workers(b *testing.B) {
CommonBenchmarkIndex(b, null.Name, nil, DestroyNull, 1)
}
func BenchmarkNullIndexing2Workers(b *testing.B) {
CommonBenchmarkIndex(b, null.Name, nil, DestroyNull, 2)
}
func BenchmarkNullIndexing4Workers(b *testing.B) {
CommonBenchmarkIndex(b, null.Name, nil, DestroyNull, 4)
}
// batches
func BenchmarkNullIndexing1Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 1, 10)
}
func BenchmarkNullIndexing2Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 2, 10)
}
func BenchmarkNullIndexing4Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 4, 10)
}
func BenchmarkNullIndexing1Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 1, 100)
}
func BenchmarkNullIndexing2Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 2, 100)
}
func BenchmarkNullIndexing4Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 4, 100)
}
func BenchmarkNullIndexing1Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 1, 1000)
}
func BenchmarkNullIndexing2Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 2, 1000)
}
func BenchmarkNullIndexing4Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 4, 1000)
}

View File

@ -1,156 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"math/rand"
"sort"
"sync"
"github.com/steveyen/gtreap"
"github.com/willf/bitset"
)
type Compensator struct {
inFlightMutex sync.RWMutex
maxRead uint64
inFlight *gtreap.Treap
deletedMutex sync.RWMutex
deletedDocNumbers *bitset.BitSet
}
func NewCompensator() *Compensator {
rv := Compensator{
inFlight: gtreap.NewTreap(inFlightItemCompare),
deletedDocNumbers: bitset.New(1000000),
}
return &rv
}
type Snapshot struct {
maxRead uint64
inFlight *gtreap.Treap
deletedDocNumbers *bitset.BitSet
}
// returns which doc number is valid
// if none, then 0
func (s *Snapshot) Which(docID []byte, docNumList DocNumberList) uint64 {
inFlightVal := s.inFlight.Get(&InFlightItem{docID: docID})
sort.Sort(docNumList) // Descending ordering.
for _, docNum := range docNumList {
if docNum > 0 && docNum <= s.maxRead &&
(inFlightVal == nil || inFlightVal.(*InFlightItem).docNum == docNum) &&
!s.deletedDocNumbers.Test(uint(docNum)) {
return docNum
}
}
return 0
}
func (s *Snapshot) Valid(docID []byte, docNum uint64) bool {
logger.Printf("checking validity of: '%s' - % x - %d", docID, docID, docNum)
if docNum > s.maxRead {
return false
}
logger.Printf("<= maxRead")
inFlightVal := s.inFlight.Get(&InFlightItem{docID: docID})
if inFlightVal != nil && inFlightVal.(*InFlightItem).docNum != docNum {
return false
}
logger.Printf("not in flight")
if s.deletedDocNumbers.Test(uint(docNum)) {
return false
}
logger.Printf("not deleted")
return true
}
func (c *Compensator) Mutate(docID []byte, docNum uint64) {
c.inFlightMutex.Lock()
defer c.inFlightMutex.Unlock()
c.inFlight = c.inFlight.Upsert(&InFlightItem{docID: docID, docNum: docNum}, rand.Int())
if docNum != 0 {
c.maxRead = docNum
}
}
func (c *Compensator) MutateBatch(inflightItems []*InFlightItem, lastDocNum uint64) {
c.inFlightMutex.Lock()
defer c.inFlightMutex.Unlock()
for _, item := range inflightItems {
c.inFlight = c.inFlight.Upsert(item, rand.Int())
}
c.maxRead = lastDocNum
}
func (c *Compensator) Migrate(docID []byte, docNum uint64, oldDocNums []uint64) {
c.inFlightMutex.Lock()
defer c.inFlightMutex.Unlock()
c.deletedMutex.Lock()
defer c.deletedMutex.Unlock()
// clone deleted doc numbers and mutate
if len(oldDocNums) > 0 {
newDeletedDocNumbers := c.deletedDocNumbers.Clone()
for _, oldDocNum := range oldDocNums {
newDeletedDocNumbers.Set(uint(oldDocNum))
}
// update pointer
c.deletedDocNumbers = newDeletedDocNumbers
}
// remove entry from in-flight if it still has same doc num
val := c.inFlight.Get(&InFlightItem{docID: docID})
if val != nil && val.(*InFlightItem).docNum == docNum {
c.inFlight = c.inFlight.Delete(&InFlightItem{docID: docID})
}
}
func (c *Compensator) GarbageCollect(docNums []uint64) {
c.deletedMutex.Lock()
defer c.deletedMutex.Unlock()
for _, docNum := range docNums {
c.deletedDocNumbers.Clear(uint(docNum))
}
}
func (c *Compensator) Snapshot() *Snapshot {
c.inFlightMutex.RLock()
defer c.inFlightMutex.RUnlock()
c.deletedMutex.RLock()
defer c.deletedMutex.RUnlock()
rv := Snapshot{
maxRead: c.maxRead,
inFlight: c.inFlight,
deletedDocNumbers: c.deletedDocNumbers,
}
return &rv
}
func (c *Compensator) GarbageCount() uint64 {
return uint64(c.deletedDocNumbers.Count())
}
//**************
type InFlightItem struct {
docID []byte
docNum uint64
}
func inFlightItemCompare(a, b interface{}) int {
return bytes.Compare(a.(*InFlightItem).docID, b.(*InFlightItem).docID)
}

View File

@ -1,160 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"encoding/binary"
"fmt"
"sync"
"sync/atomic"
"time"
)
const DefaultDictUpdateThreshold = 10
var DefaultDictUpdateSleep = 1 * time.Second
type DictUpdater struct {
batchesStarted uint64
batchesFlushed uint64
f *Firestorm
dictUpdateSleep time.Duration
quit chan struct{}
incoming chan map[string]int64
mutex sync.RWMutex
workingSet map[string]int64
closeWait sync.WaitGroup
}
func NewDictUpdater(f *Firestorm) *DictUpdater {
rv := DictUpdater{
f: f,
dictUpdateSleep: DefaultDictUpdateSleep,
workingSet: make(map[string]int64),
batchesStarted: 1,
quit: make(chan struct{}),
incoming: make(chan map[string]int64, 8),
}
return &rv
}
func (d *DictUpdater) Notify(term string, usage int64) {
d.mutex.Lock()
defer d.mutex.Unlock()
d.workingSet[term] += usage
}
func (d *DictUpdater) NotifyBatch(termUsages map[string]int64) {
d.incoming <- termUsages
}
func (d *DictUpdater) Start() {
d.closeWait.Add(1)
go d.runIncoming()
go d.run()
}
func (d *DictUpdater) Stop() {
close(d.quit)
d.closeWait.Wait()
}
func (d *DictUpdater) runIncoming() {
for {
select {
case <-d.quit:
return
case termUsages, ok := <-d.incoming:
if !ok {
return
}
d.mutex.Lock()
for term, usage := range termUsages {
d.workingSet[term] += usage
}
d.mutex.Unlock()
}
}
}
func (d *DictUpdater) run() {
tick := time.Tick(d.dictUpdateSleep)
for {
select {
case <-d.quit:
logger.Printf("dictionary updater asked to quit")
d.closeWait.Done()
return
case <-tick:
logger.Printf("dictionary updater ticked")
d.update()
}
}
}
func (d *DictUpdater) update() {
d.mutex.Lock()
oldWorkingSet := d.workingSet
d.workingSet = make(map[string]int64)
atomic.AddUint64(&d.batchesStarted, 1)
d.mutex.Unlock()
// open a writer
writer, err := d.f.store.Writer()
if err != nil {
_ = writer.Close()
logger.Printf("dict updater fatal: %v", err)
return
}
// prepare batch
wb := writer.NewBatch()
dictionaryTermDelta := make([]byte, 8)
for term, delta := range oldWorkingSet {
binary.LittleEndian.PutUint64(dictionaryTermDelta, uint64(delta))
wb.Merge([]byte(term), dictionaryTermDelta)
}
err = writer.ExecuteBatch(wb)
if err != nil {
_ = writer.Close()
logger.Printf("dict updater fatal: %v", err)
return
}
atomic.AddUint64(&d.batchesFlushed, 1)
_ = writer.Close()
}
// this is not intended to be used publicly, only for unit tests
// which depend on consistency we no longer provide
func (d *DictUpdater) waitTasksDone(dur time.Duration) error {
initial := atomic.LoadUint64(&d.batchesStarted)
timeout := time.After(dur)
tick := time.Tick(100 * time.Millisecond)
for {
select {
// Got a timeout! fail with a timeout error
case <-timeout:
flushed := atomic.LoadUint64(&d.batchesFlushed)
return fmt.Errorf("timeout, %d/%d", initial, flushed)
// Got a tick, we should check on doSomething()
case <-tick:
flushed := atomic.LoadUint64(&d.batchesFlushed)
if flushed > initial {
return nil
}
}
}
}

View File

@ -1,169 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"runtime"
"testing"
"time"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/gtreap"
)
func TestDictUpdater(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open()
if err != nil {
t.Fatal(err)
}
dictBatch := map[string]int64{
string([]byte{'d', 1, 0, 'c', 'a', 't'}): 3,
}
dictExpect := map[string]int64{
string([]byte{'d', 1, 0, 'c', 'a', 't'}): 3,
}
f.(*Firestorm).dictUpdater.NotifyBatch(dictBatch)
// invoke updater manually
for len(f.(*Firestorm).dictUpdater.incoming) > 0 {
runtime.Gosched()
}
err = f.(*Firestorm).dictUpdater.waitTasksDone(5 * time.Second)
if err != nil {
t.Fatal(err)
}
// assert that dictionary rows are correct
reader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
for key := range dictBatch {
v, err := reader.Get([]byte(key))
if err != nil {
t.Fatal(err)
}
if v == nil {
t.Fatal("unexpected dictionary value missing")
}
dr, err := NewDictionaryRowKV([]byte(key), v)
if err != nil {
t.Fatal(err)
}
expect := dictExpect[key]
if int64(dr.Count()) != expect {
t.Errorf("expected %d, got %d", expect, dr.Count())
}
}
err = reader.Close()
if err != nil {
t.Fatal(err)
}
// update it again
dictBatch = map[string]int64{
string([]byte{'d', 1, 0, 'c', 'a', 't'}): 1,
}
dictExpect = map[string]int64{
string([]byte{'d', 1, 0, 'c', 'a', 't'}): 4,
}
f.(*Firestorm).dictUpdater.NotifyBatch(dictBatch)
// invoke updater manually
for len(f.(*Firestorm).dictUpdater.incoming) > 0 {
runtime.Gosched()
}
err = f.(*Firestorm).dictUpdater.waitTasksDone(5 * time.Second)
if err != nil {
t.Fatal(err)
}
// assert that dictionary rows are correct
reader, err = f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
for key := range dictBatch {
v, err := reader.Get([]byte(key))
if err != nil {
t.Fatal(err)
}
dr, err := NewDictionaryRowKV([]byte(key), v)
if err != nil {
t.Fatal(err)
}
expect := dictExpect[key]
if int64(dr.Count()) != expect {
t.Errorf("expected %d, got %d", expect, dr.Count())
}
}
err = reader.Close()
if err != nil {
t.Fatal(err)
}
// update it again (decrement this time)
dictBatch = map[string]int64{
string([]byte{'d', 1, 0, 'c', 'a', 't'}): -2,
}
dictExpect = map[string]int64{
string([]byte{'d', 1, 0, 'c', 'a', 't'}): 2,
}
f.(*Firestorm).dictUpdater.NotifyBatch(dictBatch)
// invoke updater manually
for len(f.(*Firestorm).dictUpdater.incoming) > 0 {
runtime.Gosched()
}
err = f.(*Firestorm).dictUpdater.waitTasksDone(5 * time.Second)
if err != nil {
t.Fatal(err)
}
// assert that dictionary rows are correct
reader, err = f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
for key := range dictBatch {
v, err := reader.Get([]byte(key))
if err != nil {
t.Fatal(err)
}
dr, err := NewDictionaryRowKV([]byte(key), v)
if err != nil {
t.Fatal(err)
}
expect := dictExpect[key]
if int64(dr.Count()) != expect {
t.Errorf("expected %d, got %d", expect, dr.Count())
}
}
err = reader.Close()
if err != nil {
t.Fatal(err)
}
}

View File

@ -1,128 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"encoding/binary"
"io"
"github.com/golang/protobuf/proto"
)
const ByteSeparator byte = 0xff
var DictionaryKeyPrefix = []byte{'d'}
type DictionaryRow struct {
field uint16
term []byte
value DictionaryValue
}
func NewDictionaryRow(field uint16, term []byte, count uint64) *DictionaryRow {
rv := DictionaryRow{
field: field,
term: term,
}
rv.value.Count = proto.Uint64(count)
return &rv
}
func NewDictionaryRowK(key []byte) (*DictionaryRow, error) {
rv := DictionaryRow{}
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
if err != nil {
return nil, err
}
err = binary.Read(buf, binary.LittleEndian, &rv.field)
if err != nil {
return nil, err
}
rv.term, err = buf.ReadBytes(ByteSeparator)
// there is no separator expected here, should get EOF
if err != io.EOF {
return nil, err
}
return &rv, nil
}
func (dr *DictionaryRow) parseDictionaryV(value []byte) error {
err := dr.value.Unmarshal(value)
if err != nil {
return err
}
return nil
}
func NewDictionaryRowKV(key, value []byte) (*DictionaryRow, error) {
rv, err := NewDictionaryRowK(key)
if err != nil {
return nil, err
}
err = rv.parseDictionaryV(value)
if err != nil {
return nil, err
}
return rv, nil
}
func (dr *DictionaryRow) Count() uint64 {
return dr.value.GetCount()
}
func (dr *DictionaryRow) SetCount(count uint64) {
dr.value.Count = proto.Uint64(count)
}
func (dr *DictionaryRow) KeySize() int {
return 3 + len(dr.term)
}
func (dr *DictionaryRow) KeyTo(buf []byte) (int, error) {
copy(buf[0:], DictionaryKeyPrefix)
binary.LittleEndian.PutUint16(buf[1:3], dr.field)
copy(buf[3:], dr.term)
return 3 + len(dr.term), nil
}
func (dr *DictionaryRow) Key() []byte {
buf := make([]byte, dr.KeySize())
n, _ := dr.KeyTo(buf)
return buf[:n]
}
func (dr *DictionaryRow) ValueSize() int {
return dr.value.Size()
}
func (dr *DictionaryRow) ValueTo(buf []byte) (int, error) {
return dr.value.MarshalTo(buf)
}
func (dr *DictionaryRow) Value() []byte {
buf := make([]byte, dr.ValueSize())
n, _ := dr.ValueTo(buf)
return buf[:n]
}
func DictionaryRowKey(field uint16, term []byte) []byte {
buf := make([]byte, 3+len(term))
copy(buf[0:], DictionaryKeyPrefix)
binary.LittleEndian.PutUint16(buf[1:3], field)
copy(buf[3:], term)
return buf
}

View File

@ -1,59 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/index"
)
func TestDictionaryRows(t *testing.T) {
tests := []struct {
input index.IndexRow
outKey []byte
outVal []byte
}{
{
NewDictionaryRow(0, []byte("test"), 3),
[]byte{DictionaryKeyPrefix[0], 0, 0, 't', 'e', 's', 't'},
[]byte{8, 3},
},
{
NewDictionaryRow(3, []byte("dictionary"), 734),
[]byte{DictionaryKeyPrefix[0], 3, 0, 'd', 'i', 'c', 't', 'i', 'o', 'n', 'a', 'r', 'y'},
[]byte{8, 222, 5},
},
}
// test going from struct to k/v bytes
for i, test := range tests {
rk := test.input.Key()
if !reflect.DeepEqual(rk, test.outKey) {
t.Errorf("Expected key to be %v got: %v", test.outKey, rk)
}
rv := test.input.Value()
if !reflect.DeepEqual(rv, test.outVal) {
t.Errorf("Expected value to be %v got: %v for %d", test.outVal, rv, i)
}
}
// now test going back from k/v bytes to struct
for i, test := range tests {
row, err := NewDictionaryRowKV(test.outKey, test.outVal)
if err != nil {
t.Errorf("error parsking key/value: %v", err)
}
if !reflect.DeepEqual(row, test.input) {
t.Errorf("Expected: %#v got: %#v for %d", test.input, row, i)
}
}
}

View File

@ -1,92 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"fmt"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store"
)
// the functions in this file are only intended to be used by
// the bleve_dump utility and the debug http handlers
// if your application relies on them, you're doing something wrong
// they may change or be removed at any time
func (f *Firestorm) dumpPrefix(kvreader store.KVReader, rv chan interface{}, prefix []byte) error {
return visitPrefix(kvreader, prefix, func(key, val []byte) (bool, error) {
row, err := parseFromKeyValue(key, val)
if err != nil {
rv <- err
return false, err
}
rv <- row
return true, nil
})
}
func (f *Firestorm) dumpDoc(kvreader store.KVReader, rv chan interface{}, docID []byte) error {
// without a back index we have no choice but to walk the term freq and stored rows
// walk the term freqs
err := visitPrefix(kvreader, TermFreqKeyPrefix, func(key, val []byte) (bool, error) {
tfr, err := NewTermFreqRowKV(key, val)
if err != nil {
rv <- err
return false, err
}
if bytes.Compare(tfr.DocID(), docID) == 0 {
rv <- tfr
}
return true, nil
})
if err != nil {
return err
}
// now walk the stored
err = visitPrefix(kvreader, StoredKeyPrefix, func(key, val []byte) (bool, error) {
sr, err := NewStoredRowKV(key, val)
if err != nil {
rv <- err
return false, err
}
if bytes.Compare(sr.DocID(), docID) == 0 {
rv <- sr
}
return true, nil
})
return err
}
func parseFromKeyValue(key, value []byte) (index.IndexRow, error) {
if len(key) > 0 {
switch key[0] {
case VersionKey[0]:
return NewVersionRowV(value)
case FieldKeyPrefix[0]:
return NewFieldRowKV(key, value)
case DictionaryKeyPrefix[0]:
return NewDictionaryRowKV(key, value)
case TermFreqKeyPrefix[0]:
return NewTermFreqRowKV(key, value)
case StoredKeyPrefix[0]:
return NewStoredRowKV(key, value)
case InternalKeyPrefix[0]:
return NewInternalRowKV(key, value)
}
return nil, fmt.Errorf("Unknown field type '%s'", string(key[0]))
}
return nil, fmt.Errorf("Invalid empty key")
}

View File

@ -1,129 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"testing"
"time"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/gtreap"
)
var dictWaitDuration = 5 * time.Second
func TestDump(t *testing.T) {
analysisQueue := index.NewAnalysisQueue(1)
idx, err := NewFirestorm(gtreap.Name, nil, analysisQueue)
if err != nil {
t.Fatal(err)
}
err = idx.Open()
if err != nil {
t.Fatalf("error opening index: %v", err)
}
defer func() {
err := idx.Close()
if err != nil {
t.Fatal(err)
}
}()
var expectedCount uint64
docCount, err := idx.DocCount()
if err != nil {
t.Error(err)
}
if docCount != expectedCount {
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
}
doc := document.NewDocument("1")
doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField))
doc.AddField(document.NewNumericFieldWithIndexingOptions("age", []uint64{}, 35.99, document.IndexField|document.StoreField))
dateField, err := document.NewDateTimeFieldWithIndexingOptions("unixEpoch", []uint64{}, time.Unix(0, 0), document.IndexField|document.StoreField)
if err != nil {
t.Error(err)
}
doc.AddField(dateField)
err = idx.Update(doc)
if err != nil {
t.Errorf("Error updating index: %v", err)
}
doc = document.NewDocument("2")
doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test2"), document.IndexField|document.StoreField))
doc.AddField(document.NewNumericFieldWithIndexingOptions("age", []uint64{}, 35.99, document.IndexField|document.StoreField))
dateField, err = document.NewDateTimeFieldWithIndexingOptions("unixEpoch", []uint64{}, time.Unix(0, 0), document.IndexField|document.StoreField)
if err != nil {
t.Error(err)
}
doc.AddField(dateField)
err = idx.Update(doc)
if err != nil {
t.Errorf("Error updating index: %v", err)
}
fieldsCount := 0
fieldsRows := idx.DumpFields()
for range fieldsRows {
fieldsCount++
}
if fieldsCount != 4 { // _id field is automatic
t.Errorf("expected 4 fields, got %d", fieldsCount)
}
// 1 id term
// 1 text term
// 16 numeric terms
// 16 date terms
// 3 stored fields
expectedDocRowCount := int(1 + 1 + (2 * (64 / document.DefaultPrecisionStep)) + 3)
docRowCount := 0
docRows := idx.DumpDoc("1")
for range docRows {
docRowCount++
}
if docRowCount != expectedDocRowCount {
t.Errorf("expected %d rows for document, got %d", expectedDocRowCount, docRowCount)
}
docRowCount = 0
docRows = idx.DumpDoc("2")
for range docRows {
docRowCount++
}
if docRowCount != expectedDocRowCount {
t.Errorf("expected %d rows for document, got %d", expectedDocRowCount, docRowCount)
}
err = idx.(*Firestorm).dictUpdater.waitTasksDone(dictWaitDuration)
if err != nil {
t.Fatal(err)
}
// 1 version
// fieldsCount field rows
// 2 docs * expectedDocRowCount
// 2 text term row count (2 different text terms)
// 16 numeric term row counts (shared for both docs, same numeric value)
// 16 date term row counts (shared for both docs, same date value)
//
expectedAllRowCount := int(1 + fieldsCount + (2 * expectedDocRowCount) + 2 + int((2 * (64 / document.DefaultPrecisionStep))))
allRowCount := 0
allRows := idx.DumpAll()
for range allRows {
allRowCount++
}
if allRowCount != expectedAllRowCount {
t.Errorf("expected %d rows for all, got %d", expectedAllRowCount, allRowCount)
}
}

View File

@ -1,119 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"encoding/binary"
"fmt"
"github.com/blevesearch/bleve/index/store"
"github.com/golang/protobuf/proto"
)
var FieldKeyPrefix = []byte{'f'}
func (f *Firestorm) fieldIndexOrNewRow(name string) (uint16, *FieldRow) {
index, existed := f.fieldCache.FieldNamed(name, true)
if !existed {
return index, NewFieldRow(uint16(index), name)
}
return index, nil
}
func (f *Firestorm) loadFields(reader store.KVReader) (err error) {
err = visitPrefix(reader, FieldKeyPrefix, func(key, val []byte) (bool, error) {
fieldRow, err := NewFieldRowKV(key, val)
if err != nil {
return false, err
}
f.fieldCache.AddExisting(fieldRow.Name(), fieldRow.Index())
return true, nil
})
return
}
type FieldRow struct {
index uint16
value FieldValue
}
func NewFieldRow(i uint16, name string) *FieldRow {
rv := FieldRow{
index: i,
}
rv.value.Name = proto.String(name)
return &rv
}
func NewFieldRowKV(key, value []byte) (*FieldRow, error) {
rv := FieldRow{}
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
if err != nil {
return nil, err
}
err = binary.Read(buf, binary.LittleEndian, &rv.index)
if err != nil {
return nil, err
}
err = rv.value.Unmarshal(value)
if err != nil {
return nil, err
}
return &rv, nil
}
func (fr *FieldRow) KeySize() int {
return 3
}
func (fr *FieldRow) KeyTo(buf []byte) (int, error) {
buf[0] = 'f'
binary.LittleEndian.PutUint16(buf[1:3], fr.index)
return 3, nil
}
func (fr *FieldRow) Key() []byte {
buf := make([]byte, fr.KeySize())
n, _ := fr.KeyTo(buf)
return buf[:n]
}
func (fr *FieldRow) ValueSize() int {
return fr.value.Size()
}
func (fr *FieldRow) ValueTo(buf []byte) (int, error) {
return fr.value.MarshalTo(buf)
}
func (fr *FieldRow) Value() []byte {
buf := make([]byte, fr.ValueSize())
n, _ := fr.ValueTo(buf)
return buf[:n]
}
func (fr *FieldRow) Index() uint16 {
return fr.index
}
func (fr *FieldRow) Name() string {
return fr.value.GetName()
}
func (fr *FieldRow) String() string {
return fmt.Sprintf("FieldRow - Field: %d - Name: %s\n", fr.index, fr.Name())
}

View File

@ -1,59 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/index"
)
func TestFieldRows(t *testing.T) {
tests := []struct {
input index.IndexRow
outKey []byte
outVal []byte
}{
{
NewFieldRow(0, "_id"),
[]byte{FieldKeyPrefix[0], 0, 0},
[]byte{10, 3, '_', 'i', 'd'},
},
{
NewFieldRow(1, "name"),
[]byte{FieldKeyPrefix[0], 1, 0},
[]byte{10, 4, 'n', 'a', 'm', 'e'},
},
}
// test going from struct to k/v bytes
for i, test := range tests {
rk := test.input.Key()
if !reflect.DeepEqual(rk, test.outKey) {
t.Errorf("Expected key to be %v got: %v", test.outKey, rk)
}
rv := test.input.Value()
if !reflect.DeepEqual(rv, test.outVal) {
t.Errorf("Expected value to be %v got: %v for %d", test.outVal, rv, i)
}
}
// now test going back from k/v bytes to struct
for i, test := range tests {
row, err := NewFieldRowKV(test.outKey, test.outVal)
if err != nil {
t.Errorf("error parsking key/value: %v", err)
}
if !reflect.DeepEqual(row, test.input) {
t.Errorf("Expected: %#v got: %#v for %d", test.input, row, i)
}
}
}

View File

@ -1,542 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"encoding/json"
"sync/atomic"
"time"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/registry"
)
const Name = "firestorm"
type Firestorm struct {
highDocNumber uint64
docCount uint64
storeName string
storeConfig map[string]interface{}
store store.KVStore
compensator *Compensator
analysisQueue *index.AnalysisQueue
fieldCache *index.FieldCache
garbageCollector *GarbageCollector
lookuper *Lookuper
dictUpdater *DictUpdater
stats *indexStat
}
func NewFirestorm(storeName string, storeConfig map[string]interface{}, analysisQueue *index.AnalysisQueue) (index.Index, error) {
rv := Firestorm{
storeName: storeName,
storeConfig: storeConfig,
compensator: NewCompensator(),
analysisQueue: analysisQueue,
fieldCache: index.NewFieldCache(),
docCount: 0,
highDocNumber: 0,
stats: &indexStat{},
}
rv.stats.f = &rv
rv.garbageCollector = NewGarbageCollector(&rv)
rv.lookuper = NewLookuper(&rv)
rv.dictUpdater = NewDictUpdater(&rv)
return &rv, nil
}
func (f *Firestorm) Open() (err error) {
// open the kv store
storeConstructor := registry.KVStoreConstructorByName(f.storeName)
if storeConstructor == nil {
err = index.ErrorUnknownStorageType
return
}
// now open the store
f.store, err = storeConstructor(&mergeOperator, f.storeConfig)
if err != nil {
return
}
// start a reader
var kvreader store.KVReader
kvreader, err = f.store.Reader()
if err != nil {
return
}
// assert correct version, and find out if this is new index
var newIndex bool
newIndex, err = f.checkVersion(kvreader)
if err != nil {
return
}
if !newIndex {
// process existing index before opening
err = f.warmup(kvreader)
if err != nil {
return
}
}
err = kvreader.Close()
if err != nil {
return
}
if newIndex {
// prepare a new index
err = f.bootstrap()
if err != nil {
return
}
}
// start the garbage collector
f.garbageCollector.Start()
// start the lookuper
f.lookuper.Start()
// start the dict updater
f.dictUpdater.Start()
return
}
func (f *Firestorm) Close() error {
f.garbageCollector.Stop()
f.lookuper.Stop()
f.dictUpdater.Stop()
return f.store.Close()
}
func (f *Firestorm) DocCount() (uint64, error) {
count := atomic.LoadUint64(&f.docCount)
return count, nil
}
func (f *Firestorm) Update(doc *document.Document) (err error) {
// assign this document a number
doc.Number = atomic.AddUint64(&f.highDocNumber, 1)
// do analysis before acquiring write lock
analysisStart := time.Now()
numPlainTextBytes := doc.NumPlainTextBytes()
resultChan := make(chan *index.AnalysisResult)
aw := index.NewAnalysisWork(f, doc, resultChan)
// put the work on the queue
f.analysisQueue.Queue(aw)
// wait for the result
result := <-resultChan
close(resultChan)
atomic.AddUint64(&f.stats.analysisTime, uint64(time.Since(analysisStart)))
// start a writer for this update
indexStart := time.Now()
var kvwriter store.KVWriter
kvwriter, err = f.store.Writer()
if err != nil {
return
}
defer func() {
if cerr := kvwriter.Close(); err == nil && cerr != nil {
err = cerr
}
}()
var dictionaryDeltas map[string]int64
dictionaryDeltas, err = f.batchRows(kvwriter, [][]index.IndexRow{result.Rows}, nil)
if err != nil {
_ = kvwriter.Close()
atomic.AddUint64(&f.stats.errors, 1)
return
}
f.compensator.Mutate([]byte(doc.ID), doc.Number)
f.lookuper.NotifyBatch([]*InFlightItem{{[]byte(doc.ID), doc.Number}})
f.dictUpdater.NotifyBatch(dictionaryDeltas)
atomic.AddUint64(&f.stats.indexTime, uint64(time.Since(indexStart)))
atomic.AddUint64(&f.stats.numPlainTextBytesIndexed, numPlainTextBytes)
return
}
func (f *Firestorm) Delete(id string) error {
indexStart := time.Now()
f.compensator.Mutate([]byte(id), 0)
f.lookuper.NotifyBatch([]*InFlightItem{{[]byte(id), 0}})
atomic.AddUint64(&f.stats.indexTime, uint64(time.Since(indexStart)))
return nil
}
func (f *Firestorm) batchRows(writer store.KVWriter, rowsOfRows [][]index.IndexRow, deleteKeys [][]byte) (map[string]int64, error) {
dictionaryDeltas := make(map[string]int64)
// count up bytes needed for buffering.
addNum := 0
addKeyBytes := 0
addValBytes := 0
deleteNum := 0
deleteKeyBytes := 0
var kbuf []byte
prepareBuf := func(buf []byte, sizeNeeded int) []byte {
if cap(buf) < sizeNeeded {
return make([]byte, sizeNeeded, sizeNeeded+128)
}
return buf[0:sizeNeeded]
}
for _, rows := range rowsOfRows {
for _, row := range rows {
tfr, ok := row.(*TermFreqRow)
if ok {
if tfr.Field() != 0 {
kbuf = prepareBuf(kbuf, tfr.DictionaryRowKeySize())
klen, err := tfr.DictionaryRowKeyTo(kbuf)
if err != nil {
return nil, err
}
dictionaryDeltas[string(kbuf[0:klen])] += 1
}
}
addKeyBytes += row.KeySize()
addValBytes += row.ValueSize()
}
addNum += len(rows)
}
for _, dk := range deleteKeys {
deleteKeyBytes += len(dk)
}
deleteNum += len(deleteKeys)
// prepare batch
totBytes := addKeyBytes + addValBytes + deleteKeyBytes
buf, wb, err := writer.NewBatchEx(store.KVBatchOptions{
TotalBytes: totBytes,
NumSets: addNum,
NumDeletes: deleteNum,
NumMerges: 0,
})
if err != nil {
return nil, err
}
defer func() {
_ = wb.Close()
}()
for _, rows := range rowsOfRows {
for _, row := range rows {
klen, err := row.KeyTo(buf)
if err != nil {
return nil, err
}
vlen, err := row.ValueTo(buf[klen:])
if err != nil {
return nil, err
}
wb.Set(buf[0:klen], buf[klen:klen+vlen])
buf = buf[klen+vlen:]
}
}
for _, dk := range deleteKeys {
dklen := copy(buf, dk)
wb.Delete(buf[0:dklen])
buf = buf[dklen:]
}
// write out the batch
err = writer.ExecuteBatch(wb)
if err != nil {
return nil, err
}
return dictionaryDeltas, nil
}
func (f *Firestorm) Batch(batch *index.Batch) (err error) {
// acquire enough doc numbers for all updates in the batch
// FIXME we actually waste doc numbers because deletes are in the
// same map and we don't need numbers for them
lastDocNumber := atomic.AddUint64(&f.highDocNumber, uint64(len(batch.IndexOps)))
firstDocNumber := lastDocNumber - uint64(len(batch.IndexOps)) + 1
analysisStart := time.Now()
resultChan := make(chan *index.AnalysisResult)
var docsUpdated uint64
var docsDeleted uint64
var numPlainTextBytes uint64
for _, doc := range batch.IndexOps {
if doc != nil {
doc.Number = firstDocNumber // actually assign doc numbers here
firstDocNumber++
docsUpdated++
numPlainTextBytes += doc.NumPlainTextBytes()
} else {
docsDeleted++
}
}
go func() {
for _, doc := range batch.IndexOps {
if doc != nil {
aw := index.NewAnalysisWork(f, doc, resultChan)
// put the work on the queue
f.analysisQueue.Queue(aw)
}
}
}()
// extra 1 capacity for internal updates.
collectRows := make([][]index.IndexRow, 0, docsUpdated+1)
// wait for the result
var itemsDeQueued uint64
for itemsDeQueued < docsUpdated {
result := <-resultChan
collectRows = append(collectRows, result.Rows)
itemsDeQueued++
}
close(resultChan)
atomic.AddUint64(&f.stats.analysisTime, uint64(time.Since(analysisStart)))
var deleteKeys [][]byte
if len(batch.InternalOps) > 0 {
// add the internal ops
updateInternalRows := make([]index.IndexRow, 0, len(batch.InternalOps))
for internalKey, internalValue := range batch.InternalOps {
if internalValue == nil {
// delete
deleteInternalRow := NewInternalRow([]byte(internalKey), nil)
deleteKeys = append(deleteKeys, deleteInternalRow.Key())
} else {
updateInternalRow := NewInternalRow([]byte(internalKey), internalValue)
updateInternalRows = append(updateInternalRows, updateInternalRow)
}
}
collectRows = append(collectRows, updateInternalRows)
}
inflightItems := make([]*InFlightItem, 0, len(batch.IndexOps))
for docID, doc := range batch.IndexOps {
if doc != nil {
inflightItems = append(inflightItems,
&InFlightItem{[]byte(docID), doc.Number})
} else {
inflightItems = append(inflightItems,
&InFlightItem{[]byte(docID), 0})
}
}
indexStart := time.Now()
// start a writer for this batch
var kvwriter store.KVWriter
kvwriter, err = f.store.Writer()
if err != nil {
return
}
var dictionaryDeltas map[string]int64
dictionaryDeltas, err = f.batchRows(kvwriter, collectRows, deleteKeys)
if err != nil {
_ = kvwriter.Close()
atomic.AddUint64(&f.stats.errors, 1)
return
}
f.compensator.MutateBatch(inflightItems, lastDocNumber)
err = kvwriter.Close()
f.lookuper.NotifyBatch(inflightItems)
f.dictUpdater.NotifyBatch(dictionaryDeltas)
atomic.AddUint64(&f.stats.indexTime, uint64(time.Since(indexStart)))
if err == nil {
atomic.AddUint64(&f.stats.updates, docsUpdated)
atomic.AddUint64(&f.stats.deletes, docsDeleted)
atomic.AddUint64(&f.stats.batches, 1)
atomic.AddUint64(&f.stats.numPlainTextBytesIndexed, numPlainTextBytes)
} else {
atomic.AddUint64(&f.stats.errors, 1)
}
return
}
func (f *Firestorm) SetInternal(key, val []byte) (err error) {
internalRow := NewInternalRow(key, val)
var writer store.KVWriter
writer, err = f.store.Writer()
if err != nil {
return
}
defer func() {
if cerr := writer.Close(); err == nil && cerr != nil {
err = cerr
}
}()
wb := writer.NewBatch()
wb.Set(internalRow.Key(), internalRow.Value())
return writer.ExecuteBatch(wb)
}
func (f *Firestorm) DeleteInternal(key []byte) (err error) {
internalRow := NewInternalRow(key, nil)
var writer store.KVWriter
writer, err = f.store.Writer()
if err != nil {
return
}
defer func() {
if cerr := writer.Close(); err == nil && cerr != nil {
err = cerr
}
}()
wb := writer.NewBatch()
wb.Delete(internalRow.Key())
return writer.ExecuteBatch(wb)
}
func (f *Firestorm) DumpAll() chan interface{} {
rv := make(chan interface{})
go func() {
defer close(rv)
// start an isolated reader for use during the dump
kvreader, err := f.store.Reader()
if err != nil {
rv <- err
return
}
defer func() {
cerr := kvreader.Close()
if cerr != nil {
rv <- cerr
}
}()
err = f.dumpPrefix(kvreader, rv, nil)
if err != nil {
rv <- err
return
}
}()
return rv
}
func (f *Firestorm) DumpDoc(docID string) chan interface{} {
rv := make(chan interface{})
go func() {
defer close(rv)
// start an isolated reader for use during the dump
kvreader, err := f.store.Reader()
if err != nil {
rv <- err
return
}
defer func() {
cerr := kvreader.Close()
if cerr != nil {
rv <- cerr
}
}()
err = f.dumpDoc(kvreader, rv, []byte(docID))
if err != nil {
rv <- err
return
}
}()
return rv
}
func (f *Firestorm) DumpFields() chan interface{} {
rv := make(chan interface{})
go func() {
defer close(rv)
// start an isolated reader for use during the dump
kvreader, err := f.store.Reader()
if err != nil {
rv <- err
return
}
defer func() {
cerr := kvreader.Close()
if cerr != nil {
rv <- cerr
}
}()
err = f.dumpPrefix(kvreader, rv, FieldKeyPrefix)
if err != nil {
rv <- err
return
}
}()
return rv
}
func (f *Firestorm) Reader() (index.IndexReader, error) {
return newFirestormReader(f)
}
func (f *Firestorm) Stats() json.Marshaler {
return f.stats
}
func (f *Firestorm) StatsMap() map[string]interface{} {
return f.stats.statsMap()
}
func (f *Firestorm) Wait(timeout time.Duration) error {
return f.dictUpdater.waitTasksDone(timeout)
}
func (f *Firestorm) Advanced() (store.KVStore, error) {
return f.store, nil
}
func init() {
registry.RegisterIndexType(Name, NewFirestorm)
}

View File

@ -1,382 +0,0 @@
# Firestorm
A new indexing scheme for Bleve.
## Background
### Goals
- Avoid a single writer that must pause writing to perform computation
- either by allowing multiple writers, if computation cannot be avoided
- or by having a single writer which can insert rows uninterrupted
- Avoid the need for a back index
- the back index is expensive from a space perspective
- by not writing it out, we should be able to obtain a higher indexing throughput
- consulting the backindex is one of the read/think/update cycles mentioned above
### Considerations
- The cost for not maintaining a back index is paid in two places
- Searches may need to read more rows, because old/deleted rows may still exist
- These rows can be excluded, so correctness is not affected, but they will be slower
- Old/Deleted rows need to be cleaned up at some point
- This could either be through an explicit cleanup thread, the job of which is to constantly walk the kvstore looking for rows to delete
- Or, it could be integrated with a KV stores natural merge/compaction process (aka RocksDB)
### Semantics
It is helpful to review the desired semantics between the Index/Delete operations and Term Searches.
#### Index(doc_id, doc)
- Empty Index
- Term Search for "cat" = empty result set
The Index operation should update the index such that after the operation returns, a matching search would return the document.
- Index("a", "small cat")
- Term Search for "cat" = {"a"}
Calling the Index operation again for the same doc_id should update the index such that after the operation returns, only searches matching the newest version return the document.
- Index("a", "big dog")
- Term Search for "cat" = empty result set
- Term Search for "dog" = {"a"}
NOTE:
- At no point during the second index operation would concurrent searches for "cat" and "dog" both return 0 results.
- At no point during the second index operation would concurrent searches for "cat" and "dog" both return 1 result.
#### Delete(doc_id)
- Index("a", "small cat")
- Term Search for "cat" = {"a"}
- Delete("a")
- Term Search for "cat" = empty result set
Once the Delete operation returns, the document should no longer be returned by any search.
## Details
### Terminology
Document ID (`doc_id`)
:The user specified identifier (utf8 string). This never changes for a document.
Document Number (`doc_number`)
:The Bleve internal identifier (uint64). These numbers are generated from an atomic counter.
DocIdNumber
: Concatenation of `<doc_id> 0xff <doc_number>`
### Theory of Operation
By including a new unique identifier as a part of every row generated, the index operation no longer concerns itself with updating existing values or deleting previous values.
Removal of old rows is handled indepenently by separate threads.
Ensuring of correct semantics with respect to added/updated/deleted documents is maintained through synchronized in-memory data structures, to compensate for the decoupling of these other operations.
The Dictionary becomes a best effort data element. In kill-9 scenarios it could become incorrect, but it is believed that this will generally only affect scoring not correctness, and we can pursue read-repair operations.
### Index State
The following pseudo-structure will be used to explain changes to the internal state. Keep in mind the datatypes shown represent the logical structure required for correct behavior. The actual implementation may be different to achieve performance goals.
indexState {
docCount uint64
fieldCache map[string]uint16
nextDocNumber uint64
docIdNumberMutex sync.RWMutex // for protecting fields below
maxReadDocNumber uint64
inFlightDocIds map[string]uint64
deletedDocIdNumbers [][]byte
}
### Operation
#### Creating New Index
- New KV Batch
- SET VersionRow{version=X}
- SET FieldRow{field_id=0 field_name="_id"}
- Execute Batch
- Index State intialized to:
{
docCount = 0
fieldCache = {
"_id": 0
}
nextDocNumber = 1
maxReadDocNumber = 0
inFlightDocIds = {}
deletedDocIdNumbers = {}
}
- Garbage Collector Thread is started
- Old Doc Number Lookup Thread is started
- Index marked open
#### Opening an Existing Index
- GET VersionRow, assert current version or exit
- ITERATE all FieldRows{}
- ITERATE all TermFrequencyRow{ where field_id = 0 }
- Identify consecutive rows with same doc_id but different doc_number
- Lower document numbers are added to the deletedDocIdNumbers list
- Count all non-duplicate rows, seed the docCount
- Observe highest document number seen, seed nextDocNumber
- Index State intialized to:
{
docCount = <as counted above>
fieldCache = {
"_id": 0
<as scanned above>
}
nextDocNumber = <as scanned above> + 1
maxReadDocNumber = <same as nextDocNumber>
inFlightDocIds = {}
deletedDocIdNumbers = {<as scanned above>}
}
- Garbage Collector Thread is started
- Old Doc Number Lookup Thread is started
- Index marked open
#### Garbage Collector Thread
The role of the Garbage Collector thread is to clean up rows referring to document numbers that are no longer relevant (document was deleted or updated).
Currently, only two types of rows include document numbers:
- Term Frequency Rows
- Stored Rows
The current thought is that the garbage collector thread will use a single iterator to iterate the following key spaces:
- TermFrequencyRow { where field_id > 0}
- StoredRow {all}
For any row refering to a document number on the deletedDocNumbers list, that key will be DELETED.
The garbage collector will track loop iterations or start key for each deletedDocNumber so that it knows when it has walked a full circle for a given doc number. At point the following happen in order:
- docNumber is removed from the deletecDocNumbers list
- DELETE is issued on TermFreqRow{ field_id=0, term=doc_id, doc_id=doc_id_number }
The last thing we do is delete the TermFreqRow for field 0. If anything crashes at any point prior to this, we will again read this record on our next warmup and that doc_id_number will again go through the garbage collection process.
#### Old Doc Number Lookup Thread
The role of the Old Doc Number Lookup thread is to asynchronously lookup old document numbers in use for a give document id.
Waits in a select loop reading from a channel. Through this channel it is notified of a doc_id where work is to be done. When a doc_id comes in, the following is performed:
- Acquire indexState.docIdNumberMutex for reading:
- Read maxReadDocNumber
- Find doc_id/doc_number k/v pair in the inFlightDocIds map
- Release indexState.docIdNumberMutex
- Start Iterator at TermFrequency{ field_id=0 term=doc_id}
- Iterator until term != doc_id
All doc_numbers found that are less than maxReadDocNumber and != doc_number in the inFlightDocIds map are now scheduled for deletion.
- Acquire indexState.docIdNumberMutex for writing:
- add doc numbers to deletedDocIdNumbers
- check if doc_number in inFlightDocIds is still the same
- if so delete it
- if not, it was updated again, so we must leave it
- Release indexState.docIdNumberMutex
Notify Garbage Collector Thread directly of new doc_numbers.
#### Term Dictionary Updater Thread
The role of the Term Dictionary Updater thread is to asynchronously perform best-effort updates to the Term Dictionary. Note the contents of the Term Dictionary only affect scoring, and not correctness of query results.
NOTE: one case where correctness could be affected is if the dictionary is completely missing a term which has non-zero usage. Since the garbage collector thread is continually looking at these rows, its help could be enlisted to detect/repair this situation.
It is notified via a channel of increased term usage (by index ops) and of decresed term usage (by garbage collector cleaing up old usage)
#### Indexing a Document
- Perform all analysis on the document.
- new_doc_number = indexState.nextDocNumber++
- Create New Batch
- Batch will contain SET operations for:
- any new Fields
- Term Frequency Rows for indexed fields terms
- Stored Rows for stored fields
- Execute Batch
- Acquire indexState.docIdNumberMutex for writing:
- set maxReadDocNumber new_doc_number
- set inFlightDocIds{ docId = new_doc_number }
- Release indexState.docIdNumberMutex
- Notify Term Frequency Updater thread of increased term usage.
- Notify Old Doc Number Lookup Thread of doc_id.
The key property is that a search matching the updated document *SHOULD* return the document once this method returns. If the document was an update, it should return the previous document until this method returns. There should be no period of time where neither document matches.
#### Deleting a Document
- Acquire indexState.docIdNumberMutex for writing:
- set inFlightDocIds{ docId = 0 } // 0 is a doc number we never use, indicates pending deltion of docId
- Release indexState.docIdNumberMutex
- Notify Old Doc Number Lookup Thread of doc_id.
#### Batch Operations
Batch operations look largely just like the indexing/deleting operations. Two other optimizations come into play.
- More SET operations in the underlying batch
- Larger aggregated updates can be passed to the Term Frequency Updater Thread
#### Term Field Iteration
- Acquire indexState.docIdNumberMutex for reading:
- Get copy of: (it is assumed some COW data structure is used, or MVCC is accomodated in some way by the impl)
- maxReadDocNumber
- inFlightDocIds
- deletedDocIdNumbers
- Release indexState.docIdNumberMutex
Term Field Iteration is used by the basic term search. It produces the set of documents (and related info like term vectors) which used the specified term in the specified field.
Iterator starts at key:
```'t' <field id uint16> <term utf8> 0xff```
Iterator ends when the term does not match.
- Any row with doc_number > maxReadDocNumber MUST be ignored.
- Any row with doc_id_number on the deletedDocIdNumber list MUST be ignored.
- Any row with the same doc_id as an entry in the inFlightDocIds map, MUST have the same number.
Any row satisfying the above conditions is a candidate document.
### Row Encoding
All keys are manually encoded to ensure a precise row ordering.
Internal Row values are opaque byte arrays.
All other values are encoded using protobuf for a balance of efficiency and flexibility. Dictionary and TermFrequency rows are the most likely to take advantage of this flexibility, but other rows are read/written infrequently enough that the flexibility outweighs any overhead.
#### Version
There is a single version row which records which version of the firestorm indexing scheme is in use.
| Key | Value |
|---------|------------|
|```'v'```|```<VersionValue protobuf>```|
message VersionValue {
required uint64 version = 1;
}
#### Field
Field rows map field names to numeric values
| Key | Value |
|---------|------------|
|```'f' <field id uint16>```|```<FieldValue protobuf>```|
message FieldValue {
required string name = 1;
}
#### Dictionary
Dictionary rows record which terms are used in a particular field. The value can be used to store additional information about the term usage. The value will be encoded using protobuf so that future versions can add data to this structure.
| Key | Value |
|---------|------------|
|```'d' <field id uint16> <term utf8>```|```<DictionaryValue protobuf>```|
message DictionaryValue {
optional uint64 count = 1; // number of documents using this term in this field
}
#### Term Frequency
Term Freqquency rows record which documents use a term in a particular field. The value must record how often the term occurs. It may optionally include other details such as a normalization value (precomputed scoring adjustment for the length of the field) and term vectors (where the term occurred within the field). The value will be encoded using protobuf so that future versions can add data to this structure.
| Key | Value |
|---------|------------|
|```'t' <field id uint16> <term utf8> 0xff <doc_id utf8 > 0xff <doc number uint64>```|```<TermFreqValue protobuf>```|
message TermVectorEntry {
optional uint32 field = 1; // field optional if redundant, required for composite fields
optional uint64 pos = 2; // positional offset within the field
optional uint64 start = 3; // start byte offset
optional uint64 end = 4; // end byte offset
repeated uint64 arrayPositions = 5; // array positions
}
message TermFrequencyValue {
required uint64 freq = 1; // frequency of the term occurance within this field
optional float norm = 2; // normalization factor
repeated TermVectorEntry vectors = 3; // term vectors
}
#### Stored
Stored rows record the original values used to produce the index. At the row encoding level this is an opaque sequence of bytes.
| Key | Value |
|---------------------------|-------------------------|
|```'s' <doc id utf8> 0xff <doc number uint64> <field id uint16>```|```<StoredValue protobuf>```|
message StoredValue {
optional bytes raw = 1; // raw bytes
}
NOTE: we currently encode stored values as raw bytes, however we have other proposals in flight to do something better than this. By using protobuf here as well, we can support existing functionality through the raw field, but allow for more strongly typed information in the future.
#### Internal
Internal rows are a reserved keyspace which the layer above can use for anything it wants.
| Key | Value |
|---------------------------|-------------------------|
|```'i' <application key []byte>```|```<application value []byte>```|
### FAQ
1. How do you ensure correct semantics while updating a document in the index?
Let us consider 5 possible states:
a. Document X#1 is in the index, maxReadDocNumber=1, inFlightDocIds{}, deletedDocIdNumbers{}
b. Document X#1 and X#2 are in the index, maxReadDocNumber=1, inFlightDocIds{}, deletedDocIdNumbers{}
c. Document X#1 and X#2 are in the index, maxReadDocNumber=2, inFlightDocIds{X:2}, deletedDocIdNumbers{}
d. Document X#1 and X#2 are in the index, maxReadDocNumber=2, inFlightDocIds{}, deletedDocIdNumbers{X#1}
e. Document X#2 is in the index, maxReadDocNumber=2, inFlightDocIds{}, deletedDocIdNumbers{}
In state a, we have a steady state where one document has been indexed with id X.
In state b, we have executed the batch that writes the new rows corresponding to the new version of X, but we have not yet updated our in memory compensation data structures. This is OK, because maxReadDocNumber is still 1, all readers will ignore the new rows we just wrote. This is also OK because we are still inside the Index() method, so there is not yet any expectation to see the udpated document.
In state c, we have updated both the maxReadDocNumber to 2 and added X:2 to the inFlightDocIds map. This means that searchers could find rows corresponding to X#1 and X#2. However, they are forced to disregard any row for X where the document number is not 2.
In state d, we have completed the lookup for the old document numbers of X, and found 1. Now deletedDocIdNumbers contains X#1. Now readers that encounter this doc_id_number will ignore it.
In state e, the garbage collector has removed all record of X#1.
The Index method returns after it has transitioned to state c, which maintains the semantics we desire.
2\. Wait, what happens if I kill -9 the process, won't you forget about the deleted documents?
No, our proposal is for a warmup process to walk a subset of the keyspace (TermFreq{ where field_id=0 }). This warmup process will identify all not-yet cleaned up document numbers, and seed the deletedDocIdNumbers state as well as the Garbage Collector Thread.
3\. Wait, but what will happen to the inFlightDocIds in a kill -9 scenario?
It turns out they actually don't matter. That list was just an optimization to get us through the window of time while we hadn't yet looked up the old document numbers for a given document id. But, during the warmup phase we still identify all those keys and they go directly onto deletedDocIdNumbers list.

File diff suppressed because it is too large Load Diff

View File

@ -1,31 +0,0 @@
package firestorm;
message VersionValue {
required uint64 version = 1;
}
message FieldValue {
required string name = 1;
}
message DictionaryValue {
optional uint64 count = 1; // number of documents using this term in this field
}
message TermVector {
optional uint32 field = 1; // field optional if redundant, required for composite fields
optional uint64 pos = 2; // positional offset within the field
optional uint64 start = 3; // start byte offset
optional uint64 end = 4; // end byte offset
repeated uint64 arrayPositions = 5; // array positions
}
message TermFreqValue {
required uint64 freq = 1; // frequency of the term occurance within this field
optional float norm = 2; // normalization factor
repeated TermVector vectors = 3; // term vectors
}
message StoredValue {
optional bytes raw = 1; // raw bytes
}

File diff suppressed because it is too large Load Diff

View File

@ -1,235 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"math"
"sync"
"time"
)
const DefaultGarbageThreshold = 10
const DefaultMaxDocsPerPass = 1000
var DefaultGarbageSleep = 15 * time.Second
type GarbageCollector struct {
f *Firestorm
garbageThreshold int
garbageSleep time.Duration
maxDocsPerPass int
quit chan struct{}
mutex sync.RWMutex
workingSet map[uint64][]byte
closeWait sync.WaitGroup
}
func NewGarbageCollector(f *Firestorm) *GarbageCollector {
rv := GarbageCollector{
f: f,
garbageThreshold: DefaultGarbageThreshold,
garbageSleep: DefaultGarbageSleep,
maxDocsPerPass: DefaultMaxDocsPerPass,
quit: make(chan struct{}),
workingSet: make(map[uint64][]byte),
}
return &rv
}
func (gc *GarbageCollector) Notify(docNum uint64, docId []byte) {
gc.mutex.Lock()
defer gc.mutex.Unlock()
gc.workingSet[docNum] = docId
}
func (gc *GarbageCollector) Start() {
gc.closeWait.Add(1)
go gc.run()
}
func (gc *GarbageCollector) Stop() {
close(gc.quit)
gc.closeWait.Wait()
}
func (gc *GarbageCollector) run() {
tick := time.Tick(gc.garbageSleep)
for {
select {
case <-gc.quit:
logger.Printf("garbage collector asked to quit")
gc.closeWait.Done()
return
case <-tick:
logger.Printf("garbage collector ticked")
garbageSize := gc.f.compensator.GarbageCount()
docSize, err := gc.f.DocCount()
if err != nil {
logger.Printf("garbage collector error getting doc count: %v", err)
continue
}
if docSize == 0 {
continue
}
garbageRatio := int(uint64(garbageSize) / docSize)
if garbageRatio > gc.garbageThreshold {
gc.cleanup()
} else {
logger.Printf("garbage ratio only %d, waiting", garbageRatio)
}
}
}
}
func (gc *GarbageCollector) NextBatch(n int) []uint64 {
gc.mutex.RLock()
defer gc.mutex.RUnlock()
rv := make([]uint64, 0, n)
i := 0
for k := range gc.workingSet {
rv = append(rv, k)
i++
if i > n {
break
}
}
return rv
}
func (gc *GarbageCollector) cleanup() {
logger.Printf("garbage collector starting")
// get list of deleted doc numbers to work on this pass
deletedDocNumsList := gc.NextBatch(gc.maxDocsPerPass) //gc.f.deletedDocNumbers.Keys(gc.maxDocsPerPass)
logger.Printf("found %d doc numbers to cleanup", len(deletedDocNumsList))
// put these documents numbers in a map, for faster checking
// and for organized keys to be deleted
deletedDocNums := make(map[uint64][][]byte)
for _, deletedDocNum := range deletedDocNumsList {
deletedDocNums[deletedDocNum] = make([][]byte, 0)
}
reader, err := gc.f.store.Reader()
if err != nil {
logger.Printf("garbage collector fatal: %v", err)
return
}
defer func() {
if cerr := reader.Close(); err == nil && cerr != nil {
err = cerr
}
}()
// walk all the term freq rows (where field > 0)
termFreqStart := TermFreqIteratorStart(0, []byte{ByteSeparator})
termFreqEnd := TermFreqIteratorStart(math.MaxUint16, []byte{ByteSeparator})
var tfr TermFreqRow
dictionaryDeltas := make(map[string]int64)
err = visitRange(reader, termFreqStart, termFreqEnd, func(key, val []byte) (bool, error) {
err := tfr.ParseKey(key)
if err != nil {
return false, err
}
docNum := tfr.DocNum()
if docNumKeys, deleted := deletedDocNums[docNum]; deleted {
// this doc number has been deleted, place key into map
deletedDocNums[docNum] = append(docNumKeys, key)
if tfr.Field() != 0 {
drk := tfr.DictionaryRowKey()
dictionaryDeltas[string(drk)] -= 1
}
}
return true, nil
})
if err != nil {
logger.Printf("garbage collector fatal: %v", err)
return
}
// walk all the stored rows
var sr StoredRow
err = visitPrefix(reader, StoredKeyPrefix, func(key, val []byte) (bool, error) {
err := sr.ParseKey(key)
if err != nil {
return false, err
}
docNum := sr.DocNum()
if docNumKeys, deleted := deletedDocNums[docNum]; deleted {
// this doc number has been deleted, place key into map
deletedDocNums[docNum] = append(docNumKeys, key)
}
return true, nil
})
if err != nil {
logger.Printf("garbage collector fatal: %v", err)
return
}
// now process each doc one at a time
for docNum, docKeys := range deletedDocNums {
// delete keys for a doc number
logger.Printf("deleting keys for %d", docNum)
// open a writer
writer, err := gc.f.store.Writer()
if err != nil {
_ = writer.Close()
logger.Printf("garbage collector fatal: %v", err)
return
}
// prepare batch
wb := writer.NewBatch()
for _, k := range docKeys {
wb.Delete(k)
}
err = writer.ExecuteBatch(wb)
if err != nil {
_ = writer.Close()
logger.Printf("garbage collector fatal: %v", err)
return
}
logger.Printf("deleted %d keys", len(docKeys))
// remove it from delete keys list
docID := gc.workingSet[docNum]
delete(gc.workingSet, docNum)
gc.f.compensator.GarbageCollect([]uint64{docNum})
// now delete the original marker row (field 0)
tfidrow := NewTermFreqRow(0, nil, docID, docNum, 0, 0, nil)
markerRowKey := tfidrow.Key()
markerBatch := writer.NewBatch()
markerBatch.Delete(markerRowKey)
err = writer.ExecuteBatch(markerBatch)
if err != nil {
logger.Printf("garbage collector fatal: %v", err)
return
}
err = writer.Close()
if err != nil {
logger.Printf("garbage collector fatal: %v", err)
return
}
}
// updating dictionary in one batch
gc.f.dictUpdater.NotifyBatch(dictionaryDeltas)
logger.Printf("garbage collector finished")
}

View File

@ -1,132 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"testing"
"time"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/gtreap"
)
func TestGarbageCleanup(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open()
if err != nil {
t.Fatal(err)
}
kvwriter, err := f.(*Firestorm).store.Writer()
if err != nil {
t.Fatal(err)
}
rows := []struct {
row index.IndexRow
garbage bool
}{
// needed for warmup to work
{NewFieldRow(0, IDFieldName), false},
// 3 documents, with 2 older versions
{NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil), true},
{NewTermFreqRow(0, nil, []byte("a"), 2, 0, 0.0, nil), false},
{NewTermFreqRow(0, nil, []byte("b"), 3, 0, 0.0, nil), false},
{NewTermFreqRow(0, nil, []byte("c"), 4, 0, 0.0, nil), true},
{NewTermFreqRow(0, nil, []byte("c"), 5, 0, 0.0, nil), false},
// additional records for these docs which should be removed
{NewTermFreqRow(1, []byte("cat"), []byte("a"), 1, 3, 2.0, nil), true},
{NewTermFreqRow(1, []byte("cat"), []byte("c"), 4, 1, 1.0, nil), true},
{NewStoredRow([]byte("a"), 1, 1, nil, []byte("tcat")), true},
{NewStoredRow([]byte("c"), 4, 1, nil, []byte("tcat")), true},
}
for _, row := range rows {
wb := kvwriter.NewBatch()
wb.Set(row.row.Key(), row.row.Value())
err = kvwriter.ExecuteBatch(wb)
if err != nil {
t.Fatal(err)
}
}
err = kvwriter.Close()
if err != nil {
t.Fatal(err)
}
kvreader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
// warmup ensures that deletedDocNums is seeded correctly
err = f.(*Firestorm).warmup(kvreader)
if err != nil {
t.Fatal(err)
}
err = kvreader.Close()
if err != nil {
t.Fatal(err)
}
// now invoke garbage collector cleanup manually
f.(*Firestorm).garbageCollector.cleanup()
// assert that garbage rows are gone
reader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
for _, row := range rows {
v, err := reader.Get(row.row.Key())
if err != nil {
t.Fatal(err)
}
if v != nil && row.garbage {
t.Errorf("garbage row not deleted, key: %s", row.row.Key())
}
if v == nil && !row.garbage {
t.Errorf("non-garbage row deleted, key: %s", row.row.Key())
}
}
err = reader.Close()
if err != nil {
t.Fatal(err)
}
// assert that deletedDocsNumbers size is 0
if f.(*Firestorm).compensator.GarbageCount() != 0 {
t.Errorf("expected deletedDocsNumbers size to be 0, got %d", f.(*Firestorm).compensator.GarbageCount())
}
}
func TestGarbageDontPanicOnEmptyDocs(t *testing.T) {
idx, err := NewFirestorm("", nil, index.NewAnalysisQueue(1))
if err != nil {
t.Fatal(err)
}
f := idx.(*Firestorm)
gc := NewGarbageCollector(f)
gc.garbageSleep = 30 * time.Millisecond
gc.Start()
time.Sleep(40 * time.Millisecond)
gc.Stop()
}

View File

@ -1,67 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import "fmt"
var InternalKeyPrefix = []byte{'i'}
type InternalRow struct {
key []byte
val []byte
}
func NewInternalRow(key, val []byte) *InternalRow {
rv := InternalRow{
key: key,
val: val,
}
return &rv
}
func NewInternalRowKV(key, value []byte) (*InternalRow, error) {
rv := InternalRow{}
rv.key = key[1:]
rv.val = value
return &rv, nil
}
func (ir *InternalRow) KeySize() int {
return 1 + len(ir.key)
}
func (ir *InternalRow) KeyTo(buf []byte) (int, error) {
buf[0] = 'i'
copy(buf[1:], ir.key)
return 1 + len(ir.key), nil
}
func (ir *InternalRow) Key() []byte {
buf := make([]byte, ir.KeySize())
n, _ := ir.KeyTo(buf)
return buf[:n]
}
func (ir *InternalRow) ValueSize() int {
return len(ir.val)
}
func (ir *InternalRow) ValueTo(buf []byte) (int, error) {
copy(buf, ir.val)
return len(ir.val), nil
}
func (ir *InternalRow) Value() []byte {
return ir.val
}
func (ir *InternalRow) String() string {
return fmt.Sprintf("InternalStore - Key: %s (% x) Val: %s (% x)", ir.key, ir.key, ir.val, ir.val)
}

View File

@ -1,54 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/index"
)
func TestInternalRows(t *testing.T) {
tests := []struct {
input index.IndexRow
outKey []byte
outVal []byte
}{
{
NewInternalRow([]byte("key"), []byte("val")),
[]byte{'i', 'k', 'e', 'y'},
[]byte{'v', 'a', 'l'},
},
}
// test going from struct to k/v bytes
for i, test := range tests {
rk := test.input.Key()
if !reflect.DeepEqual(rk, test.outKey) {
t.Errorf("Expected key to be %v got: %v", test.outKey, rk)
}
rv := test.input.Value()
if !reflect.DeepEqual(rv, test.outVal) {
t.Errorf("Expected value to be %v got: %v for %d", test.outVal, rv, i)
}
}
// now test going back from k/v bytes to struct
for i, test := range tests {
row, err := NewInternalRowKV(test.outKey, test.outVal)
if err != nil {
t.Errorf("error parsking key/value: %v", err)
}
if !reflect.DeepEqual(row, test.input) {
t.Errorf("Expected: %#v got: %#v for %d", test.input, row, i)
}
}
}

View File

@ -1,146 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"fmt"
"sync"
"sync/atomic"
"time"
)
const channelBufferSize = 1000
type Lookuper struct {
tasksQueued uint64
tasksDone uint64
f *Firestorm
workChan chan []*InFlightItem
quit chan struct{}
closeWait sync.WaitGroup
}
func NewLookuper(f *Firestorm) *Lookuper {
rv := Lookuper{
f: f,
workChan: make(chan []*InFlightItem, channelBufferSize),
quit: make(chan struct{}),
}
return &rv
}
func (l *Lookuper) NotifyBatch(items []*InFlightItem) {
atomic.AddUint64(&l.tasksQueued, 1)
l.workChan <- items
}
func (l *Lookuper) Start() {
l.closeWait.Add(1)
go l.run()
}
func (l *Lookuper) Stop() {
close(l.quit)
l.closeWait.Wait()
}
func (l *Lookuper) run() {
for {
select {
case <-l.quit:
logger.Printf("lookuper asked to quit")
l.closeWait.Done()
return
case items, ok := <-l.workChan:
if !ok {
logger.Printf("lookuper work channel closed unexpectedly, stopping")
return
}
l.lookupItems(items)
}
}
}
func (l *Lookuper) lookupItems(items []*InFlightItem) {
for _, item := range items {
l.lookup(item)
}
atomic.AddUint64(&l.tasksDone, 1)
}
func (l *Lookuper) lookup(item *InFlightItem) {
reader, err := l.f.store.Reader()
if err != nil {
logger.Printf("lookuper fatal: %v", err)
return
}
defer func() {
if cerr := reader.Close(); err == nil && cerr != nil {
err = cerr
}
}()
prefix := TermFreqPrefixFieldTermDocId(0, nil, item.docID)
logger.Printf("lookuper prefix - % x", prefix)
var tfk TermFreqRow
docNums := make(DocNumberList, 0)
err = visitPrefix(reader, prefix, func(key, val []byte) (bool, error) {
logger.Printf("lookuper sees key % x", key)
err := tfk.ParseKey(key)
if err != nil {
return false, err
}
docNum := tfk.DocNum()
docNums = append(docNums, docNum)
return true, nil
})
if err != nil {
logger.Printf("lookuper fatal: %v", err)
return
}
oldDocNums := make(DocNumberList, 0, len(docNums))
for _, docNum := range docNums {
if item.docNum == 0 || docNum < item.docNum {
oldDocNums = append(oldDocNums, docNum)
}
}
logger.Printf("lookup migrating '%s' - %d - oldDocNums: %v", item.docID, item.docNum, oldDocNums)
l.f.compensator.Migrate(item.docID, item.docNum, oldDocNums)
if len(oldDocNums) == 0 && item.docNum != 0 {
// this was an add, not an update
atomic.AddUint64(&l.f.docCount, 1)
} else if len(oldDocNums) > 0 && item.docNum == 0 {
// this was a delete (and it previously existed)
atomic.AddUint64(&l.f.docCount, ^uint64(0))
}
}
// this is not intended to be used publicly, only for unit tests
// which depend on consistency we no longer provide
func (l *Lookuper) waitTasksDone(d time.Duration) error {
timeout := time.After(d)
tick := time.Tick(100 * time.Millisecond)
for {
select {
// Got a timeout! fail with a timeout error
case <-timeout:
return fmt.Errorf("timeout")
// Got a tick, we should check on doSomething()
case <-tick:
queued := atomic.LoadUint64(&l.tasksQueued)
done := atomic.LoadUint64(&l.tasksDone)
if queued == done {
return nil
}
}
}
}

View File

@ -1,83 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"testing"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/gtreap"
)
func TestLookups(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open()
if err != nil {
t.Fatal(err)
}
kvwriter, err := f.(*Firestorm).store.Writer()
if err != nil {
t.Fatal(err)
}
rows := []struct {
row index.IndexRow
garbage bool
}{
// needed for warmup to work
{NewFieldRow(0, IDFieldName), false},
// 3 documents, with 2 older versions
{NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil), true},
{NewTermFreqRow(0, nil, []byte("a"), 2, 0, 0.0, nil), false},
{NewTermFreqRow(0, nil, []byte("b"), 3, 0, 0.0, nil), false},
{NewTermFreqRow(0, nil, []byte("c"), 4, 0, 0.0, nil), true},
{NewTermFreqRow(0, nil, []byte("c"), 5, 0, 0.0, nil), false},
}
for _, row := range rows {
wb := kvwriter.NewBatch()
wb.Set(row.row.Key(), row.row.Value())
err = kvwriter.ExecuteBatch(wb)
if err != nil {
t.Fatal(err)
}
// also see the compensator
if tfr, ok := row.row.(*TermFreqRow); ok {
f.(*Firestorm).compensator.Mutate(tfr.DocID(), tfr.DocNum())
// expect this mutation to be in the in-flight list
val := f.(*Firestorm).compensator.inFlight.Get(&InFlightItem{docID: tfr.DocID()})
if val == nil {
t.Errorf("expected key: % x to be in the inflight list", tfr.DocID())
}
f.(*Firestorm).lookuper.lookup(&InFlightItem{docID: tfr.DocID(), docNum: tfr.DocNum()})
// now expect this mutation to NOT be in the in-flight list
val = f.(*Firestorm).compensator.inFlight.Get(&InFlightItem{docID: tfr.DocID()})
if val != nil {
t.Errorf("expected key: % x to NOT be in the inflight list, got %v", tfr.DocID(), val)
}
}
}
// check that doc count is 3 at the end of this
docCount, err := f.DocCount()
if err != nil {
t.Fatal(err)
}
if docCount != 3 {
t.Errorf("expected doc count 3, got %d", docCount)
}
}

View File

@ -1,71 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"encoding/binary"
)
var mergeOperator firestormMerge
var dictionaryTermIncr []byte
var dictionaryTermDecr []byte
func init() {
dictionaryTermIncr = make([]byte, 8)
binary.LittleEndian.PutUint64(dictionaryTermIncr, uint64(1))
dictionaryTermDecr = make([]byte, 8)
var negOne = int64(-1)
binary.LittleEndian.PutUint64(dictionaryTermDecr, uint64(negOne))
}
type firestormMerge struct{}
func (m *firestormMerge) FullMerge(key, existingValue []byte, operands [][]byte) ([]byte, bool) {
// set up record based on key
dr, err := NewDictionaryRowK(key)
if err != nil {
return nil, false
}
if len(existingValue) > 0 {
// if existing value, parse it
err = dr.parseDictionaryV(existingValue)
if err != nil {
return nil, false
}
}
// now process operands
for _, operand := range operands {
next := int64(binary.LittleEndian.Uint64(operand))
if next < 0 && uint64(-next) > dr.Count() {
// subtracting next from existing would overflow
dr.SetCount(0)
} else if next < 0 {
dr.SetCount(dr.Count() - uint64(-next))
} else {
dr.SetCount(dr.Count() + uint64(next))
}
}
return dr.Value(), true
}
func (m *firestormMerge) PartialMerge(key, leftOperand, rightOperand []byte) ([]byte, bool) {
left := int64(binary.LittleEndian.Uint64(leftOperand))
right := int64(binary.LittleEndian.Uint64(rightOperand))
rv := make([]byte, 8)
binary.LittleEndian.PutUint64(rv, uint64(left+right))
return rv, true
}
func (m *firestormMerge) Name() string {
return "firestormMerge"
}

View File

@ -1,93 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"encoding/binary"
"reflect"
"testing"
"github.com/blevesearch/bleve/index"
)
func TestPartialMerge(t *testing.T) {
tests := []struct {
in [][]byte
out uint64
}{
{
in: [][]byte{dictionaryTermIncr, dictionaryTermIncr, dictionaryTermIncr, dictionaryTermIncr, dictionaryTermIncr},
out: 5,
},
}
mo := &firestormMerge{}
for _, test := range tests {
curr := test.in[0]
for _, next := range test.in[1:] {
var ok bool
curr, ok = mo.PartialMerge([]byte("key"), curr, next)
if !ok {
t.Errorf("expected partial merge ok")
}
}
actual := decodeCount(curr)
if actual != test.out {
t.Errorf("expected %d, got %d", test.out, actual)
}
}
}
func decodeCount(in []byte) uint64 {
buf := bytes.NewBuffer(in)
count, _ := binary.ReadUvarint(buf)
return count
}
func TestFullMerge(t *testing.T) {
tests := []struct {
existing index.IndexRow
operands [][]byte
result index.IndexRow
success bool
}{
{
existing: NewDictionaryRow(1, []byte("term"), 3),
operands: [][]byte{dictionaryTermIncr, dictionaryTermIncr},
result: NewDictionaryRow(1, []byte("term"), 5),
success: true,
},
{
existing: NewDictionaryRow(1, []byte("term"), 3),
operands: [][]byte{dictionaryTermDecr, dictionaryTermDecr},
result: NewDictionaryRow(1, []byte("term"), 1),
success: true,
},
}
mo := &firestormMerge{}
for _, test := range tests {
existingVal := test.existing.Value()
actual, success := mo.FullMerge([]byte("key"), existingVal, test.operands)
if success != test.success {
t.Errorf("expected error %t, got %t", test.success, success)
}
expectedVal := test.result.Value()
if !reflect.DeepEqual(expectedVal, actual) {
t.Errorf("expected result %v, got %v", expectedVal, actual)
}
}
}

View File

@ -1,220 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"fmt"
"sort"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store"
)
type firestormReader struct {
f *Firestorm
r store.KVReader
s *Snapshot
docCount uint64
}
func newFirestormReader(f *Firestorm) (index.IndexReader, error) {
r, err := f.store.Reader()
if err != nil {
return nil, fmt.Errorf("error opening store reader: %v", err)
}
docCount, err := f.DocCount()
if err != nil {
return nil, fmt.Errorf("error opening store reader: %v", err)
}
rv := firestormReader{
f: f,
r: r,
s: f.compensator.Snapshot(),
docCount: docCount,
}
return &rv, nil
}
func (r *firestormReader) TermFieldReader(term []byte, field string) (index.TermFieldReader, error) {
fieldIndex, fieldExists := r.f.fieldCache.FieldNamed(field, false)
if fieldExists {
return newFirestormTermFieldReader(r, uint16(fieldIndex), term)
}
return newFirestormTermFieldReader(r, ^uint16(0), []byte{ByteSeparator})
}
func (r *firestormReader) DocIDReader(start, end string) (index.DocIDReader, error) {
return newFirestormDocIDReader(r, start, end)
}
func (r *firestormReader) FieldDict(field string) (index.FieldDict, error) {
return r.FieldDictRange(field, nil, nil)
}
func (r *firestormReader) FieldDictRange(field string, startTerm []byte, endTerm []byte) (index.FieldDict, error) {
fieldIndex, fieldExists := r.f.fieldCache.FieldNamed(field, false)
if fieldExists {
return newFirestormDictionaryReader(r, uint16(fieldIndex), startTerm, endTerm)
}
return newFirestormDictionaryReader(r, ^uint16(0), []byte{ByteSeparator}, []byte{})
}
func (r *firestormReader) FieldDictPrefix(field string, termPrefix []byte) (index.FieldDict, error) {
return r.FieldDictRange(field, termPrefix, incrementBytes(termPrefix))
}
func (r *firestormReader) Document(id string) (*document.Document, error) {
docID := []byte(id)
docNum, err := r.currDocNumForId(docID)
if err != nil {
return nil, err
} else if docNum == 0 {
return nil, nil
}
rv := document.NewDocument(id)
prefix := StoredPrefixDocIDNum(docID, docNum)
err = visitPrefix(r.r, prefix, func(key, val []byte) (bool, error) {
safeVal := make([]byte, len(val))
copy(safeVal, val)
row, err := NewStoredRowKV(key, safeVal)
if err != nil {
return false, err
}
if row != nil {
fieldName := r.f.fieldCache.FieldIndexed(row.field)
field := r.decodeFieldType(fieldName, row.arrayPositions, row.value.GetRaw())
if field != nil {
rv.AddField(field)
}
}
return true, nil
})
if err != nil {
return nil, err
}
return rv, nil
}
func (r *firestormReader) decodeFieldType(name string, pos []uint64, value []byte) document.Field {
switch value[0] {
case 't':
return document.NewTextField(name, pos, value[1:])
case 'n':
return document.NewNumericFieldFromBytes(name, pos, value[1:])
case 'd':
return document.NewDateTimeFieldFromBytes(name, pos, value[1:])
case 'b':
return document.NewBooleanFieldFromBytes(name, pos, value[1:])
}
return nil
}
func (r *firestormReader) currDocNumForId(docID []byte) (uint64, error) {
prefix := TermFreqPrefixFieldTermDocId(0, nil, docID)
docNums := make(DocNumberList, 0)
err := visitPrefix(r.r, prefix, func(key, val []byte) (bool, error) {
tfk, err := NewTermFreqRowKV(key, val)
if err != nil {
return false, err
}
docNum := tfk.DocNum()
docNums = append(docNums, docNum)
return true, nil
})
if err != nil {
return 0, err
}
if len(docNums) > 0 {
sort.Sort(docNums)
return docNums[0], nil
}
return 0, nil
}
func (r *firestormReader) DocumentFieldTerms(id string) (index.FieldTerms, error) {
docID := []byte(id)
docNum, err := r.currDocNumForId(docID)
if err != nil {
return nil, err
} else if docNum == 0 {
return nil, nil
}
rv := make(index.FieldTerms, 0)
// walk the term freqs
err = visitPrefix(r.r, TermFreqKeyPrefix, func(key, val []byte) (bool, error) {
tfr, err := NewTermFreqRowKV(key, val)
if err != nil {
return false, err
}
if bytes.Compare(tfr.DocID(), docID) == 0 && tfr.DocNum() == docNum && tfr.Field() != 0 {
fieldName := r.f.fieldCache.FieldIndexed(uint16(tfr.Field()))
terms, ok := rv[fieldName]
if !ok {
terms = make([]string, 0, 1)
}
terms = append(terms, string(tfr.Term()))
rv[fieldName] = terms
}
return true, nil
})
if err != nil {
return nil, err
}
return rv, nil
}
func (r *firestormReader) Fields() ([]string, error) {
fields := make([]string, 0)
err := visitPrefix(r.r, FieldKeyPrefix, func(key, val []byte) (bool, error) {
fieldRow, err := NewFieldRowKV(key, val)
if err != nil {
return false, err
}
fields = append(fields, fieldRow.Name())
return true, nil
})
if err != nil {
return nil, err
}
return fields, nil
}
func (r *firestormReader) GetInternal(key []byte) ([]byte, error) {
internalRow := NewInternalRow(key, nil)
return r.r.Get(internalRow.Key())
}
func (r *firestormReader) DocCount() uint64 {
return r.docCount
}
func (r *firestormReader) Close() error {
return r.r.Close()
}
func incrementBytes(in []byte) []byte {
rv := make([]byte, len(in))
copy(rv, in)
for i := len(rv) - 1; i >= 0; i-- {
rv[i] = rv[i] + 1
if rv[i] != 0 {
// didn't overflow, so stop
break
}
}
return rv
}

View File

@ -1,70 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"fmt"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store"
)
type firestormDictionaryReader struct {
r *firestormReader
field uint16
start []byte
i store.KVIterator
}
func newFirestormDictionaryReader(r *firestormReader, field uint16, start, end []byte) (*firestormDictionaryReader, error) {
startKey := DictionaryRowKey(field, start)
logger.Printf("start key '%s' - % x", startKey, startKey)
if end == nil {
end = []byte{ByteSeparator}
}
endKey := DictionaryRowKey(field, end)
logger.Printf("end key '%s' - % x", endKey, endKey)
i := r.r.RangeIterator(startKey, endKey)
rv := firestormDictionaryReader{
r: r,
field: field,
start: startKey,
i: i,
}
return &rv, nil
}
func (r *firestormDictionaryReader) Next() (*index.DictEntry, error) {
key, val, valid := r.i.Current()
if !valid {
return nil, nil
}
logger.Printf("see key '%s' - % x", key, key)
currRow, err := NewDictionaryRowKV(key, val)
if err != nil {
return nil, fmt.Errorf("unexpected error parsing dictionary row kv: %v", err)
}
rv := index.DictEntry{
Term: string(currRow.term),
Count: currRow.Count(),
}
// advance the iterator to the next term
r.i.Next()
return &rv, nil
}
func (r *firestormDictionaryReader) Close() error {
if r.i != nil {
return r.i.Close()
}
return nil
}

View File

@ -1,225 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"reflect"
"regexp"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/tokenizers/regexp_tokenizer"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/gtreap"
)
var testAnalyzer = &analysis.Analyzer{
Tokenizer: regexp_tokenizer.NewRegexpTokenizer(regexp.MustCompile(`\w+`)),
}
func TestDictionaryReader(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open()
if err != nil {
t.Fatal(err)
}
kvwriter, err := f.(*Firestorm).store.Writer()
if err != nil {
t.Fatal(err)
}
rows := []index.IndexRow{
NewFieldRow(0, IDFieldName),
NewFieldRow(1, "name"),
NewFieldRow(2, "desc"),
NewFieldRow(3, "prefix"),
}
for _, row := range rows {
wb := kvwriter.NewBatch()
wb.Set(row.Key(), row.Value())
err = kvwriter.ExecuteBatch(wb)
if err != nil {
t.Fatal(err)
}
}
err = kvwriter.Close()
if err != nil {
t.Fatal(err)
}
kvreader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
err = f.(*Firestorm).warmup(kvreader)
if err != nil {
t.Fatal(err)
}
err = kvreader.Close()
if err != nil {
t.Fatal(err)
}
kvwriter, err = f.(*Firestorm).store.Writer()
if err != nil {
t.Fatal(err)
}
rows = []index.IndexRow{
// dictionary entries
NewDictionaryRow(1, []byte("test"), 4),
NewDictionaryRow(2, []byte("eat"), 1),
NewDictionaryRow(2, []byte("more"), 1),
NewDictionaryRow(2, []byte("rice"), 1),
NewDictionaryRow(3, []byte("bob"), 1),
NewDictionaryRow(3, []byte("cat"), 1),
NewDictionaryRow(3, []byte("cats"), 1),
NewDictionaryRow(3, []byte("catting"), 1),
NewDictionaryRow(3, []byte("dog"), 1),
NewDictionaryRow(3, []byte("doggy"), 1),
NewDictionaryRow(3, []byte("zoo"), 1),
}
for _, row := range rows {
wb := kvwriter.NewBatch()
wb.Set(row.Key(), row.Value())
err = kvwriter.ExecuteBatch(wb)
if err != nil {
t.Fatal(err)
}
}
// now try it
r, err := f.Reader()
if err != nil {
t.Fatal(err)
}
dict, err := r.FieldDict("name")
if err != nil {
t.Errorf("error creating reader: %v", err)
}
termCount := 0
curr, err := dict.Next()
for err == nil && curr != nil {
termCount++
if curr.Term != "test" {
t.Errorf("expected term to be 'test', got '%s'", curr.Term)
}
curr, err = dict.Next()
}
if termCount != 1 {
t.Errorf("expected 1 term for this field, got %d", termCount)
}
err = dict.Close()
if err != nil {
t.Fatal(err)
}
dict, err = r.FieldDict("desc")
if err != nil {
t.Errorf("error creating reader: %v", err)
}
termCount = 0
terms := make([]string, 0)
curr, err = dict.Next()
for err == nil && curr != nil {
termCount++
terms = append(terms, curr.Term)
curr, err = dict.Next()
}
if termCount != 3 {
t.Errorf("expected 3 term for this field, got %d", termCount)
}
expectedTerms := []string{"eat", "more", "rice"}
if !reflect.DeepEqual(expectedTerms, terms) {
t.Errorf("expected %#v, got %#v", expectedTerms, terms)
}
err = dict.Close()
if err != nil {
t.Fatal(err)
}
// test start and end range
dict, err = r.FieldDictRange("desc", []byte("fun"), []byte("nice"))
if err != nil {
t.Errorf("error creating reader: %v", err)
}
termCount = 0
terms = make([]string, 0)
curr, err = dict.Next()
for err == nil && curr != nil {
termCount++
terms = append(terms, curr.Term)
curr, err = dict.Next()
}
if termCount != 1 {
t.Errorf("expected 1 term for this field, got %d", termCount)
}
expectedTerms = []string{"more"}
if !reflect.DeepEqual(expectedTerms, terms) {
t.Errorf("expected %#v, got %#v", expectedTerms, terms)
}
err = dict.Close()
if err != nil {
t.Fatal(err)
}
// test use case for prefix
dict, err = r.FieldDictPrefix("prefix", []byte("cat"))
if err != nil {
t.Errorf("error creating reader: %v", err)
}
termCount = 0
terms = make([]string, 0)
curr, err = dict.Next()
for err == nil && curr != nil {
termCount++
terms = append(terms, curr.Term)
curr, err = dict.Next()
}
if termCount != 3 {
t.Errorf("expected 3 term for this field, got %d", termCount)
}
expectedTerms = []string{"cat", "cats", "catting"}
if !reflect.DeepEqual(expectedTerms, terms) {
t.Errorf("expected %#v, got %#v", expectedTerms, terms)
}
err = dict.Close()
if err != nil {
t.Fatal(err)
}
err = r.Close()
if err != nil {
t.Fatal(err)
}
}

View File

@ -1,120 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"github.com/blevesearch/bleve/index/store"
)
type firestormDocIDReader struct {
r *firestormReader
start []byte
i store.KVIterator
}
func newFirestormDocIDReader(r *firestormReader, start, end string) (*firestormDocIDReader, error) {
startKey := TermFreqIteratorStart(0, nil)
if start != "" {
startKey = TermFreqPrefixFieldTermDocId(0, nil, []byte(start))
}
logger.Printf("start key '%s' - % x", startKey, startKey)
endKey := TermFreqIteratorStart(0, []byte{ByteSeparator})
if end != "" {
endKey = TermFreqPrefixFieldTermDocId(0, nil, []byte(end))
}
logger.Printf("end key '%s' - % x", endKey, endKey)
i := r.r.RangeIterator(startKey, endKey)
rv := firestormDocIDReader{
r: r,
start: startKey,
i: i,
}
return &rv, nil
}
func (r *firestormDocIDReader) Next() (string, error) {
if r.i != nil {
key, val, valid := r.i.Current()
for valid {
logger.Printf("see key: '%s' - % x", key, key)
tfrsByDocNum := make(map[uint64]*TermFreqRow)
tfr, err := NewTermFreqRowKV(key, val)
if err != nil {
return "", err
}
tfrsByDocNum[tfr.DocNum()] = tfr
// now we have a possible row, but there may be more rows for the same docid
// find these now
err = r.findNextTfrsWithSameDocId(tfrsByDocNum, tfr.DocID())
if err != nil {
return "", err
}
docNumList := make(DocNumberList, 0, len(tfrsByDocNum))
for dn := range tfrsByDocNum {
docNumList = append(docNumList, dn)
}
logger.Printf("docNumList: %v", docNumList)
highestValidDocNum := r.r.s.Which(tfr.docID, docNumList)
if highestValidDocNum == 0 {
// no valid doc number
key, val, valid = r.i.Current()
continue
}
logger.Printf("highest valid: %d", highestValidDocNum)
tfr = tfrsByDocNum[highestValidDocNum]
return string(tfr.DocID()), nil
}
}
return "", nil
}
// FIXME this is identical to the one in reader_terms.go
func (r *firestormDocIDReader) findNextTfrsWithSameDocId(tfrsByDocNum map[uint64]*TermFreqRow, docID []byte) error {
tfrDocIdPrefix := TermFreqPrefixFieldTermDocId(0, nil, docID)
r.i.Next()
key, val, valid := r.i.Current()
for valid && bytes.HasPrefix(key, tfrDocIdPrefix) {
tfr, err := NewTermFreqRowKV(key, val)
if err != nil {
return err
}
tfrsByDocNum[tfr.DocNum()] = tfr
r.i.Next()
key, val, valid = r.i.Current()
}
return nil
}
func (r *firestormDocIDReader) Advance(docID string) (string, error) {
if r.i != nil {
tfrDocIdPrefix := TermFreqPrefixFieldTermDocId(0, nil, []byte(docID))
r.i.Seek(tfrDocIdPrefix)
return r.Next()
}
return "", nil
}
func (r *firestormDocIDReader) Close() error {
if r.i != nil {
return r.i.Close()
}
return nil
}

View File

@ -1,187 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"math/rand"
"reflect"
"testing"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/gtreap"
)
func TestDocIDReaderSomeGarbage(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open()
if err != nil {
t.Fatal(err)
}
kvwriter, err := f.(*Firestorm).store.Writer()
if err != nil {
t.Fatal(err)
}
rows := []index.IndexRow{
NewFieldRow(0, IDFieldName),
NewFieldRow(1, "desc"),
NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("b"), 2, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("c"), 3, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("d"), 4, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("a"), 5, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("b"), 6, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("e"), 7, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("g"), 8, 0, 0.0, nil),
// first version of all docs have cat
NewTermFreqRow(1, []byte("cat"), []byte("a"), 1, 1, 1.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("b"), 2, 1, 1.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("c"), 3, 1, 1.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("d"), 4, 1, 1.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("e"), 7, 1, 1.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("g"), 8, 1, 1.0, nil),
// updated version of a still has cat
NewTermFreqRow(1, []byte("cat"), []byte("a"), 5, 1, 1.0, nil),
// updated version of b does NOT have cat
// c has delete in-flight
// d has delete not-yet-garbage-collected
}
for _, row := range rows {
wb := kvwriter.NewBatch()
wb.Set(row.Key(), row.Value())
err = kvwriter.ExecuteBatch(wb)
if err != nil {
t.Fatal(err)
}
}
f.(*Firestorm).compensator.inFlight = f.(*Firestorm).compensator.inFlight.Upsert(&InFlightItem{docID: []byte("c"), docNum: 0}, rand.Int())
f.(*Firestorm).compensator.deletedDocNumbers.Set(4)
err = kvwriter.Close()
if err != nil {
t.Fatal(err)
}
kvreader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
// warmup to load field cache and set maxRead correctly
err = f.(*Firestorm).warmup(kvreader)
if err != nil {
t.Fatal(err)
}
err = kvreader.Close()
if err != nil {
t.Fatal(err)
}
r, err := f.Reader()
if err != nil {
t.Fatal(err)
}
dr, err := r.DocIDReader("", "")
if err != nil {
t.Fatal(err)
}
expectedDocIds := []string{"a", "b", "e", "g"}
foundDocIds := make([]string, 0)
next, err := dr.Next()
for next != "" && err == nil {
foundDocIds = append(foundDocIds, next)
next, err = dr.Next()
}
if err != nil {
t.Fatal(err)
}
if !reflect.DeepEqual(expectedDocIds, foundDocIds) {
t.Errorf("expected: %v, got %v", expectedDocIds, foundDocIds)
}
err = dr.Close()
if err != nil {
t.Fatal(err)
}
// now test with some doc id ranges
dr, err = r.DocIDReader("b", "f")
if err != nil {
t.Fatal(err)
}
expectedDocIds = []string{"b", "e"}
foundDocIds = make([]string, 0)
next, err = dr.Next()
for next != "" && err == nil {
foundDocIds = append(foundDocIds, next)
next, err = dr.Next()
}
if err != nil {
t.Fatal(err)
}
if !reflect.DeepEqual(expectedDocIds, foundDocIds) {
t.Errorf("expected: %v, got %v", expectedDocIds, foundDocIds)
}
err = dr.Close()
if err != nil {
t.Fatal(err)
}
//now try again and Advance to skip over "e"
dr, err = r.DocIDReader("b", "")
if err != nil {
t.Fatal(err)
}
expectedDocIds = []string{"b", "g"}
foundDocIds = make([]string, 0)
next, err = dr.Next()
if err != nil {
t.Fatal(err)
} else {
foundDocIds = append(foundDocIds, next)
}
next, err = dr.Advance("f")
if err != nil {
t.Fatal(err)
} else {
foundDocIds = append(foundDocIds, next)
}
if !reflect.DeepEqual(expectedDocIds, foundDocIds) {
t.Errorf("expected: %v, got %v", expectedDocIds, foundDocIds)
}
err = dr.Close()
if err != nil {
t.Fatal(err)
}
err = r.Close()
if err != nil {
t.Fatal(err)
}
}

View File

@ -1,162 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"sync/atomic"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store"
)
type firestormTermFieldReader struct {
r *firestormReader
field uint16
term []byte
prefix []byte
count uint64
i store.KVIterator
}
func newFirestormTermFieldReader(r *firestormReader, field uint16, term []byte) (index.TermFieldReader, error) {
dictionaryKey := DictionaryRowKey(field, term)
dictionaryValue, err := r.r.Get(dictionaryKey)
if err != nil {
return nil, err
}
prefix := TermFreqIteratorStart(field, term)
logger.Printf("starting term freq iterator at: '%s' - % x", prefix, prefix)
i := r.r.PrefixIterator(prefix)
rv := firestormTermFieldReader{
r: r,
field: field,
term: term,
prefix: prefix,
i: i,
}
// NOTE: in firestorm the dictionary row is advisory in nature
// it *may* tell us the correct out
// if this record does not exist, it DOES not mean that there is no
// usage, we must scan the term frequencies to be sure
if dictionaryValue != nil {
dictionaryRow, err := NewDictionaryRowKV(dictionaryKey, dictionaryValue)
if err != nil {
return nil, err
}
rv.count = dictionaryRow.Count()
}
atomic.AddUint64(&r.f.stats.termSearchersStarted, uint64(1))
return &rv, nil
}
func (r *firestormTermFieldReader) Next() (*index.TermFieldDoc, error) {
if r.i != nil {
key, val, valid := r.i.Current()
for valid {
logger.Printf("see key: '%s' - % x", key, key)
tfrsByDocNum := make(map[uint64]*TermFreqRow)
tfr, err := NewTermFreqRowKV(key, val)
if err != nil {
return nil, err
}
tfrsByDocNum[tfr.DocNum()] = tfr
// now we have a possible row, but there may be more rows for the same docid
// find these now
err = r.findNextTfrsWithSameDocId(tfrsByDocNum, tfr.DocID())
if err != nil {
return nil, err
}
docNumList := make(DocNumberList, 0, len(tfrsByDocNum))
for dn := range tfrsByDocNum {
docNumList = append(docNumList, dn)
}
logger.Printf("docNumList: %v", docNumList)
highestValidDocNum := r.r.s.Which(tfr.docID, docNumList)
if highestValidDocNum == 0 {
// no valid doc number
key, val, valid = r.i.Current()
continue
}
logger.Printf("highest valid: %d", highestValidDocNum)
tfr = tfrsByDocNum[highestValidDocNum]
return &index.TermFieldDoc{
ID: string(tfr.DocID()),
Freq: tfr.Freq(),
Norm: float64(tfr.Norm()),
Vectors: r.termFieldVectorsFromTermVectors(tfr.Vectors()),
}, nil
}
}
return nil, nil
}
func (r *firestormTermFieldReader) findNextTfrsWithSameDocId(tfrsByDocNum map[uint64]*TermFreqRow, docID []byte) error {
tfrDocIdPrefix := TermFreqPrefixFieldTermDocId(r.field, r.term, docID)
r.i.Next()
key, val, valid := r.i.Current()
for valid && bytes.HasPrefix(key, tfrDocIdPrefix) {
tfr, err := NewTermFreqRowKV(key, val)
if err != nil {
return err
}
tfrsByDocNum[tfr.DocNum()] = tfr
r.i.Next()
key, val, valid = r.i.Current()
}
return nil
}
func (r *firestormTermFieldReader) Advance(docID string) (*index.TermFieldDoc, error) {
if r.i != nil {
tfrDocIdPrefix := TermFreqPrefixFieldTermDocId(r.field, r.term, []byte(docID))
r.i.Seek(tfrDocIdPrefix)
return r.Next()
}
return nil, nil
}
func (r *firestormTermFieldReader) Count() uint64 {
return r.count
}
func (r *firestormTermFieldReader) Close() error {
atomic.AddUint64(&r.r.f.stats.termSearchersFinished, uint64(1))
if r.i != nil {
return r.i.Close()
}
return nil
}
func (r *firestormTermFieldReader) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector {
rv := make([]*index.TermFieldVector, len(in))
for i, tv := range in {
fieldName := r.r.f.fieldCache.FieldIndexed(uint16(tv.GetField()))
tfv := index.TermFieldVector{
Field: fieldName,
ArrayPositions: tv.GetArrayPositions(),
Pos: tv.GetPos(),
Start: tv.GetStart(),
End: tv.GetEnd(),
}
rv[i] = &tfv
}
return rv
}

View File

@ -1,254 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"math/rand"
"reflect"
"testing"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/gtreap"
)
func TestTermReaderNoGarbage(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open()
if err != nil {
t.Fatal(err)
}
kvwriter, err := f.(*Firestorm).store.Writer()
if err != nil {
t.Fatal(err)
}
rows := []index.IndexRow{
NewFieldRow(0, IDFieldName),
NewFieldRow(1, "desc"),
NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("b"), 2, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("c"), 3, 0, 0.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("a"), 1, 3, 2.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("c"), 3, 1, 1.0, nil),
}
for _, row := range rows {
wb := kvwriter.NewBatch()
wb.Set(row.Key(), row.Value())
err = kvwriter.ExecuteBatch(wb)
if err != nil {
t.Fatal(err)
}
}
err = kvwriter.Close()
if err != nil {
t.Fatal(err)
}
kvreader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
// warmup to load field cache and set maxRead correctly
err = f.(*Firestorm).warmup(kvreader)
if err != nil {
t.Fatal(err)
}
err = kvreader.Close()
if err != nil {
t.Fatal(err)
}
r, err := f.Reader()
if err != nil {
t.Fatal(err)
}
tfr, err := r.TermFieldReader([]byte("cat"), "desc")
if err != nil {
t.Fatal(err)
}
expectedDocIds := []string{"a", "c"}
foundDocIds := make([]string, 0)
next, err := tfr.Next()
for next != nil && err == nil {
foundDocIds = append(foundDocIds, next.ID)
next, err = tfr.Next()
}
if err != nil {
t.Fatal(err)
}
if !reflect.DeepEqual(expectedDocIds, foundDocIds) {
t.Errorf("expected: %v, got %v", expectedDocIds, foundDocIds)
}
err = tfr.Close()
if err != nil {
t.Fatal(err)
}
err = r.Close()
if err != nil {
t.Fatal(err)
}
}
func TestTermReaderSomeGarbage(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open()
if err != nil {
t.Fatal(err)
}
kvwriter, err := f.(*Firestorm).store.Writer()
if err != nil {
t.Fatal(err)
}
rows := []index.IndexRow{
NewFieldRow(0, IDFieldName),
NewFieldRow(1, "desc"),
NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("b"), 2, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("c"), 3, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("d"), 4, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("a"), 5, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("b"), 6, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("e"), 7, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("g"), 8, 0, 0.0, nil),
// first version of all docs have cat
NewTermFreqRow(1, []byte("cat"), []byte("a"), 1, 1, 1.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("b"), 2, 1, 1.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("c"), 3, 1, 1.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("d"), 4, 1, 1.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("e"), 7, 1, 1.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("g"), 8, 1, 1.0, nil),
// updated version of a still has cat
NewTermFreqRow(1, []byte("cat"), []byte("a"), 5, 1, 1.0, nil),
// updated version of b does NOT have cat
// c has delete in-flight
// d has delete not-yet-garbage-collected
}
for _, row := range rows {
wb := kvwriter.NewBatch()
wb.Set(row.Key(), row.Value())
err = kvwriter.ExecuteBatch(wb)
if err != nil {
t.Fatal(err)
}
}
f.(*Firestorm).compensator.inFlight = f.(*Firestorm).compensator.inFlight.Upsert(&InFlightItem{docID: []byte("c"), docNum: 0}, rand.Int())
f.(*Firestorm).compensator.deletedDocNumbers.Set(4)
err = kvwriter.Close()
if err != nil {
t.Fatal(err)
}
kvreader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
// warmup to load field cache and set maxRead correctly
err = f.(*Firestorm).warmup(kvreader)
if err != nil {
t.Fatal(err)
}
err = kvreader.Close()
if err != nil {
t.Fatal(err)
}
r, err := f.Reader()
if err != nil {
t.Fatal(err)
}
tfr, err := r.TermFieldReader([]byte("cat"), "desc")
if err != nil {
t.Fatal(err)
}
expectedDocIds := []string{"a", "e", "g"}
foundDocIds := make([]string, 0)
next, err := tfr.Next()
for next != nil && err == nil {
foundDocIds = append(foundDocIds, next.ID)
next, err = tfr.Next()
}
if err != nil {
t.Fatal(err)
}
if !reflect.DeepEqual(expectedDocIds, foundDocIds) {
t.Errorf("expected: %v, got %v", expectedDocIds, foundDocIds)
}
err = tfr.Close()
if err != nil {
t.Fatal(err)
}
// now try again and Advance to skip over "e"
tfr, err = r.TermFieldReader([]byte("cat"), "desc")
if err != nil {
t.Fatal(err)
}
expectedDocIds = []string{"a", "g"}
foundDocIds = make([]string, 0)
next, err = tfr.Next()
if err != nil {
t.Fatal(err)
} else {
foundDocIds = append(foundDocIds, next.ID)
}
next, err = tfr.Advance("f")
if err != nil {
t.Fatal(err)
} else {
foundDocIds = append(foundDocIds, next.ID)
}
if !reflect.DeepEqual(expectedDocIds, foundDocIds) {
t.Errorf("expected: %v, got %v", expectedDocIds, foundDocIds)
}
err = tfr.Close()
if err != nil {
t.Fatal(err)
}
err = r.Close()
if err != nil {
t.Fatal(err)
}
}

View File

@ -1,51 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"encoding/json"
"sync/atomic"
"github.com/blevesearch/bleve/index/store"
)
type indexStat struct {
updates, deletes, batches, errors uint64
analysisTime, indexTime uint64
termSearchersStarted uint64
termSearchersFinished uint64
numPlainTextBytesIndexed uint64
f *Firestorm
}
func (i *indexStat) statsMap() map[string]interface{} {
m := map[string]interface{}{}
m["updates"] = atomic.LoadUint64(&i.updates)
m["deletes"] = atomic.LoadUint64(&i.deletes)
m["batches"] = atomic.LoadUint64(&i.batches)
m["errors"] = atomic.LoadUint64(&i.errors)
m["analysis_time"] = atomic.LoadUint64(&i.analysisTime)
m["index_time"] = atomic.LoadUint64(&i.indexTime)
m["lookup_queue_len"] = len(i.f.lookuper.workChan)
m["term_searchers_started"] = atomic.LoadUint64(&i.termSearchersStarted)
m["term_searchers_finished"] = atomic.LoadUint64(&i.termSearchersFinished)
m["num_plain_text_bytes_indexed"] = atomic.LoadUint64(&i.numPlainTextBytesIndexed)
if o, ok := i.f.store.(store.KVStoreStats); ok {
m["kv"] = o.StatsMap()
}
return m
}
func (i *indexStat) MarshalJSON() ([]byte, error) {
m := i.statsMap()
return json.Marshal(m)
}

View File

@ -1,164 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"encoding/binary"
"fmt"
)
var StoredKeyPrefix = []byte{'s'}
type StoredRow struct {
docID []byte
docNum uint64
field uint16
arrayPositions []uint64
value StoredValue
}
func NewStoredRow(docID []byte, docNum uint64, field uint16, arrayPositions []uint64, value []byte) *StoredRow {
rv := StoredRow{
docID: docID,
docNum: docNum,
field: field,
arrayPositions: arrayPositions,
}
if len(arrayPositions) < 1 {
rv.arrayPositions = make([]uint64, 0)
}
rv.value.Raw = value // FIXME review do we need to copy?
return &rv
}
func NewStoredRowKV(key, value []byte) (*StoredRow, error) {
rv := StoredRow{}
err := rv.ParseKey(key)
if err != nil {
return nil, err
}
err = rv.value.Unmarshal(value)
if err != nil {
return nil, err
}
return &rv, nil
}
func (sr *StoredRow) ParseKey(key []byte) error {
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
if err != nil {
return err
}
sr.docID, err = buf.ReadBytes(ByteSeparator)
if len(sr.docID) < 2 { // 1 for min doc id length, 1 for separator
err = fmt.Errorf("invalid doc length 0")
return err
}
sr.docID = sr.docID[:len(sr.docID)-1] // trim off separator byte
sr.docNum, err = binary.ReadUvarint(buf)
if err != nil {
return err
}
err = binary.Read(buf, binary.LittleEndian, &sr.field)
if err != nil {
return err
}
sr.arrayPositions = make([]uint64, 0)
nextArrayPos, err := binary.ReadUvarint(buf)
for err == nil {
sr.arrayPositions = append(sr.arrayPositions, nextArrayPos)
nextArrayPos, err = binary.ReadUvarint(buf)
}
return nil
}
func (sr *StoredRow) KeySize() int {
return 1 + len(sr.docID) + 1 + binary.MaxVarintLen64 + 2 + (binary.MaxVarintLen64 * len(sr.arrayPositions))
}
func (sr *StoredRow) KeyTo(buf []byte) (int, error) {
buf[0] = 's'
copy(buf[1:], sr.docID)
buf[1+len(sr.docID)] = ByteSeparator
bytesUsed := 1 + len(sr.docID) + 1
bytesUsed += binary.PutUvarint(buf[bytesUsed:], sr.docNum)
binary.LittleEndian.PutUint16(buf[bytesUsed:], sr.field)
bytesUsed += 2
for _, arrayPosition := range sr.arrayPositions {
varbytes := binary.PutUvarint(buf[bytesUsed:], arrayPosition)
bytesUsed += varbytes
}
return bytesUsed, nil
}
func (sr *StoredRow) Key() []byte {
buf := make([]byte, sr.KeySize())
n, _ := sr.KeyTo(buf)
return buf[:n]
}
func (sr *StoredRow) ValueSize() int {
return sr.value.Size()
}
func (sr *StoredRow) ValueTo(buf []byte) (int, error) {
return sr.value.MarshalTo(buf)
}
func (sr *StoredRow) Value() []byte {
buf := make([]byte, sr.ValueSize())
n, _ := sr.ValueTo(buf)
return buf[:n]
}
func (sr *StoredRow) DocID() []byte {
return sr.docID
}
func (sr *StoredRow) DocNum() uint64 {
return sr.docNum
}
func (sr *StoredRow) String() string {
return fmt.Sprintf("StoredRow - Field: %d\n", sr.field) +
fmt.Sprintf("DocID '%s' - % x\n", sr.docID, sr.docID) +
fmt.Sprintf("DocNum %d\n", sr.docNum) +
fmt.Sprintf("Array Positions:\n%v", sr.arrayPositions) +
fmt.Sprintf("Value: % x", sr.value.GetRaw())
}
func StoredIteratorStartDocID(docID []byte) []byte {
docLen := len(docID)
buf := make([]byte, 1+docLen+1)
buf[0] = 's'
copy(buf[1:], docID)
buf[1+docLen] = ByteSeparator
return buf
}
func StoredPrefixDocIDNum(docID []byte, docNum uint64) []byte {
docLen := len(docID)
buf := make([]byte, 1+docLen+1+binary.MaxVarintLen64)
buf[0] = 's'
copy(buf[1:], docID)
buf[1+docLen] = ByteSeparator
bytesUsed := 1 + docLen + 1
bytesUsed += binary.PutUvarint(buf[bytesUsed:], docNum)
return buf[0:bytesUsed]
}

View File

@ -1,59 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/index"
)
func TestStoredRows(t *testing.T) {
tests := []struct {
input index.IndexRow
outKey []byte
outVal []byte
}{
{
NewStoredRow([]byte("doca"), 5, 7, nil, []byte("tcat")),
[]byte{StoredKeyPrefix[0], 'd', 'o', 'c', 'a', ByteSeparator, 5, 7, 0},
[]byte{10, 4, 't', 'c', 'a', 't'},
},
{
NewStoredRow([]byte("doca"), 5, 7, []uint64{1, 1}, []byte("tcat")),
[]byte{StoredKeyPrefix[0], 'd', 'o', 'c', 'a', ByteSeparator, 5, 7, 0, 1, 1},
[]byte{10, 4, 't', 'c', 'a', 't'},
},
}
// test going from struct to k/v bytes
for i, test := range tests {
rk := test.input.Key()
if !reflect.DeepEqual(rk, test.outKey) {
t.Errorf("Expected key to be %v got: %v", test.outKey, rk)
}
rv := test.input.Value()
if !reflect.DeepEqual(rv, test.outVal) {
t.Errorf("Expected value to be %v got: %v for %d", test.outVal, rv, i)
}
}
// now test going back from k/v bytes to struct
for i, test := range tests {
row, err := NewStoredRowKV(test.outKey, test.outVal)
if err != nil {
t.Errorf("error parsking key/value: %v", err)
}
if !reflect.DeepEqual(row, test.input) {
t.Errorf("Expected: %#v got: %#v for %d", test.input, row, i)
}
}
}

View File

@ -1,209 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"encoding/binary"
"fmt"
"github.com/golang/protobuf/proto"
)
var TermFreqKeyPrefix = []byte{'t'}
type TermFreqRow struct {
field uint16
term []byte
docID []byte
docNum uint64
value TermFreqValue
}
func NewTermVector(field uint16, pos uint64, start uint64, end uint64, arrayPos []uint64) *TermVector {
rv := TermVector{}
rv.Field = proto.Uint32(uint32(field))
rv.Pos = proto.Uint64(pos)
rv.Start = proto.Uint64(start)
rv.End = proto.Uint64(end)
if len(arrayPos) > 0 {
rv.ArrayPositions = make([]uint64, len(arrayPos))
for i, apv := range arrayPos {
rv.ArrayPositions[i] = apv
}
}
return &rv
}
func NewTermFreqRow(field uint16, term []byte, docID []byte, docNum uint64, freq uint64, norm float32, termVectors []*TermVector) *TermFreqRow {
return InitTermFreqRow(&TermFreqRow{}, field, term, docID, docNum, freq, norm, termVectors)
}
func InitTermFreqRow(tfr *TermFreqRow, field uint16, term []byte, docID []byte, docNum uint64, freq uint64, norm float32, termVectors []*TermVector) *TermFreqRow {
tfr.field = field
tfr.term = term
tfr.docID = docID
tfr.docNum = docNum
tfr.value.Freq = proto.Uint64(freq)
tfr.value.Norm = proto.Float32(norm)
tfr.value.Vectors = termVectors
return tfr
}
func NewTermFreqRowKV(key, value []byte) (*TermFreqRow, error) {
rv := TermFreqRow{}
err := rv.ParseKey(key)
if err != nil {
return nil, err
}
err = rv.value.Unmarshal(value)
if err != nil {
return nil, err
}
return &rv, nil
}
func (tfr *TermFreqRow) ParseKey(key []byte) error {
keyLen := len(key)
if keyLen < 3 {
return fmt.Errorf("invalid term frequency key, no valid field")
}
tfr.field = binary.LittleEndian.Uint16(key[1:3])
termStartPos := 3
termEndPos := bytes.IndexByte(key[termStartPos:], ByteSeparator)
if termEndPos < 0 {
return fmt.Errorf("invalid term frequency key, no byte separator terminating term")
}
tfr.term = key[termStartPos : termStartPos+termEndPos]
docStartPos := termStartPos + termEndPos + 1
docEndPos := bytes.IndexByte(key[docStartPos:], ByteSeparator)
tfr.docID = key[docStartPos : docStartPos+docEndPos]
docNumPos := docStartPos + docEndPos + 1
tfr.docNum, _ = binary.Uvarint(key[docNumPos:])
return nil
}
func (tfr *TermFreqRow) KeySize() int {
return 3 + len(tfr.term) + 1 + len(tfr.docID) + 1 + binary.MaxVarintLen64
}
func (tfr *TermFreqRow) KeyTo(buf []byte) (int, error) {
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
termLen := copy(buf[3:], tfr.term)
buf[3+termLen] = ByteSeparator
docLen := copy(buf[3+termLen+1:], tfr.docID)
buf[3+termLen+1+docLen] = ByteSeparator
used := binary.PutUvarint(buf[3+termLen+1+docLen+1:], tfr.docNum)
return 3 + termLen + 1 + docLen + 1 + used, nil
}
func (tfr *TermFreqRow) Key() []byte {
buf := make([]byte, tfr.KeySize())
n, _ := tfr.KeyTo(buf)
return buf[:n]
}
func (tfr *TermFreqRow) ValueSize() int {
return tfr.value.Size()
}
func (tfr *TermFreqRow) ValueTo(buf []byte) (int, error) {
return tfr.value.MarshalTo(buf)
}
func (tfr *TermFreqRow) Value() []byte {
buf := make([]byte, tfr.ValueSize())
n, _ := tfr.ValueTo(buf)
return buf[:n]
}
func (tfr *TermFreqRow) String() string {
vectors := ""
for i, v := range tfr.value.GetVectors() {
vectors += fmt.Sprintf("%d - Field: %d Pos: %d Start: %d End: %d ArrayPos: %v - %#v\n", i, v.GetField(), v.GetPos(), v.GetStart(), v.GetEnd(), v.GetArrayPositions(), v.ArrayPositions)
}
return fmt.Sprintf("TermFreqRow - Field: %d\n", tfr.field) +
fmt.Sprintf("Term '%s' - % x\n", tfr.term, tfr.term) +
fmt.Sprintf("DocID '%s' - % x\n", tfr.docID, tfr.docID) +
fmt.Sprintf("DocNum %d\n", tfr.docNum) +
fmt.Sprintf("Freq: %d\n", tfr.value.GetFreq()) +
fmt.Sprintf("Norm: %f\n", tfr.value.GetNorm()) +
fmt.Sprintf("Vectors:\n%s", vectors)
}
func (tfr *TermFreqRow) Field() uint16 {
return tfr.field
}
func (tfr *TermFreqRow) Term() []byte {
return tfr.term
}
func (tfr *TermFreqRow) DocID() []byte {
return tfr.docID
}
func (tfr *TermFreqRow) DocNum() uint64 {
return tfr.docNum
}
func (tfr *TermFreqRow) Norm() float32 {
return tfr.value.GetNorm()
}
func (tfr *TermFreqRow) Freq() uint64 {
return tfr.value.GetFreq()
}
func (tfr *TermFreqRow) Vectors() []*TermVector {
return tfr.value.GetVectors()
}
func (tfr *TermFreqRow) DictionaryRowKeySize() int {
return 3 + len(tfr.term)
}
func (tfr *TermFreqRow) DictionaryRowKeyTo(buf []byte) (int, error) {
dr := NewDictionaryRow(tfr.field, tfr.term, 0)
return dr.KeyTo(buf)
}
func (tfr *TermFreqRow) DictionaryRowKey() []byte {
dr := NewDictionaryRow(tfr.field, tfr.term, 0)
return dr.Key()
}
func TermFreqIteratorStart(field uint16, term []byte) []byte {
buf := make([]byte, 3+len(term)+1)
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], field)
termLen := copy(buf[3:], term)
buf[3+termLen] = ByteSeparator
return buf
}
func TermFreqPrefixFieldTermDocId(field uint16, term []byte, docID []byte) []byte {
buf := make([]byte, 3+len(term)+1+len(docID)+1)
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], field)
termLen := copy(buf[3:], term)
buf[3+termLen] = ByteSeparator
docLen := copy(buf[3+termLen+1:], docID)
buf[3+termLen+1+docLen] = ByteSeparator
return buf
}

View File

@ -1,85 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/index"
)
func TestTermFreqRows(t *testing.T) {
tests := []struct {
input index.IndexRow
outKey []byte
outVal []byte
}{
{
NewTermFreqRow(0, []byte("test"), []byte("doca"), 1, 3, 7.0, nil),
[]byte{TermFreqKeyPrefix[0], 0, 0, 't', 'e', 's', 't', ByteSeparator, 'd', 'o', 'c', 'a', ByteSeparator, 1},
[]byte{8, 3, 21, 0, 0, 224, 64},
},
{
NewTermFreqRow(2, []byte("cats"), []byte("docb"), 254, 3, 7.0, nil),
[]byte{TermFreqKeyPrefix[0], 2, 0, 'c', 'a', 't', 's', ByteSeparator, 'd', 'o', 'c', 'b', ByteSeparator, 254, 1},
[]byte{8, 3, 21, 0, 0, 224, 64},
},
{
NewTermFreqRow(2, []byte("cats"), []byte("docb"), 254, 7, 3.0, nil),
[]byte{TermFreqKeyPrefix[0], 2, 0, 'c', 'a', 't', 's', ByteSeparator, 'd', 'o', 'c', 'b', ByteSeparator, 254, 1},
[]byte{8, 7, 21, 0, 0, 64, 64},
},
{
NewTermFreqRow(2, []byte("cats"), []byte("docb"), 254, 7, 3.0, []*TermVector{NewTermVector(2, 1, 0, 5, nil)}),
[]byte{TermFreqKeyPrefix[0], 2, 0, 'c', 'a', 't', 's', ByteSeparator, 'd', 'o', 'c', 'b', ByteSeparator, 254, 1},
[]byte{8, 7, 21, 0, 0, 64, 64, 26, 8, 8, 2, 16, 1, 24, 0, 32, 5},
},
{
NewTermFreqRow(2, []byte("cats"), []byte("docb"), 254, 7, 3.0, []*TermVector{NewTermVector(2, 1, 0, 5, []uint64{0})}),
[]byte{TermFreqKeyPrefix[0], 2, 0, 'c', 'a', 't', 's', ByteSeparator, 'd', 'o', 'c', 'b', ByteSeparator, 254, 1},
[]byte{8, 7, 21, 0, 0, 64, 64, 26, 10, 8, 2, 16, 1, 24, 0, 32, 5, 40, 0},
},
{
NewTermFreqRow(2, []byte("cats"), []byte("docb"), 254, 7, 3.0, []*TermVector{NewTermVector(2, 1, 0, 5, []uint64{0, 1, 2})}),
[]byte{TermFreqKeyPrefix[0], 2, 0, 'c', 'a', 't', 's', ByteSeparator, 'd', 'o', 'c', 'b', ByteSeparator, 254, 1},
[]byte{8, 7, 21, 0, 0, 64, 64, 26, 14, 8, 2, 16, 1, 24, 0, 32, 5, 40, 0, 40, 1, 40, 2},
},
// test empty term, used by _id field
{
NewTermFreqRow(0, []byte{}, []byte("doca"), 1, 0, 0.0, nil),
[]byte{TermFreqKeyPrefix[0], 0, 0, ByteSeparator, 'd', 'o', 'c', 'a', ByteSeparator, 1},
[]byte{8, 0, 21, 0, 0, 0, 0},
},
}
// test going from struct to k/v bytes
for i, test := range tests {
rk := test.input.Key()
if !reflect.DeepEqual(rk, test.outKey) {
t.Errorf("Expected key to be %v got: %v", test.outKey, rk)
}
rv := test.input.Value()
if !reflect.DeepEqual(rv, test.outVal) {
t.Errorf("Expected value to be %v got: %v for %d", test.outVal, rv, i)
}
}
// now test going back from k/v bytes to struct
for i, test := range tests {
row, err := NewTermFreqRowKV(test.outKey, test.outVal)
if err != nil {
t.Errorf("error parsking key/value: %v", err)
}
if !reflect.DeepEqual(row, test.input) {
t.Errorf("Expected:\n%vgot:\n%vfor %d", test.input, row, i)
}
}
}

View File

@ -1,99 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"io/ioutil"
"log"
"github.com/blevesearch/bleve/index/store"
)
type KVVisitor func(key, val []byte) (bool, error)
func visitPrefix(reader store.KVReader, prefix []byte, visitor KVVisitor) (err error) {
start := prefix
if start == nil {
start = []byte{}
}
it := reader.PrefixIterator(start)
defer func() {
if cerr := it.Close(); err == nil && cerr != nil {
err = cerr
}
}()
k, v, valid := it.Current()
for valid {
var cont bool
cont, err = visitor(k, v)
if err != nil {
// visitor encountered an error, stop and return it
return
}
if !cont {
// vistor has requested we stop iteration, return nil
return
}
it.Next()
k, v, valid = it.Current()
}
return
}
func visitRange(reader store.KVReader, start, end []byte, visitor KVVisitor) (err error) {
it := reader.RangeIterator(start, end)
defer func() {
if cerr := it.Close(); err == nil && cerr != nil {
err = cerr
}
}()
k, v, valid := it.Current()
for valid {
var cont bool
cont, err = visitor(k, v)
if err != nil {
// visitor encountered an error, stop and return it
return
}
if !cont {
// vistor has requested we stop iteration, return nil
return
}
it.Next()
k, v, valid = it.Current()
}
return
}
type DocNumberList []uint64
func (l DocNumberList) Len() int { return len(l) }
func (l DocNumberList) Less(i, j int) bool { return l[i] > l[j] }
func (l DocNumberList) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
// HighestValid returns the highest valid doc number
// from a *SORTED* DocNumberList
// if no doc number in the list is valid, then 0
func (l DocNumberList) HighestValid(maxRead uint64) uint64 {
for _, dn := range l {
if dn <= maxRead {
return dn
}
}
return 0
}
var logger = log.New(ioutil.Discard, "bleve.index.firestorm ", 0)
// SetLog sets the logger used for logging
// by default log messages are sent to ioutil.Discard
func SetLog(l *log.Logger) {
logger = l
}

View File

@ -1,108 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"fmt"
"github.com/golang/protobuf/proto"
"github.com/blevesearch/bleve/index/store"
)
const Version uint64 = 1
var IncompatibleVersion = fmt.Errorf("incompatible version, %d is supported", Version)
var VersionKey = []byte{'v'}
type VersionRow struct {
value VersionValue
}
func NewVersionRow(version uint64) *VersionRow {
rv := VersionRow{}
rv.value.Version = proto.Uint64(version)
return &rv
}
func NewVersionRowV(val []byte) (*VersionRow, error) {
rv := VersionRow{}
err := rv.value.Unmarshal(val)
if err != nil {
return nil, err
}
return &rv, nil
}
func (vr *VersionRow) KeySize() int {
return 1
}
func (vr *VersionRow) KeyTo(buf []byte) (int, error) {
buf[0] = VersionKey[0]
return 1, nil
}
func (vr *VersionRow) Key() []byte {
return VersionKey
}
func (vr *VersionRow) ValueSize() int {
return vr.value.Size()
}
func (vr *VersionRow) ValueTo(buf []byte) (int, error) {
return vr.value.MarshalTo(buf)
}
func (vr *VersionRow) Value() []byte {
buf := make([]byte, vr.ValueSize())
n, _ := vr.value.MarshalTo(buf)
return buf[:n]
}
func (vr *VersionRow) Version() uint64 {
return vr.value.GetVersion()
}
func (f *Firestorm) checkVersion(reader store.KVReader) (newIndex bool, err error) {
value, err := reader.Get(VersionKey)
if err != nil {
return
}
if value == nil {
newIndex = true
return
}
var vr *VersionRow
vr, err = NewVersionRowV(value)
if err != nil {
return
}
// assert correct version
if vr.Version() != Version {
err = IncompatibleVersion
return
}
return
}
func (f *Firestorm) storeVersion(writer store.KVWriter) error {
vr := NewVersionRow(Version)
wb := writer.NewBatch()
wb.Set(vr.Key(), vr.Value())
err := writer.ExecuteBatch(wb)
return err
}

View File

@ -1,59 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/index"
)
func TestVersionRows(t *testing.T) {
tests := []struct {
input index.IndexRow
outKey []byte
outVal []byte
}{
{
NewVersionRow(1),
[]byte{VersionKey[0]},
[]byte{8, 1},
},
{
NewVersionRow(1025),
[]byte{VersionKey[0]},
[]byte{8, 129, 8},
},
}
// test going from struct to k/v bytes
for i, test := range tests {
rk := test.input.Key()
if !reflect.DeepEqual(rk, test.outKey) {
t.Errorf("Expected key to be %v got: %v", test.outKey, rk)
}
rv := test.input.Value()
if !reflect.DeepEqual(rv, test.outVal) {
t.Errorf("Expected value to be %v got: %v for %d", test.outVal, rv, i)
}
}
// now test going back from k/v bytes to struct
for i, test := range tests {
row, err := NewVersionRowV(test.outVal)
if err != nil {
t.Errorf("error parsking key/value: %v", err)
}
if !reflect.DeepEqual(row, test.input) {
t.Errorf("Expected: %#v got: %#v for %d", test.input, row, i)
}
}
}

View File

@ -1,129 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"fmt"
"sort"
"sync/atomic"
"github.com/blevesearch/bleve/index/store"
)
const IDFieldName = "_id"
func (f *Firestorm) bootstrap() (err error) {
kvwriter, err := f.store.Writer()
if err != nil {
return
}
defer func() {
if cerr := kvwriter.Close(); err == nil && cerr != nil {
err = cerr
}
}()
// record version
err = f.storeVersion(kvwriter)
if err != nil {
return
}
// define _id field
_, idFieldRow := f.fieldIndexOrNewRow(IDFieldName)
wb := kvwriter.NewBatch()
wb.Set(idFieldRow.Key(), idFieldRow.Value())
err = kvwriter.ExecuteBatch(wb)
if err != nil {
return
}
return
}
func (f *Firestorm) warmup(reader store.KVReader) error {
// load all the existing fields
err := f.loadFields(reader)
if err != nil {
return err
}
// walk the term frequency info for _id
// this allows us to find deleted doc numbers
// and seed the doc count
idField, existed := f.fieldCache.FieldNamed(IDFieldName, false)
if !existed {
return fmt.Errorf("_id field missing, cannot proceed")
}
tfkPrefix := TermFreqIteratorStart(idField, nil)
var tfk TermFreqRow
var lastDocId []byte
lastDocNumbers := make(DocNumberList, 1)
err = visitPrefix(reader, tfkPrefix, func(key, val []byte) (bool, error) {
err := tfk.ParseKey(key)
if err != nil {
return false, err
}
docID := tfk.DocID()
docNum := tfk.DocNum()
if docNum > f.highDocNumber {
f.highDocNumber = docNum
}
if docNum > f.compensator.maxRead {
f.compensator.maxRead = docNum
}
// check for consecutive records
if bytes.Compare(docID, lastDocId) == 0 {
lastDocNumbers = append(lastDocNumbers, docNum)
} else {
// new doc id
atomic.AddUint64(&f.docCount, 1)
// last docID had multiple doc numbers
if len(lastDocNumbers) > 1 {
f.addOldDocNumbers(lastDocNumbers, lastDocId)
// reset size to 1
lastDocNumbers = make(DocNumberList, 1)
}
lastDocNumbers = lastDocNumbers[:1]
lastDocNumbers[0] = docNum
lastDocId = make([]byte, len(docID))
copy(lastDocId, docID)
}
return true, nil
})
if err != nil {
return err
}
// be sure to finish up check on final row
if len(lastDocNumbers) > 1 {
f.addOldDocNumbers(lastDocNumbers, lastDocId)
}
return nil
}
func (f *Firestorm) addOldDocNumbers(docNumberList DocNumberList, docID []byte) {
sort.Sort(docNumberList)
// high doc number is OK, rest are deleted
for _, dn := range docNumberList[1:] {
// f.deletedDocNumbers.Add(dn, docID)
f.compensator.deletedDocNumbers.Set(uint(dn))
f.garbageCollector.Notify(dn, docID)
}
}

View File

@ -1,237 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"testing"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/gtreap"
)
func TestBootstrap(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open() // open calls bootstrap
if err != nil {
t.Fatal(err)
}
// assert that version is set
reader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
val, err := reader.Get(VersionKey)
if err != nil {
t.Fatal(err)
}
verRow, err := NewVersionRowV(val)
if err != nil {
t.Fatal(err)
}
if verRow.Version() != Version {
t.Errorf("expected version %d, got %d", Version, verRow.Version())
}
// assert that field cache has _id
id, existed := f.(*Firestorm).fieldCache.FieldNamed(IDFieldName, false)
if !existed {
t.Errorf("expect '%s' in field cache", IDFieldName)
}
if id != 0 {
t.Errorf("expected '%s' to have index 0, got %d", IDFieldName, id)
}
// assert that field is recorded in kv store
fRowExpected := NewFieldRow(id, IDFieldName)
fRowKey := fRowExpected.Key()
val, err = reader.Get(fRowKey)
if err != nil {
t.Fatal(err)
}
fRowActual, err := NewFieldRowKV(fRowKey, val)
if err != nil {
t.Fatal(err)
}
if fRowExpected.Name() != fRowActual.Name() {
t.Errorf("expected name '%s' got '%s'", fRowExpected.Name(), fRowActual.Name())
}
// assert that highDocNumber is 0
if f.(*Firestorm).highDocNumber != 0 {
t.Errorf("expected highDocNumber to be 0, got %d", f.(*Firestorm).highDocNumber)
}
}
func TestWarmupNoGarbage(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open()
if err != nil {
t.Fatal(err)
}
kvwriter, err := f.(*Firestorm).store.Writer()
if err != nil {
t.Fatal(err)
}
rows := []index.IndexRow{
NewFieldRow(0, IDFieldName),
NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("b"), 2, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("c"), 3, 0, 0.0, nil),
}
expectedCount := uint64(3)
expectedGarbage := uint64(0)
for _, row := range rows {
wb := kvwriter.NewBatch()
wb.Set(row.Key(), row.Value())
err = kvwriter.ExecuteBatch(wb)
if err != nil {
t.Fatal(err)
}
}
err = kvwriter.Close()
if err != nil {
t.Fatal(err)
}
kvreader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
err = f.(*Firestorm).warmup(kvreader)
if err != nil {
t.Fatal(err)
}
err = kvreader.Close()
if err != nil {
t.Fatal(err)
}
// assert that doc count is correct
count, err := f.DocCount()
if err != nil {
t.Fatal(err)
}
if count != expectedCount {
t.Errorf("expected doc count %d, got %d", expectedCount, count)
}
// assert that deleted doc numbers size is 0
if f.(*Firestorm).compensator.GarbageCount() != expectedGarbage {
t.Errorf("expected 0 deleted doc numbers, got %d", f.(*Firestorm).compensator.GarbageCount())
}
// assert that highDocNumber is 3
if f.(*Firestorm).highDocNumber != 3 {
t.Errorf("expected highDocNumber to be 3, got %d", f.(*Firestorm).highDocNumber)
}
}
func TestWarmupSomeGarbage(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open()
if err != nil {
t.Fatal(err)
}
kvwriter, err := f.(*Firestorm).store.Writer()
if err != nil {
t.Fatal(err)
}
rows := []index.IndexRow{
NewFieldRow(0, IDFieldName),
NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("a"), 2, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("b"), 3, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("c"), 4, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("c"), 5, 0, 0.0, nil),
}
expectedCount := uint64(3)
expectedGarbage := uint64(2)
for _, row := range rows {
wb := kvwriter.NewBatch()
wb.Set(row.Key(), row.Value())
err = kvwriter.ExecuteBatch(wb)
if err != nil {
t.Fatal(err)
}
}
err = kvwriter.Close()
if err != nil {
t.Fatal(err)
}
kvreader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
err = f.(*Firestorm).warmup(kvreader)
if err != nil {
t.Fatal(err)
}
err = kvreader.Close()
if err != nil {
t.Fatal(err)
}
// assert that doc count is correct
count, err := f.DocCount()
if err != nil {
t.Fatal(err)
}
if count != expectedCount {
t.Errorf("expected doc count %d, got %d", expectedCount, count)
}
// assert that deleted doc numbers size is 0
if f.(*Firestorm).compensator.GarbageCount() != expectedGarbage {
t.Errorf("expected %d deleted doc numbers, got %d", expectedGarbage, f.(*Firestorm).compensator.GarbageCount())
}
// assert that doc numbers 1 and 4 are on the deleted list
if !f.(*Firestorm).compensator.deletedDocNumbers.Test(1) {
t.Errorf("expected doc number 1 to be deleted")
}
if !f.(*Firestorm).compensator.deletedDocNumbers.Test(4) {
t.Errorf("expected doc number 4 to be deleted")
}
// assert that highDocNumber is 5
if f.(*Firestorm).highDocNumber != 5 {
t.Errorf("expected highDocNumber to be 5, got %d", f.(*Firestorm).highDocNumber)
}
}