From 1b10c286e7e9532bb86a1227916baffdbdba2197 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sat, 20 Aug 2016 14:03:46 -0400 Subject: [PATCH] adding initial attempt at numeric ids in index index scheme is named smolder compiles and unit tests pass, that is all --- index/smolder/analysis.go | 117 ++ index/smolder/analysis_test.go | 101 ++ index/smolder/benchmark_all.sh | 8 + index/smolder/benchmark_boltdb_test.go | 70 ++ index/smolder/benchmark_common_test.go | 144 +++ index/smolder/benchmark_cznicb_test.go | 77 ++ index/smolder/benchmark_forestdb_test.go | 208 ++++ index/smolder/benchmark_goleveldb_test.go | 71 ++ index/smolder/benchmark_gorocksdb_test.go | 73 ++ index/smolder/benchmark_gtreap_test.go | 66 + index/smolder/benchmark_leveldb_test.go | 73 ++ index/smolder/benchmark_null_test.go | 66 + index/smolder/dump.go | 212 ++++ index/smolder/dump_test.go | 130 ++ index/smolder/field_dict.go | 67 + index/smolder/field_dict_test.go | 181 +++ index/smolder/index_reader.go | 227 ++++ index/smolder/reader.go | 297 +++++ index/smolder/reader_test.go | 302 +++++ index/smolder/row.go | 876 +++++++++++++ index/smolder/row_merge.go | 71 ++ index/smolder/row_merge_test.go | 52 + index/smolder/row_test.go | 357 ++++++ index/smolder/smolder.pb.go | 684 +++++++++++ index/smolder/smolder.proto | 14 + index/smolder/smoldering.go | 1107 +++++++++++++++++ index/smolder/smoldering_test.go | 1351 +++++++++++++++++++++ index/smolder/stats.go | 50 + index/smolder/varint.go | 94 ++ 29 files changed, 7146 insertions(+) create mode 100644 index/smolder/analysis.go create mode 100644 index/smolder/analysis_test.go create mode 100755 index/smolder/benchmark_all.sh create mode 100644 index/smolder/benchmark_boltdb_test.go create mode 100644 index/smolder/benchmark_common_test.go create mode 100644 index/smolder/benchmark_cznicb_test.go create mode 100644 index/smolder/benchmark_forestdb_test.go create mode 100644 index/smolder/benchmark_goleveldb_test.go create mode 100644 index/smolder/benchmark_gorocksdb_test.go create mode 100644 index/smolder/benchmark_gtreap_test.go create mode 100644 index/smolder/benchmark_leveldb_test.go create mode 100644 index/smolder/benchmark_null_test.go create mode 100644 index/smolder/dump.go create mode 100644 index/smolder/dump_test.go create mode 100644 index/smolder/field_dict.go create mode 100644 index/smolder/field_dict_test.go create mode 100644 index/smolder/index_reader.go create mode 100644 index/smolder/reader.go create mode 100644 index/smolder/reader_test.go create mode 100644 index/smolder/row.go create mode 100644 index/smolder/row_merge.go create mode 100644 index/smolder/row_merge_test.go create mode 100644 index/smolder/row_test.go create mode 100644 index/smolder/smolder.pb.go create mode 100644 index/smolder/smolder.proto create mode 100644 index/smolder/smoldering.go create mode 100644 index/smolder/smoldering_test.go create mode 100644 index/smolder/stats.go create mode 100644 index/smolder/varint.go diff --git a/index/smolder/analysis.go b/index/smolder/analysis.go new file mode 100644 index 00000000..9d9dd836 --- /dev/null +++ b/index/smolder/analysis.go @@ -0,0 +1,117 @@ +// Copyright (c) 2015 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" +) + +func (udc *SmolderingCouch) Analyze(d *document.Document) *index.AnalysisResult { + rv := &index.AnalysisResult{ + DocID: d.ID, + Rows: make([]index.IndexRow, 0, 100), + } + + // track our back index entries + backIndexStoredEntries := make([]*BackIndexStoreEntry, 0) + + // information we collate as we merge fields with same name + fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies) + fieldLengths := make(map[uint16]int) + fieldIncludeTermVectors := make(map[uint16]bool) + fieldNames := make(map[uint16]string) + + // set the value for the _id field + fieldTermFreqs[0] = analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Term: []byte(d.ID), + Position: 1, + Start: 0, + End: len(d.ID), + }, + }, nil, false) + + analyzeField := func(field document.Field, storable bool) { + fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(field.Name()) + if newFieldRow != nil { + rv.Rows = append(rv.Rows, newFieldRow) + } + fieldNames[fieldIndex] = field.Name() + + if field.Options().IsIndexed() { + fieldLength, tokenFreqs := field.Analyze() + existingFreqs := fieldTermFreqs[fieldIndex] + if existingFreqs == nil { + fieldTermFreqs[fieldIndex] = tokenFreqs + } else { + existingFreqs.MergeAll(field.Name(), tokenFreqs) + fieldTermFreqs[fieldIndex] = existingFreqs + } + fieldLengths[fieldIndex] += fieldLength + fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors() + } + + if storable && field.Options().IsStored() { + rv.Rows, backIndexStoredEntries = udc.storeField(d.Number, field, fieldIndex, rv.Rows, backIndexStoredEntries) + } + } + + // walk all the fields, record stored fields now + // place information about indexed fields into map + // this collates information across fields with + // same names (arrays) + for _, field := range d.Fields { + analyzeField(field, true) + } + + if len(d.CompositeFields) > 0 { + for fieldIndex, tokenFreqs := range fieldTermFreqs { + if fieldIndex == 0 { + // dont add id to any composite field + continue + } + // see if any of the composite fields need this + for _, compositeField := range d.CompositeFields { + compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs) + } + } + + for _, compositeField := range d.CompositeFields { + analyzeField(compositeField, false) + } + } + + rowsCapNeeded := len(rv.Rows) + 1 + for _, tokenFreqs := range fieldTermFreqs { + rowsCapNeeded += len(tokenFreqs) + } + + rv.Rows = append(make([]index.IndexRow, 0, rowsCapNeeded), rv.Rows...) + + backIndexTermEntries := make([]*BackIndexTermEntry, 0, rowsCapNeeded) + + // walk through the collated information and process + // once for each indexed field (unique name) + for fieldIndex, tokenFreqs := range fieldTermFreqs { + fieldLength := fieldLengths[fieldIndex] + includeTermVectors := fieldIncludeTermVectors[fieldIndex] + + // encode this field + rv.Rows, backIndexTermEntries = udc.indexField(d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows, backIndexTermEntries) + } + + // build the back index row + backIndexRow := NewBackIndexRow(d.Number, backIndexTermEntries, backIndexStoredEntries) + rv.Rows = append(rv.Rows, backIndexRow) + + return rv +} diff --git a/index/smolder/analysis_test.go b/index/smolder/analysis_test.go new file mode 100644 index 00000000..f16083ac --- /dev/null +++ b/index/smolder/analysis_test.go @@ -0,0 +1,101 @@ +package smolder + +import ( + "testing" + + "github.com/blevesearch/bleve/analysis/analyzers/standard_analyzer" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/store/null" + "github.com/blevesearch/bleve/registry" +) + +func TestAnalysisBug328(t *testing.T) { + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name) + if err != nil { + t.Fatal(err) + } + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(null.Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + + d := document.NewDocument("1") + f := document.NewTextFieldCustom("title", nil, []byte("bleve"), document.IndexField|document.IncludeTermVectors, analyzer) + d.AddField(f) + f = document.NewTextFieldCustom("body", nil, []byte("bleve"), document.IndexField|document.IncludeTermVectors, analyzer) + d.AddField(f) + cf := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, []string{}, document.IndexField|document.IncludeTermVectors) + d.AddField(cf) + + rv := idx.Analyze(d) + fieldIndexes := make(map[uint16]string) + for _, row := range rv.Rows { + if row, ok := row.(*FieldRow); ok { + fieldIndexes[row.index] = row.name + } + if row, ok := row.(*TermFrequencyRow); ok && string(row.term) == "bleve" { + for _, vec := range row.vectors { + if vec.field != row.field { + if fieldIndexes[row.field] != "_all" { + t.Errorf("row named %s field %d - vector field %d", fieldIndexes[row.field], row.field, vec.field) + } + } + } + } + } +} + +func BenchmarkAnalyze(b *testing.B) { + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name) + if err != nil { + b.Fatal(err) + } + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(null.Name, nil, analysisQueue) + if err != nil { + b.Fatal(err) + } + + d := document.NewDocument("1") + f := document.NewTextFieldWithAnalyzer("desc", nil, bleveWikiArticle1K, analyzer) + d.AddField(f) + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + rv := idx.Analyze(d) + if len(rv.Rows) < 92 || len(rv.Rows) > 93 { + b.Fatalf("expected 512-13 rows, got %d", len(rv.Rows)) + } + } +} + +var bleveWikiArticle1K = []byte(`Boiling liquid expanding vapor explosion +From Wikipedia, the free encyclopedia +See also: Boiler explosion and Steam explosion + +Flames subsequent to a flammable liquid BLEVE from a tanker. BLEVEs do not necessarily involve fire. + +This article's tone or style may not reflect the encyclopedic tone used on Wikipedia. See Wikipedia's guide to writing better articles for suggestions. (July 2013) +A boiling liquid expanding vapor explosion (BLEVE, /ˈblɛviː/ blev-ee) is an explosion caused by the rupture of a vessel containing a pressurized liquid above its boiling point.[1] +Contents [hide] +1 Mechanism +1.1 Water example +1.2 BLEVEs without chemical reactions +2 Fires +3 Incidents +4 Safety measures +5 See also +6 References +7 External links +Mechanism[edit] + +This section needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed. (July 2013) +There are three characteristics of liquids which are relevant to the discussion of a BLEVE:`) diff --git a/index/smolder/benchmark_all.sh b/index/smolder/benchmark_all.sh new file mode 100755 index 00000000..079fef18 --- /dev/null +++ b/index/smolder/benchmark_all.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +BENCHMARKS=`grep "func Benchmark" *_test.go | sed 's/.*func //' | sed s/\(.*{//` + +for BENCHMARK in $BENCHMARKS +do + go test -v -run=xxx -bench=^$BENCHMARK$ -benchtime=10s -tags 'forestdb leveldb' | grep -v ok | grep -v PASS +done diff --git a/index/smolder/benchmark_boltdb_test.go b/index/smolder/benchmark_boltdb_test.go new file mode 100644 index 00000000..6559b961 --- /dev/null +++ b/index/smolder/benchmark_boltdb_test.go @@ -0,0 +1,70 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "testing" + + "github.com/blevesearch/bleve/index/store/boltdb" +) + +var boltTestConfig = map[string]interface{}{ + "path": "test", +} + +func BenchmarkBoltDBIndexing1Workers(b *testing.B) { + CommonBenchmarkIndex(b, boltdb.Name, boltTestConfig, DestroyTest, 1) +} + +func BenchmarkBoltDBIndexing2Workers(b *testing.B) { + CommonBenchmarkIndex(b, boltdb.Name, boltTestConfig, DestroyTest, 2) +} + +func BenchmarkBoltDBIndexing4Workers(b *testing.B) { + CommonBenchmarkIndex(b, boltdb.Name, boltTestConfig, DestroyTest, 4) +} + +// batches + +func BenchmarkBoltDBIndexing1Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 1, 10) +} + +func BenchmarkBoltDBIndexing2Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 2, 10) +} + +func BenchmarkBoltDBIndexing4Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 4, 10) +} + +func BenchmarkBoltDBIndexing1Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 1, 100) +} + +func BenchmarkBoltDBIndexing2Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 2, 100) +} + +func BenchmarkBoltDBIndexing4Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 4, 100) +} + +func BenchmarkBoltBIndexing1Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 1, 1000) +} + +func BenchmarkBoltBIndexing2Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 2, 1000) +} + +func BenchmarkBoltBIndexing4Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 4, 1000) +} diff --git a/index/smolder/benchmark_common_test.go b/index/smolder/benchmark_common_test.go new file mode 100644 index 00000000..48737db5 --- /dev/null +++ b/index/smolder/benchmark_common_test.go @@ -0,0 +1,144 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "os" + "strconv" + "testing" + + _ "github.com/blevesearch/bleve/analysis/analyzers/standard_analyzer" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/registry" +) + +var benchmarkDocBodies = []string{ + "A boiling liquid expanding vapor explosion (BLEVE, /ˈblɛviː/ blev-ee) is an explosion caused by the rupture of a vessel containing a pressurized liquid above its boiling point.", + "A boiler explosion is a catastrophic failure of a boiler. As seen today, boiler explosions are of two kinds. One kind is a failure of the pressure parts of the steam and water sides. There can be many different causes, such as failure of the safety valve, corrosion of critical parts of the boiler, or low water level. Corrosion along the edges of lap joints was a common cause of early boiler explosions.", + "A boiler is a closed vessel in which water or other fluid is heated. The fluid does not necessarily boil. (In North America the term \"furnace\" is normally used if the purpose is not actually to boil the fluid.) The heated or vaporized fluid exits the boiler for use in various processes or heating applications,[1][2] including central heating, boiler-based power generation, cooking, and sanitation.", + "A pressure vessel is a closed container designed to hold gases or liquids at a pressure substantially different from the ambient pressure.", + "Pressure (symbol: p or P) is the ratio of force to the area over which that force is distributed.", + "Liquid is one of the four fundamental states of matter (the others being solid, gas, and plasma), and is the only state with a definite volume but no fixed shape.", + "The boiling point of a substance is the temperature at which the vapor pressure of the liquid equals the pressure surrounding the liquid[1][2] and the liquid changes into a vapor.", + "Vapor pressure or equilibrium vapor pressure is defined as the pressure exerted by a vapor in thermodynamic equilibrium with its condensed phases (solid or liquid) at a given temperature in a closed system.", + "Industrial gases are a group of gases that are specifically manufactured for use in a wide range of industries, which include oil and gas, petrochemicals, chemicals, power, mining, steelmaking, metals, environmental protection, medicine, pharmaceuticals, biotechnology, food, water, fertilizers, nuclear power, electronics and aerospace.", + "The expansion ratio of a liquefied and cryogenic substance is the volume of a given amount of that substance in liquid form compared to the volume of the same amount of substance in gaseous form, at room temperature and normal atmospheric pressure.", +} + +type KVStoreDestroy func() error + +func DestroyTest() error { + return os.RemoveAll("test") +} + +func CommonBenchmarkIndex(b *testing.B, storeName string, storeConfig map[string]interface{}, destroy KVStoreDestroy, analysisWorkers int) { + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed("standard") + if err != nil { + b.Fatal(err) + } + + indexDocument := document.NewDocument(""). + AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[0]), analyzer)) + + b.ResetTimer() + b.StopTimer() + for i := 0; i < b.N; i++ { + analysisQueue := index.NewAnalysisQueue(analysisWorkers) + idx, err := NewSmolderingCouch(storeName, storeConfig, analysisQueue) + if err != nil { + b.Fatal(err) + } + + err = idx.Open() + if err != nil { + b.Fatal(err) + } + indexDocument.ID = strconv.Itoa(i) + // just time the indexing portion + b.StartTimer() + err = idx.Update(indexDocument) + if err != nil { + b.Fatal(err) + } + b.StopTimer() + err = idx.Close() + if err != nil { + b.Fatal(err) + } + err = destroy() + if err != nil { + b.Fatal(err) + } + analysisQueue.Close() + } +} + +func CommonBenchmarkIndexBatch(b *testing.B, storeName string, storeConfig map[string]interface{}, destroy KVStoreDestroy, analysisWorkers, batchSize int) { + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed("standard") + if err != nil { + b.Fatal(err) + } + + b.ResetTimer() + b.StopTimer() + for i := 0; i < b.N; i++ { + + analysisQueue := index.NewAnalysisQueue(analysisWorkers) + idx, err := NewSmolderingCouch(storeName, storeConfig, analysisQueue) + if err != nil { + b.Fatal(err) + } + + err = idx.Open() + if err != nil { + b.Fatal(err) + } + + b.StartTimer() + batch := index.NewBatch() + for j := 0; j < 1000; j++ { + if j%batchSize == 0 { + if len(batch.IndexOps) > 0 { + err := idx.Batch(batch) + if err != nil { + b.Fatal(err) + } + } + batch = index.NewBatch() + } + indexDocument := document.NewDocument(""). + AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[j%10]), analyzer)) + indexDocument.ID = strconv.Itoa(i) + "-" + strconv.Itoa(j) + batch.Update(indexDocument) + } + // close last batch + if len(batch.IndexOps) > 0 { + err := idx.Batch(batch) + if err != nil { + b.Fatal(err) + } + } + b.StopTimer() + err = idx.Close() + if err != nil { + b.Fatal(err) + } + err = destroy() + if err != nil { + b.Fatal(err) + } + analysisQueue.Close() + } +} diff --git a/index/smolder/benchmark_cznicb_test.go b/index/smolder/benchmark_cznicb_test.go new file mode 100644 index 00000000..f39eb8d0 --- /dev/null +++ b/index/smolder/benchmark_cznicb_test.go @@ -0,0 +1,77 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +// +build cznicb + +package smolder + +import ( + "testing" + + "github.com/blevesearch/bleve/index/store" + "github.com/blevesearch/blevex/cznicb" +) + +func CreateCznicB() (store.KVStore, error) { + return cznicb.StoreConstructor(nil) +} + +func DestroyCznicB() error { + return nil +} + +func BenchmarkCznicBIndexing1Workers(b *testing.B) { + CommonBenchmarkIndex(b, CreateCznicB, DestroyCznicB, 1) +} + +func BenchmarkCznicBIndexing2Workers(b *testing.B) { + CommonBenchmarkIndex(b, CreateCznicB, DestroyCznicB, 2) +} + +func BenchmarkCznicBIndexing4Workers(b *testing.B) { + CommonBenchmarkIndex(b, CreateCznicB, DestroyCznicB, 4) +} + +// batches + +func BenchmarkCznicBIndexing1Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, CreateCznicB, DestroyCznicB, 1, 10) +} + +func BenchmarkCznicBIndexing2Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, CreateCznicB, DestroyCznicB, 2, 10) +} + +func BenchmarkCznicBIndexing4Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, CreateCznicB, DestroyCznicB, 4, 10) +} + +func BenchmarkCznicBIndexing1Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, CreateCznicB, DestroyCznicB, 1, 100) +} + +func BenchmarkCznicBIndexing2Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, CreateCznicB, DestroyCznicB, 2, 100) +} + +func BenchmarkCznicBIndexing4Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, CreateCznicB, DestroyCznicB, 4, 100) +} + +func BenchmarkCznicBIndexing1Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, CreateCznicB, DestroyCznicB, 1, 1000) +} + +func BenchmarkCznicBIndexing2Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, CreateCznicB, DestroyCznicB, 2, 1000) +} + +func BenchmarkCznicBIndexing4Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, CreateCznicB, DestroyCznicB, 4, 1000) +} diff --git a/index/smolder/benchmark_forestdb_test.go b/index/smolder/benchmark_forestdb_test.go new file mode 100644 index 00000000..9104d02e --- /dev/null +++ b/index/smolder/benchmark_forestdb_test.go @@ -0,0 +1,208 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +// +build forestdb + +package smolder + +import ( + "os" + "testing" + + "github.com/blevesearch/blevex/forestdb" +) + +var forestDBTestOption = map[string]interface{}{ + "path": "testdir/test", + "create_if_missing": true, +} + +// internally used to reset, so we also +// re-make the testdir +func DestroyForestDB() error { + err := os.RemoveAll("testdir") + if err != nil { + return err + } + err = os.MkdirAll("testdir", 0700) + if err != nil { + return err + } + return nil +} + +func BenchmarkForestDBIndexing1Workers(b *testing.B) { + err := os.MkdirAll("testdir", 0700) + if err != nil { + b.Fatal(err) + } + defer func() { + err := os.RemoveAll("testdir") + if err != nil { + b.Fatal(err) + } + }() + CommonBenchmarkIndex(b, forestdb.Name, forestDBTestOption, DestroyForestDB, 1) +} + +func BenchmarkForestDBIndexing2Workers(b *testing.B) { + err := os.MkdirAll("testdir", 0700) + if err != nil { + b.Fatal(err) + } + defer func() { + err := os.RemoveAll("testdir") + if err != nil { + b.Fatal(err) + } + }() + CommonBenchmarkIndex(b, forestdb.Name, forestDBTestOption, DestroyForestDB, 2) +} + +func BenchmarkForestDBIndexing4Workers(b *testing.B) { + err := os.MkdirAll("testdir", 0700) + if err != nil { + b.Fatal(err) + } + defer func() { + err := os.RemoveAll("testdir") + if err != nil { + b.Fatal(err) + } + }() + CommonBenchmarkIndex(b, forestdb.Name, forestDBTestOption, DestroyForestDB, 4) +} + +// batches + +func BenchmarkForestDBIndexing1Workers10Batch(b *testing.B) { + err := os.MkdirAll("testdir", 0700) + if err != nil { + b.Fatal(err) + } + defer func() { + err := os.RemoveAll("testdir") + if err != nil { + b.Fatal(err) + } + }() + CommonBenchmarkIndexBatch(b, forestdb.Name, forestDBTestOption, DestroyForestDB, 1, 10) +} + +func BenchmarkForestDBIndexing2Workers10Batch(b *testing.B) { + err := os.MkdirAll("testdir", 0700) + if err != nil { + b.Fatal(err) + } + defer func() { + err := os.RemoveAll("testdir") + if err != nil { + b.Fatal(err) + } + }() + CommonBenchmarkIndexBatch(b, forestdb.Name, forestDBTestOption, DestroyForestDB, 2, 10) +} + +func BenchmarkForestDBIndexing4Workers10Batch(b *testing.B) { + err := os.MkdirAll("testdir", 0700) + if err != nil { + b.Fatal(err) + } + defer func() { + err := os.RemoveAll("testdir") + if err != nil { + b.Fatal(err) + } + }() + CommonBenchmarkIndexBatch(b, forestdb.Name, forestDBTestOption, DestroyForestDB, 4, 10) +} + +func BenchmarkForestDBIndexing1Workers100Batch(b *testing.B) { + err := os.MkdirAll("testdir", 0700) + if err != nil { + b.Fatal(err) + } + defer func() { + err := os.RemoveAll("testdir") + if err != nil { + b.Fatal(err) + } + }() + CommonBenchmarkIndexBatch(b, forestdb.Name, forestDBTestOption, DestroyForestDB, 1, 100) +} + +func BenchmarkForestDBIndexing2Workers100Batch(b *testing.B) { + err := os.MkdirAll("testdir", 0700) + if err != nil { + b.Fatal(err) + } + defer func() { + err := os.RemoveAll("testdir") + if err != nil { + b.Fatal(err) + } + }() + CommonBenchmarkIndexBatch(b, forestdb.Name, forestDBTestOption, DestroyForestDB, 2, 100) +} + +func BenchmarkForestDBIndexing4Workers100Batch(b *testing.B) { + err := os.MkdirAll("testdir", 0700) + if err != nil { + b.Fatal(err) + } + defer func() { + err := os.RemoveAll("testdir") + if err != nil { + b.Fatal(err) + } + }() + CommonBenchmarkIndexBatch(b, forestdb.Name, forestDBTestOption, DestroyForestDB, 4, 100) +} + +func BenchmarkForestDBIndexing1Workers1000Batch(b *testing.B) { + err := os.MkdirAll("testdir", 0700) + if err != nil { + b.Fatal(err) + } + defer func() { + err := os.RemoveAll("testdir") + if err != nil { + b.Fatal(err) + } + }() + CommonBenchmarkIndexBatch(b, forestdb.Name, forestDBTestOption, DestroyForestDB, 1, 1000) +} + +func BenchmarkForestDBIndexing2Workers1000Batch(b *testing.B) { + err := os.MkdirAll("testdir", 0700) + if err != nil { + b.Fatal(err) + } + defer func() { + err := os.RemoveAll("testdir") + if err != nil { + b.Fatal(err) + } + }() + CommonBenchmarkIndexBatch(b, forestdb.Name, forestDBTestOption, DestroyForestDB, 2, 1000) +} + +func BenchmarkForestDBIndexing4Workers1000Batch(b *testing.B) { + err := os.MkdirAll("testdir", 0700) + if err != nil { + b.Fatal(err) + } + defer func() { + err := os.RemoveAll("testdir") + if err != nil { + b.Fatal(err) + } + }() + CommonBenchmarkIndexBatch(b, forestdb.Name, forestDBTestOption, DestroyForestDB, 4, 1000) +} diff --git a/index/smolder/benchmark_goleveldb_test.go b/index/smolder/benchmark_goleveldb_test.go new file mode 100644 index 00000000..1c92a1b9 --- /dev/null +++ b/index/smolder/benchmark_goleveldb_test.go @@ -0,0 +1,71 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "testing" + + "github.com/blevesearch/bleve/index/store/goleveldb" +) + +var goLevelDBTestOptions = map[string]interface{}{ + "create_if_missing": true, + "path": "test", +} + +func BenchmarkGoLevelDBIndexing1Workers(b *testing.B) { + CommonBenchmarkIndex(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 1) +} + +func BenchmarkGoLevelDBIndexing2Workers(b *testing.B) { + CommonBenchmarkIndex(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 2) +} + +func BenchmarkGoLevelDBIndexing4Workers(b *testing.B) { + CommonBenchmarkIndex(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 4) +} + +// batches + +func BenchmarkGoLevelDBIndexing1Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 1, 10) +} + +func BenchmarkGoLevelDBIndexing2Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 2, 10) +} + +func BenchmarkGoLevelDBIndexing4Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 4, 10) +} + +func BenchmarkGoLevelDBIndexing1Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 1, 100) +} + +func BenchmarkGoLevelDBIndexing2Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 2, 100) +} + +func BenchmarkGoLevelDBIndexing4Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 4, 100) +} + +func BenchmarkGoLevelDBIndexing1Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 1, 1000) +} + +func BenchmarkGoLevelDBIndexing2Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 2, 1000) +} + +func BenchmarkGoLevelDBIndexing4Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 4, 1000) +} diff --git a/index/smolder/benchmark_gorocksdb_test.go b/index/smolder/benchmark_gorocksdb_test.go new file mode 100644 index 00000000..19aaa46a --- /dev/null +++ b/index/smolder/benchmark_gorocksdb_test.go @@ -0,0 +1,73 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +// +build rocksdb + +package smolder + +import ( + "testing" + + "github.com/blevesearch/blevex/rocksdb" +) + +var rocksdbTestOptions = map[string]interface{}{ + "path": "test", + "create_if_missing": true, +} + +func BenchmarkRocksDBIndexing1Workers(b *testing.B) { + CommonBenchmarkIndex(b, rocksdb.Name, rocksdbTestOptions, DestroyTest, 1) +} + +func BenchmarkRocksDBIndexing2Workers(b *testing.B) { + CommonBenchmarkIndex(b, rocksdb.Name, rocksdbTestOptions, DestroyTest, 2) +} + +func BenchmarkRocksDBIndexing4Workers(b *testing.B) { + CommonBenchmarkIndex(b, rocksdb.Name, rocksdbTestOptions, DestroyTest, 4) +} + +// batches + +func BenchmarkRocksDBIndexing1Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, rocksdb.Name, rocksdbTestOptions, DestroyTest, 1, 10) +} + +func BenchmarkRocksDBIndexing2Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, rocksdb.Name, rocksdbTestOptions, DestroyTest, 2, 10) +} + +func BenchmarkRocksDBIndexing4Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, rocksdb.Name, rocksdbTestOptions, DestroyTest, 4, 10) +} + +func BenchmarkRocksDBIndexing1Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, rocksdb.Name, rocksdbTestOptions, DestroyTest, 1, 100) +} + +func BenchmarkRocksDBIndexing2Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, rocksdb.Name, rocksdbTestOptions, DestroyTest, 2, 100) +} + +func BenchmarkRocksDBIndexing4Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, rocksdb.Name, rocksdbTestOptions, DestroyTest, 4, 100) +} + +func BenchmarkRocksDBIndexing1Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, rocksdb.Name, rocksdbTestOptions, DestroyTest, 1, 1000) +} + +func BenchmarkRocksDBIndexing2Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, rocksdb.Name, rocksdbTestOptions, DestroyTest, 2, 1000) +} + +func BenchmarkRocksDBIndexing4Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, rocksdb.Name, rocksdbTestOptions, DestroyTest, 4, 1000) +} diff --git a/index/smolder/benchmark_gtreap_test.go b/index/smolder/benchmark_gtreap_test.go new file mode 100644 index 00000000..867615c0 --- /dev/null +++ b/index/smolder/benchmark_gtreap_test.go @@ -0,0 +1,66 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "testing" + + "github.com/blevesearch/bleve/index/store/gtreap" +) + +func BenchmarkGTreapIndexing1Workers(b *testing.B) { + CommonBenchmarkIndex(b, gtreap.Name, nil, DestroyTest, 1) +} + +func BenchmarkGTreapIndexing2Workers(b *testing.B) { + CommonBenchmarkIndex(b, gtreap.Name, nil, DestroyTest, 2) +} + +func BenchmarkGTreapIndexing4Workers(b *testing.B) { + CommonBenchmarkIndex(b, gtreap.Name, nil, DestroyTest, 4) +} + +// batches + +func BenchmarkGTreapIndexing1Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyTest, 1, 10) +} + +func BenchmarkGTreapIndexing2Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyTest, 2, 10) +} + +func BenchmarkGTreapIndexing4Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyTest, 4, 10) +} + +func BenchmarkGTreapIndexing1Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyTest, 1, 100) +} + +func BenchmarkGTreapIndexing2Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyTest, 2, 100) +} + +func BenchmarkGTreapIndexing4Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyTest, 4, 100) +} + +func BenchmarkGTreapIndexing1Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyTest, 1, 1000) +} + +func BenchmarkGTreapIndexing2Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyTest, 2, 1000) +} + +func BenchmarkGTreapIndexing4Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyTest, 4, 1000) +} diff --git a/index/smolder/benchmark_leveldb_test.go b/index/smolder/benchmark_leveldb_test.go new file mode 100644 index 00000000..7840b149 --- /dev/null +++ b/index/smolder/benchmark_leveldb_test.go @@ -0,0 +1,73 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +// +build leveldb + +package smolder + +import ( + "testing" + + "github.com/blevesearch/blevex/leveldb" +) + +var leveldbTestOptions = map[string]interface{}{ + "path": "test", + "create_if_missing": true, +} + +func BenchmarkLevelDBIndexing1Workers(b *testing.B) { + CommonBenchmarkIndex(b, leveldb.Name, leveldbTestOptions, DestroyTest, 1) +} + +func BenchmarkLevelDBIndexing2Workers(b *testing.B) { + CommonBenchmarkIndex(b, leveldb.Name, leveldbTestOptions, DestroyTest, 2) +} + +func BenchmarkLevelDBIndexing4Workers(b *testing.B) { + CommonBenchmarkIndex(b, leveldb.Name, leveldbTestOptions, DestroyTest, 4) +} + +// batches + +func BenchmarkLevelDBIndexing1Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, leveldb.Name, leveldbTestOptions, DestroyTest, 1, 10) +} + +func BenchmarkLevelDBIndexing2Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, leveldb.Name, leveldbTestOptions, DestroyTest, 2, 10) +} + +func BenchmarkLevelDBIndexing4Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, leveldb.Name, leveldbTestOptions, DestroyTest, 4, 10) +} + +func BenchmarkLevelDBIndexing1Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, leveldb.Name, leveldbTestOptions, DestroyTest, 1, 100) +} + +func BenchmarkLevelDBIndexing2Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, leveldb.Name, leveldbTestOptions, DestroyTest, 2, 100) +} + +func BenchmarkLevelDBIndexing4Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, leveldb.Name, leveldbTestOptions, DestroyTest, 4, 100) +} + +func BenchmarkLevelDBIndexing1Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, leveldb.Name, leveldbTestOptions, DestroyTest, 1, 1000) +} + +func BenchmarkLevelDBIndexing2Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, leveldb.Name, leveldbTestOptions, DestroyTest, 2, 1000) +} + +func BenchmarkLevelDBIndexing4Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, leveldb.Name, leveldbTestOptions, DestroyTest, 4, 1000) +} diff --git a/index/smolder/benchmark_null_test.go b/index/smolder/benchmark_null_test.go new file mode 100644 index 00000000..685aae0f --- /dev/null +++ b/index/smolder/benchmark_null_test.go @@ -0,0 +1,66 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "testing" + + "github.com/blevesearch/bleve/index/store/null" +) + +func BenchmarkNullIndexing1Workers(b *testing.B) { + CommonBenchmarkIndex(b, null.Name, nil, DestroyTest, 1) +} + +func BenchmarkNullIndexing2Workers(b *testing.B) { + CommonBenchmarkIndex(b, null.Name, nil, DestroyTest, 2) +} + +func BenchmarkNullIndexing4Workers(b *testing.B) { + CommonBenchmarkIndex(b, null.Name, nil, DestroyTest, 4) +} + +// batches + +func BenchmarkNullIndexing1Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyTest, 1, 10) +} + +func BenchmarkNullIndexing2Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyTest, 2, 10) +} + +func BenchmarkNullIndexing4Workers10Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyTest, 4, 10) +} + +func BenchmarkNullIndexing1Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyTest, 1, 100) +} + +func BenchmarkNullIndexing2Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyTest, 2, 100) +} + +func BenchmarkNullIndexing4Workers100Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyTest, 4, 100) +} + +func BenchmarkNullIndexing1Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyTest, 1, 1000) +} + +func BenchmarkNullIndexing2Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyTest, 2, 1000) +} + +func BenchmarkNullIndexing4Workers1000Batch(b *testing.B) { + CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyTest, 4, 1000) +} diff --git a/index/smolder/dump.go b/index/smolder/dump.go new file mode 100644 index 00000000..13f259ee --- /dev/null +++ b/index/smolder/dump.go @@ -0,0 +1,212 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "bytes" + "log" + "sort" + + "github.com/blevesearch/bleve/index/store" +) + +// the functions in this file are only intended to be used by +// the bleve_dump utility and the debug http handlers +// if your application relies on them, you're doing something wrong +// they may change or be removed at any time + +func (udc *SmolderingCouch) dumpPrefix(kvreader store.KVReader, rv chan interface{}, prefix []byte) { + start := prefix + if start == nil { + start = []byte{0} + } + it := kvreader.PrefixIterator(start) + defer func() { + cerr := it.Close() + if cerr != nil { + rv <- cerr + } + }() + key, val, valid := it.Current() + for valid { + ck := make([]byte, len(key)) + copy(ck, key) + cv := make([]byte, len(val)) + copy(cv, val) + row, err := ParseFromKeyValue(ck, cv) + if err != nil { + rv <- err + return + } + rv <- row + + it.Next() + key, val, valid = it.Current() + } +} + +func (udc *SmolderingCouch) dumpRange(kvreader store.KVReader, rv chan interface{}, start, end []byte) { + it := kvreader.RangeIterator(start, end) + defer func() { + cerr := it.Close() + if cerr != nil { + rv <- cerr + } + }() + key, val, valid := it.Current() + for valid { + ck := make([]byte, len(key)) + copy(ck, key) + cv := make([]byte, len(val)) + copy(cv, val) + row, err := ParseFromKeyValue(ck, cv) + if err != nil { + rv <- err + return + } + rv <- row + + it.Next() + key, val, valid = it.Current() + } +} + +func (udc *SmolderingCouch) DumpAll() chan interface{} { + rv := make(chan interface{}) + go func() { + defer close(rv) + + // start an isolated reader for use during the dump + kvreader, err := udc.store.Reader() + if err != nil { + rv <- err + return + } + defer func() { + cerr := kvreader.Close() + if cerr != nil { + rv <- cerr + } + }() + + udc.dumpRange(kvreader, rv, nil, nil) + }() + return rv +} + +func (udc *SmolderingCouch) DumpFields() chan interface{} { + rv := make(chan interface{}) + go func() { + defer close(rv) + + // start an isolated reader for use during the dump + kvreader, err := udc.store.Reader() + if err != nil { + rv <- err + return + } + defer func() { + cerr := kvreader.Close() + if cerr != nil { + rv <- cerr + } + }() + + udc.dumpPrefix(kvreader, rv, []byte{'f'}) + }() + return rv +} + +type keyset [][]byte + +func (k keyset) Len() int { return len(k) } +func (k keyset) Swap(i, j int) { k[i], k[j] = k[j], k[i] } +func (k keyset) Less(i, j int) bool { return bytes.Compare(k[i], k[j]) < 0 } + +// DumpDoc returns all rows in the index related to this doc id +func (udc *SmolderingCouch) DumpDoc(id string) chan interface{} { + rv := make(chan interface{}) + + go func() { + defer close(rv) + + indexReader, err := udc.Reader() + if err != nil { + rv <- err + return + } + + defer func() { + cerr := indexReader.Close() + if cerr != nil { + rv <- cerr + } + }() + + back, err := udc.backIndexRowForDoc(indexReader, nil, id) + if err != nil { + rv <- err + return + } + + // no such doc + if back == nil { + log.Printf("no such doc") + return + } + // build sorted list of term keys + keys := make(keyset, 0) + for _, entry := range back.termEntries { + // tfr := NewTermFrequencyRow([]byte(*entry.Term), uint16(*entry.Field), back.docNumber, 0, 0) + tfr := TermFrequencyRowStart([]byte(*entry.Term), uint16(*entry.Field), back.docNumber) + key := tfr.Key() + keys = append(keys, key) + } + sort.Sort(keys) + + // start an isolated reader for use during the dump + kvreader := indexReader.(*IndexReader).kvreader + + // first add all the stored rows + storedRowPrefix := NewStoredRowDocBytes(back.docNumber, 0, []uint64{}, 'x', []byte{}).ScanPrefixForDoc() + udc.dumpPrefix(kvreader, rv, storedRowPrefix) + + // now walk term keys in order and add them as well + if len(keys) > 0 { + it := kvreader.RangeIterator(keys[0], nil) + defer func() { + cerr := it.Close() + if cerr != nil { + rv <- cerr + } + }() + + for _, key := range keys { + it.Seek(key) + rkey, rval, valid := it.Current() + if !valid { + break + } + rck := make([]byte, len(rkey)) + copy(rck, key) + rcv := make([]byte, len(rval)) + copy(rcv, rval) + row, err := ParseFromKeyValue(rck, rcv) + if err != nil { + rv <- err + return + } + rv <- row + } + } + }() + + return rv +} diff --git a/index/smolder/dump_test.go b/index/smolder/dump_test.go new file mode 100644 index 00000000..d99121ef --- /dev/null +++ b/index/smolder/dump_test.go @@ -0,0 +1,130 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "testing" + "time" + + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/store/boltdb" + + "github.com/blevesearch/bleve/document" +) + +func TestDump(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + docCount, err := idx.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField)) + doc.AddField(document.NewNumericFieldWithIndexingOptions("age", []uint64{}, 35.99, document.IndexField|document.StoreField)) + dateField, err := document.NewDateTimeFieldWithIndexingOptions("unixEpoch", []uint64{}, time.Unix(0, 0), document.IndexField|document.StoreField) + if err != nil { + t.Error(err) + } + doc.AddField(dateField) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + doc = document.NewDocument("2") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test2"), document.IndexField|document.StoreField)) + doc.AddField(document.NewNumericFieldWithIndexingOptions("age", []uint64{}, 35.99, document.IndexField|document.StoreField)) + dateField, err = document.NewDateTimeFieldWithIndexingOptions("unixEpoch", []uint64{}, time.Unix(0, 0), document.IndexField|document.StoreField) + if err != nil { + t.Error(err) + } + doc.AddField(dateField) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + fieldsCount := 0 + fieldsRows := idx.DumpFields() + for range fieldsRows { + fieldsCount++ + } + if fieldsCount != 4 { + t.Errorf("expected 4 fields, got %d", fieldsCount) + } + + // 1 text term + // 16 numeric terms + // 16 date terms + // 3 stored fields + // 1 id term row + expectedDocRowCount := int(1+(2*(64/document.DefaultPrecisionStep))+3) + 1 + docRowCount := 0 + docRows := idx.DumpDoc("1") + for range docRows { + docRowCount++ + } + if docRowCount != expectedDocRowCount { + t.Errorf("expected %d rows for document, got %d", expectedDocRowCount, docRowCount) + } + + docRowCount = 0 + docRows = idx.DumpDoc("2") + for range docRows { + docRowCount++ + } + if docRowCount != expectedDocRowCount { + t.Errorf("expected %d rows for document, got %d", expectedDocRowCount, docRowCount) + } + + // 1 version + // fieldsCount field rows + // 2 docs * expectedDocRowCount + // 2 back index rows + //4 text term row count (4 different text terms) + // 16 numeric term row counts (shared for both docs, same numeric value) + // 16 date term row counts (shared for both docs, same date value) + expectedAllRowCount := int(1 + fieldsCount + (2 * expectedDocRowCount) + 2 + 4 + int((2 * (64 / document.DefaultPrecisionStep)))) + allRowCount := 0 + allRows := idx.DumpAll() + for range allRows { + allRowCount++ + } + if allRowCount != expectedAllRowCount { + t.Errorf("expected %d rows for all, got %d", expectedAllRowCount, allRowCount) + } +} diff --git a/index/smolder/field_dict.go b/index/smolder/field_dict.go new file mode 100644 index 00000000..5640970a --- /dev/null +++ b/index/smolder/field_dict.go @@ -0,0 +1,67 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "fmt" + + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/store" +) + +type SmolderingCouchFieldDict struct { + indexReader *IndexReader + iterator store.KVIterator + field uint16 +} + +func newSmolderingCouchFieldDict(indexReader *IndexReader, field uint16, startTerm, endTerm []byte) (*SmolderingCouchFieldDict, error) { + + startKey := NewDictionaryRow(startTerm, field, 0).Key() + if endTerm == nil { + endTerm = []byte{ByteSeparator} + } else { + endTerm = incrementBytes(endTerm) + } + endKey := NewDictionaryRow(endTerm, field, 0).Key() + + it := indexReader.kvreader.RangeIterator(startKey, endKey) + + return &SmolderingCouchFieldDict{ + indexReader: indexReader, + iterator: it, + field: field, + }, nil + +} + +func (r *SmolderingCouchFieldDict) Next() (*index.DictEntry, error) { + key, val, valid := r.iterator.Current() + if !valid { + return nil, nil + } + + currRow, err := NewDictionaryRowKV(key, val) + if err != nil { + return nil, fmt.Errorf("unexpected error parsing dictionary row kv: %v", err) + } + rv := index.DictEntry{ + Term: string(currRow.term), + Count: currRow.count, + } + // advance the iterator to the next term + r.iterator.Next() + return &rv, nil + +} + +func (r *SmolderingCouchFieldDict) Close() error { + return r.iterator.Close() +} diff --git a/index/smolder/field_dict_test.go b/index/smolder/field_dict_test.go new file mode 100644 index 00000000..fb64d213 --- /dev/null +++ b/index/smolder/field_dict_test.go @@ -0,0 +1,181 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/store/boltdb" +) + +func TestIndexFieldDict(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + doc = document.NewDocument("2") + doc.AddField(document.NewTextFieldWithAnalyzer("name", []uint64{}, []byte("test test test"), testAnalyzer)) + doc.AddField(document.NewTextFieldCustom("desc", []uint64{}, []byte("eat more rice"), document.IndexField|document.IncludeTermVectors, testAnalyzer)) + doc.AddField(document.NewTextFieldCustom("prefix", []uint64{}, []byte("bob cat cats catting dog doggy zoo"), document.IndexField|document.IncludeTermVectors, testAnalyzer)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + dict, err := indexReader.FieldDict("name") + if err != nil { + t.Errorf("error creating reader: %v", err) + } + defer func() { + err := dict.Close() + if err != nil { + t.Fatal(err) + } + }() + + termCount := 0 + curr, err := dict.Next() + for err == nil && curr != nil { + termCount++ + if curr.Term != "test" { + t.Errorf("expected term to be 'test', got '%s'", curr.Term) + } + curr, err = dict.Next() + } + if termCount != 1 { + t.Errorf("expected 1 term for this field, got %d", termCount) + } + + dict2, err := indexReader.FieldDict("desc") + if err != nil { + t.Errorf("error creating reader: %v", err) + } + defer func() { + err := dict2.Close() + if err != nil { + t.Fatal(err) + } + }() + + termCount = 0 + terms := make([]string, 0) + curr, err = dict2.Next() + for err == nil && curr != nil { + termCount++ + terms = append(terms, curr.Term) + curr, err = dict2.Next() + } + if termCount != 3 { + t.Errorf("expected 3 term for this field, got %d", termCount) + } + expectedTerms := []string{"eat", "more", "rice"} + if !reflect.DeepEqual(expectedTerms, terms) { + t.Errorf("expected %#v, got %#v", expectedTerms, terms) + } + + // test start and end range + dict3, err := indexReader.FieldDictRange("desc", []byte("fun"), []byte("nice")) + if err != nil { + t.Errorf("error creating reader: %v", err) + } + defer func() { + err := dict3.Close() + if err != nil { + t.Fatal(err) + } + }() + + termCount = 0 + terms = make([]string, 0) + curr, err = dict3.Next() + for err == nil && curr != nil { + termCount++ + terms = append(terms, curr.Term) + curr, err = dict3.Next() + } + if termCount != 1 { + t.Errorf("expected 1 term for this field, got %d", termCount) + } + expectedTerms = []string{"more"} + if !reflect.DeepEqual(expectedTerms, terms) { + t.Errorf("expected %#v, got %#v", expectedTerms, terms) + } + + // test use case for prefix + dict4, err := indexReader.FieldDictPrefix("prefix", []byte("cat")) + if err != nil { + t.Errorf("error creating reader: %v", err) + } + defer func() { + err := dict4.Close() + if err != nil { + t.Fatal(err) + } + }() + + termCount = 0 + terms = make([]string, 0) + curr, err = dict4.Next() + for err == nil && curr != nil { + termCount++ + terms = append(terms, curr.Term) + curr, err = dict4.Next() + } + if termCount != 3 { + t.Errorf("expected 3 term for this field, got %d", termCount) + } + expectedTerms = []string{"cat", "cats", "catting"} + if !reflect.DeepEqual(expectedTerms, terms) { + t.Errorf("expected %#v, got %#v", expectedTerms, terms) + } +} diff --git a/index/smolder/index_reader.go b/index/smolder/index_reader.go new file mode 100644 index 00000000..ff0d2166 --- /dev/null +++ b/index/smolder/index_reader.go @@ -0,0 +1,227 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "fmt" + + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/store" +) + +type IndexReader struct { + index *SmolderingCouch + kvreader store.KVReader + docCount uint64 +} + +func (i *IndexReader) TermFieldReader(term []byte, fieldName string, includeFreq, includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { + fieldIndex, fieldExists := i.index.fieldCache.FieldNamed(fieldName, false) + if fieldExists { + return newSmolderingCouchTermFieldReader(i, term, uint16(fieldIndex), includeFreq, includeNorm, includeTermVectors) + } + return newSmolderingCouchTermFieldReader(i, []byte{ByteSeparator}, ^uint16(0), includeFreq, includeNorm, includeTermVectors) +} + +func (i *IndexReader) FieldDict(fieldName string) (index.FieldDict, error) { + return i.FieldDictRange(fieldName, nil, nil) +} + +func (i *IndexReader) FieldDictRange(fieldName string, startTerm []byte, endTerm []byte) (index.FieldDict, error) { + fieldIndex, fieldExists := i.index.fieldCache.FieldNamed(fieldName, false) + if fieldExists { + return newSmolderingCouchFieldDict(i, uint16(fieldIndex), startTerm, endTerm) + } + return newSmolderingCouchFieldDict(i, ^uint16(0), []byte{ByteSeparator}, []byte{}) +} + +func (i *IndexReader) FieldDictPrefix(fieldName string, termPrefix []byte) (index.FieldDict, error) { + return i.FieldDictRange(fieldName, termPrefix, termPrefix) +} + +func (i *IndexReader) DocIDReader(start, end string) (index.DocIDReader, error) { + return newSmolderingCouchDocIDReader(i, start, end) +} + +func (i *IndexReader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) { + return newSmolderingCouchDocIDReaderOnly(i, ids) +} + +func (i *IndexReader) Document(id string) (doc *document.Document, err error) { + + // first hit the back index to confirm doc exists + var backIndexRow *BackIndexRow + backIndexRow, err = i.index.backIndexRowForDoc(i, nil, id) + if err != nil { + return + } + if backIndexRow == nil { + return + } + doc = document.NewDocument(id) + storedRowScanPrefix := NewStoredRowDocBytes(backIndexRow.docNumber, 0, []uint64{}, 'x', nil).ScanPrefixForDoc() + it := i.kvreader.PrefixIterator(storedRowScanPrefix) + defer func() { + if cerr := it.Close(); err == nil && cerr != nil { + err = cerr + } + }() + key, val, valid := it.Current() + for valid { + safeVal := make([]byte, len(val)) + copy(safeVal, val) + var row *StoredRow + row, err = NewStoredRowKV(key, safeVal) + if err != nil { + doc = nil + return + } + if row != nil { + fieldName := i.index.fieldCache.FieldIndexed(row.field) + field := decodeFieldType(row.typ, fieldName, row.arrayPositions, row.value) + if field != nil { + doc.AddField(field) + } + } + + it.Next() + key, val, valid = it.Current() + } + return +} + +func (i *IndexReader) DocumentFieldTerms(id index.IndexInternalID) (index.FieldTerms, error) { + back, err := i.index.backIndexRowForDoc(i, id, "") + if err != nil { + return nil, err + } + if back == nil { + return nil, nil + } + rv := make(index.FieldTerms, len(back.termEntries)) + for _, entry := range back.termEntries { + fieldName := i.index.fieldCache.FieldIndexed(uint16(*entry.Field)) + terms, ok := rv[fieldName] + if !ok { + terms = make([]string, 0) + } + terms = append(terms, *entry.Term) + rv[fieldName] = terms + } + return rv, nil +} + +func (i *IndexReader) DocumentFieldTermsForFields(id index.IndexInternalID, fields []string) (index.FieldTerms, error) { + back, err := i.index.backIndexRowForDoc(i, id, "") + if err != nil { + return nil, err + } + rv := make(index.FieldTerms, len(fields)) + fieldsMap := make(map[uint16]string, len(fields)) + for _, f := range fields { + id, ok := i.index.fieldCache.FieldNamed(f, false) + if !ok { + return nil, fmt.Errorf("Field %s was not found in cache", f) + } + fieldsMap[id] = f + } + for _, entry := range back.termEntries { + if field, ok := fieldsMap[uint16(*entry.Field)]; ok { + terms, ok := rv[field] + if !ok { + terms = make([]string, 0) + } + terms = append(terms, *entry.Term) + rv[field] = terms + } + } + return rv, nil +} + +func (i *IndexReader) Fields() (fields []string, err error) { + fields = make([]string, 0) + it := i.kvreader.PrefixIterator([]byte{'f'}) + defer func() { + if cerr := it.Close(); err == nil && cerr != nil { + err = cerr + } + }() + key, val, valid := it.Current() + for valid { + var row SmolderingCouchRow + row, err = ParseFromKeyValue(key, val) + if err != nil { + fields = nil + return + } + if row != nil { + fieldRow, ok := row.(*FieldRow) + if ok { + fields = append(fields, fieldRow.name) + } + } + + it.Next() + key, val, valid = it.Current() + } + return +} + +func (i *IndexReader) GetInternal(key []byte) ([]byte, error) { + internalRow := NewInternalRow(key, nil) + return i.kvreader.Get(internalRow.Key()) +} + +func (i *IndexReader) DocCount() uint64 { + return i.docCount +} + +func (i *IndexReader) Close() error { + return i.kvreader.Close() +} + +func (i *IndexReader) ExternalID(id index.IndexInternalID) (string, error) { + ft, err := i.DocumentFieldTermsForFields(id, []string{"_id"}) + if err != nil { + return "", err + } + terms := ft["_id"] + if len(terms) < 1 { + return "", nil + } + return terms[0], nil +} + +func (i *IndexReader) InternalID(id string) (index.IndexInternalID, error) { + tfr, err := i.TermFieldReader([]byte(id), "_id", false, false, false) + if err != nil { + return nil, err + } + if tfr.Count() < 1 { + return nil, nil + } + tfd := index.TermFieldDoc{} + tfr.Next(&tfd) + return tfd.ID, nil +} + +func incrementBytes(in []byte) []byte { + rv := make([]byte, len(in)) + copy(rv, in) + for i := len(rv) - 1; i >= 0; i-- { + rv[i] = rv[i] + 1 + if rv[i] != 0 { + // didn't overflow, so stop + break + } + } + return rv +} diff --git a/index/smolder/reader.go b/index/smolder/reader.go new file mode 100644 index 00000000..df1402f2 --- /dev/null +++ b/index/smolder/reader.go @@ -0,0 +1,297 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "bytes" + "sort" + "sync/atomic" + + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/store" +) + +type SmolderingCouchTermFieldReader struct { + count uint64 + indexReader *IndexReader + iterator store.KVIterator + term []byte + tfrNext *TermFrequencyRow + field uint16 +} + +func newSmolderingCouchTermFieldReader(indexReader *IndexReader, term []byte, field uint16, includeFreq, includeNorm, includeTermVectors bool) (*SmolderingCouchTermFieldReader, error) { + dictionaryRow := NewDictionaryRow(term, field, 0) + val, err := indexReader.kvreader.Get(dictionaryRow.Key()) + if err != nil { + return nil, err + } + if val == nil { + atomic.AddUint64(&indexReader.index.stats.termSearchersStarted, uint64(1)) + return &SmolderingCouchTermFieldReader{ + count: 0, + term: term, + tfrNext: &TermFrequencyRow{}, + field: field, + }, nil + } + + err = dictionaryRow.parseDictionaryV(val) + if err != nil { + return nil, err + } + + tfr := TermFrequencyRowStart(term, field, []byte{}) + it := indexReader.kvreader.PrefixIterator(tfr.Key()) + + atomic.AddUint64(&indexReader.index.stats.termSearchersStarted, uint64(1)) + return &SmolderingCouchTermFieldReader{ + indexReader: indexReader, + iterator: it, + count: dictionaryRow.count, + term: term, + tfrNext: &TermFrequencyRow{}, + field: field, + }, nil +} + +func (r *SmolderingCouchTermFieldReader) Count() uint64 { + return r.count +} + +func (r *SmolderingCouchTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { + if r.iterator != nil { + key, val, valid := r.iterator.Current() + if valid { + tfr := r.tfrNext + err := tfr.parseKDoc(key) + if err != nil { + return nil, err + } + err = tfr.parseV(val) + if err != nil { + return nil, err + } + rv := preAlloced + if rv == nil { + rv = &index.TermFieldDoc{} + } + rv.ID = append(rv.ID, tfr.docNumber...) + rv.Freq = tfr.freq + rv.Norm = float64(tfr.norm) + if tfr.vectors != nil { + rv.Vectors = r.indexReader.index.termFieldVectorsFromTermVectors(tfr.vectors) + } + r.iterator.Next() + return rv, nil + } + } + return nil, nil +} + +func (r *SmolderingCouchTermFieldReader) Advance(docID index.IndexInternalID, preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { + if r.iterator != nil { + tfr := TermFrequencyRowStart(r.term, r.field, docID) + r.iterator.Seek(tfr.Key()) + key, val, valid := r.iterator.Current() + if valid { + tfr, err := NewTermFrequencyRowKV(key, val) + if err != nil { + return nil, err + } + rv := preAlloced + if rv == nil { + rv = &index.TermFieldDoc{} + } + rv.ID = append(rv.ID, tfr.docNumber...) + rv.Freq = tfr.freq + rv.Norm = float64(tfr.norm) + if tfr.vectors != nil { + rv.Vectors = r.indexReader.index.termFieldVectorsFromTermVectors(tfr.vectors) + } + r.iterator.Next() + return rv, nil + } + } + return nil, nil +} + +func (r *SmolderingCouchTermFieldReader) Close() error { + if r.iterator != nil { + return r.iterator.Close() + } + return nil +} + +type SmolderingCouchDocIDReader struct { + indexReader *IndexReader + iterator store.KVIterator + only []string + onlyPos int + onlyMode bool +} + +func newSmolderingCouchDocIDReader(indexReader *IndexReader, start, end string) (*SmolderingCouchDocIDReader, error) { + startBytes := []byte(start) + if start == "" { + startBytes = []byte{0x0} + } + endBytes := []byte(end) + if end == "" { + endBytes = []byte{0xff} + } + bisrk := BackIndexRowKey(startBytes) + bierk := BackIndexRowKey(endBytes) + it := indexReader.kvreader.RangeIterator(bisrk, bierk) + + return &SmolderingCouchDocIDReader{ + indexReader: indexReader, + iterator: it, + }, nil +} + +func newSmolderingCouchDocIDReaderOnly(indexReader *IndexReader, ids []string) (*SmolderingCouchDocIDReader, error) { + // ensure ids are sorted + sort.Strings(ids) + startBytes := []byte{0x0} + if len(ids) > 0 { + startBytes = []byte(ids[0]) + } + endBytes := []byte{0xff} + if len(ids) > 0 { + endBytes = incrementBytes([]byte(ids[len(ids)-1])) + } + bisrk := BackIndexRowKey(startBytes) + bierk := BackIndexRowKey(endBytes) + it := indexReader.kvreader.RangeIterator(bisrk, bierk) + + return &SmolderingCouchDocIDReader{ + indexReader: indexReader, + iterator: it, + only: ids, + onlyMode: true, + }, nil +} + +func (r *SmolderingCouchDocIDReader) Next() (index.IndexInternalID, error) { + key, val, valid := r.iterator.Current() + + if r.onlyMode { + var rv index.IndexInternalID + for valid && r.onlyPos < len(r.only) { + br, err := NewBackIndexRowKV(key, val) + if err != nil { + return nil, err + } + if !bytes.Equal(br.docNumber, []byte(r.only[r.onlyPos])) { + ok := r.nextOnly() + if !ok { + return nil, nil + } + birk := BackIndexRowKey([]byte(r.only[r.onlyPos])) + r.iterator.Seek(birk) + key, val, valid = r.iterator.Current() + continue + } else { + rv = append([]byte(nil), br.docNumber...) + break + } + } + if valid && r.onlyPos < len(r.only) { + ok := r.nextOnly() + if ok { + birk := BackIndexRowKey([]byte(r.only[r.onlyPos])) + r.iterator.Seek(birk) + } + return rv, nil + } + + } else { + if valid { + br, err := NewBackIndexRowKV(key, val) + if err != nil { + return nil, err + } + rv := append([]byte(nil), br.docNumber...) + r.iterator.Next() + return rv, nil + } + } + return nil, nil +} + +func (r *SmolderingCouchDocIDReader) Advance(docID index.IndexInternalID) (index.IndexInternalID, error) { + birk := BackIndexRowKey(docID) + r.iterator.Seek(birk) + key, val, valid := r.iterator.Current() + r.onlyPos = sort.SearchStrings(r.only, string(docID)) + + if r.onlyMode { + var rv index.IndexInternalID + for valid && r.onlyPos < len(r.only) { + br, err := NewBackIndexRowKV(key, val) + if err != nil { + return nil, err + } + if !bytes.Equal(br.docNumber, []byte(r.only[r.onlyPos])) { + ok := r.nextOnly() + if !ok { + return nil, nil + } + birk := BackIndexRowKey([]byte(r.only[r.onlyPos])) + r.iterator.Seek(birk) + continue + } else { + rv = append([]byte(nil), br.docNumber...) + break + } + } + if valid && r.onlyPos < len(r.only) { + ok := r.nextOnly() + if ok { + birk := BackIndexRowKey([]byte(r.only[r.onlyPos])) + r.iterator.Seek(birk) + } + return rv, nil + } + } else { + if valid { + br, err := NewBackIndexRowKV(key, val) + if err != nil { + return nil, err + } + rv := append([]byte(nil), br.docNumber...) + r.iterator.Next() + return rv, nil + } + } + return nil, nil +} + +func (r *SmolderingCouchDocIDReader) Close() error { + atomic.AddUint64(&r.indexReader.index.stats.termSearchersFinished, uint64(1)) + return r.iterator.Close() +} + +// move the r.only pos forward one, skipping duplicates +// return true if there is more data, or false if we got to the end of the list +func (r *SmolderingCouchDocIDReader) nextOnly() bool { + + // advance 1 position, until we see a different key + // it's already sorted, so this skips duplicates + start := r.onlyPos + r.onlyPos++ + for r.onlyPos < len(r.only) && r.only[r.onlyPos] == r.only[start] { + start = r.onlyPos + r.onlyPos++ + } + // inidicate if we got to the end of the list + return r.onlyPos < len(r.only) +} diff --git a/index/smolder/reader_test.go b/index/smolder/reader_test.go new file mode 100644 index 00000000..f8e0627c --- /dev/null +++ b/index/smolder/reader_test.go @@ -0,0 +1,302 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/store/boltdb" +) + +func TestIndexReader(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + doc = document.NewDocument("2") + doc.AddField(document.NewTextFieldWithAnalyzer("name", []uint64{}, []byte("test test test"), testAnalyzer)) + doc.AddField(document.NewTextFieldCustom("desc", []uint64{}, []byte("eat more rice"), document.IndexField|document.IncludeTermVectors, testAnalyzer)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + // first look for a term that doesn't exist + reader, err := indexReader.TermFieldReader([]byte("nope"), "name", true, true, true) + if err != nil { + t.Errorf("Error accessing term field reader: %v", err) + } + count := reader.Count() + if count != 0 { + t.Errorf("Expected doc count to be: %d got: %d", 0, count) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + reader, err = indexReader.TermFieldReader([]byte("test"), "name", true, true, true) + if err != nil { + t.Errorf("Error accessing term field reader: %v", err) + } + + expectedCount = 2 + count = reader.Count() + if count != expectedCount { + t.Errorf("Exptected doc count to be: %d got: %d", expectedCount, count) + } + + var match *index.TermFieldDoc + var actualCount uint64 + match, err = reader.Next(nil) + for err == nil && match != nil { + match, err = reader.Next(nil) + if err != nil { + t.Errorf("unexpected error reading next") + } + actualCount++ + } + if actualCount != count { + t.Errorf("count was 2, but only saw %d", actualCount) + } + + expectedMatch := &index.TermFieldDoc{ + ID: EncodeUvarintAscending(nil, 2), + Freq: 1, + Norm: 0.5773502588272095, + Vectors: []*index.TermFieldVector{ + { + Field: "desc", + Pos: 3, + Start: 9, + End: 13, + }, + }, + } + tfr, err := indexReader.TermFieldReader([]byte("rice"), "desc", true, true, true) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + match, err = tfr.Next(nil) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !reflect.DeepEqual(expectedMatch, match) { + t.Errorf("got %#v, expected %#v", match, expectedMatch) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + // now test usage of advance + reader, err = indexReader.TermFieldReader([]byte("test"), "name", true, true, true) + if err != nil { + t.Errorf("Error accessing term field reader: %v", err) + } + + internalID2 := EncodeUvarintAscending(nil, 2) + match, err = reader.Advance(internalID2, nil) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if match == nil { + t.Fatalf("Expected match, got nil") + } + if !match.ID.Equals(internalID2) { + t.Errorf("Expected ID '2', got '%s'", match.ID) + } + internalID3 := EncodeUvarintAscending(nil, 3) + match, err = reader.Advance(internalID3, nil) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if match != nil { + t.Errorf("expected nil, got %v", match) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + // now test creating a reader for a field that doesn't exist + reader, err = indexReader.TermFieldReader([]byte("water"), "doesnotexist", true, true, true) + if err != nil { + t.Errorf("Error accessing term field reader: %v", err) + } + count = reader.Count() + if count != 0 { + t.Errorf("expected count 0 for reader of non-existant field") + } + match, err = reader.Next(nil) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if match != nil { + t.Errorf("expected nil, got %v", match) + } + match, err = reader.Advance(index.IndexInternalID("anywhere"), nil) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if match != nil { + t.Errorf("expected nil, got %v", match) + } + +} + +func TestIndexDocIdReader(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + doc = document.NewDocument("2") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test test test"))) + doc.AddField(document.NewTextFieldWithIndexingOptions("desc", []uint64{}, []byte("eat more rice"), document.IndexField|document.IncludeTermVectors)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Error(err) + } + }() + + // first get all doc ids + reader, err := indexReader.DocIDReader("", "") + if err != nil { + t.Errorf("Error accessing doc id reader: %v", err) + } + defer func() { + err := reader.Close() + if err != nil { + t.Fatal(err) + } + }() + + id, err := reader.Next() + count := uint64(0) + for id != nil { + count++ + id, err = reader.Next() + } + if count != expectedCount { + t.Errorf("expected %d, got %d", expectedCount, count) + } + + // try it again, but jump to the second doc this time + reader2, err := indexReader.DocIDReader("", "") + if err != nil { + t.Errorf("Error accessing doc id reader: %v", err) + } + defer func() { + err := reader2.Close() + if err != nil { + t.Error(err) + } + }() + + internalID2 := EncodeUvarintAscending(nil, 2) + id, err = reader2.Advance(internalID2) + if err != nil { + t.Error(err) + } + if !id.Equals(internalID2) { + t.Errorf("expected to find id '2', got '%s'", id) + } + + internalID3 := EncodeUvarintAscending(nil, 3) + id, err = reader2.Advance(internalID3) + if err != nil { + t.Error(err) + } + if id != nil { + t.Errorf("expected to find id '', got '%s'", id) + } +} diff --git a/index/smolder/row.go b/index/smolder/row.go new file mode 100644 index 00000000..52713850 --- /dev/null +++ b/index/smolder/row.go @@ -0,0 +1,876 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "math" + + "github.com/golang/protobuf/proto" +) + +const ByteSeparator byte = 0xff + +type SmolderingCouchRowStream chan SmolderingCouchRow + +type SmolderingCouchRow interface { + KeySize() int + KeyTo([]byte) (int, error) + Key() []byte + Value() []byte + ValueSize() int + ValueTo([]byte) (int, error) +} + +func ParseFromKeyValue(key, value []byte) (SmolderingCouchRow, error) { + if len(key) > 0 { + switch key[0] { + case 'v': + return NewVersionRowKV(key, value) + case 'f': + return NewFieldRowKV(key, value) + case 'd': + return NewDictionaryRowKV(key, value) + case 't': + return NewTermFrequencyRowKV(key, value) + case 'b': + return NewBackIndexRowKV(key, value) + case 's': + return NewStoredRowKV(key, value) + case 'i': + return NewInternalRowKV(key, value) + } + return nil, fmt.Errorf("Unknown field type '%s'", string(key[0])) + } + return nil, fmt.Errorf("Invalid empty key") +} + +// VERSION + +type VersionRow struct { + version uint8 +} + +func (v *VersionRow) Key() []byte { + return []byte{'v'} +} + +func (v *VersionRow) KeySize() int { + return 1 +} + +func (v *VersionRow) KeyTo(buf []byte) (int, error) { + buf[0] = 'v' + return 1, nil +} + +func (v *VersionRow) Value() []byte { + return []byte{byte(v.version)} +} + +func (v *VersionRow) ValueSize() int { + return 1 +} + +func (v *VersionRow) ValueTo(buf []byte) (int, error) { + buf[0] = v.version + return 1, nil +} + +func (v *VersionRow) String() string { + return fmt.Sprintf("Version: %d", v.version) +} + +func NewVersionRow(version uint8) *VersionRow { + return &VersionRow{ + version: version, + } +} + +func NewVersionRowKV(key, value []byte) (*VersionRow, error) { + rv := VersionRow{} + buf := bytes.NewBuffer(value) + err := binary.Read(buf, binary.LittleEndian, &rv.version) + if err != nil { + return nil, err + } + return &rv, nil +} + +// INTERNAL STORAGE + +type InternalRow struct { + key []byte + val []byte +} + +func (i *InternalRow) Key() []byte { + buf := make([]byte, i.KeySize()) + size, _ := i.KeyTo(buf) + return buf[:size] +} + +func (i *InternalRow) KeySize() int { + return len(i.key) + 1 +} + +func (i *InternalRow) KeyTo(buf []byte) (int, error) { + buf[0] = 'i' + actual := copy(buf[1:], i.key) + return 1 + actual, nil +} + +func (i *InternalRow) Value() []byte { + return i.val +} + +func (i *InternalRow) ValueSize() int { + return len(i.val) +} + +func (i *InternalRow) ValueTo(buf []byte) (int, error) { + actual := copy(buf, i.val) + return actual, nil +} + +func (i *InternalRow) String() string { + return fmt.Sprintf("InternalStore - Key: %s (% x) Val: %s (% x)", i.key, i.key, i.val, i.val) +} + +func NewInternalRow(key, val []byte) *InternalRow { + return &InternalRow{ + key: key, + val: val, + } +} + +func NewInternalRowKV(key, value []byte) (*InternalRow, error) { + rv := InternalRow{} + rv.key = key[1:] + rv.val = value + return &rv, nil +} + +// FIELD definition + +type FieldRow struct { + index uint16 + name string +} + +func (f *FieldRow) Key() []byte { + buf := make([]byte, f.KeySize()) + size, _ := f.KeyTo(buf) + return buf[:size] +} + +func (f *FieldRow) KeySize() int { + return 3 +} + +func (f *FieldRow) KeyTo(buf []byte) (int, error) { + buf[0] = 'f' + binary.LittleEndian.PutUint16(buf[1:3], f.index) + return 3, nil +} + +func (f *FieldRow) Value() []byte { + return append([]byte(f.name), ByteSeparator) +} + +func (f *FieldRow) ValueSize() int { + return len(f.name) + 1 +} + +func (f *FieldRow) ValueTo(buf []byte) (int, error) { + size := copy(buf, f.name) + buf[size] = ByteSeparator + return size + 1, nil +} + +func (f *FieldRow) String() string { + return fmt.Sprintf("Field: %d Name: %s", f.index, f.name) +} + +func NewFieldRow(index uint16, name string) *FieldRow { + return &FieldRow{ + index: index, + name: name, + } +} + +func NewFieldRowKV(key, value []byte) (*FieldRow, error) { + rv := FieldRow{} + + buf := bytes.NewBuffer(key) + _, err := buf.ReadByte() // type + if err != nil { + return nil, err + } + err = binary.Read(buf, binary.LittleEndian, &rv.index) + if err != nil { + return nil, err + } + + buf = bytes.NewBuffer(value) + rv.name, err = buf.ReadString(ByteSeparator) + if err != nil { + return nil, err + } + rv.name = rv.name[:len(rv.name)-1] // trim off separator byte + + return &rv, nil +} + +// DICTIONARY + +const DictionaryRowMaxValueSize = binary.MaxVarintLen64 + +type DictionaryRow struct { + field uint16 + term []byte + count uint64 +} + +func (dr *DictionaryRow) Key() []byte { + buf := make([]byte, dr.KeySize()) + size, _ := dr.KeyTo(buf) + return buf[:size] +} + +func (dr *DictionaryRow) KeySize() int { + return len(dr.term) + 3 +} + +func (dr *DictionaryRow) KeyTo(buf []byte) (int, error) { + buf[0] = 'd' + binary.LittleEndian.PutUint16(buf[1:3], dr.field) + size := copy(buf[3:], dr.term) + return size + 3, nil +} + +func (dr *DictionaryRow) Value() []byte { + buf := make([]byte, dr.ValueSize()) + size, _ := dr.ValueTo(buf) + return buf[:size] +} + +func (dr *DictionaryRow) ValueSize() int { + return DictionaryRowMaxValueSize +} + +func (dr *DictionaryRow) ValueTo(buf []byte) (int, error) { + used := binary.PutUvarint(buf, dr.count) + return used, nil +} + +func (dr *DictionaryRow) String() string { + return fmt.Sprintf("Dictionary Term: `%s` Field: %d Count: %d ", string(dr.term), dr.field, dr.count) +} + +func NewDictionaryRow(term []byte, field uint16, count uint64) *DictionaryRow { + return &DictionaryRow{ + term: term, + field: field, + count: count, + } +} + +func NewDictionaryRowKV(key, value []byte) (*DictionaryRow, error) { + rv, err := NewDictionaryRowK(key) + if err != nil { + return nil, err + } + + err = rv.parseDictionaryV(value) + if err != nil { + return nil, err + } + return rv, nil + +} + +func NewDictionaryRowK(key []byte) (*DictionaryRow, error) { + rv := DictionaryRow{} + buf := bytes.NewBuffer(key) + _, err := buf.ReadByte() // type + if err != nil { + return nil, err + } + + err = binary.Read(buf, binary.LittleEndian, &rv.field) + if err != nil { + return nil, err + } + + rv.term, err = buf.ReadBytes(ByteSeparator) + // there is no separator expected here, should get EOF + if err != io.EOF { + return nil, err + } + + return &rv, nil +} + +func (dr *DictionaryRow) parseDictionaryV(value []byte) error { + buf := bytes.NewBuffer(value) + + count, err := binary.ReadUvarint(buf) + if err != nil { + return err + } + dr.count = count + + return nil +} + +// TERM FIELD FREQUENCY + +type TermVector struct { + field uint16 + arrayPositions []uint64 + pos uint64 + start uint64 + end uint64 +} + +func (tv *TermVector) String() string { + return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions) +} + +type TermFrequencyRow struct { + term []byte + docNumber []byte + freq uint64 + vectors []*TermVector + norm float32 + field uint16 +} + +func (tfr *TermFrequencyRow) Term() []byte { + return tfr.term +} + +func (tfr *TermFrequencyRow) Freq() uint64 { + return tfr.freq +} + +func (tfr *TermFrequencyRow) ScanPrefixForField() []byte { + buf := make([]byte, 3) + buf[0] = 't' + binary.LittleEndian.PutUint16(buf[1:3], tfr.field) + return buf +} + +func (tfr *TermFrequencyRow) ScanPrefixForFieldTermPrefix() []byte { + buf := make([]byte, 3+len(tfr.term)) + buf[0] = 't' + binary.LittleEndian.PutUint16(buf[1:3], tfr.field) + copy(buf[3:], tfr.term) + return buf +} + +func (tfr *TermFrequencyRow) ScanPrefixForFieldTerm() []byte { + buf := make([]byte, 3+len(tfr.term)+1) + buf[0] = 't' + binary.LittleEndian.PutUint16(buf[1:3], tfr.field) + termLen := copy(buf[3:], tfr.term) + buf[3+termLen] = ByteSeparator + return buf +} + +func (tfr *TermFrequencyRow) Key() []byte { + buf := make([]byte, tfr.KeySize()) + size, _ := tfr.KeyTo(buf) + return buf[:size] +} + +func (tfr *TermFrequencyRow) KeySize() int { + return 3 + len(tfr.term) + 1 + len(tfr.docNumber) +} + +func (tfr *TermFrequencyRow) KeyTo(buf []byte) (int, error) { + buf[0] = 't' + binary.LittleEndian.PutUint16(buf[1:3], tfr.field) + termLen := copy(buf[3:], tfr.term) + buf[3+termLen] = ByteSeparator + docLen := copy(buf[3+termLen+1:], tfr.docNumber) + return 3 + termLen + 1 + docLen, nil +} + +func (tfr *TermFrequencyRow) DictionaryRowKey() []byte { + dr := NewDictionaryRow(tfr.term, tfr.field, 0) + return dr.Key() +} + +func (tfr *TermFrequencyRow) DictionaryRowKeySize() int { + dr := NewDictionaryRow(tfr.term, tfr.field, 0) + return dr.KeySize() +} + +func (tfr *TermFrequencyRow) DictionaryRowKeyTo(buf []byte) (int, error) { + dr := NewDictionaryRow(tfr.term, tfr.field, 0) + return dr.KeyTo(buf) +} + +func (tfr *TermFrequencyRow) Value() []byte { + buf := make([]byte, tfr.ValueSize()) + size, _ := tfr.ValueTo(buf) + return buf[:size] +} + +func (tfr *TermFrequencyRow) ValueSize() int { + bufLen := binary.MaxVarintLen64 + binary.MaxVarintLen64 + for _, vector := range tfr.vectors { + bufLen += (binary.MaxVarintLen64 * 4) + (1+len(vector.arrayPositions))*binary.MaxVarintLen64 + } + return bufLen +} + +func (tfr *TermFrequencyRow) ValueTo(buf []byte) (int, error) { + used := binary.PutUvarint(buf[:binary.MaxVarintLen64], tfr.freq) + + normuint32 := math.Float32bits(tfr.norm) + newbuf := buf[used : used+binary.MaxVarintLen64] + used += binary.PutUvarint(newbuf, uint64(normuint32)) + + for _, vector := range tfr.vectors { + used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], uint64(vector.field)) + used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], vector.pos) + used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], vector.start) + used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], vector.end) + used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], uint64(len(vector.arrayPositions))) + for _, arrayPosition := range vector.arrayPositions { + used += binary.PutUvarint(buf[used:used+binary.MaxVarintLen64], arrayPosition) + } + } + return used, nil +} + +func (tfr *TermFrequencyRow) String() string { + _, dn, _ := DecodeUvarintAscending(tfr.docNumber) + return fmt.Sprintf("Term: `%s` Field: %d Document: %d Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, dn, tfr.freq, tfr.norm, tfr.vectors) +} + +func NewTermFrequencyRow(term []byte, field uint16, docNum uint64, freq uint64, norm float32) *TermFrequencyRow { + return &TermFrequencyRow{ + term: term, + field: field, + docNumber: EncodeUvarintAscending(nil, docNum), + freq: freq, + norm: norm, + } +} + +func TermFrequencyRowStart(term []byte, field uint16, docNum []byte) *TermFrequencyRow { + return &TermFrequencyRow{ + term: term, + field: field, + docNumber: docNum, + freq: 0, + norm: 0, + } +} + +func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, docNum uint64, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow { + return &TermFrequencyRow{ + term: term, + field: field, + docNumber: EncodeUvarintAscending(nil, docNum), + freq: freq, + norm: norm, + vectors: vectors, + } +} + +func NewTermFrequencyRowK(key []byte) (*TermFrequencyRow, error) { + rv := &TermFrequencyRow{} + err := rv.parseK(key) + if err != nil { + return nil, err + } + return rv, nil +} + +func (tfr *TermFrequencyRow) parseK(key []byte) error { + keyLen := len(key) + if keyLen < 3 { + return fmt.Errorf("invalid term frequency key, no valid field") + } + tfr.field = binary.LittleEndian.Uint16(key[1:3]) + + termEndPos := bytes.IndexByte(key[3:], ByteSeparator) + if termEndPos < 0 { + return fmt.Errorf("invalid term frequency key, no byte separator terminating term") + } + tfr.term = key[3 : 3+termEndPos] + + docNumberLen := keyLen - (3 + termEndPos + 1) + if docNumberLen < 1 { + return fmt.Errorf("invalid term frequency key, empty docnum") + } + tfr.docNumber = key[3+termEndPos+1:] + + return nil +} + +func (tfr *TermFrequencyRow) parseKDoc(key []byte) error { + termEndPos := bytes.IndexByte(key[3:], ByteSeparator) + if termEndPos < 0 { + return fmt.Errorf("invalid term frequency key, no byte separator terminating term") + } + + tfr.docNumber = key[3+termEndPos+1:] + if len(tfr.docNumber) <= 0 { + return fmt.Errorf("invalid term frequency key, empty docnum") + } + + return nil +} + +func (tfr *TermFrequencyRow) parseV(value []byte) error { + var bytesRead int + tfr.freq, bytesRead = binary.Uvarint(value) + if bytesRead <= 0 { + return fmt.Errorf("invalid term frequency value, invalid frequency") + } + currOffset := bytesRead + + var norm uint64 + norm, bytesRead = binary.Uvarint(value[currOffset:]) + if bytesRead <= 0 { + return fmt.Errorf("invalid term frequency value, no norm") + } + currOffset += bytesRead + + tfr.norm = math.Float32frombits(uint32(norm)) + + tfr.vectors = nil + var field uint64 + field, bytesRead = binary.Uvarint(value[currOffset:]) + for bytesRead > 0 { + currOffset += bytesRead + tv := TermVector{} + tv.field = uint16(field) + // at this point we expect at least one term vector + if tfr.vectors == nil { + tfr.vectors = make([]*TermVector, 0) + } + + tv.pos, bytesRead = binary.Uvarint(value[currOffset:]) + if bytesRead <= 0 { + return fmt.Errorf("invalid term frequency value, vector contains no position") + } + currOffset += bytesRead + + tv.start, bytesRead = binary.Uvarint(value[currOffset:]) + if bytesRead <= 0 { + return fmt.Errorf("invalid term frequency value, vector contains no start") + } + currOffset += bytesRead + + tv.end, bytesRead = binary.Uvarint(value[currOffset:]) + if bytesRead <= 0 { + return fmt.Errorf("invalid term frequency value, vector contains no end") + } + currOffset += bytesRead + + var arrayPositionsLen uint64 = 0 + arrayPositionsLen, bytesRead = binary.Uvarint(value[currOffset:]) + if bytesRead <= 0 { + return fmt.Errorf("invalid term frequency value, vector contains no arrayPositionLen") + } + currOffset += bytesRead + + if arrayPositionsLen > 0 { + tv.arrayPositions = make([]uint64, arrayPositionsLen) + for i := 0; uint64(i) < arrayPositionsLen; i++ { + tv.arrayPositions[i], bytesRead = binary.Uvarint(value[currOffset:]) + if bytesRead <= 0 { + return fmt.Errorf("invalid term frequency value, vector contains no arrayPosition of index %d", i) + } + currOffset += bytesRead + } + } + + tfr.vectors = append(tfr.vectors, &tv) + // try to read next record (may not exist) + field, bytesRead = binary.Uvarint(value[currOffset:]) + } + if len(value[currOffset:]) > 0 && bytesRead <= 0 { + return fmt.Errorf("invalid term frequency value, vector field invalid") + } + + return nil +} + +func NewTermFrequencyRowKV(key, value []byte) (*TermFrequencyRow, error) { + rv, err := NewTermFrequencyRowK(key) + if err != nil { + return nil, err + } + + err = rv.parseV(value) + if err != nil { + return nil, err + } + return rv, nil + +} + +type BackIndexRow struct { + docNumber []byte + termEntries []*BackIndexTermEntry + storedEntries []*BackIndexStoreEntry +} + +func (br *BackIndexRow) AllTermKeys() [][]byte { + if br == nil { + return nil + } + + rv := make([][]byte, len(br.termEntries)) + for i, termEntry := range br.termEntries { + termRow := TermFrequencyRowStart([]byte(termEntry.GetTerm()), uint16(termEntry.GetField()), br.docNumber) + rv[i] = termRow.Key() + } + return rv +} + +func (br *BackIndexRow) AllStoredKeys() [][]byte { + if br == nil { + return nil + } + rv := make([][]byte, len(br.storedEntries)) + for i, storedEntry := range br.storedEntries { + storedRow := NewStoredRowDocBytes(br.docNumber, uint16(storedEntry.GetField()), storedEntry.GetArrayPositions(), 'x', []byte{}) + rv[i] = storedRow.Key() + } + return rv +} + +func (br *BackIndexRow) Key() []byte { + buf := make([]byte, br.KeySize()) + size, _ := br.KeyTo(buf) + return buf[:size] +} + +func (br *BackIndexRow) KeySize() int { + return len(br.docNumber) + 1 +} + +func (br *BackIndexRow) KeyTo(buf []byte) (int, error) { + buf[0] = 'b' + used := copy(buf[1:], br.docNumber) + return used + 1, nil +} + +func (br *BackIndexRow) Value() []byte { + buf := make([]byte, br.ValueSize()) + size, _ := br.ValueTo(buf) + return buf[:size] +} + +func (br *BackIndexRow) ValueSize() int { + birv := &BackIndexRowValue{ + TermEntries: br.termEntries, + StoredEntries: br.storedEntries, + } + return birv.Size() +} + +func (br *BackIndexRow) ValueTo(buf []byte) (int, error) { + birv := &BackIndexRowValue{ + TermEntries: br.termEntries, + StoredEntries: br.storedEntries, + } + return birv.MarshalTo(buf) +} + +func (br *BackIndexRow) String() string { + _, dn, _ := DecodeUvarintAscending(br.docNumber) + return fmt.Sprintf("Backindex Document: %d Term Entries: %v, Stored Entries: %v", dn, br.termEntries, br.storedEntries) +} + +func NewBackIndexRow(docNum uint64, entries []*BackIndexTermEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow { + return &BackIndexRow{ + docNumber: EncodeUvarintAscending(nil, docNum), + termEntries: entries, + storedEntries: storedFields, + } +} + +func BackIndexRowKey(docNum []byte) []byte { + var buf = []byte{'b'} + buf = append(buf, docNum...) + return buf +} + +func NewBackIndexRowKV(key, value []byte) (*BackIndexRow, error) { + rv := BackIndexRow{} + + // buf := bytes.NewBuffer(key) + // _, err := buf.ReadByte() // type + // if err != nil { + // return nil, err + // } + // + // rv.docNumber, err = buf.ReadBytes(ByteSeparator) + // if err == io.EOF && len(rv.doc) < 1 { + // err = fmt.Errorf("invalid doc length 0 - % x", key) + // } + // if err != nil && err != io.EOF { + // return nil, err + // } else if err == nil { + // rv.doc = rv.doc[:len(rv.doc)-1] // trim off separator byte + // } + + //rv.docNumber = key[1:] + rv.docNumber = append(rv.docNumber, key[1:]...) + + var birv BackIndexRowValue + err := proto.Unmarshal(value, &birv) + if err != nil { + return nil, err + } + rv.termEntries = birv.TermEntries + rv.storedEntries = birv.StoredEntries + + return &rv, nil +} + +// STORED + +type StoredRow struct { + docNumber []byte + field uint16 + arrayPositions []uint64 + typ byte + value []byte +} + +func (s *StoredRow) Key() []byte { + buf := make([]byte, s.KeySize()) + size, _ := s.KeyTo(buf) + return buf[0:size] +} + +func (s *StoredRow) KeySize() int { + return 1 + len(s.docNumber) + 1 + 2 + (binary.MaxVarintLen64 * len(s.arrayPositions)) +} + +func (s *StoredRow) KeyTo(buf []byte) (int, error) { + docNumLen := len(s.docNumber) + buf[0] = 's' + copy(buf[1:], s.docNumber) + buf[1+docNumLen] = ByteSeparator // FIXME can we remove this ByteSeparator? we know the length of the docnum + binary.LittleEndian.PutUint16(buf[1+docNumLen+1:], s.field) + bytesUsed := 1 + docNumLen + 1 + 2 + for _, arrayPosition := range s.arrayPositions { + varbytes := binary.PutUvarint(buf[bytesUsed:], arrayPosition) + bytesUsed += varbytes + } + return bytesUsed, nil +} + +func (s *StoredRow) Value() []byte { + buf := make([]byte, s.ValueSize()) + size, _ := s.ValueTo(buf) + return buf[:size] +} + +func (s *StoredRow) ValueSize() int { + return len(s.value) + 1 +} + +func (s *StoredRow) ValueTo(buf []byte) (int, error) { + buf[0] = s.typ + used := copy(buf[1:], s.value) + return used + 1, nil +} + +func (s *StoredRow) String() string { + _, dn, _ := DecodeUvarintAscending(s.docNumber) + return fmt.Sprintf("Document: %d Field %d, Array Positions: %v, Type: %s Value: %s", dn, s.field, s.arrayPositions, string(s.typ), s.value) +} + +func (s *StoredRow) ScanPrefixForDoc() []byte { + docLen := len(s.docNumber) + buf := make([]byte, 1+docLen+1) + buf[0] = 's' + copy(buf[1:], s.docNumber) + buf[1+docLen] = ByteSeparator + return buf +} + +func NewStoredRow(docNum uint64, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow { + return &StoredRow{ + docNumber: EncodeUvarintAscending(nil, docNum), + field: field, + arrayPositions: arrayPositions, + typ: typ, + value: value, + } +} + +// FIXME consolidate +func NewStoredRowDocBytes(docNum []byte, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow { + return &StoredRow{ + docNumber: docNum, + field: field, + arrayPositions: arrayPositions, + typ: typ, + value: value, + } +} + +func NewStoredRowK(key []byte) (*StoredRow, error) { + rv := StoredRow{} + + buf := bytes.NewBuffer(key) + _, err := buf.ReadByte() // type + if err != nil { + return nil, err + } + + rv.docNumber, err = buf.ReadBytes(ByteSeparator) + if len(rv.docNumber) < 2 { // 1 for min doc id length, 1 for separator + err = fmt.Errorf("invalid doc length 0") + return nil, err + } + + rv.docNumber = rv.docNumber[:len(rv.docNumber)-1] // trim off separator byte + + err = binary.Read(buf, binary.LittleEndian, &rv.field) + if err != nil { + return nil, err + } + + rv.arrayPositions = make([]uint64, 0) + nextArrayPos, err := binary.ReadUvarint(buf) + for err == nil { + rv.arrayPositions = append(rv.arrayPositions, nextArrayPos) + nextArrayPos, err = binary.ReadUvarint(buf) + } + return &rv, nil +} + +func NewStoredRowKV(key, value []byte) (*StoredRow, error) { + rv, err := NewStoredRowK(key) + if err != nil { + return nil, err + } + rv.typ = value[0] + rv.value = value[1:] + return rv, nil +} diff --git a/index/smolder/row_merge.go b/index/smolder/row_merge.go new file mode 100644 index 00000000..a4bdfe28 --- /dev/null +++ b/index/smolder/row_merge.go @@ -0,0 +1,71 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "encoding/binary" +) + +var mergeOperator smolderingMerge + +var dictionaryTermIncr []byte +var dictionaryTermDecr []byte + +func init() { + dictionaryTermIncr = make([]byte, 8) + binary.LittleEndian.PutUint64(dictionaryTermIncr, uint64(1)) + dictionaryTermDecr = make([]byte, 8) + var negOne = int64(-1) + binary.LittleEndian.PutUint64(dictionaryTermDecr, uint64(negOne)) +} + +type smolderingMerge struct{} + +func (m *smolderingMerge) FullMerge(key, existingValue []byte, operands [][]byte) ([]byte, bool) { + // set up record based on key + dr, err := NewDictionaryRowK(key) + if err != nil { + return nil, false + } + if len(existingValue) > 0 { + // if existing value, parse it + err = dr.parseDictionaryV(existingValue) + if err != nil { + return nil, false + } + } + + // now process operands + for _, operand := range operands { + next := int64(binary.LittleEndian.Uint64(operand)) + if next < 0 && uint64(-next) > dr.count { + // subtracting next from existing would overflow + dr.count = 0 + } else if next < 0 { + dr.count -= uint64(-next) + } else { + dr.count += uint64(next) + } + } + + return dr.Value(), true +} + +func (m *smolderingMerge) PartialMerge(key, leftOperand, rightOperand []byte) ([]byte, bool) { + left := int64(binary.LittleEndian.Uint64(leftOperand)) + right := int64(binary.LittleEndian.Uint64(rightOperand)) + rv := make([]byte, 8) + binary.LittleEndian.PutUint64(rv, uint64(left+right)) + return rv, true +} + +func (m *smolderingMerge) Name() string { + return "smolderingMerge" +} diff --git a/index/smolder/row_merge_test.go b/index/smolder/row_merge_test.go new file mode 100644 index 00000000..020fc8a8 --- /dev/null +++ b/index/smolder/row_merge_test.go @@ -0,0 +1,52 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "bytes" + "encoding/binary" + "testing" +) + +func TestPartialMerge(t *testing.T) { + + tests := []struct { + in [][]byte + out uint64 + }{ + { + in: [][]byte{dictionaryTermIncr, dictionaryTermIncr, dictionaryTermIncr, dictionaryTermIncr, dictionaryTermIncr}, + out: 5, + }, + } + + mo := &smolderingMerge{} + for _, test := range tests { + curr := test.in[0] + for _, next := range test.in[1:] { + var ok bool + curr, ok = mo.PartialMerge([]byte("key"), curr, next) + if !ok { + t.Errorf("expected partial merge ok") + } + } + actual := decodeCount(curr) + if actual != test.out { + t.Errorf("expected %d, got %d", test.out, actual) + } + } + +} + +func decodeCount(in []byte) uint64 { + buf := bytes.NewBuffer(in) + count, _ := binary.ReadUvarint(buf) + return count +} diff --git a/index/smolder/row_test.go b/index/smolder/row_test.go new file mode 100644 index 00000000..5d78e75b --- /dev/null +++ b/index/smolder/row_test.go @@ -0,0 +1,357 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "math" + "reflect" + "testing" + + "github.com/golang/protobuf/proto" +) + +func TestRows(t *testing.T) { + tests := []struct { + input SmolderingCouchRow + outKey []byte + outVal []byte + }{ + { + NewVersionRow(1), + []byte{'v'}, + []byte{0x1}, + }, + { + NewFieldRow(0, "name"), + []byte{'f', 0, 0}, + []byte{'n', 'a', 'm', 'e', ByteSeparator}, + }, + { + NewFieldRow(1, "desc"), + []byte{'f', 1, 0}, + []byte{'d', 'e', 's', 'c', ByteSeparator}, + }, + { + NewFieldRow(513, "style"), + []byte{'f', 1, 2}, + []byte{'s', 't', 'y', 'l', 'e', ByteSeparator}, + }, + { + NewDictionaryRow([]byte{'b', 'e', 'e', 'r'}, 0, 27), + []byte{'d', 0, 0, 'b', 'e', 'e', 'r'}, + []byte{27}, + }, + { + NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, 1, 3, 3.14), + []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137}, + []byte{3, 195, 235, 163, 130, 4}, + }, + { + NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, 1, 3, 3.14), + []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137}, + []byte{3, 195, 235, 163, 130, 4}, + }, + { + NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, 1, 3, 3.14, []*TermVector{{field: 0, pos: 1, start: 3, end: 11}, {field: 0, pos: 2, start: 23, end: 31}, {field: 0, pos: 3, start: 43, end: 51}}), + []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137}, + []byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0}, + }, + // test larger varints + { + NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, 1, 25896, 3.14, []*TermVector{{field: 255, pos: 1, start: 3, end: 11}, {field: 0, pos: 2198, start: 23, end: 31}, {field: 0, pos: 3, start: 43, end: 51}}), + []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137}, + []byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 0, 150, 17, 23, 31, 0, 0, 3, 43, 51, 0}, + }, + // test vectors with arrayPositions + { + NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, 1, 25896, 3.14, []*TermVector{{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, {field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, {field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}), + []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137}, + []byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 1, 0, 0, 150, 17, 23, 31, 2, 1, 2, 0, 3, 43, 51, 3, 3, 4, 5}, + }, + { + NewBackIndexRow(1, []*BackIndexTermEntry{{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil), + []byte{'b', 137}, + []byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0}, + }, + { + NewBackIndexRow(1, []*BackIndexTermEntry{{Term: proto.String("beer"), Field: proto.Uint32(0)}, {Term: proto.String("beat"), Field: proto.Uint32(1)}}, nil), + []byte{'b', 137}, + []byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0, 10, 8, 10, 4, 'b', 'e', 'a', 't', 16, 1}, + }, + { + NewBackIndexRow(1, []*BackIndexTermEntry{{Term: proto.String("beer"), Field: proto.Uint32(0)}, {Term: proto.String("beat"), Field: proto.Uint32(1)}}, []*BackIndexStoreEntry{{Field: proto.Uint32(3)}, {Field: proto.Uint32(4)}, {Field: proto.Uint32(5)}}), + []byte{'b', 137}, + []byte{10, 8, 10, 4, 'b', 'e', 'e', 'r', 16, 0, 10, 8, 10, 4, 'b', 'e', 'a', 't', 16, 1, 18, 2, 8, 3, 18, 2, 8, 4, 18, 2, 8, 5}, + }, + { + NewStoredRow(1, 0, []uint64{}, byte('t'), []byte("an american beer")), + []byte{'s', 137, ByteSeparator, 0, 0}, + []byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'}, + }, + { + NewStoredRow(1, 0, []uint64{2, 294, 3078}, byte('t'), []byte("an american beer")), + []byte{'s', 137, ByteSeparator, 0, 0, 2, 166, 2, 134, 24}, + []byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'}, + }, + { + NewInternalRow([]byte("mapping"), []byte(`{"mapping":"json content"}`)), + []byte{'i', 'm', 'a', 'p', 'p', 'i', 'n', 'g'}, + []byte{'{', '"', 'm', 'a', 'p', 'p', 'i', 'n', 'g', '"', ':', '"', 'j', 's', 'o', 'n', ' ', 'c', 'o', 'n', 't', 'e', 'n', 't', '"', '}'}, + }, + } + + // test going from struct to k/v bytes + for i, test := range tests { + rk := test.input.Key() + if !reflect.DeepEqual(rk, test.outKey) { + t.Errorf("Expected key to be %v got: %v", test.outKey, rk) + } + rv := test.input.Value() + if !reflect.DeepEqual(rv, test.outVal) { + t.Errorf("Expected value to be %v got: %v for %d", test.outVal, rv, i) + } + } + + // now test going back from k/v bytes to struct + for i, test := range tests { + row, err := ParseFromKeyValue(test.outKey, test.outVal) + if err != nil { + t.Errorf("error parsking key/value: %v", err) + } + if !reflect.DeepEqual(row, test.input) { + t.Errorf("Expected: %#v got: %#v for %d", test.input, row, i) + } + } + +} + +func TestInvalidRows(t *testing.T) { + tests := []struct { + key []byte + val []byte + }{ + // empty key + { + []byte{}, + []byte{}, + }, + // no such type q + { + []byte{'q'}, + []byte{}, + }, + // type v, invalid empty value + { + []byte{'v'}, + []byte{}, + }, + // type f, invalid key + { + []byte{'f'}, + []byte{}, + }, + // type f, valid key, invalid value + { + []byte{'f', 0, 0}, + []byte{}, + }, + // type t, invalid key (missing field) + { + []byte{'t'}, + []byte{}, + }, + // type t, invalid key (missing term) + { + []byte{'t', 0, 0}, + []byte{}, + }, + // type t, invalid key (missing id) + { + []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator}, + []byte{}, + }, + // type t, invalid val (missing freq) + { + []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, + []byte{}, + }, + // type t, invalid val (missing norm) + { + []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, + []byte{3}, + }, + // type t, invalid val (half missing tv field, full missing is valid (no term vectors)) + { + []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, + []byte{3, 25, 255}, + }, + // type t, invalid val (missing tv pos) + { + []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, + []byte{3, 25, 0}, + }, + // type t, invalid val (missing tv start) + { + []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, + []byte{3, 25, 0, 0}, + }, + // type t, invalid val (missing tv end) + { + []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, + []byte{3, 25, 0, 0, 0}, + }, + // type b, invalid key (missing id) + { + []byte{'b'}, + []byte{'b', 'e', 'e', 'r', ByteSeparator, 0, 0}, + }, + // type b, invalid val (missing field) + { + []byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, + []byte{'g', 'a', 'r', 'b', 'a', 'g', 'e'}, + }, + // type s, invalid key (missing id) + { + []byte{'s'}, + []byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'}, + }, + // type b, invalid val (missing field) + { + []byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', ByteSeparator}, + []byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'}, + }, + } + + for _, test := range tests { + _, err := ParseFromKeyValue(test.key, test.val) + if err == nil { + t.Errorf("expected error, got nil") + } + } +} + +func TestDictionaryRowValueBug197(t *testing.T) { + // this was the smallest value that would trigger a crash + dr := &DictionaryRow{ + field: 0, + term: []byte("marty"), + count: 72057594037927936, + } + dr.Value() + // this is the maximum possible value + dr = &DictionaryRow{ + field: 0, + term: []byte("marty"), + count: math.MaxUint64, + } + dr.Value() + // neither of these should panic +} + +func BenchmarkTermFrequencyRowEncode(b *testing.B) { + row := NewTermFrequencyRowWithTermVectors( + []byte{'b', 'e', 'e', 'r'}, + 0, + 1, + 3, + 3.14, + []*TermVector{ + { + field: 0, + pos: 1, + start: 3, + end: 11, + }, + { + field: 0, + pos: 2, + start: 23, + end: 31, + }, + { + field: 0, + pos: 3, + start: 43, + end: 51, + }, + }) + b.ResetTimer() + for i := 0; i < b.N; i++ { + row.Key() + row.Value() + } +} + +func BenchmarkTermFrequencyRowDecode(b *testing.B) { + k := []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'} + v := []byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0} + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := NewTermFrequencyRowKV(k, v) + if err != nil { + b.Fatal(err) + } + } +} + +func BenchmarkBackIndexRowEncode(b *testing.B) { + field := uint32(1) + t1 := "term1" + row := NewBackIndexRow(1, + []*BackIndexTermEntry{ + { + Term: &t1, + Field: &field, + }, + }, + []*BackIndexStoreEntry{ + { + Field: &field, + }, + }) + b.ResetTimer() + for i := 0; i < b.N; i++ { + row.Key() + row.Value() + } +} + +func BenchmarkBackIndexRowDecode(b *testing.B) { + k := []byte{0x62, 0x62, 0x65, 0x65, 0x72, 0x6e, 0x61, 0x6d, 0x65} + v := []byte{0x0a, 0x09, 0x0a, 0x05, 0x74, 0x65, 0x72, 0x6d, 0x31, 0x10, 0x01, 0x12, 0x02, 0x08, 0x01} + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := NewBackIndexRowKV(k, v) + if err != nil { + b.Fatal(err) + } + } +} + +func BenchmarkStoredRowEncode(b *testing.B) { + row := NewStoredRow(1, 0, []uint64{}, byte('t'), []byte("an american beer")) + b.ResetTimer() + for i := 0; i < b.N; i++ { + row.Key() + row.Value() + } +} + +func BenchmarkStoredRowDecode(b *testing.B) { + k := []byte{'s', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r', ByteSeparator, 0, 0} + v := []byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'} + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := NewStoredRowKV(k, v) + if err != nil { + b.Fatal(err) + } + } +} diff --git a/index/smolder/smolder.pb.go b/index/smolder/smolder.pb.go new file mode 100644 index 00000000..06d20cbf --- /dev/null +++ b/index/smolder/smolder.pb.go @@ -0,0 +1,684 @@ +// Code generated by protoc-gen-gogo. +// source: smolder.proto +// DO NOT EDIT! + +/* +Package smolder is a generated protocol buffer package. + +It is generated from these files: + smolder.proto + +It has these top-level messages: + BackIndexTermEntry + BackIndexStoreEntry + BackIndexRowValue +*/ +package smolder + +import proto "github.com/golang/protobuf/proto" +import math "math" + +import io "io" +import fmt "fmt" +import github_com_golang_protobuf_proto "github.com/golang/protobuf/proto" + +// Reference imports to suppress errors if they are not otherwise used. +var _ = proto.Marshal +var _ = math.Inf + +type BackIndexTermEntry struct { + Term *string `protobuf:"bytes,1,req,name=term" json:"term,omitempty"` + Field *uint32 `protobuf:"varint,2,req,name=field" json:"field,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *BackIndexTermEntry) Reset() { *m = BackIndexTermEntry{} } +func (m *BackIndexTermEntry) String() string { return proto.CompactTextString(m) } +func (*BackIndexTermEntry) ProtoMessage() {} + +func (m *BackIndexTermEntry) GetTerm() string { + if m != nil && m.Term != nil { + return *m.Term + } + return "" +} + +func (m *BackIndexTermEntry) GetField() uint32 { + if m != nil && m.Field != nil { + return *m.Field + } + return 0 +} + +type BackIndexStoreEntry struct { + Field *uint32 `protobuf:"varint,1,req,name=field" json:"field,omitempty"` + ArrayPositions []uint64 `protobuf:"varint,2,rep,name=arrayPositions" json:"arrayPositions,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *BackIndexStoreEntry) Reset() { *m = BackIndexStoreEntry{} } +func (m *BackIndexStoreEntry) String() string { return proto.CompactTextString(m) } +func (*BackIndexStoreEntry) ProtoMessage() {} + +func (m *BackIndexStoreEntry) GetField() uint32 { + if m != nil && m.Field != nil { + return *m.Field + } + return 0 +} + +func (m *BackIndexStoreEntry) GetArrayPositions() []uint64 { + if m != nil { + return m.ArrayPositions + } + return nil +} + +type BackIndexRowValue struct { + TermEntries []*BackIndexTermEntry `protobuf:"bytes,1,rep,name=termEntries" json:"termEntries,omitempty"` + StoredEntries []*BackIndexStoreEntry `protobuf:"bytes,2,rep,name=storedEntries" json:"storedEntries,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *BackIndexRowValue) Reset() { *m = BackIndexRowValue{} } +func (m *BackIndexRowValue) String() string { return proto.CompactTextString(m) } +func (*BackIndexRowValue) ProtoMessage() {} + +func (m *BackIndexRowValue) GetTermEntries() []*BackIndexTermEntry { + if m != nil { + return m.TermEntries + } + return nil +} + +func (m *BackIndexRowValue) GetStoredEntries() []*BackIndexStoreEntry { + if m != nil { + return m.StoredEntries + } + return nil +} + +func (m *BackIndexTermEntry) Unmarshal(data []byte) error { + var hasFields [1]uint64 + l := len(data) + iNdEx := 0 + for iNdEx < l { + var wire uint64 + for shift := uint(0); ; shift += 7 { + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + wire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Term", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + stringLen |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + postIndex := iNdEx + int(stringLen) + if postIndex > l { + return io.ErrUnexpectedEOF + } + s := string(data[iNdEx:postIndex]) + m.Term = &s + iNdEx = postIndex + hasFields[0] |= uint64(0x00000001) + case 2: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Field", wireType) + } + var v uint32 + for shift := uint(0); ; shift += 7 { + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + v |= (uint32(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + m.Field = &v + hasFields[0] |= uint64(0x00000002) + default: + var sizeOfWire int + for { + sizeOfWire++ + wire >>= 7 + if wire == 0 { + break + } + } + iNdEx -= sizeOfWire + skippy, err := skipSmolder(data[iNdEx:]) + if err != nil { + return err + } + if skippy < 0 { + return ErrInvalidLengthSmolder + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...) + iNdEx += skippy + } + } + if hasFields[0]&uint64(0x00000001) == 0 { + return new(github_com_golang_protobuf_proto.RequiredNotSetError) + } + if hasFields[0]&uint64(0x00000002) == 0 { + return new(github_com_golang_protobuf_proto.RequiredNotSetError) + } + + return nil +} +func (m *BackIndexStoreEntry) Unmarshal(data []byte) error { + var hasFields [1]uint64 + l := len(data) + iNdEx := 0 + for iNdEx < l { + var wire uint64 + for shift := uint(0); ; shift += 7 { + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + wire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + switch fieldNum { + case 1: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Field", wireType) + } + var v uint32 + for shift := uint(0); ; shift += 7 { + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + v |= (uint32(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + m.Field = &v + hasFields[0] |= uint64(0x00000001) + case 2: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field ArrayPositions", wireType) + } + var v uint64 + for shift := uint(0); ; shift += 7 { + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + v |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + m.ArrayPositions = append(m.ArrayPositions, v) + default: + var sizeOfWire int + for { + sizeOfWire++ + wire >>= 7 + if wire == 0 { + break + } + } + iNdEx -= sizeOfWire + skippy, err := skipSmolder(data[iNdEx:]) + if err != nil { + return err + } + if skippy < 0 { + return ErrInvalidLengthSmolder + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...) + iNdEx += skippy + } + } + if hasFields[0]&uint64(0x00000001) == 0 { + return new(github_com_golang_protobuf_proto.RequiredNotSetError) + } + + return nil +} +func (m *BackIndexRowValue) Unmarshal(data []byte) error { + l := len(data) + iNdEx := 0 + for iNdEx < l { + var wire uint64 + for shift := uint(0); ; shift += 7 { + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + wire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field TermEntries", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + msglen |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + postIndex := iNdEx + msglen + if msglen < 0 { + return ErrInvalidLengthSmolder + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.TermEntries = append(m.TermEntries, &BackIndexTermEntry{}) + if err := m.TermEntries[len(m.TermEntries)-1].Unmarshal(data[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + case 2: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field StoredEntries", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + msglen |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + postIndex := iNdEx + msglen + if msglen < 0 { + return ErrInvalidLengthSmolder + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.StoredEntries = append(m.StoredEntries, &BackIndexStoreEntry{}) + if err := m.StoredEntries[len(m.StoredEntries)-1].Unmarshal(data[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + default: + var sizeOfWire int + for { + sizeOfWire++ + wire >>= 7 + if wire == 0 { + break + } + } + iNdEx -= sizeOfWire + skippy, err := skipSmolder(data[iNdEx:]) + if err != nil { + return err + } + if skippy < 0 { + return ErrInvalidLengthSmolder + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + m.XXX_unrecognized = append(m.XXX_unrecognized, data[iNdEx:iNdEx+skippy]...) + iNdEx += skippy + } + } + + return nil +} +func skipSmolder(data []byte) (n int, err error) { + l := len(data) + iNdEx := 0 + for iNdEx < l { + var wire uint64 + for shift := uint(0); ; shift += 7 { + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + wire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + wireType := int(wire & 0x7) + switch wireType { + case 0: + for { + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + iNdEx++ + if data[iNdEx-1] < 0x80 { + break + } + } + return iNdEx, nil + case 1: + iNdEx += 8 + return iNdEx, nil + case 2: + var length int + for shift := uint(0); ; shift += 7 { + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + length |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + iNdEx += length + if length < 0 { + return 0, ErrInvalidLengthSmolder + } + return iNdEx, nil + case 3: + for { + var innerWire uint64 + var start int = iNdEx + for shift := uint(0); ; shift += 7 { + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := data[iNdEx] + iNdEx++ + innerWire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + innerWireType := int(innerWire & 0x7) + if innerWireType == 4 { + break + } + next, err := skipSmolder(data[start:]) + if err != nil { + return 0, err + } + iNdEx = start + next + } + return iNdEx, nil + case 4: + return iNdEx, nil + case 5: + iNdEx += 4 + return iNdEx, nil + default: + return 0, fmt.Errorf("proto: illegal wireType %d", wireType) + } + } + panic("unreachable") +} + +var ( + ErrInvalidLengthSmolder = fmt.Errorf("proto: negative length found during unmarshaling") +) + +func (m *BackIndexTermEntry) Size() (n int) { + var l int + _ = l + if m.Term != nil { + l = len(*m.Term) + n += 1 + l + sovSmolder(uint64(l)) + } + if m.Field != nil { + n += 1 + sovSmolder(uint64(*m.Field)) + } + if m.XXX_unrecognized != nil { + n += len(m.XXX_unrecognized) + } + return n +} + +func (m *BackIndexStoreEntry) Size() (n int) { + var l int + _ = l + if m.Field != nil { + n += 1 + sovSmolder(uint64(*m.Field)) + } + if len(m.ArrayPositions) > 0 { + for _, e := range m.ArrayPositions { + n += 1 + sovSmolder(uint64(e)) + } + } + if m.XXX_unrecognized != nil { + n += len(m.XXX_unrecognized) + } + return n +} + +func (m *BackIndexRowValue) Size() (n int) { + var l int + _ = l + if len(m.TermEntries) > 0 { + for _, e := range m.TermEntries { + l = e.Size() + n += 1 + l + sovSmolder(uint64(l)) + } + } + if len(m.StoredEntries) > 0 { + for _, e := range m.StoredEntries { + l = e.Size() + n += 1 + l + sovSmolder(uint64(l)) + } + } + if m.XXX_unrecognized != nil { + n += len(m.XXX_unrecognized) + } + return n +} + +func sovSmolder(x uint64) (n int) { + for { + n++ + x >>= 7 + if x == 0 { + break + } + } + return n +} +func sozSmolder(x uint64) (n int) { + return sovSmolder(uint64((x << 1) ^ uint64((int64(x) >> 63)))) +} +func (m *BackIndexTermEntry) Marshal() (data []byte, err error) { + size := m.Size() + data = make([]byte, size) + n, err := m.MarshalTo(data) + if err != nil { + return nil, err + } + return data[:n], nil +} + +func (m *BackIndexTermEntry) MarshalTo(data []byte) (n int, err error) { + var i int + _ = i + var l int + _ = l + if m.Term == nil { + return 0, new(github_com_golang_protobuf_proto.RequiredNotSetError) + } else { + data[i] = 0xa + i++ + i = encodeVarintSmolder(data, i, uint64(len(*m.Term))) + i += copy(data[i:], *m.Term) + } + if m.Field == nil { + return 0, new(github_com_golang_protobuf_proto.RequiredNotSetError) + } else { + data[i] = 0x10 + i++ + i = encodeVarintSmolder(data, i, uint64(*m.Field)) + } + if m.XXX_unrecognized != nil { + i += copy(data[i:], m.XXX_unrecognized) + } + return i, nil +} + +func (m *BackIndexStoreEntry) Marshal() (data []byte, err error) { + size := m.Size() + data = make([]byte, size) + n, err := m.MarshalTo(data) + if err != nil { + return nil, err + } + return data[:n], nil +} + +func (m *BackIndexStoreEntry) MarshalTo(data []byte) (n int, err error) { + var i int + _ = i + var l int + _ = l + if m.Field == nil { + return 0, new(github_com_golang_protobuf_proto.RequiredNotSetError) + } else { + data[i] = 0x8 + i++ + i = encodeVarintSmolder(data, i, uint64(*m.Field)) + } + if len(m.ArrayPositions) > 0 { + for _, num := range m.ArrayPositions { + data[i] = 0x10 + i++ + i = encodeVarintSmolder(data, i, uint64(num)) + } + } + if m.XXX_unrecognized != nil { + i += copy(data[i:], m.XXX_unrecognized) + } + return i, nil +} + +func (m *BackIndexRowValue) Marshal() (data []byte, err error) { + size := m.Size() + data = make([]byte, size) + n, err := m.MarshalTo(data) + if err != nil { + return nil, err + } + return data[:n], nil +} + +func (m *BackIndexRowValue) MarshalTo(data []byte) (n int, err error) { + var i int + _ = i + var l int + _ = l + if len(m.TermEntries) > 0 { + for _, msg := range m.TermEntries { + data[i] = 0xa + i++ + i = encodeVarintSmolder(data, i, uint64(msg.Size())) + n, err := msg.MarshalTo(data[i:]) + if err != nil { + return 0, err + } + i += n + } + } + if len(m.StoredEntries) > 0 { + for _, msg := range m.StoredEntries { + data[i] = 0x12 + i++ + i = encodeVarintSmolder(data, i, uint64(msg.Size())) + n, err := msg.MarshalTo(data[i:]) + if err != nil { + return 0, err + } + i += n + } + } + if m.XXX_unrecognized != nil { + i += copy(data[i:], m.XXX_unrecognized) + } + return i, nil +} + +func encodeFixed64Smolder(data []byte, offset int, v uint64) int { + data[offset] = uint8(v) + data[offset+1] = uint8(v >> 8) + data[offset+2] = uint8(v >> 16) + data[offset+3] = uint8(v >> 24) + data[offset+4] = uint8(v >> 32) + data[offset+5] = uint8(v >> 40) + data[offset+6] = uint8(v >> 48) + data[offset+7] = uint8(v >> 56) + return offset + 8 +} +func encodeFixed32Smolder(data []byte, offset int, v uint32) int { + data[offset] = uint8(v) + data[offset+1] = uint8(v >> 8) + data[offset+2] = uint8(v >> 16) + data[offset+3] = uint8(v >> 24) + return offset + 4 +} +func encodeVarintSmolder(data []byte, offset int, v uint64) int { + for v >= 1<<7 { + data[offset] = uint8(v&0x7f | 0x80) + v >>= 7 + offset++ + } + data[offset] = uint8(v) + return offset + 1 +} diff --git a/index/smolder/smolder.proto b/index/smolder/smolder.proto new file mode 100644 index 00000000..27f24ccb --- /dev/null +++ b/index/smolder/smolder.proto @@ -0,0 +1,14 @@ +message BackIndexTermEntry { + required string term = 1; + required uint32 field = 2; +} + +message BackIndexStoreEntry { + required uint32 field = 1; + repeated uint64 arrayPositions = 2; +} + +message BackIndexRowValue { + repeated BackIndexTermEntry termEntries = 1; + repeated BackIndexStoreEntry storedEntries = 2; +} \ No newline at end of file diff --git a/index/smolder/smoldering.go b/index/smolder/smoldering.go new file mode 100644 index 00000000..0465c815 --- /dev/null +++ b/index/smolder/smoldering.go @@ -0,0 +1,1107 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +//go:generate protoc --gofast_out=. smolder.proto + +package smolder + +import ( + "encoding/binary" + "encoding/json" + "fmt" + "math" + "sync" + "sync/atomic" + "time" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/store" + "github.com/blevesearch/bleve/registry" + + "github.com/golang/protobuf/proto" +) + +const Name = "smolder" + +// RowBufferSize should ideally this is sized to be the smallest +// size that can contain an index row key and its corresponding +// value. It is not a limit, if need be a larger buffer is +// allocated, but performance will be more optimal if *most* +// rows fit this size. +const RowBufferSize = 4 * 1024 + +var VersionKey = []byte{'v'} + +const Version uint8 = 5 + +var IncompatibleVersion = fmt.Errorf("incompatible version, %d is supported", Version) + +type SmolderingCouch struct { + version uint8 + path string + storeName string + storeConfig map[string]interface{} + store store.KVStore + fieldCache *index.FieldCache + analysisQueue *index.AnalysisQueue + stats *indexStat + + m sync.RWMutex + // fields protected by m + docCount uint64 + + writeMutex sync.Mutex + + maxInternalDocID uint64 +} + +type docBackIndexRow struct { + docID index.IndexInternalID + doc *document.Document // If deletion, doc will be nil. + backIndexRow *BackIndexRow +} + +func NewSmolderingCouch(storeName string, storeConfig map[string]interface{}, analysisQueue *index.AnalysisQueue) (index.Index, error) { + rv := &SmolderingCouch{ + version: Version, + fieldCache: index.NewFieldCache(), + storeName: storeName, + storeConfig: storeConfig, + analysisQueue: analysisQueue, + } + rv.stats = &indexStat{i: rv} + return rv, nil +} + +func (udc *SmolderingCouch) init(kvwriter store.KVWriter) (err error) { + // version marker + rowsAll := [][]SmolderingCouchRow{ + {NewVersionRow(udc.version)}, + {NewFieldRow(0, "_id")}, + } + + udc.fieldCache.AddExisting("_id", 0) + + err = udc.batchRows(kvwriter, nil, rowsAll, nil) + return +} + +func (udc *SmolderingCouch) loadSchema(kvreader store.KVReader) (err error) { + + it := kvreader.PrefixIterator([]byte{'f'}) + defer func() { + if cerr := it.Close(); err == nil && cerr != nil { + err = cerr + } + }() + + key, val, valid := it.Current() + for valid { + var fieldRow *FieldRow + fieldRow, err = NewFieldRowKV(key, val) + if err != nil { + return + } + udc.fieldCache.AddExisting(fieldRow.name, fieldRow.index) + + it.Next() + key, val, valid = it.Current() + } + + val, err = kvreader.Get([]byte{'v'}) + if err != nil { + return + } + var vr *VersionRow + vr, err = NewVersionRowKV([]byte{'v'}, val) + if err != nil { + return + } + if vr.version != Version { + err = IncompatibleVersion + return + } + + return +} + +var rowBufferPool sync.Pool + +func GetRowBuffer() []byte { + if rb, ok := rowBufferPool.Get().([]byte); ok { + return rb + } else { + return make([]byte, RowBufferSize) + } +} + +func PutRowBuffer(buf []byte) { + rowBufferPool.Put(buf) +} + +func (udc *SmolderingCouch) batchRows(writer store.KVWriter, addRowsAll [][]SmolderingCouchRow, updateRowsAll [][]SmolderingCouchRow, deleteRowsAll [][]SmolderingCouchRow) (err error) { + dictionaryDeltas := make(map[string]int64) + + // count up bytes needed for buffering. + addNum := 0 + addKeyBytes := 0 + addValBytes := 0 + + updateNum := 0 + updateKeyBytes := 0 + updateValBytes := 0 + + deleteNum := 0 + deleteKeyBytes := 0 + + rowBuf := GetRowBuffer() + + for _, addRows := range addRowsAll { + for _, row := range addRows { + tfr, ok := row.(*TermFrequencyRow) + if ok { + if tfr.DictionaryRowKeySize() > len(rowBuf) { + rowBuf = make([]byte, tfr.DictionaryRowKeySize()) + } + dictKeySize, err := tfr.DictionaryRowKeyTo(rowBuf) + if err != nil { + return err + } + dictionaryDeltas[string(rowBuf[:dictKeySize])] += 1 + } + addKeyBytes += row.KeySize() + addValBytes += row.ValueSize() + } + addNum += len(addRows) + } + + for _, updateRows := range updateRowsAll { + for _, row := range updateRows { + updateKeyBytes += row.KeySize() + updateValBytes += row.ValueSize() + } + updateNum += len(updateRows) + } + + for _, deleteRows := range deleteRowsAll { + for _, row := range deleteRows { + tfr, ok := row.(*TermFrequencyRow) + if ok { + // need to decrement counter + if tfr.DictionaryRowKeySize() > len(rowBuf) { + rowBuf = make([]byte, tfr.DictionaryRowKeySize()) + } + dictKeySize, err := tfr.DictionaryRowKeyTo(rowBuf) + if err != nil { + return err + } + dictionaryDeltas[string(rowBuf[:dictKeySize])] -= 1 + } + deleteKeyBytes += row.KeySize() + } + deleteNum += len(deleteRows) + } + + PutRowBuffer(rowBuf) + + mergeNum := len(dictionaryDeltas) + mergeKeyBytes := 0 + mergeValBytes := mergeNum * DictionaryRowMaxValueSize + + for dictRowKey := range dictionaryDeltas { + mergeKeyBytes += len(dictRowKey) + } + + // prepare batch + totBytes := addKeyBytes + addValBytes + + updateKeyBytes + updateValBytes + + deleteKeyBytes + + 2*(mergeKeyBytes+mergeValBytes) + + buf, wb, err := writer.NewBatchEx(store.KVBatchOptions{ + TotalBytes: totBytes, + NumSets: addNum + updateNum, + NumDeletes: deleteNum, + NumMerges: mergeNum, + }) + if err != nil { + return err + } + defer func() { + _ = wb.Close() + }() + + // fill the batch + for _, addRows := range addRowsAll { + for _, row := range addRows { + keySize, err := row.KeyTo(buf) + if err != nil { + return err + } + valSize, err := row.ValueTo(buf[keySize:]) + if err != nil { + return err + } + wb.Set(buf[:keySize], buf[keySize:keySize+valSize]) + buf = buf[keySize+valSize:] + } + } + + for _, updateRows := range updateRowsAll { + for _, row := range updateRows { + keySize, err := row.KeyTo(buf) + if err != nil { + return err + } + valSize, err := row.ValueTo(buf[keySize:]) + if err != nil { + return err + } + wb.Set(buf[:keySize], buf[keySize:keySize+valSize]) + buf = buf[keySize+valSize:] + } + } + + for _, deleteRows := range deleteRowsAll { + for _, row := range deleteRows { + keySize, err := row.KeyTo(buf) + if err != nil { + return err + } + wb.Delete(buf[:keySize]) + buf = buf[keySize:] + } + } + + for dictRowKey, delta := range dictionaryDeltas { + dictRowKeyLen := copy(buf, dictRowKey) + binary.LittleEndian.PutUint64(buf[dictRowKeyLen:], uint64(delta)) + wb.Merge(buf[:dictRowKeyLen], buf[dictRowKeyLen:dictRowKeyLen+DictionaryRowMaxValueSize]) + buf = buf[dictRowKeyLen+DictionaryRowMaxValueSize:] + } + + // write out the batch + return writer.ExecuteBatch(wb) +} + +func (udc *SmolderingCouch) DocCount() (uint64, error) { + udc.m.RLock() + defer udc.m.RUnlock() + return udc.docCount, nil +} + +func (udc *SmolderingCouch) Open() (err error) { + //acquire the write mutex for the duratin of Open() + udc.writeMutex.Lock() + defer udc.writeMutex.Unlock() + + // open the kv store + storeConstructor := registry.KVStoreConstructorByName(udc.storeName) + if storeConstructor == nil { + err = index.ErrorUnknownStorageType + return + } + + // now open the store + udc.store, err = storeConstructor(&mergeOperator, udc.storeConfig) + if err != nil { + return + } + + // start a reader to look at the index + var kvreader store.KVReader + kvreader, err = udc.store.Reader() + if err != nil { + return + } + + var value []byte + value, err = kvreader.Get(VersionKey) + if err != nil { + _ = kvreader.Close() + return + } + + if value != nil { + err = udc.loadSchema(kvreader) + if err != nil { + _ = kvreader.Close() + return + } + + // set doc count + udc.m.Lock() + udc.docCount, udc.maxInternalDocID, err = udc.countDocs(kvreader) + udc.m.Unlock() + + err = kvreader.Close() + } else { + // new index, close the reader and open writer to init + err = kvreader.Close() + if err != nil { + return + } + + var kvwriter store.KVWriter + kvwriter, err = udc.store.Writer() + if err != nil { + return + } + defer func() { + if cerr := kvwriter.Close(); err == nil && cerr != nil { + err = cerr + } + }() + + // init the index + err = udc.init(kvwriter) + } + + return +} + +func (udc *SmolderingCouch) countDocs(kvreader store.KVReader) (count, highDocNum uint64, err error) { + it := kvreader.PrefixIterator([]byte{'b'}) + defer func() { + if cerr := it.Close(); err == nil && cerr != nil { + err = cerr + } + }() + + var lastValidK []byte + k, _, valid := it.Current() + for valid { + lastValidK = lastValidK[:0] + lastValidK = append(lastValidK, k...) + count++ + it.Next() + k, _, valid = it.Current() + } + + if lastValidK != nil { + _, highDocNum, err = DecodeUvarintAscending(k[1:]) + if err != nil { + return 0, 0, err + } + } + + return +} + +func (udc *SmolderingCouch) rowCount() (count uint64, err error) { + // start an isolated reader for use during the rowcount + kvreader, err := udc.store.Reader() + if err != nil { + return + } + defer func() { + if cerr := kvreader.Close(); err == nil && cerr != nil { + err = cerr + } + }() + it := kvreader.RangeIterator(nil, nil) + defer func() { + if cerr := it.Close(); err == nil && cerr != nil { + err = cerr + } + }() + + _, _, valid := it.Current() + for valid { + count++ + it.Next() + _, _, valid = it.Current() + } + + return +} + +func (udc *SmolderingCouch) Close() error { + return udc.store.Close() +} + +func (udc *SmolderingCouch) Update(doc *document.Document) (err error) { + + // get the next available doc number + doc.Number = atomic.AddUint64(&udc.maxInternalDocID, 1) + + analysisStart := time.Now() + numPlainTextBytes := doc.NumPlainTextBytes() + resultChan := make(chan *index.AnalysisResult) + aw := index.NewAnalysisWork(udc, doc, resultChan) + + // put the work on the queue + udc.analysisQueue.Queue(aw) + + // wait for the result + result := <-resultChan + close(resultChan) + atomic.AddUint64(&udc.stats.analysisTime, uint64(time.Since(analysisStart))) + + udc.writeMutex.Lock() + defer udc.writeMutex.Unlock() + + indexReader, err := udc.Reader() + if err != nil { + return + } + + // first we lookup the backindex row for the doc id if it exists + // lookup the back index row + var backIndexRow *BackIndexRow + backIndexRow, err = udc.backIndexRowForDoc(indexReader, nil, doc.ID) + if err != nil { + _ = indexReader.Close() + atomic.AddUint64(&udc.stats.errors, 1) + return + } + + err = indexReader.Close() + if err != nil { + return + } + + // start a writer for this update + indexStart := time.Now() + var kvwriter store.KVWriter + kvwriter, err = udc.store.Writer() + if err != nil { + return + } + defer func() { + if cerr := kvwriter.Close(); err == nil && cerr != nil { + err = cerr + } + }() + + // prepare a list of rows + var addRowsAll [][]SmolderingCouchRow + var updateRowsAll [][]SmolderingCouchRow + var deleteRowsAll [][]SmolderingCouchRow + + addRows, updateRows, deleteRows := udc.mergeOldAndNew(doc.ID, backIndexRow, result.Rows) + if len(addRows) > 0 { + addRowsAll = append(addRowsAll, addRows) + } + if len(updateRows) > 0 { + updateRowsAll = append(updateRowsAll, updateRows) + } + if len(deleteRows) > 0 { + deleteRowsAll = append(deleteRowsAll, deleteRows) + } + + err = udc.batchRows(kvwriter, addRowsAll, updateRowsAll, deleteRowsAll) + if err == nil && backIndexRow == nil { + udc.m.Lock() + udc.docCount++ + udc.m.Unlock() + } + atomic.AddUint64(&udc.stats.indexTime, uint64(time.Since(indexStart))) + if err == nil { + atomic.AddUint64(&udc.stats.updates, 1) + atomic.AddUint64(&udc.stats.numPlainTextBytesIndexed, numPlainTextBytes) + } else { + atomic.AddUint64(&udc.stats.errors, 1) + } + return +} + +func (udc *SmolderingCouch) mergeOldAndNew(externalDocId string, backIndexRow *BackIndexRow, rows []index.IndexRow) (addRows []SmolderingCouchRow, updateRows []SmolderingCouchRow, deleteRows []SmolderingCouchRow) { + addRows = make([]SmolderingCouchRow, 0, len(rows)) + updateRows = make([]SmolderingCouchRow, 0, len(rows)) + deleteRows = make([]SmolderingCouchRow, 0, len(rows)) + + existingTermKeys := make(map[string]bool) + for _, key := range backIndexRow.AllTermKeys() { + existingTermKeys[string(key)] = true + } + + existingStoredKeys := make(map[string]bool) + for _, key := range backIndexRow.AllStoredKeys() { + existingStoredKeys[string(key)] = true + } + + keyBuf := GetRowBuffer() + for _, row := range rows { + switch row := row.(type) { + case *BackIndexRow: + if backIndexRow != nil { + row.docNumber = backIndexRow.docNumber + // look through the backindex and update the term entry for _id + for _, te := range row.termEntries { + if *te.Field == 0 { + te.Term = &externalDocId + } + } + } + updateRows = append(updateRows, row) + case *TermFrequencyRow: + if backIndexRow != nil { + // if this is the id term-freq-row, set the doc number from the back-index-row + // this could be different if we now know we're doing an update + //if row.field == 0 { + row.docNumber = backIndexRow.docNumber + //} + } + if row.KeySize() > len(keyBuf) { + keyBuf = make([]byte, row.KeySize()) + } + keySize, _ := row.KeyTo(keyBuf) + if _, ok := existingTermKeys[string(keyBuf[:keySize])]; ok { + updateRows = append(updateRows, row) + delete(existingTermKeys, string(keyBuf[:keySize])) + } else { + addRows = append(addRows, row) + } + case *StoredRow: + if backIndexRow != nil { + // if this is the id term-freq-row, set the doc number from the back-index-row + // this could be different if we now know we're doing an update + //if row.field == 0 { + row.docNumber = backIndexRow.docNumber + //} + } + if row.KeySize() > len(keyBuf) { + keyBuf = make([]byte, row.KeySize()) + } + keySize, _ := row.KeyTo(keyBuf) + if _, ok := existingStoredKeys[string(keyBuf[:keySize])]; ok { + updateRows = append(updateRows, row) + delete(existingStoredKeys, string(keyBuf[:keySize])) + } else { + addRows = append(addRows, row) + } + default: + updateRows = append(updateRows, row) + } + } + PutRowBuffer(keyBuf) + + // any of the existing rows that weren't updated need to be deleted + for existingTermKey := range existingTermKeys { + termFreqRow, err := NewTermFrequencyRowK([]byte(existingTermKey)) + if err == nil { + deleteRows = append(deleteRows, termFreqRow) + } + } + + // any of the existing stored fields that weren't updated need to be deleted + for existingStoredKey := range existingStoredKeys { + storedRow, err := NewStoredRowK([]byte(existingStoredKey)) + if err == nil { + deleteRows = append(deleteRows, storedRow) + } + } + + return addRows, updateRows, deleteRows +} + +func (udc *SmolderingCouch) storeField(docNum uint64, field document.Field, fieldIndex uint16, rows []index.IndexRow, backIndexStoredEntries []*BackIndexStoreEntry) ([]index.IndexRow, []*BackIndexStoreEntry) { + fieldType := encodeFieldType(field) + storedRow := NewStoredRow(docNum, fieldIndex, field.ArrayPositions(), fieldType, field.Value()) + + // record the back index entry + backIndexStoredEntry := BackIndexStoreEntry{Field: proto.Uint32(uint32(fieldIndex)), ArrayPositions: field.ArrayPositions()} + + return append(rows, storedRow), append(backIndexStoredEntries, &backIndexStoredEntry) +} + +func encodeFieldType(f document.Field) byte { + fieldType := byte('x') + switch f.(type) { + case *document.TextField: + fieldType = 't' + case *document.NumericField: + fieldType = 'n' + case *document.DateTimeField: + fieldType = 'd' + case *document.BooleanField: + fieldType = 'b' + case *document.CompositeField: + fieldType = 'c' + } + return fieldType +} + +func (udc *SmolderingCouch) indexField(docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow, backIndexTermEntries []*BackIndexTermEntry) ([]index.IndexRow, []*BackIndexTermEntry) { + fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength))) + + for k, tf := range tokenFreqs { + var termFreqRow *TermFrequencyRow + if includeTermVectors { + var tv []*TermVector + tv, rows = udc.termVectorsFromTokenFreq(fieldIndex, tf, rows) + termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docNum, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv) + } else { + termFreqRow = NewTermFrequencyRow(tf.Term, fieldIndex, docNum, uint64(frequencyFromTokenFreq(tf)), fieldNorm) + } + + // record the back index entry + backIndexTermEntry := BackIndexTermEntry{Term: proto.String(k), Field: proto.Uint32(uint32(fieldIndex))} + backIndexTermEntries = append(backIndexTermEntries, &backIndexTermEntry) + + rows = append(rows, termFreqRow) + } + + return rows, backIndexTermEntries +} + +func (udc *SmolderingCouch) Delete(id string) (err error) { + indexStart := time.Now() + + udc.writeMutex.Lock() + defer udc.writeMutex.Unlock() + + indexReader, err := udc.Reader() + if err != nil { + return + } + + // first we lookup the backindex row for the doc id if it exists + // lookup the back index row + var backIndexRow *BackIndexRow + backIndexRow, err = udc.backIndexRowForDoc(indexReader, nil, id) + if err != nil { + _ = indexReader.Close() + atomic.AddUint64(&udc.stats.errors, 1) + return + } + + err = indexReader.Close() + if err != nil { + return + } + + if backIndexRow == nil { + atomic.AddUint64(&udc.stats.deletes, 1) + return + } + + // start a writer for this delete + var kvwriter store.KVWriter + kvwriter, err = udc.store.Writer() + if err != nil { + return + } + defer func() { + if cerr := kvwriter.Close(); err == nil && cerr != nil { + err = cerr + } + }() + + var deleteRowsAll [][]SmolderingCouchRow + + deleteRows := udc.deleteSingle(backIndexRow.docNumber, backIndexRow, nil) + if len(deleteRows) > 0 { + deleteRowsAll = append(deleteRowsAll, deleteRows) + } + + err = udc.batchRows(kvwriter, nil, nil, deleteRowsAll) + if err == nil { + udc.m.Lock() + udc.docCount-- + udc.m.Unlock() + } + atomic.AddUint64(&udc.stats.indexTime, uint64(time.Since(indexStart))) + if err == nil { + atomic.AddUint64(&udc.stats.deletes, 1) + } else { + atomic.AddUint64(&udc.stats.errors, 1) + } + return +} + +func (udc *SmolderingCouch) deleteSingle(id index.IndexInternalID, backIndexRow *BackIndexRow, deleteRows []SmolderingCouchRow) []SmolderingCouchRow { + for _, backIndexEntry := range backIndexRow.termEntries { + tfr := TermFrequencyRowStart([]byte(*backIndexEntry.Term), uint16(*backIndexEntry.Field), id) + //tfr := NewTermFrequencyRow([]byte(*backIndexEntry.Term), uint16(*backIndexEntry.Field), idBytes, 0, 0) + deleteRows = append(deleteRows, tfr) + } + for _, se := range backIndexRow.storedEntries { + //sf := NewStoredRow(idBytes, uint16(*se.Field), se.ArrayPositions, 'x', nil) + sf := NewStoredRowDocBytes(id, uint16(*se.Field), se.ArrayPositions, 'x', nil) + deleteRows = append(deleteRows, sf) + } + + // also delete the back entry itself + deleteRows = append(deleteRows, backIndexRow) + return deleteRows +} + +func (udc *SmolderingCouch) backIndexRowForDoc(indexReader index.IndexReader, docID index.IndexInternalID, externalDocID string) (*BackIndexRow, error) { + + var err error + // first look up the docID if it isn't known + if docID == nil { + // first get the internal identifier + docID, err = indexReader.InternalID(externalDocID) + if err != nil { + _ = indexReader.Close() + return nil, err + } + } + if len(docID) < 1 { + return nil, nil + } + + // use a temporary row structure to build key + tempRow := &BackIndexRow{ + docNumber: docID, + } + + keyBuf := GetRowBuffer() + if tempRow.KeySize() > len(keyBuf) { + keyBuf = make([]byte, 2*tempRow.KeySize()) + } + defer PutRowBuffer(keyBuf) + keySize, err := tempRow.KeyTo(keyBuf) + if err != nil { + return nil, err + } + + // open a reader for backindex lookup + var kvreader = indexReader.(*IndexReader).kvreader + + value, err := kvreader.Get(keyBuf[:keySize]) + if err != nil { + return nil, err + } + if value == nil { + return nil, nil + } + backIndexRow, err := NewBackIndexRowKV(keyBuf[:keySize], value) + if err != nil { + return nil, err + } + return backIndexRow, nil +} + +func decodeFieldType(typ byte, name string, pos []uint64, value []byte) document.Field { + switch typ { + case 't': + return document.NewTextField(name, pos, value) + case 'n': + return document.NewNumericFieldFromBytes(name, pos, value) + case 'd': + return document.NewDateTimeFieldFromBytes(name, pos, value) + case 'b': + return document.NewBooleanFieldFromBytes(name, pos, value) + } + return nil +} + +func frequencyFromTokenFreq(tf *analysis.TokenFreq) int { + return tf.Frequency() +} + +func (udc *SmolderingCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq, rows []index.IndexRow) ([]*TermVector, []index.IndexRow) { + rv := make([]*TermVector, len(tf.Locations)) + + for i, l := range tf.Locations { + var newFieldRow *FieldRow + fieldIndex := field + if l.Field != "" { + // lookup correct field + fieldIndex, newFieldRow = udc.fieldIndexOrNewRow(l.Field) + if newFieldRow != nil { + rows = append(rows, newFieldRow) + } + } + tv := TermVector{ + field: fieldIndex, + arrayPositions: l.ArrayPositions, + pos: uint64(l.Position), + start: uint64(l.Start), + end: uint64(l.End), + } + rv[i] = &tv + } + + return rv, rows +} + +func (udc *SmolderingCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector { + if len(in) <= 0 { + return nil + } + + rv := make([]*index.TermFieldVector, len(in)) + + for i, tv := range in { + fieldName := udc.fieldCache.FieldIndexed(tv.field) + tfv := index.TermFieldVector{ + Field: fieldName, + ArrayPositions: tv.arrayPositions, + Pos: tv.pos, + Start: tv.start, + End: tv.end, + } + rv[i] = &tfv + } + return rv +} + +func (udc *SmolderingCouch) Batch(batch *index.Batch) (err error) { + // acquire enough doc numbers for all updates in the batch + // FIXME we actually waste doc numbers because deletes are in the + // same map and we don't need numbers for them + lastDocNumber := atomic.AddUint64(&udc.maxInternalDocID, uint64(len(batch.IndexOps))) + nextDocNumber := lastDocNumber - uint64(len(batch.IndexOps)) + 1 + + analysisStart := time.Now() + + resultChan := make(chan *index.AnalysisResult, len(batch.IndexOps)) + + var numUpdates uint64 + var numPlainTextBytes uint64 + for _, doc := range batch.IndexOps { + if doc != nil { + doc.Number = nextDocNumber // actually assign doc numbers here + nextDocNumber++ + numUpdates++ + numPlainTextBytes += doc.NumPlainTextBytes() + } + } + + go func() { + for _, doc := range batch.IndexOps { + if doc != nil { + aw := index.NewAnalysisWork(udc, doc, resultChan) + // put the work on the queue + udc.analysisQueue.Queue(aw) + } + } + }() + + // retrieve back index rows concurrent with analysis + docBackIndexRowErr := error(nil) + docBackIndexRowCh := make(chan *docBackIndexRow, len(batch.IndexOps)) + + udc.writeMutex.Lock() + defer udc.writeMutex.Unlock() + + go func() { + defer close(docBackIndexRowCh) + + // open a reader for backindex lookup + + indexReader, err := udc.Reader() + if err != nil { + docBackIndexRowErr = err + return + } + + for docID, doc := range batch.IndexOps { + backIndexRow, err := udc.backIndexRowForDoc(indexReader, nil, docID) + if err != nil { + docBackIndexRowErr = err + return + } + + var docNumber []byte + if backIndexRow != nil { + docNumber = backIndexRow.docNumber + } + docBackIndexRowCh <- &docBackIndexRow{docNumber, doc, backIndexRow} + } + + err = indexReader.Close() + if err != nil { + docBackIndexRowErr = err + return + } + }() + + // wait for analysis result + newRowsMap := make(map[string][]index.IndexRow) + var itemsDeQueued uint64 + for itemsDeQueued < numUpdates { + result := <-resultChan + newRowsMap[result.DocID] = result.Rows + itemsDeQueued++ + } + close(resultChan) + + atomic.AddUint64(&udc.stats.analysisTime, uint64(time.Since(analysisStart))) + + docsAdded := uint64(0) + docsDeleted := uint64(0) + + indexStart := time.Now() + + // prepare a list of rows + var addRowsAll [][]SmolderingCouchRow + var updateRowsAll [][]SmolderingCouchRow + var deleteRowsAll [][]SmolderingCouchRow + + // add the internal ops + var updateRows []SmolderingCouchRow + var deleteRows []SmolderingCouchRow + + for internalKey, internalValue := range batch.InternalOps { + if internalValue == nil { + // delete + deleteInternalRow := NewInternalRow([]byte(internalKey), nil) + deleteRows = append(deleteRows, deleteInternalRow) + } else { + updateInternalRow := NewInternalRow([]byte(internalKey), internalValue) + updateRows = append(updateRows, updateInternalRow) + } + } + + if len(updateRows) > 0 { + updateRowsAll = append(updateRowsAll, updateRows) + } + if len(deleteRows) > 0 { + deleteRowsAll = append(deleteRowsAll, deleteRows) + } + + // process back index rows as they arrive + for dbir := range docBackIndexRowCh { + if dbir.doc == nil && dbir.backIndexRow != nil { + // delete + deleteRows := udc.deleteSingle(dbir.docID, dbir.backIndexRow, nil) + if len(deleteRows) > 0 { + deleteRowsAll = append(deleteRowsAll, deleteRows) + } + docsDeleted++ + } else if dbir.doc != nil { + addRows, updateRows, deleteRows := udc.mergeOldAndNew(dbir.doc.ID, dbir.backIndexRow, newRowsMap[dbir.doc.ID]) + if len(addRows) > 0 { + addRowsAll = append(addRowsAll, addRows) + } + if len(updateRows) > 0 { + updateRowsAll = append(updateRowsAll, updateRows) + } + if len(deleteRows) > 0 { + deleteRowsAll = append(deleteRowsAll, deleteRows) + } + if dbir.backIndexRow == nil { + docsAdded++ + } + } + } + + if docBackIndexRowErr != nil { + return docBackIndexRowErr + } + + // start a writer for this batch + var kvwriter store.KVWriter + kvwriter, err = udc.store.Writer() + if err != nil { + return + } + + err = udc.batchRows(kvwriter, addRowsAll, updateRowsAll, deleteRowsAll) + if err != nil { + _ = kvwriter.Close() + atomic.AddUint64(&udc.stats.errors, 1) + return + } + + err = kvwriter.Close() + + atomic.AddUint64(&udc.stats.indexTime, uint64(time.Since(indexStart))) + + if err == nil { + udc.m.Lock() + udc.docCount += docsAdded + udc.docCount -= docsDeleted + udc.m.Unlock() + atomic.AddUint64(&udc.stats.updates, numUpdates) + atomic.AddUint64(&udc.stats.deletes, docsDeleted) + atomic.AddUint64(&udc.stats.batches, 1) + atomic.AddUint64(&udc.stats.numPlainTextBytesIndexed, numPlainTextBytes) + } else { + atomic.AddUint64(&udc.stats.errors, 1) + } + return +} + +func (udc *SmolderingCouch) SetInternal(key, val []byte) (err error) { + internalRow := NewInternalRow(key, val) + udc.writeMutex.Lock() + defer udc.writeMutex.Unlock() + var writer store.KVWriter + writer, err = udc.store.Writer() + if err != nil { + return + } + defer func() { + if cerr := writer.Close(); err == nil && cerr != nil { + err = cerr + } + }() + + batch := writer.NewBatch() + batch.Set(internalRow.Key(), internalRow.Value()) + + return writer.ExecuteBatch(batch) +} + +func (udc *SmolderingCouch) DeleteInternal(key []byte) (err error) { + internalRow := NewInternalRow(key, nil) + udc.writeMutex.Lock() + defer udc.writeMutex.Unlock() + var writer store.KVWriter + writer, err = udc.store.Writer() + if err != nil { + return + } + defer func() { + if cerr := writer.Close(); err == nil && cerr != nil { + err = cerr + } + }() + + batch := writer.NewBatch() + batch.Delete(internalRow.Key()) + return writer.ExecuteBatch(batch) +} + +func (udc *SmolderingCouch) Reader() (index.IndexReader, error) { + kvr, err := udc.store.Reader() + if err != nil { + return nil, fmt.Errorf("error opening store reader: %v", err) + } + udc.m.RLock() + defer udc.m.RUnlock() + return &IndexReader{ + index: udc, + kvreader: kvr, + docCount: udc.docCount, + }, nil +} + +func (udc *SmolderingCouch) Stats() json.Marshaler { + return udc.stats +} + +func (udc *SmolderingCouch) StatsMap() map[string]interface{} { + return udc.stats.statsMap() +} + +func (udc *SmolderingCouch) Advanced() (store.KVStore, error) { + return udc.store, nil +} + +func (udc *SmolderingCouch) fieldIndexOrNewRow(name string) (uint16, *FieldRow) { + index, existed := udc.fieldCache.FieldNamed(name, true) + if !existed { + return index, NewFieldRow(index, name) + } + return index, nil +} + +func init() { + registry.RegisterIndexType(Name, NewSmolderingCouch) +} diff --git a/index/smolder/smoldering_test.go b/index/smolder/smoldering_test.go new file mode 100644 index 00000000..5298ace4 --- /dev/null +++ b/index/smolder/smoldering_test.go @@ -0,0 +1,1351 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "log" + "reflect" + "regexp" + "strconv" + "sync" + "testing" + "time" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/analyzers/standard_analyzer" + "github.com/blevesearch/bleve/analysis/tokenizers/regexp_tokenizer" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/store/boltdb" + "github.com/blevesearch/bleve/index/store/null" + "github.com/blevesearch/bleve/registry" +) + +var testAnalyzer = &analysis.Analyzer{ + Tokenizer: regexp_tokenizer.NewRegexpTokenizer(regexp.MustCompile(`\w+`)), +} + +func TestIndexOpenReopen(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + + var expectedCount uint64 + docCount, err := idx.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + // opening the database should have inserted a version and _id field + expectedLength := uint64(2) + rowCount, err := idx.(*SmolderingCouch).rowCount() + if err != nil { + t.Error(err) + } + if rowCount != expectedLength { + t.Errorf("expected %d rows, got: %d", expectedLength, rowCount) + } + + // now close it + err = idx.Close() + if err != nil { + t.Fatal(err) + } + + idx, err = NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + + // now close it + err = idx.Close() + if err != nil { + t.Fatal(err) + } +} + +func TestIndexInsert(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + docCount, err := idx.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + docCount, err = idx.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + // should have 4 rows (1 for version, 1 for schema field, and 1 for single term, and 1 for the term count, and 1 for the back index entry) + // +1 for id term + // +1 for id term dictionary + // +1 for id field def + expectedLength := uint64(1 + 1 + 1 + 1 + 1 + 1 + 1 + 1) + rowCount, err := idx.(*SmolderingCouch).rowCount() + if err != nil { + t.Error(err) + } + if rowCount != expectedLength { + t.Errorf("expected %d rows, got: %d", expectedLength, rowCount) + } +} + +func TestIndexInsertThenDelete(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + expectedRows := 2 + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + docCount, err := idx.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + expectedRows += 4 // 2 dictionary 2 terms + + doc2 := document.NewDocument("2") + doc2.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc2) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + expectedRows += 4 // 2 dictionary 2 terms + + docCount, err = idx.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + err = idx.Delete("1") + if err != nil { + t.Errorf("Error deleting entry from index: %v", err) + } + expectedCount-- + expectedRows -= 2 //2 terms + + docCount, err = idx.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + err = idx.Delete("2") + if err != nil { + t.Errorf("Error deleting entry from index: %v", err) + } + expectedCount-- + expectedRows -= 2 //2 terms + + docCount, err = idx.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + // should have 2 rows (1 for version, 2 for schema field, 3 for dictionary row garbage) + //expectedLength := uint64(1 + 2 + 3) + rowCount, err := idx.(*SmolderingCouch).rowCount() + if err != nil { + t.Error(err) + } + if rowCount != uint64(expectedRows) { + t.Errorf("expected %d rows, got: %d", expectedRows, rowCount) + } +} + +func TestIndexInsertThenUpdate(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + // this update should overwrite one term, and introduce one new one + doc = document.NewDocument("1") + doc.AddField(document.NewTextFieldWithAnalyzer("name", []uint64{}, []byte("test fail"), testAnalyzer)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error deleting entry from index: %v", err) + } + + // should have 2 rows (1 for version, 2 for schema field, and 3 for the two term, and 3 for the term counts, and 1 for the back index entry) + expectedLength := uint64(1 + 2 + 3 + 3 + 1) + rowCount, err := idx.(*SmolderingCouch).rowCount() + if err != nil { + t.Error(err) + } + if rowCount != expectedLength { + t.Errorf("expected %d rows, got: %d", expectedLength, rowCount) + allRows := idx.DumpAll() + for ar := range allRows { + t.Logf("%v", ar) + } + } + + // now do another update that should remove one of the terms + doc = document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("fail"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error deleting entry from index: %v", err) + } + + // should have 2 rows (1 for version, 2 for schema field, and 2 for the remaining terms, and 2 for the term diciontary, and 1 for the back index entry) + expectedLength = uint64(1 + 2 + 2 + 3 + 1) + rowCount, err = idx.(*SmolderingCouch).rowCount() + if err != nil { + t.Error(err) + } + if rowCount != expectedLength { + t.Errorf("expected %d rows, got: %d", expectedLength, rowCount) + allRows := idx.DumpAll() + for ar := range allRows { + t.Logf("%v", ar) + } + } +} + +func TestIndexInsertMultiple(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + + var expectedCount uint64 + + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + doc = document.NewDocument("2") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + // should have 4 rows (1 for version, 1 for schema field, and 4 for terms, and 3 for the term count, and 2 for the back index entries) + expectedLength := uint64(1 + 2 + 4 + 3 + 2) + rowCount, err := idx.(*SmolderingCouch).rowCount() + if err != nil { + t.Error(err) + } + if rowCount != expectedLength { + t.Errorf("expected %d rows, got: %d", expectedLength, rowCount) + } + + // close, reopen and add one more to test that counting works correctly + err = idx.Close() + if err != nil { + t.Fatal(err) + } + + idx, err = NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Fatalf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + doc = document.NewDocument("3") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + docCount, err := idx.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("expected doc count: %d, got %d", expectedCount, docCount) + } +} + +func TestIndexInsertWithStore(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + docCount, err := idx.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + docCount, err = idx.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + // should have 6 rows (1 for version, 2 for schema field, and 2 for terms, and 1 for the stored field and 2 for the term counts, and 1 for the back index entry) + expectedLength := uint64(1 + 2 + 2 + 1 + 2 + 1) + rowCount, err := idx.(*SmolderingCouch).rowCount() + if err != nil { + t.Error(err) + } + if rowCount != expectedLength { + t.Errorf("expected %d rows, got: %d", expectedLength, rowCount) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + storedDoc, err := indexReader.Document("1") + if err != nil { + t.Error(err) + } + + if len(storedDoc.Fields) != 1 { + t.Errorf("expected 1 stored field, got %d", len(storedDoc.Fields)) + } + textField, ok := storedDoc.Fields[0].(*document.TextField) + if !ok { + t.Errorf("expected text field") + } + if string(textField.Value()) != "test" { + t.Errorf("expected field content 'test', got '%s'", string(textField.Value())) + } +} + +func TestIndexInternalCRUD(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + + // get something that doesn't exist yet + val, err := indexReader.GetInternal([]byte("key")) + if err != nil { + t.Error(err) + } + if val != nil { + t.Errorf("expected nil, got %s", val) + } + + err = indexReader.Close() + if err != nil { + t.Fatal(err) + } + + // set + err = idx.SetInternal([]byte("key"), []byte("abc")) + if err != nil { + t.Error(err) + } + + indexReader2, err := idx.Reader() + if err != nil { + t.Error(err) + } + + // get + val, err = indexReader2.GetInternal([]byte("key")) + if err != nil { + t.Error(err) + } + if string(val) != "abc" { + t.Errorf("expected %s, got '%s'", "abc", val) + } + + err = indexReader2.Close() + if err != nil { + t.Fatal(err) + } + + // delete + err = idx.DeleteInternal([]byte("key")) + if err != nil { + t.Error(err) + } + + indexReader3, err := idx.Reader() + if err != nil { + t.Error(err) + } + + // get again + val, err = indexReader3.GetInternal([]byte("key")) + if err != nil { + t.Error(err) + } + if val != nil { + t.Errorf("expected nil, got %s", val) + } + + err = indexReader3.Close() + if err != nil { + t.Fatal(err) + } +} + +func TestIndexBatch(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + + // first create 2 docs the old fashioned way + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + doc = document.NewDocument("2") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test2"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + // now create a batch which does 3 things + // insert new doc + // update existing doc + // delete existing doc + // net document count change 0 + + batch := index.NewBatch() + doc = document.NewDocument("3") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test3"))) + batch.Update(doc) + doc = document.NewDocument("2") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test2updated"))) + batch.Update(doc) + batch.Delete("1") + + err = idx.Batch(batch) + if err != nil { + t.Error(err) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + docCount := indexReader.DocCount() + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + docIDReader, err := indexReader.DocIDReader("", "") + if err != nil { + t.Error(err) + } + docIds := make([]index.IndexInternalID, 0) + docID, err := docIDReader.Next() + for docID != nil && err == nil { + docIds = append(docIds, docID) + docID, err = docIDReader.Next() + } + if err != nil { + t.Error(err) + } + expectedDocIds := []index.IndexInternalID{EncodeUvarintAscending(nil, 2), EncodeUvarintAscending(nil, 3)} + if !reflect.DeepEqual(docIds, expectedDocIds) { + t.Errorf("expected ids: %v, got ids: %v", expectedDocIds, docIds) + allRows := idx.DumpAll() + for ar := range allRows { + t.Logf("%v", ar) + } + } +} + +func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + docCount, err := idx.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField)) + doc.AddField(document.NewNumericFieldWithIndexingOptions("age", []uint64{}, 35.99, document.IndexField|document.StoreField)) + df, err := document.NewDateTimeFieldWithIndexingOptions("unixEpoch", []uint64{}, time.Unix(0, 0), document.IndexField|document.StoreField) + if err != nil { + t.Error(err) + } + doc.AddField(df) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + docCount, err = idx.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + // should have 77 rows + // 1 for version + // 4 for schema fields + // 1 for id term + // 1 for text term + // 16 for numeric terms + // 16 for date terms + // 3 for the stored field + // 1 for id term count + // 1 for the text term count + // 16 for numeric term counts + // 16 for date term counts + // 1 for the back index entry + expectedLength := uint64(1 + 4 + 1 + 1 + (64 / document.DefaultPrecisionStep) + (64 / document.DefaultPrecisionStep) + 3 + 1 + 1 + (64 / document.DefaultPrecisionStep) + (64 / document.DefaultPrecisionStep) + 1) + rowCount, err := idx.(*SmolderingCouch).rowCount() + if err != nil { + t.Error(err) + } + if rowCount != expectedLength { + t.Errorf("expected %d rows, got: %d", expectedLength, rowCount) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + + storedDoc, err := indexReader.Document("1") + if err != nil { + t.Error(err) + } + + err = indexReader.Close() + if err != nil { + t.Error(err) + } + + if len(storedDoc.Fields) != 3 { + t.Errorf("expected 3 stored field, got %d", len(storedDoc.Fields)) + } + textField, ok := storedDoc.Fields[0].(*document.TextField) + if !ok { + t.Errorf("expected text field") + } + if string(textField.Value()) != "test" { + t.Errorf("expected field content 'test', got '%s'", string(textField.Value())) + } + numField, ok := storedDoc.Fields[1].(*document.NumericField) + if !ok { + t.Errorf("expected numeric field") + } + numFieldNumer, err := numField.Number() + if err != nil { + t.Error(err) + } else { + if numFieldNumer != 35.99 { + t.Errorf("expeted numeric value 35.99, got %f", numFieldNumer) + } + } + dateField, ok := storedDoc.Fields[2].(*document.DateTimeField) + if !ok { + t.Errorf("expected date field") + } + dateFieldDate, err := dateField.DateTime() + if err != nil { + t.Error(err) + } else { + if dateFieldDate != time.Unix(0, 0).UTC() { + t.Errorf("expected date value unix epoch, got %v", dateFieldDate) + } + } + + // now update the document, but omit one of the fields + doc = document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("testup"), document.IndexField|document.StoreField)) + doc.AddField(document.NewNumericFieldWithIndexingOptions("age", []uint64{}, 36.99, document.IndexField|document.StoreField)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + indexReader2, err := idx.Reader() + if err != nil { + t.Error(err) + } + + // expected doc count shouldn't have changed + docCount = indexReader2.DocCount() + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + // should only get 2 fields back now though + storedDoc, err = indexReader2.Document("1") + if err != nil { + t.Error(err) + } + + err = indexReader2.Close() + if err != nil { + t.Error(err) + } + + if len(storedDoc.Fields) != 2 { + t.Errorf("expected 3 stored field, got %d", len(storedDoc.Fields)) + } + textField, ok = storedDoc.Fields[0].(*document.TextField) + if !ok { + t.Errorf("expected text field") + } + if string(textField.Value()) != "testup" { + t.Errorf("expected field content 'testup', got '%s'", string(textField.Value())) + } + numField, ok = storedDoc.Fields[1].(*document.NumericField) + if !ok { + t.Errorf("expected numeric field") + } + numFieldNumer, err = numField.Number() + if err != nil { + t.Error(err) + } else { + if numFieldNumer != 36.99 { + t.Errorf("expeted numeric value 36.99, got %f", numFieldNumer) + } + } + + // now delete the document + err = idx.Delete("1") + expectedCount-- + + // expected doc count shouldn't have changed + docCount, err = idx.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } +} + +func TestIndexInsertFields(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField)) + doc.AddField(document.NewNumericFieldWithIndexingOptions("age", []uint64{}, 35.99, document.IndexField|document.StoreField)) + dateField, err := document.NewDateTimeFieldWithIndexingOptions("unixEpoch", []uint64{}, time.Unix(0, 0), document.IndexField|document.StoreField) + if err != nil { + t.Error(err) + } + doc.AddField(dateField) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + fields, err := indexReader.Fields() + if err != nil { + t.Error(err) + } else { + expectedFields := []string{"_id", "name", "age", "unixEpoch"} + if !reflect.DeepEqual(fields, expectedFields) { + t.Errorf("expected fields: %v, got %v", expectedFields, fields) + } + } + +} + +func TestIndexUpdateComposites(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField)) + doc.AddField(document.NewTextFieldWithIndexingOptions("title", []uint64{}, []byte("mister"), document.IndexField|document.StoreField)) + doc.AddField(document.NewCompositeFieldWithIndexingOptions("_all", true, nil, nil, document.IndexField)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + // 1 for version + // 4 for schema fields + // 5 for text term + // 2 for the stored field + // 5 for the text term count + // 1 for the back index entry + expectedLength := uint64(1 + 4 + 5 + 2 + 5 + 1) + rowCount, err := idx.(*SmolderingCouch).rowCount() + if err != nil { + t.Error(err) + } + if rowCount != expectedLength { + t.Errorf("expected %d rows, got: %d", expectedLength, rowCount) + } + + // now lets update it + doc = document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("testupdated"), document.IndexField|document.StoreField)) + doc.AddField(document.NewTextFieldWithIndexingOptions("title", []uint64{}, []byte("misterupdated"), document.IndexField|document.StoreField)) + doc.AddField(document.NewCompositeFieldWithIndexingOptions("_all", true, nil, nil, document.IndexField)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + // make sure new values are in index + storedDoc, err := indexReader.Document("1") + if err != nil { + t.Error(err) + } + if len(storedDoc.Fields) != 2 { + t.Errorf("expected 2 stored field, got %d", len(storedDoc.Fields)) + } + textField, ok := storedDoc.Fields[0].(*document.TextField) + if !ok { + t.Errorf("expected text field") + } + if string(textField.Value()) != "testupdated" { + t.Errorf("expected field content 'test', got '%s'", string(textField.Value())) + } + + // should have the same row count as before, plus 4 term dictionary garbage rows + expectedLength += 4 + rowCount, err = idx.(*SmolderingCouch).rowCount() + if err != nil { + t.Error(err) + } + if rowCount != expectedLength { + t.Errorf("expected %d rows, got: %d", expectedLength, rowCount) + } +} + +func TestIndexFieldsMisc(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField)) + doc.AddField(document.NewTextFieldWithIndexingOptions("title", []uint64{}, []byte("mister"), document.IndexField|document.StoreField)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + fieldName0 := idx.(*SmolderingCouch).fieldCache.FieldIndexed(0) + if fieldName0 != "_id" { + t.Errorf("expected field named '_id', got '%s'", fieldName0) + } + fieldName1 := idx.(*SmolderingCouch).fieldCache.FieldIndexed(1) + if fieldName1 != "name" { + t.Errorf("expected field named 'name', got '%s'", fieldName1) + } + fieldName2 := idx.(*SmolderingCouch).fieldCache.FieldIndexed(2) + if fieldName2 != "title" { + t.Errorf("expected field named 'title', got '%s'", fieldName2) + } + fieldName3 := idx.(*SmolderingCouch).fieldCache.FieldIndexed(3) + if fieldName3 != "" { + t.Errorf("expected field named '', got '%s'", fieldName3) + } + +} + +func TestIndexTermReaderCompositeFields(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField|document.IncludeTermVectors)) + doc.AddField(document.NewTextFieldWithIndexingOptions("title", []uint64{}, []byte("mister"), document.IndexField|document.StoreField|document.IncludeTermVectors)) + doc.AddField(document.NewCompositeFieldWithIndexingOptions("_all", true, nil, nil, document.IndexField|document.IncludeTermVectors)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + termFieldReader, err := indexReader.TermFieldReader([]byte("mister"), "_all", true, true, true) + if err != nil { + t.Error(err) + } + + tfd, err := termFieldReader.Next(nil) + for tfd != nil && err == nil { + if !tfd.ID.Equals(EncodeUvarintAscending(nil, 1)) { + t.Errorf("expected to find document id 1") + } + tfd, err = termFieldReader.Next(nil) + } + if err != nil { + t.Error(err) + } +} + +func TestIndexDocumentFieldTerms(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField|document.IncludeTermVectors)) + doc.AddField(document.NewTextFieldWithIndexingOptions("title", []uint64{}, []byte("mister"), document.IndexField|document.StoreField|document.IncludeTermVectors)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + fieldTerms, err := indexReader.DocumentFieldTerms(EncodeUvarintAscending(nil, 1)) + if err != nil { + t.Error(err) + } + expectedFieldTerms := index.FieldTerms{ + "name": []string{"test"}, + "title": []string{"mister"}, + "_id": []string{"1"}, + } + if !reflect.DeepEqual(fieldTerms, expectedFieldTerms) { + t.Errorf("expected field terms: %#v, got: %#v", expectedFieldTerms, fieldTerms) + } +} + +func BenchmarkBatch(b *testing.B) { + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name) + if err != nil { + b.Fatal(err) + } + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(null.Name, nil, analysisQueue) + if err != nil { + b.Fatal(err) + } + err = idx.Open() + if err != nil { + b.Fatal(err) + } + + batch := index.NewBatch() + for i := 0; i < 100; i++ { + d := document.NewDocument(strconv.Itoa(i)) + f := document.NewTextFieldWithAnalyzer("desc", nil, bleveWikiArticle1K, analyzer) + d.AddField(f) + batch.Update(d) + } + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + err = idx.Batch(batch) + if err != nil { + b.Fatal(err) + } + } +} + +func TestConcurrentUpdate(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + // do some concurrent updates + var wg sync.WaitGroup + for i := 0; i < 10; i++ { + wg.Add(1) + go func(i int) { + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions(strconv.Itoa(i), []uint64{}, []byte(strconv.Itoa(i)), document.StoreField)) + err := idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + wg.Done() + }(i) + } + wg.Wait() + + // now load the name field and see what we get + r, err := idx.Reader() + if err != nil { + log.Fatal(err) + } + + doc, err := r.Document("1") + if err != nil { + log.Fatal(err) + } + + if len(doc.Fields) > 1 { + t.Errorf("expected single field, found %d", len(doc.Fields)) + } +} + +func TestLargeField(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewSmolderingCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + largeFieldValue := make([]byte, 0) + for len(largeFieldValue) < RowBufferSize { + largeFieldValue = append(largeFieldValue, bleveWikiArticle1K...) + } + t.Logf("large field size: %d", len(largeFieldValue)) + + d := document.NewDocument("large") + f := document.NewTextFieldWithIndexingOptions("desc", nil, largeFieldValue, document.IndexField|document.StoreField) + d.AddField(f) + + err = idx.Update(d) + if err != nil { + t.Fatal(err) + } +} diff --git a/index/smolder/stats.go b/index/smolder/stats.go new file mode 100644 index 00000000..2c93f7a1 --- /dev/null +++ b/index/smolder/stats.go @@ -0,0 +1,50 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package smolder + +import ( + "encoding/json" + "sync/atomic" + + "github.com/blevesearch/bleve/index/store" +) + +type indexStat struct { + updates, deletes, batches, errors uint64 + analysisTime, indexTime uint64 + termSearchersStarted uint64 + termSearchersFinished uint64 + numPlainTextBytesIndexed uint64 + i *SmolderingCouch +} + +func (i *indexStat) statsMap() map[string]interface{} { + m := map[string]interface{}{} + m["updates"] = atomic.LoadUint64(&i.updates) + m["deletes"] = atomic.LoadUint64(&i.deletes) + m["batches"] = atomic.LoadUint64(&i.batches) + m["errors"] = atomic.LoadUint64(&i.errors) + m["analysis_time"] = atomic.LoadUint64(&i.analysisTime) + m["index_time"] = atomic.LoadUint64(&i.indexTime) + m["term_searchers_started"] = atomic.LoadUint64(&i.termSearchersStarted) + m["term_searchers_finished"] = atomic.LoadUint64(&i.termSearchersFinished) + m["num_plain_text_bytes_indexed"] = atomic.LoadUint64(&i.numPlainTextBytesIndexed) + + if o, ok := i.i.store.(store.KVStoreStats); ok { + m["kv"] = o.StatsMap() + } + + return m +} + +func (i *indexStat) MarshalJSON() ([]byte, error) { + m := i.statsMap() + return json.Marshal(m) +} diff --git a/index/smolder/varint.go b/index/smolder/varint.go new file mode 100644 index 00000000..3b48ad68 --- /dev/null +++ b/index/smolder/varint.go @@ -0,0 +1,94 @@ +// Copyright 2014 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. +// +// Author: Tobias Schottdorf (tobias.schottdorf@gmail.com) + +// Bleve changes: +// - changed package name +// - removed dep on pkg/errors (even though its awesome and we should use it) + +package smolder + +import "fmt" + +const ( + // IntMin is chosen such that the range of int tags does not overlap the + // ascii character set that is frequently used in testing. + IntMin = 0x80 + // IntMax is the maximum int tag value. + IntMax = 0xfd + + intMaxWidth = 8 + intZero = IntMin + intMaxWidth + intSmall = IntMax - intZero - intMaxWidth // 109 +) + +// EncodeUvarintAscending encodes the uint64 value using a variable length +// (length-prefixed) representation. The length is encoded as a single +// byte indicating the number of encoded bytes (-8) to follow. See +// EncodeVarintAscending for rationale. The encoded bytes are appended to the +// supplied buffer and the final buffer is returned. +func EncodeUvarintAscending(b []byte, v uint64) []byte { + switch { + case v <= intSmall: + return append(b, intZero+byte(v)) + case v <= 0xff: + return append(b, IntMax-7, byte(v)) + case v <= 0xffff: + return append(b, IntMax-6, byte(v>>8), byte(v)) + case v <= 0xffffff: + return append(b, IntMax-5, byte(v>>16), byte(v>>8), byte(v)) + case v <= 0xffffffff: + return append(b, IntMax-4, byte(v>>24), byte(v>>16), byte(v>>8), byte(v)) + case v <= 0xffffffffff: + return append(b, IntMax-3, byte(v>>32), byte(v>>24), byte(v>>16), byte(v>>8), + byte(v)) + case v <= 0xffffffffffff: + return append(b, IntMax-2, byte(v>>40), byte(v>>32), byte(v>>24), byte(v>>16), + byte(v>>8), byte(v)) + case v <= 0xffffffffffffff: + return append(b, IntMax-1, byte(v>>48), byte(v>>40), byte(v>>32), byte(v>>24), + byte(v>>16), byte(v>>8), byte(v)) + default: + return append(b, IntMax, byte(v>>56), byte(v>>48), byte(v>>40), byte(v>>32), + byte(v>>24), byte(v>>16), byte(v>>8), byte(v)) + } +} + +// DecodeUvarintAscending decodes a varint encoded uint64 from the input +// buffer. The remainder of the input buffer and the decoded uint64 +// are returned. +func DecodeUvarintAscending(b []byte) ([]byte, uint64, error) { + if len(b) == 0 { + return nil, 0, fmt.Errorf("insufficient bytes to decode uvarint value") + } + length := int(b[0]) - intZero + b = b[1:] // skip length byte + if length <= intSmall { + return b, uint64(length), nil + } + length -= intSmall + if length < 0 || length > 8 { + return nil, 0, fmt.Errorf("invalid uvarint length of %d", length) + } else if len(b) < length { + return nil, 0, fmt.Errorf("insufficient bytes to decode uvarint value: %v", b) + } + var v uint64 + // It is faster to range over the elements in a slice than to index + // into the slice on each loop iteration. + for _, t := range b[:length] { + v = (v << 8) | uint64(t) + } + return b[length:], v, nil +}