From 23755049e811f39f9811ab0dbef74ea2b74baba7 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sun, 11 Sep 2016 18:11:09 -0400 Subject: [PATCH] slight tweak to API to only encode docNum->docNumBytes once --- index/smolder/analysis.go | 10 ++++++---- index/smolder/row.go | 16 ++++++++-------- index/smolder/row_test.go | 30 +++++++++++++++++------------- index/smolder/smoldering.go | 4 ++-- 4 files changed, 33 insertions(+), 27 deletions(-) diff --git a/index/smolder/analysis.go b/index/smolder/analysis.go index 4c0774d9..350cec36 100644 --- a/index/smolder/analysis.go +++ b/index/smolder/analysis.go @@ -21,6 +21,8 @@ func (udc *SmolderingCouch) Analyze(d *document.Document) *index.AnalysisResult Rows: make([]index.IndexRow, 0, 100), } + docNumBytes := EncodeUvarintAscending(nil, d.Number) + // track our back index entries backIndexStoredEntries := make([]*BackIndexStoreEntry, 0) @@ -42,7 +44,7 @@ func (udc *SmolderingCouch) Analyze(d *document.Document) *index.AnalysisResult }, nil, false) // store the _id field as well f := document.NewTextField("_id", nil, []byte(idBytes)) - rv.Rows, backIndexStoredEntries = udc.storeField(d.Number, f, 0, rv.Rows, backIndexStoredEntries) + rv.Rows, backIndexStoredEntries = udc.storeField(docNumBytes, f, 0, rv.Rows, backIndexStoredEntries) analyzeField := func(field document.Field, storable bool) { fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(field.Name()) @@ -65,7 +67,7 @@ func (udc *SmolderingCouch) Analyze(d *document.Document) *index.AnalysisResult } if storable && field.Options().IsStored() { - rv.Rows, backIndexStoredEntries = udc.storeField(d.Number, field, fieldIndex, rv.Rows, backIndexStoredEntries) + rv.Rows, backIndexStoredEntries = udc.storeField(docNumBytes, field, fieldIndex, rv.Rows, backIndexStoredEntries) } } @@ -110,11 +112,11 @@ func (udc *SmolderingCouch) Analyze(d *document.Document) *index.AnalysisResult includeTermVectors := fieldIncludeTermVectors[fieldIndex] // encode this field - rv.Rows, backIndexTermsEntries = udc.indexField(d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows, backIndexTermsEntries) + rv.Rows, backIndexTermsEntries = udc.indexField(docNumBytes, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows, backIndexTermsEntries) } // build the back index row - backIndexRow := NewBackIndexRow(d.Number, backIndexTermsEntries, backIndexStoredEntries) + backIndexRow := NewBackIndexRow(docNumBytes, backIndexTermsEntries, backIndexStoredEntries) rv.Rows = append(rv.Rows, backIndexRow) return rv diff --git a/index/smolder/row.go b/index/smolder/row.go index 1061c448..26a9bf12 100644 --- a/index/smolder/row.go +++ b/index/smolder/row.go @@ -462,11 +462,11 @@ func (tfr *TermFrequencyRow) String() string { return fmt.Sprintf("Term: `%s` Field: %d Document: %d Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, dn, tfr.freq, tfr.norm, tfr.vectors) } -func NewTermFrequencyRow(term []byte, field uint16, docNum uint64, freq uint64, norm float32) *TermFrequencyRow { +func NewTermFrequencyRow(term []byte, field uint16, docNum []byte, freq uint64, norm float32) *TermFrequencyRow { return &TermFrequencyRow{ term: term, field: field, - docNumber: EncodeUvarintAscending(nil, docNum), + docNumber: docNum, freq: freq, norm: norm, } @@ -500,11 +500,11 @@ func TermFrequencyRowStartField(field uint16) []byte { return buf } -func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, docNum uint64, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow { +func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, docNum []byte, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow { return &TermFrequencyRow{ term: term, field: field, - docNumber: EncodeUvarintAscending(nil, docNum), + docNumber: docNum, freq: freq, norm: norm, vectors: vectors, @@ -722,9 +722,9 @@ func (br *BackIndexRow) String() string { return fmt.Sprintf("Backindex Document: %d Terms Entries: %v, Stored Entries: %v", dn, br.termsEntries, br.storedEntries) } -func NewBackIndexRow(docNum uint64, entries []*BackIndexTermsEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow { +func NewBackIndexRow(docNum []byte, entries []*BackIndexTermsEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow { return &BackIndexRow{ - docNumber: EncodeUvarintAscending(nil, docNum), + docNumber: docNum, termsEntries: entries, storedEntries: storedFields, } @@ -825,9 +825,9 @@ func (s *StoredRow) ScanPrefixForDoc() []byte { return buf } -func NewStoredRow(docNum uint64, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow { +func NewStoredRow(docNum []byte, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow { return &StoredRow{ - docNumber: EncodeUvarintAscending(nil, docNum), + docNumber: docNum, field: field, arrayPositions: arrayPositions, typ: typ, diff --git a/index/smolder/row_test.go b/index/smolder/row_test.go index 6eb1b7e2..a10d0afe 100644 --- a/index/smolder/row_test.go +++ b/index/smolder/row_test.go @@ -18,6 +18,7 @@ import ( ) func TestRows(t *testing.T) { + docNumBytes1 := EncodeUvarintAscending(nil, 1) tests := []struct { input SmolderingCouchRow outKey []byte @@ -49,39 +50,39 @@ func TestRows(t *testing.T) { []byte{27}, }, { - NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, 1, 3, 3.14), + NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, docNumBytes1, 3, 3.14), []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137}, []byte{3, 195, 235, 163, 130, 4}, }, { - NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, 1, 3, 3.14), + NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, docNumBytes1, 3, 3.14), []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137}, []byte{3, 195, 235, 163, 130, 4}, }, { - NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, 1, 3, 3.14, []*TermVector{{field: 0, pos: 1, start: 3, end: 11}, {field: 0, pos: 2, start: 23, end: 31}, {field: 0, pos: 3, start: 43, end: 51}}), + NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, docNumBytes1, 3, 3.14, []*TermVector{{field: 0, pos: 1, start: 3, end: 11}, {field: 0, pos: 2, start: 23, end: 31}, {field: 0, pos: 3, start: 43, end: 51}}), []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137}, []byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0}, }, // test larger varints { - NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, 1, 25896, 3.14, []*TermVector{{field: 255, pos: 1, start: 3, end: 11}, {field: 0, pos: 2198, start: 23, end: 31}, {field: 0, pos: 3, start: 43, end: 51}}), + NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, docNumBytes1, 25896, 3.14, []*TermVector{{field: 255, pos: 1, start: 3, end: 11}, {field: 0, pos: 2198, start: 23, end: 31}, {field: 0, pos: 3, start: 43, end: 51}}), []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137}, []byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 0, 150, 17, 23, 31, 0, 0, 3, 43, 51, 0}, }, // test vectors with arrayPositions { - NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, 1, 25896, 3.14, []*TermVector{{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, {field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, {field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}), + NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, docNumBytes1, 25896, 3.14, []*TermVector{{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, {field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, {field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}), []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137}, []byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 1, 0, 0, 150, 17, 23, 31, 2, 1, 2, 0, 3, 43, 51, 3, 3, 4, 5}, }, { - NewBackIndexRow(1, []*BackIndexTermsEntry{{Field: proto.Uint32(0), Terms: []string{"beer"}}}, nil), + NewBackIndexRow(docNumBytes1, []*BackIndexTermsEntry{{Field: proto.Uint32(0), Terms: []string{"beer"}}}, nil), []byte{'b', 137}, []byte{10, 8, 8, 0, 18, 4, 'b', 'e', 'e', 'r'}, }, { - NewBackIndexRow(1, []*BackIndexTermsEntry{ + NewBackIndexRow(docNumBytes1, []*BackIndexTermsEntry{ { Field: proto.Uint32(0), Terms: []string{"beer"}, @@ -95,7 +96,7 @@ func TestRows(t *testing.T) { []byte{10, 8, 8, 0, 18, 4, 'b', 'e', 'e', 'r', 10, 8, 8, 1, 18, 4, 'b', 'e', 'a', 't'}, }, { - NewBackIndexRow(1, + NewBackIndexRow(docNumBytes1, []*BackIndexTermsEntry{ { Field: proto.Uint32(0), @@ -122,12 +123,12 @@ func TestRows(t *testing.T) { []byte{10, 8, 8, 0, 18, 4, 'b', 'e', 'e', 'r', 10, 8, 8, 1, 18, 4, 'b', 'e', 'a', 't', 18, 2, 8, 3, 18, 2, 8, 4, 18, 2, 8, 5}, }, { - NewStoredRow(1, 0, []uint64{}, byte('t'), []byte("an american beer")), + NewStoredRow(docNumBytes1, 0, []uint64{}, byte('t'), []byte("an american beer")), []byte{'s', 137, ByteSeparator, 0, 0}, []byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'}, }, { - NewStoredRow(1, 0, []uint64{2, 294, 3078}, byte('t'), []byte("an american beer")), + NewStoredRow(docNumBytes1, 0, []uint64{2, 294, 3078}, byte('t'), []byte("an american beer")), []byte{'s', 137, ByteSeparator, 0, 0, 2, 166, 2, 134, 24}, []byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'}, }, @@ -287,10 +288,11 @@ func TestDictionaryRowValueBug197(t *testing.T) { } func BenchmarkTermFrequencyRowEncode(b *testing.B) { + docNumBytes1 := EncodeUvarintAscending(nil, 1) row := NewTermFrequencyRowWithTermVectors( []byte{'b', 'e', 'e', 'r'}, 0, - 1, + docNumBytes1, 3, 3.14, []*TermVector{ @@ -333,9 +335,10 @@ func BenchmarkTermFrequencyRowDecode(b *testing.B) { } func BenchmarkBackIndexRowEncode(b *testing.B) { + docNumBytes1 := EncodeUvarintAscending(nil, 1) field := uint32(1) t1 := "term1" - row := NewBackIndexRow(1, + row := NewBackIndexRow(docNumBytes1, []*BackIndexTermsEntry{ { Field: &field, @@ -367,7 +370,8 @@ func BenchmarkBackIndexRowDecode(b *testing.B) { } func BenchmarkStoredRowEncode(b *testing.B) { - row := NewStoredRow(1, 0, []uint64{}, byte('t'), []byte("an american beer")) + docNumBytes1 := EncodeUvarintAscending(nil, 1) + row := NewStoredRow(docNumBytes1, 0, []uint64{}, byte('t'), []byte("an american beer")) b.ResetTimer() for i := 0; i < b.N; i++ { row.Key() diff --git a/index/smolder/smoldering.go b/index/smolder/smoldering.go index 154c5bda..5d07a378 100644 --- a/index/smolder/smoldering.go +++ b/index/smolder/smoldering.go @@ -616,7 +616,7 @@ func (udc *SmolderingCouch) mergeOldAndNew(externalDocId string, backIndexRow *B return addRows, updateRows, deleteRows } -func (udc *SmolderingCouch) storeField(docNum uint64, field document.Field, fieldIndex uint16, rows []index.IndexRow, backIndexStoredEntries []*BackIndexStoreEntry) ([]index.IndexRow, []*BackIndexStoreEntry) { +func (udc *SmolderingCouch) storeField(docNum []byte, field document.Field, fieldIndex uint16, rows []index.IndexRow, backIndexStoredEntries []*BackIndexStoreEntry) ([]index.IndexRow, []*BackIndexStoreEntry) { fieldType := encodeFieldType(field) storedRow := NewStoredRow(docNum, fieldIndex, field.ArrayPositions(), fieldType, field.Value()) @@ -643,7 +643,7 @@ func encodeFieldType(f document.Field) byte { return fieldType } -func (udc *SmolderingCouch) indexField(docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow, backIndexTermsEntries []*BackIndexTermsEntry) ([]index.IndexRow, []*BackIndexTermsEntry) { +func (udc *SmolderingCouch) indexField(docNum []byte, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow, backIndexTermsEntries []*BackIndexTermsEntry) ([]index.IndexRow, []*BackIndexTermsEntry) { fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength))) terms := make([]string, 0, len(tokenFreqs))