slight tweak to API to only encode docNum->docNumBytes once
This commit is contained in:
parent
035b7c91fc
commit
23755049e8
|
@ -21,6 +21,8 @@ func (udc *SmolderingCouch) Analyze(d *document.Document) *index.AnalysisResult
|
||||||
Rows: make([]index.IndexRow, 0, 100),
|
Rows: make([]index.IndexRow, 0, 100),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
docNumBytes := EncodeUvarintAscending(nil, d.Number)
|
||||||
|
|
||||||
// track our back index entries
|
// track our back index entries
|
||||||
backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)
|
backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)
|
||||||
|
|
||||||
|
@ -42,7 +44,7 @@ func (udc *SmolderingCouch) Analyze(d *document.Document) *index.AnalysisResult
|
||||||
}, nil, false)
|
}, nil, false)
|
||||||
// store the _id field as well
|
// store the _id field as well
|
||||||
f := document.NewTextField("_id", nil, []byte(idBytes))
|
f := document.NewTextField("_id", nil, []byte(idBytes))
|
||||||
rv.Rows, backIndexStoredEntries = udc.storeField(d.Number, f, 0, rv.Rows, backIndexStoredEntries)
|
rv.Rows, backIndexStoredEntries = udc.storeField(docNumBytes, f, 0, rv.Rows, backIndexStoredEntries)
|
||||||
|
|
||||||
analyzeField := func(field document.Field, storable bool) {
|
analyzeField := func(field document.Field, storable bool) {
|
||||||
fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(field.Name())
|
fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(field.Name())
|
||||||
|
@ -65,7 +67,7 @@ func (udc *SmolderingCouch) Analyze(d *document.Document) *index.AnalysisResult
|
||||||
}
|
}
|
||||||
|
|
||||||
if storable && field.Options().IsStored() {
|
if storable && field.Options().IsStored() {
|
||||||
rv.Rows, backIndexStoredEntries = udc.storeField(d.Number, field, fieldIndex, rv.Rows, backIndexStoredEntries)
|
rv.Rows, backIndexStoredEntries = udc.storeField(docNumBytes, field, fieldIndex, rv.Rows, backIndexStoredEntries)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -110,11 +112,11 @@ func (udc *SmolderingCouch) Analyze(d *document.Document) *index.AnalysisResult
|
||||||
includeTermVectors := fieldIncludeTermVectors[fieldIndex]
|
includeTermVectors := fieldIncludeTermVectors[fieldIndex]
|
||||||
|
|
||||||
// encode this field
|
// encode this field
|
||||||
rv.Rows, backIndexTermsEntries = udc.indexField(d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows, backIndexTermsEntries)
|
rv.Rows, backIndexTermsEntries = udc.indexField(docNumBytes, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows, backIndexTermsEntries)
|
||||||
}
|
}
|
||||||
|
|
||||||
// build the back index row
|
// build the back index row
|
||||||
backIndexRow := NewBackIndexRow(d.Number, backIndexTermsEntries, backIndexStoredEntries)
|
backIndexRow := NewBackIndexRow(docNumBytes, backIndexTermsEntries, backIndexStoredEntries)
|
||||||
rv.Rows = append(rv.Rows, backIndexRow)
|
rv.Rows = append(rv.Rows, backIndexRow)
|
||||||
|
|
||||||
return rv
|
return rv
|
||||||
|
|
|
@ -462,11 +462,11 @@ func (tfr *TermFrequencyRow) String() string {
|
||||||
return fmt.Sprintf("Term: `%s` Field: %d Document: %d Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, dn, tfr.freq, tfr.norm, tfr.vectors)
|
return fmt.Sprintf("Term: `%s` Field: %d Document: %d Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, dn, tfr.freq, tfr.norm, tfr.vectors)
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewTermFrequencyRow(term []byte, field uint16, docNum uint64, freq uint64, norm float32) *TermFrequencyRow {
|
func NewTermFrequencyRow(term []byte, field uint16, docNum []byte, freq uint64, norm float32) *TermFrequencyRow {
|
||||||
return &TermFrequencyRow{
|
return &TermFrequencyRow{
|
||||||
term: term,
|
term: term,
|
||||||
field: field,
|
field: field,
|
||||||
docNumber: EncodeUvarintAscending(nil, docNum),
|
docNumber: docNum,
|
||||||
freq: freq,
|
freq: freq,
|
||||||
norm: norm,
|
norm: norm,
|
||||||
}
|
}
|
||||||
|
@ -500,11 +500,11 @@ func TermFrequencyRowStartField(field uint16) []byte {
|
||||||
return buf
|
return buf
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, docNum uint64, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow {
|
func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, docNum []byte, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow {
|
||||||
return &TermFrequencyRow{
|
return &TermFrequencyRow{
|
||||||
term: term,
|
term: term,
|
||||||
field: field,
|
field: field,
|
||||||
docNumber: EncodeUvarintAscending(nil, docNum),
|
docNumber: docNum,
|
||||||
freq: freq,
|
freq: freq,
|
||||||
norm: norm,
|
norm: norm,
|
||||||
vectors: vectors,
|
vectors: vectors,
|
||||||
|
@ -722,9 +722,9 @@ func (br *BackIndexRow) String() string {
|
||||||
return fmt.Sprintf("Backindex Document: %d Terms Entries: %v, Stored Entries: %v", dn, br.termsEntries, br.storedEntries)
|
return fmt.Sprintf("Backindex Document: %d Terms Entries: %v, Stored Entries: %v", dn, br.termsEntries, br.storedEntries)
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewBackIndexRow(docNum uint64, entries []*BackIndexTermsEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow {
|
func NewBackIndexRow(docNum []byte, entries []*BackIndexTermsEntry, storedFields []*BackIndexStoreEntry) *BackIndexRow {
|
||||||
return &BackIndexRow{
|
return &BackIndexRow{
|
||||||
docNumber: EncodeUvarintAscending(nil, docNum),
|
docNumber: docNum,
|
||||||
termsEntries: entries,
|
termsEntries: entries,
|
||||||
storedEntries: storedFields,
|
storedEntries: storedFields,
|
||||||
}
|
}
|
||||||
|
@ -825,9 +825,9 @@ func (s *StoredRow) ScanPrefixForDoc() []byte {
|
||||||
return buf
|
return buf
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewStoredRow(docNum uint64, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow {
|
func NewStoredRow(docNum []byte, field uint16, arrayPositions []uint64, typ byte, value []byte) *StoredRow {
|
||||||
return &StoredRow{
|
return &StoredRow{
|
||||||
docNumber: EncodeUvarintAscending(nil, docNum),
|
docNumber: docNum,
|
||||||
field: field,
|
field: field,
|
||||||
arrayPositions: arrayPositions,
|
arrayPositions: arrayPositions,
|
||||||
typ: typ,
|
typ: typ,
|
||||||
|
|
|
@ -18,6 +18,7 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestRows(t *testing.T) {
|
func TestRows(t *testing.T) {
|
||||||
|
docNumBytes1 := EncodeUvarintAscending(nil, 1)
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
input SmolderingCouchRow
|
input SmolderingCouchRow
|
||||||
outKey []byte
|
outKey []byte
|
||||||
|
@ -49,39 +50,39 @@ func TestRows(t *testing.T) {
|
||||||
[]byte{27},
|
[]byte{27},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, 1, 3, 3.14),
|
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, docNumBytes1, 3, 3.14),
|
||||||
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137},
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137},
|
||||||
[]byte{3, 195, 235, 163, 130, 4},
|
[]byte{3, 195, 235, 163, 130, 4},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, 1, 3, 3.14),
|
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, docNumBytes1, 3, 3.14),
|
||||||
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137},
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137},
|
||||||
[]byte{3, 195, 235, 163, 130, 4},
|
[]byte{3, 195, 235, 163, 130, 4},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, 1, 3, 3.14, []*TermVector{{field: 0, pos: 1, start: 3, end: 11}, {field: 0, pos: 2, start: 23, end: 31}, {field: 0, pos: 3, start: 43, end: 51}}),
|
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, docNumBytes1, 3, 3.14, []*TermVector{{field: 0, pos: 1, start: 3, end: 11}, {field: 0, pos: 2, start: 23, end: 31}, {field: 0, pos: 3, start: 43, end: 51}}),
|
||||||
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137},
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137},
|
||||||
[]byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0},
|
[]byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0},
|
||||||
},
|
},
|
||||||
// test larger varints
|
// test larger varints
|
||||||
{
|
{
|
||||||
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, 1, 25896, 3.14, []*TermVector{{field: 255, pos: 1, start: 3, end: 11}, {field: 0, pos: 2198, start: 23, end: 31}, {field: 0, pos: 3, start: 43, end: 51}}),
|
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, docNumBytes1, 25896, 3.14, []*TermVector{{field: 255, pos: 1, start: 3, end: 11}, {field: 0, pos: 2198, start: 23, end: 31}, {field: 0, pos: 3, start: 43, end: 51}}),
|
||||||
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137},
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137},
|
||||||
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 0, 150, 17, 23, 31, 0, 0, 3, 43, 51, 0},
|
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 0, 150, 17, 23, 31, 0, 0, 3, 43, 51, 0},
|
||||||
},
|
},
|
||||||
// test vectors with arrayPositions
|
// test vectors with arrayPositions
|
||||||
{
|
{
|
||||||
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, 1, 25896, 3.14, []*TermVector{{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, {field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, {field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}),
|
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, docNumBytes1, 25896, 3.14, []*TermVector{{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, {field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, {field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}),
|
||||||
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137},
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 137},
|
||||||
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 1, 0, 0, 150, 17, 23, 31, 2, 1, 2, 0, 3, 43, 51, 3, 3, 4, 5},
|
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 1, 0, 0, 150, 17, 23, 31, 2, 1, 2, 0, 3, 43, 51, 3, 3, 4, 5},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
NewBackIndexRow(1, []*BackIndexTermsEntry{{Field: proto.Uint32(0), Terms: []string{"beer"}}}, nil),
|
NewBackIndexRow(docNumBytes1, []*BackIndexTermsEntry{{Field: proto.Uint32(0), Terms: []string{"beer"}}}, nil),
|
||||||
[]byte{'b', 137},
|
[]byte{'b', 137},
|
||||||
[]byte{10, 8, 8, 0, 18, 4, 'b', 'e', 'e', 'r'},
|
[]byte{10, 8, 8, 0, 18, 4, 'b', 'e', 'e', 'r'},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
NewBackIndexRow(1, []*BackIndexTermsEntry{
|
NewBackIndexRow(docNumBytes1, []*BackIndexTermsEntry{
|
||||||
{
|
{
|
||||||
Field: proto.Uint32(0),
|
Field: proto.Uint32(0),
|
||||||
Terms: []string{"beer"},
|
Terms: []string{"beer"},
|
||||||
|
@ -95,7 +96,7 @@ func TestRows(t *testing.T) {
|
||||||
[]byte{10, 8, 8, 0, 18, 4, 'b', 'e', 'e', 'r', 10, 8, 8, 1, 18, 4, 'b', 'e', 'a', 't'},
|
[]byte{10, 8, 8, 0, 18, 4, 'b', 'e', 'e', 'r', 10, 8, 8, 1, 18, 4, 'b', 'e', 'a', 't'},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
NewBackIndexRow(1,
|
NewBackIndexRow(docNumBytes1,
|
||||||
[]*BackIndexTermsEntry{
|
[]*BackIndexTermsEntry{
|
||||||
{
|
{
|
||||||
Field: proto.Uint32(0),
|
Field: proto.Uint32(0),
|
||||||
|
@ -122,12 +123,12 @@ func TestRows(t *testing.T) {
|
||||||
[]byte{10, 8, 8, 0, 18, 4, 'b', 'e', 'e', 'r', 10, 8, 8, 1, 18, 4, 'b', 'e', 'a', 't', 18, 2, 8, 3, 18, 2, 8, 4, 18, 2, 8, 5},
|
[]byte{10, 8, 8, 0, 18, 4, 'b', 'e', 'e', 'r', 10, 8, 8, 1, 18, 4, 'b', 'e', 'a', 't', 18, 2, 8, 3, 18, 2, 8, 4, 18, 2, 8, 5},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
NewStoredRow(1, 0, []uint64{}, byte('t'), []byte("an american beer")),
|
NewStoredRow(docNumBytes1, 0, []uint64{}, byte('t'), []byte("an american beer")),
|
||||||
[]byte{'s', 137, ByteSeparator, 0, 0},
|
[]byte{'s', 137, ByteSeparator, 0, 0},
|
||||||
[]byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'},
|
[]byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
NewStoredRow(1, 0, []uint64{2, 294, 3078}, byte('t'), []byte("an american beer")),
|
NewStoredRow(docNumBytes1, 0, []uint64{2, 294, 3078}, byte('t'), []byte("an american beer")),
|
||||||
[]byte{'s', 137, ByteSeparator, 0, 0, 2, 166, 2, 134, 24},
|
[]byte{'s', 137, ByteSeparator, 0, 0, 2, 166, 2, 134, 24},
|
||||||
[]byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'},
|
[]byte{'t', 'a', 'n', ' ', 'a', 'm', 'e', 'r', 'i', 'c', 'a', 'n', ' ', 'b', 'e', 'e', 'r'},
|
||||||
},
|
},
|
||||||
|
@ -287,10 +288,11 @@ func TestDictionaryRowValueBug197(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func BenchmarkTermFrequencyRowEncode(b *testing.B) {
|
func BenchmarkTermFrequencyRowEncode(b *testing.B) {
|
||||||
|
docNumBytes1 := EncodeUvarintAscending(nil, 1)
|
||||||
row := NewTermFrequencyRowWithTermVectors(
|
row := NewTermFrequencyRowWithTermVectors(
|
||||||
[]byte{'b', 'e', 'e', 'r'},
|
[]byte{'b', 'e', 'e', 'r'},
|
||||||
0,
|
0,
|
||||||
1,
|
docNumBytes1,
|
||||||
3,
|
3,
|
||||||
3.14,
|
3.14,
|
||||||
[]*TermVector{
|
[]*TermVector{
|
||||||
|
@ -333,9 +335,10 @@ func BenchmarkTermFrequencyRowDecode(b *testing.B) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func BenchmarkBackIndexRowEncode(b *testing.B) {
|
func BenchmarkBackIndexRowEncode(b *testing.B) {
|
||||||
|
docNumBytes1 := EncodeUvarintAscending(nil, 1)
|
||||||
field := uint32(1)
|
field := uint32(1)
|
||||||
t1 := "term1"
|
t1 := "term1"
|
||||||
row := NewBackIndexRow(1,
|
row := NewBackIndexRow(docNumBytes1,
|
||||||
[]*BackIndexTermsEntry{
|
[]*BackIndexTermsEntry{
|
||||||
{
|
{
|
||||||
Field: &field,
|
Field: &field,
|
||||||
|
@ -367,7 +370,8 @@ func BenchmarkBackIndexRowDecode(b *testing.B) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func BenchmarkStoredRowEncode(b *testing.B) {
|
func BenchmarkStoredRowEncode(b *testing.B) {
|
||||||
row := NewStoredRow(1, 0, []uint64{}, byte('t'), []byte("an american beer"))
|
docNumBytes1 := EncodeUvarintAscending(nil, 1)
|
||||||
|
row := NewStoredRow(docNumBytes1, 0, []uint64{}, byte('t'), []byte("an american beer"))
|
||||||
b.ResetTimer()
|
b.ResetTimer()
|
||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
row.Key()
|
row.Key()
|
||||||
|
|
|
@ -616,7 +616,7 @@ func (udc *SmolderingCouch) mergeOldAndNew(externalDocId string, backIndexRow *B
|
||||||
return addRows, updateRows, deleteRows
|
return addRows, updateRows, deleteRows
|
||||||
}
|
}
|
||||||
|
|
||||||
func (udc *SmolderingCouch) storeField(docNum uint64, field document.Field, fieldIndex uint16, rows []index.IndexRow, backIndexStoredEntries []*BackIndexStoreEntry) ([]index.IndexRow, []*BackIndexStoreEntry) {
|
func (udc *SmolderingCouch) storeField(docNum []byte, field document.Field, fieldIndex uint16, rows []index.IndexRow, backIndexStoredEntries []*BackIndexStoreEntry) ([]index.IndexRow, []*BackIndexStoreEntry) {
|
||||||
fieldType := encodeFieldType(field)
|
fieldType := encodeFieldType(field)
|
||||||
storedRow := NewStoredRow(docNum, fieldIndex, field.ArrayPositions(), fieldType, field.Value())
|
storedRow := NewStoredRow(docNum, fieldIndex, field.ArrayPositions(), fieldType, field.Value())
|
||||||
|
|
||||||
|
@ -643,7 +643,7 @@ func encodeFieldType(f document.Field) byte {
|
||||||
return fieldType
|
return fieldType
|
||||||
}
|
}
|
||||||
|
|
||||||
func (udc *SmolderingCouch) indexField(docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow, backIndexTermsEntries []*BackIndexTermsEntry) ([]index.IndexRow, []*BackIndexTermsEntry) {
|
func (udc *SmolderingCouch) indexField(docNum []byte, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow, backIndexTermsEntries []*BackIndexTermsEntry) ([]index.IndexRow, []*BackIndexTermsEntry) {
|
||||||
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
|
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
|
||||||
|
|
||||||
terms := make([]string, 0, len(tokenFreqs))
|
terms := make([]string, 0, len(tokenFreqs))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user