update term freq rows to use varint where possible
benchmark old ns/op new ns/op delta BenchmarkLevelDBIndexing1Workers 1138292 657901 -42.20% BenchmarkLevelDBIndexing2Workers 1619323 647628 -60.01% BenchmarkLevelDBIndexing4Workers 1172845 636478 -45.73% BenchmarkLevelDBIndexing1Workers10Batch 465556545 448153394 -3.74% BenchmarkLevelDBIndexing2Workers10Batch 504203911 449657355 -10.82% BenchmarkLevelDBIndexing4Workers10Batch 510766435 439839335 -13.89% BenchmarkLevelDBIndexing1Workers100Batch 307657846 268976464 -12.57% BenchmarkLevelDBIndexing2Workers100Batch 302257400 269110215 -10.97% BenchmarkLevelDBIndexing4Workers100Batch 305320485 259084902 -15.14% BenchmarkLevelDBIndexing1Workers1000Batch 301320576 258070231 -14.35% BenchmarkLevelDBIndexing2Workers1000Batch 334174454 261175641 -21.84% BenchmarkLevelDBIndexing4Workers1000Batch 267732436 261461739 -2.34% closes #165
This commit is contained in:
parent
ee1210bc82
commit
a2ad7634f2
|
@ -234,22 +234,22 @@ func (tfr *TermFrequencyRow) SummaryKey() []byte {
|
|||
}
|
||||
|
||||
func (tfr *TermFrequencyRow) Value() []byte {
|
||||
buf := make([]byte, 8+4+(len(tfr.vectors)*(2+8+8+8)))
|
||||
used := 0
|
||||
buf := make([]byte, 8+8+(len(tfr.vectors)*(8+8+8+8)))
|
||||
|
||||
binary.LittleEndian.PutUint64(buf[0:8], tfr.freq)
|
||||
used += binary.PutUvarint(buf[used:used+8], tfr.freq)
|
||||
|
||||
normuint32 := math.Float32bits(tfr.norm)
|
||||
binary.LittleEndian.PutUint32(buf[8:12], normuint32)
|
||||
newbuf := buf[used : used+8]
|
||||
used += binary.PutUvarint(newbuf, uint64(normuint32))
|
||||
|
||||
offset := 12
|
||||
for _, vector := range tfr.vectors {
|
||||
binary.LittleEndian.PutUint16(buf[offset:offset+2], vector.field)
|
||||
binary.LittleEndian.PutUint64(buf[offset+2:offset+10], vector.pos)
|
||||
binary.LittleEndian.PutUint64(buf[offset+10:offset+18], vector.start)
|
||||
binary.LittleEndian.PutUint64(buf[offset+18:offset+26], vector.end)
|
||||
offset += 26
|
||||
used += binary.PutUvarint(buf[used:used+8], uint64(vector.field))
|
||||
used += binary.PutUvarint(buf[used:used+8], vector.pos)
|
||||
used += binary.PutUvarint(buf[used:used+8], vector.start)
|
||||
used += binary.PutUvarint(buf[used:used+8], vector.end)
|
||||
}
|
||||
return buf
|
||||
return buf[0:used]
|
||||
}
|
||||
|
||||
func (tfr *TermFrequencyRow) String() string {
|
||||
|
@ -309,43 +309,56 @@ func NewTermFrequencyRowK(key []byte) (*TermFrequencyRow, error) {
|
|||
|
||||
func (tfr *TermFrequencyRow) parseV(value []byte) error {
|
||||
buf := bytes.NewBuffer((value))
|
||||
err := binary.Read(buf, binary.LittleEndian, &tfr.freq)
|
||||
|
||||
freq, err := binary.ReadUvarint(buf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = binary.Read(buf, binary.LittleEndian, &tfr.norm)
|
||||
tfr.freq = freq
|
||||
|
||||
norm, err := binary.ReadUvarint(buf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var field uint16
|
||||
err = binary.Read(buf, binary.LittleEndian, &field)
|
||||
tfr.norm = math.Float32frombits(uint32(norm))
|
||||
|
||||
field, err := binary.ReadUvarint(buf)
|
||||
if err != nil && err != io.EOF {
|
||||
return err
|
||||
}
|
||||
for err != io.EOF {
|
||||
tv := TermVector{}
|
||||
tv.field = field
|
||||
tv.field = uint16(field)
|
||||
// at this point we expect at least one term vector
|
||||
if tfr.vectors == nil {
|
||||
tfr.vectors = make([]*TermVector, 0)
|
||||
}
|
||||
|
||||
err = binary.Read(buf, binary.LittleEndian, &tv.pos)
|
||||
var pos uint64
|
||||
pos, err = binary.ReadUvarint(buf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = binary.Read(buf, binary.LittleEndian, &tv.start)
|
||||
tv.pos = pos
|
||||
|
||||
var start uint64
|
||||
start, err = binary.ReadUvarint(buf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = binary.Read(buf, binary.LittleEndian, &tv.end)
|
||||
tv.start = start
|
||||
|
||||
var end uint64
|
||||
end, err = binary.ReadUvarint(buf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
tv.end = end
|
||||
|
||||
tfr.vectors = append(tfr.vectors, &tv)
|
||||
// try to read next record (may not exist)
|
||||
err = binary.Read(buf, binary.LittleEndian, &field)
|
||||
field, err = binary.ReadUvarint(buf)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -45,17 +45,23 @@ func TestRows(t *testing.T) {
|
|||
{
|
||||
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "", 3, 3.14),
|
||||
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator},
|
||||
[]byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64},
|
||||
[]byte{3, 195, 235, 163, 130, 4},
|
||||
},
|
||||
{
|
||||
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14),
|
||||
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||
[]byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64},
|
||||
[]byte{3, 195, 235, 163, 130, 4},
|
||||
},
|
||||
{
|
||||
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
|
||||
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||
[]byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 43, 0, 0, 0, 0, 0, 0, 0, 51, 0, 0, 0, 0, 0, 0, 0},
|
||||
[]byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 2, 23, 31, 0, 3, 43, 51},
|
||||
},
|
||||
// test larger varints
|
||||
{
|
||||
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
|
||||
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 150, 17, 23, 31, 0, 3, 43, 51},
|
||||
},
|
||||
{
|
||||
NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil),
|
||||
|
@ -85,14 +91,14 @@ func TestRows(t *testing.T) {
|
|||
}
|
||||
|
||||
// test going from struct to k/v bytes
|
||||
for _, test := range tests {
|
||||
for i, test := range tests {
|
||||
rk := test.input.Key()
|
||||
if !reflect.DeepEqual(rk, test.outKey) {
|
||||
t.Errorf("Expected key to be %v got: %v", test.outKey, rk)
|
||||
}
|
||||
rv := test.input.Value()
|
||||
if !reflect.DeepEqual(rv, test.outVal) {
|
||||
t.Errorf("Expected value to be %v got: %v", test.outVal, rv)
|
||||
t.Errorf("Expected value to be %v got: %v for %d", test.outVal, rv, i)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -100,7 +106,7 @@ func TestRows(t *testing.T) {
|
|||
for i, test := range tests {
|
||||
row, err := ParseFromKeyValue(test.outKey, test.outVal)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
t.Errorf("error parsking key/value: %v", err)
|
||||
}
|
||||
if !reflect.DeepEqual(row, test.input) {
|
||||
t.Errorf("Expected: %#v got: %#v for %d", test.input, row, i)
|
||||
|
@ -213,3 +219,45 @@ func TestInvalidRows(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkTermFrequencyRowEncode(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
row := NewTermFrequencyRowWithTermVectors(
|
||||
[]byte{'b', 'e', 'e', 'r'},
|
||||
0,
|
||||
"budweiser",
|
||||
3,
|
||||
3.14,
|
||||
[]*TermVector{
|
||||
&TermVector{
|
||||
field: 0,
|
||||
pos: 1,
|
||||
start: 3,
|
||||
end: 11,
|
||||
},
|
||||
&TermVector{
|
||||
field: 0,
|
||||
pos: 2,
|
||||
start: 23,
|
||||
end: 31,
|
||||
},
|
||||
&TermVector{
|
||||
field: 0,
|
||||
pos: 3,
|
||||
start: 43,
|
||||
end: 51,
|
||||
},
|
||||
})
|
||||
|
||||
row.Key()
|
||||
row.Value()
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkTermFrequencyRowDecode(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
k := []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}
|
||||
v := []byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 43, 0, 0, 0, 0, 0, 0, 0, 51, 0, 0, 0, 0, 0, 0, 0}
|
||||
NewTermFrequencyRowKV(k, v)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ import (
|
|||
|
||||
var VersionKey = []byte{'v'}
|
||||
|
||||
const Version uint8 = 2
|
||||
const Version uint8 = 3
|
||||
|
||||
var IncompatibleVersion = fmt.Errorf("incompatible version, %d is supported", Version)
|
||||
|
||||
|
|
Loading…
Reference in New Issue