0
0
Fork 0

update term freq rows to use varint where possible

benchmark old ns/op new ns/op delta
BenchmarkLevelDBIndexing1Workers 1138292 657901 -42.20%
BenchmarkLevelDBIndexing2Workers 1619323 647628 -60.01%
BenchmarkLevelDBIndexing4Workers 1172845 636478 -45.73%
BenchmarkLevelDBIndexing1Workers10Batch 465556545 448153394 -3.74%
BenchmarkLevelDBIndexing2Workers10Batch 504203911 449657355 -10.82%
BenchmarkLevelDBIndexing4Workers10Batch 510766435 439839335 -13.89%
BenchmarkLevelDBIndexing1Workers100Batch 307657846 268976464 -12.57%
BenchmarkLevelDBIndexing2Workers100Batch 302257400 269110215 -10.97%
BenchmarkLevelDBIndexing4Workers100Batch 305320485 259084902 -15.14%
BenchmarkLevelDBIndexing1Workers1000Batch 301320576 258070231 -14.35%
BenchmarkLevelDBIndexing2Workers1000Batch 334174454 261175641 -21.84%
BenchmarkLevelDBIndexing4Workers1000Batch 267732436 261461739 -2.34%

closes #165
This commit is contained in:
Marty Schoch 2015-03-06 13:00:53 -05:00
parent ee1210bc82
commit a2ad7634f2
3 changed files with 87 additions and 26 deletions

View File

@ -234,22 +234,22 @@ func (tfr *TermFrequencyRow) SummaryKey() []byte {
}
func (tfr *TermFrequencyRow) Value() []byte {
buf := make([]byte, 8+4+(len(tfr.vectors)*(2+8+8+8)))
used := 0
buf := make([]byte, 8+8+(len(tfr.vectors)*(8+8+8+8)))
binary.LittleEndian.PutUint64(buf[0:8], tfr.freq)
used += binary.PutUvarint(buf[used:used+8], tfr.freq)
normuint32 := math.Float32bits(tfr.norm)
binary.LittleEndian.PutUint32(buf[8:12], normuint32)
newbuf := buf[used : used+8]
used += binary.PutUvarint(newbuf, uint64(normuint32))
offset := 12
for _, vector := range tfr.vectors {
binary.LittleEndian.PutUint16(buf[offset:offset+2], vector.field)
binary.LittleEndian.PutUint64(buf[offset+2:offset+10], vector.pos)
binary.LittleEndian.PutUint64(buf[offset+10:offset+18], vector.start)
binary.LittleEndian.PutUint64(buf[offset+18:offset+26], vector.end)
offset += 26
used += binary.PutUvarint(buf[used:used+8], uint64(vector.field))
used += binary.PutUvarint(buf[used:used+8], vector.pos)
used += binary.PutUvarint(buf[used:used+8], vector.start)
used += binary.PutUvarint(buf[used:used+8], vector.end)
}
return buf
return buf[0:used]
}
func (tfr *TermFrequencyRow) String() string {
@ -309,43 +309,56 @@ func NewTermFrequencyRowK(key []byte) (*TermFrequencyRow, error) {
func (tfr *TermFrequencyRow) parseV(value []byte) error {
buf := bytes.NewBuffer((value))
err := binary.Read(buf, binary.LittleEndian, &tfr.freq)
freq, err := binary.ReadUvarint(buf)
if err != nil {
return err
}
err = binary.Read(buf, binary.LittleEndian, &tfr.norm)
tfr.freq = freq
norm, err := binary.ReadUvarint(buf)
if err != nil {
return err
}
var field uint16
err = binary.Read(buf, binary.LittleEndian, &field)
tfr.norm = math.Float32frombits(uint32(norm))
field, err := binary.ReadUvarint(buf)
if err != nil && err != io.EOF {
return err
}
for err != io.EOF {
tv := TermVector{}
tv.field = field
tv.field = uint16(field)
// at this point we expect at least one term vector
if tfr.vectors == nil {
tfr.vectors = make([]*TermVector, 0)
}
err = binary.Read(buf, binary.LittleEndian, &tv.pos)
var pos uint64
pos, err = binary.ReadUvarint(buf)
if err != nil {
return err
}
err = binary.Read(buf, binary.LittleEndian, &tv.start)
tv.pos = pos
var start uint64
start, err = binary.ReadUvarint(buf)
if err != nil {
return err
}
err = binary.Read(buf, binary.LittleEndian, &tv.end)
tv.start = start
var end uint64
end, err = binary.ReadUvarint(buf)
if err != nil {
return err
}
tv.end = end
tfr.vectors = append(tfr.vectors, &tv)
// try to read next record (may not exist)
err = binary.Read(buf, binary.LittleEndian, &field)
field, err = binary.ReadUvarint(buf)
}
return nil
}

View File

@ -45,17 +45,23 @@ func TestRows(t *testing.T) {
{
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "", 3, 3.14),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator},
[]byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64},
[]byte{3, 195, 235, 163, 130, 4},
},
{
NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64},
[]byte{3, 195, 235, 163, 130, 4},
},
{
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 43, 0, 0, 0, 0, 0, 0, 0, 51, 0, 0, 0, 0, 0, 0, 0},
[]byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 2, 23, 31, 0, 3, 43, 51},
},
// test larger varints
{
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 150, 17, 23, 31, 0, 3, 43, 51},
},
{
NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil),
@ -85,14 +91,14 @@ func TestRows(t *testing.T) {
}
// test going from struct to k/v bytes
for _, test := range tests {
for i, test := range tests {
rk := test.input.Key()
if !reflect.DeepEqual(rk, test.outKey) {
t.Errorf("Expected key to be %v got: %v", test.outKey, rk)
}
rv := test.input.Value()
if !reflect.DeepEqual(rv, test.outVal) {
t.Errorf("Expected value to be %v got: %v", test.outVal, rv)
t.Errorf("Expected value to be %v got: %v for %d", test.outVal, rv, i)
}
}
@ -100,7 +106,7 @@ func TestRows(t *testing.T) {
for i, test := range tests {
row, err := ParseFromKeyValue(test.outKey, test.outVal)
if err != nil {
t.Error(err)
t.Errorf("error parsking key/value: %v", err)
}
if !reflect.DeepEqual(row, test.input) {
t.Errorf("Expected: %#v got: %#v for %d", test.input, row, i)
@ -213,3 +219,45 @@ func TestInvalidRows(t *testing.T) {
}
}
}
func BenchmarkTermFrequencyRowEncode(b *testing.B) {
for i := 0; i < b.N; i++ {
row := NewTermFrequencyRowWithTermVectors(
[]byte{'b', 'e', 'e', 'r'},
0,
"budweiser",
3,
3.14,
[]*TermVector{
&TermVector{
field: 0,
pos: 1,
start: 3,
end: 11,
},
&TermVector{
field: 0,
pos: 2,
start: 23,
end: 31,
},
&TermVector{
field: 0,
pos: 3,
start: 43,
end: 51,
},
})
row.Key()
row.Value()
}
}
func BenchmarkTermFrequencyRowDecode(b *testing.B) {
for i := 0; i < b.N; i++ {
k := []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}
v := []byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 43, 0, 0, 0, 0, 0, 0, 0, 51, 0, 0, 0, 0, 0, 0, 0}
NewTermFrequencyRowKV(k, v)
}
}

View File

@ -27,7 +27,7 @@ import (
var VersionKey = []byte{'v'}
const Version uint8 = 2
const Version uint8 = 3
var IncompatibleVersion = fmt.Errorf("incompatible version, %d is supported", Version)