Merge pull request #815 from blevesearch/loadchunk_minor

minor optimisation to loadChunk method
2018-03-16 08:15:37 +05:30 · 2018-03-16 08:15:37 +05:30 · 23cebae5a8
parent d1b84d4578 d1155c223a
commit 23cebae5a8
7 changed files with 279 additions and 50 deletions
--- a/index/scorch/segment/zap/build.go
+++ b/index/scorch/segment/zap/build.go
@ -22,7 +22,7 @@ import (
 	"github.com/Smerity/govarint"
 )

-const version uint32 = 4
+const version uint32 = 5

 const fieldNotUninverted = math.MaxUint64

--- a/index/scorch/segment/zap/contentcoder.go
+++ b/index/scorch/segment/zap/contentcoder.go
@ -156,9 +156,11 @@ func (c *chunkedContentCoder) Write(w io.Writer) (int, error) {
 	if err != nil {
 		return tw, err
 	}
-	// write out the chunk lens
-	for _, chunkLen := range c.chunkLens {
-		n := binary.PutUvarint(buf, uint64(chunkLen))
+
+	chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens)
+	// write out the chunk offsets
+	for _, chunkOffset := range chunkOffsets {
+		n := binary.PutUvarint(buf, chunkOffset)
 		nw, err = w.Write(buf[:n])
 		tw += nw
 		if err != nil {
--- a/index/scorch/segment/zap/contentcoder_test.go
+++ b/index/scorch/segment/zap/contentcoder_test.go
@ -46,7 +46,7 @@ func TestChunkContentCoder(t *testing.T) {
 				[]byte("scorch"),
 			},

-			expected: string([]byte{0x02, 0x0c, 0x0c, 0x01, 0x00, 0x00, 0x06, 0x06, 0x14,
+			expected: string([]byte{0x02, 0x0c, 0x18, 0x01, 0x00, 0x00, 0x06, 0x06, 0x14,
 				0x75, 0x70, 0x73, 0x69, 0x64, 0x65, 0x01, 0x01, 0x00, 0x06, 0x06,
 				0x14, 0x73, 0x63, 0x6f, 0x72, 0x63, 0x68}),
 		},
--- a/index/scorch/segment/zap/docvalues.go
+++ b/index/scorch/segment/zap/docvalues.go
@ -38,7 +38,7 @@ type docValueIterator struct {
 	field          string
 	curChunkNum    uint64
 	numChunks      uint64
-	chunkLens      []uint64
+	chunkOffsets   []uint64
 	dvDataLoc      uint64
 	curChunkHeader []MetaData
 	curChunkData   []byte // compressed data cache
@ -47,7 +47,7 @@ type docValueIterator struct {
 func (di *docValueIterator) size() int {
 	return reflectStaticSizedocValueIterator + size.SizeOfPtr +
 		len(di.field) +
-		len(di.chunkLens)*size.SizeOfUint64 +
+		len(di.chunkOffsets)*size.SizeOfUint64 +
 		len(di.curChunkHeader)*reflectStaticSizeMetaData +
 		len(di.curChunkData)
 }
@ -69,7 +69,7 @@ func (s *SegmentBase) loadFieldDocValueIterator(field string,
 	}

 	// read the number of chunks, chunk lengths
-	var offset, clen uint64
+	var offset, loc uint64
 	numChunks, read := binary.Uvarint(s.mem[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64])
 	if read <= 0 {
 		return nil, fmt.Errorf("failed to read the field "+
@ -78,16 +78,16 @@ func (s *SegmentBase) loadFieldDocValueIterator(field string,
 	offset += uint64(read)

 	fdvIter := &docValueIterator{
-		curChunkNum: math.MaxUint64,
-		field:       field,
-		chunkLens:   make([]uint64, int(numChunks)),
+		curChunkNum:  math.MaxUint64,
+		field:        field,
+		chunkOffsets: make([]uint64, int(numChunks)),
 	}
 	for i := 0; i < int(numChunks); i++ {
-		clen, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64])
+		loc, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64])
 		if read <= 0 {
-			return nil, fmt.Errorf("corrupted chunk length during segment load")
+			return nil, fmt.Errorf("corrupted chunk offset during segment load")
 		}
-		fdvIter.chunkLens[i] = clen
+		fdvIter.chunkOffsets[i] = loc
 		offset += uint64(read)
 	}

@ -99,12 +99,11 @@ func (di *docValueIterator) loadDvChunk(chunkNumber,
 	localDocNum uint64, s *SegmentBase) error {
 	// advance to the chunk where the docValues
 	// reside for the given docNum
-	destChunkDataLoc := di.dvDataLoc
-	for i := 0; i < int(chunkNumber); i++ {
-		destChunkDataLoc += di.chunkLens[i]
-	}
+	destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc
+	start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets)
+	destChunkDataLoc += start
+	curChunkEnd += end

-	curChunkSize := di.chunkLens[chunkNumber]
 	// read the number of docs reside in the chunk
 	numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64])
 	if read <= 0 {
@ -124,7 +123,7 @@ func (di *docValueIterator) loadDvChunk(chunkNumber,
 	}

 	compressedDataLoc := chunkMetaLoc + offset
-	dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc
+	dataLength := curChunkEnd - compressedDataLoc
 	di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength]
 	di.curChunkNum = chunkNumber
 	return nil
--- a/index/scorch/segment/zap/intcoder.go
+++ b/index/scorch/segment/zap/intcoder.go
@ -111,10 +111,13 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) {
 	}
 	buf := c.buf

-	// write out the number of chunks & each chunkLen
-	n := binary.PutUvarint(buf, uint64(len(c.chunkLens)))
-	for _, chunkLen := range c.chunkLens {
-		n += binary.PutUvarint(buf[n:], uint64(chunkLen))
+	// convert the chunk lengths into chunk offsets
+	chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens)
+
+	// write out the number of chunks & each chunk offsets
+	n := binary.PutUvarint(buf, uint64(len(chunkOffsets)))
+	for _, chunkOffset := range chunkOffsets {
+		n += binary.PutUvarint(buf[n:], chunkOffset)
 	}

 	tw, err := w.Write(buf[:n])
@ -134,3 +137,36 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) {
 func (c *chunkedIntCoder) FinalSize() int {
 	return len(c.final)
 }
+
+// modifyLengthsToEndOffsets converts the chunk length array
+// to a chunk offset array. The readChunkBoundary
+// will figure out the start and end of every chunk from
+// these offsets. Starting offset of i'th index is stored
+// in i-1'th position except for 0'th index and ending offset
+// is stored at i'th index position.
+// For 0'th element, starting position is always zero.
+// eg:
+// Lens ->  5 5 5 5 => 5 10 15 20
+// Lens ->  0 5 0 5 => 0 5 5 10
+// Lens ->  0 0 0 5 => 0 0 0 5
+// Lens ->  5 0 0 0 => 5 5 5 5
+// Lens ->  0 5 0 0 => 0 5 5 5
+// Lens ->  0 0 5 0 => 0 0 5 5
+func modifyLengthsToEndOffsets(lengths []uint64) []uint64 {
+	var runningOffset uint64
+	var index, i int
+	for i = 1; i <= len(lengths); i++ {
+		runningOffset += lengths[i-1]
+		lengths[index] = runningOffset
+		index++
+	}
+	return lengths
+}
+
+func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) {
+	var start uint64
+	if chunk > 0 {
+		start = offsets[chunk-1]
+	}
+	return start, offsets[chunk]
+}
--- a/index/scorch/segment/zap/intcoder_test.go
+++ b/index/scorch/segment/zap/intcoder_test.go
@ -46,8 +46,8 @@ func TestChunkIntCoder(t *testing.T) {
 				[]uint64{3},
 				[]uint64{7},
 			},
-			// 2 chunks, chunk-0 length 1, chunk-1 length 1, value 3, value 7
-			expected: []byte{0x2, 0x1, 0x1, 0x3, 0x7},
+			// 2 chunks, chunk-0 offset 1, chunk-1 offset 2, value 3, value 7
+			expected: []byte{0x2, 0x1, 0x2, 0x3, 0x7},
 		},
 	}

@ -71,3 +71,199 @@ func TestChunkIntCoder(t *testing.T) {
 		}
 	}
 }
+
+func TestChunkLengthToOffsets(t *testing.T) {
+
+	tests := []struct {
+		lengths         []uint64
+		expectedOffsets []uint64
+	}{
+		{
+			lengths:         []uint64{5, 5, 5, 5, 5},
+			expectedOffsets: []uint64{5, 10, 15, 20, 25},
+		},
+		{
+			lengths:         []uint64{0, 5, 0, 5, 0},
+			expectedOffsets: []uint64{0, 5, 5, 10, 10},
+		},
+		{
+			lengths:         []uint64{0, 0, 0, 0, 5},
+			expectedOffsets: []uint64{0, 0, 0, 0, 5},
+		},
+		{
+			lengths:         []uint64{5, 0, 0, 0, 0},
+			expectedOffsets: []uint64{5, 5, 5, 5, 5},
+		},
+		{
+			lengths:         []uint64{0, 5, 0, 0, 0},
+			expectedOffsets: []uint64{0, 5, 5, 5, 5},
+		},
+		{
+			lengths:         []uint64{0, 0, 0, 5, 0},
+			expectedOffsets: []uint64{0, 0, 0, 5, 5},
+		},
+		{
+			lengths:         []uint64{0, 0, 0, 5, 5},
+			expectedOffsets: []uint64{0, 0, 0, 5, 10},
+		},
+		{
+			lengths:         []uint64{5, 5, 5, 0, 0},
+			expectedOffsets: []uint64{5, 10, 15, 15, 15},
+		},
+		{
+			lengths:         []uint64{5},
+			expectedOffsets: []uint64{5},
+		},
+		{
+			lengths:         []uint64{5, 5},
+			expectedOffsets: []uint64{5, 10},
+		},
+	}
+
+	for i, test := range tests {
+		modifyLengthsToEndOffsets(test.lengths)
+		if !reflect.DeepEqual(test.expectedOffsets, test.lengths) {
+			t.Errorf("Test: %d failed, got %+v, expected %+v", i, test.lengths, test.expectedOffsets)
+		}
+	}
+}
+
+func TestChunkReadBoundaryFromOffsets(t *testing.T) {
+
+	tests := []struct {
+		chunkNumber   int
+		offsets       []uint64
+		expectedStart uint64
+		expectedEnd   uint64
+	}{
+		{
+			offsets:       []uint64{5, 10, 15, 20, 25},
+			chunkNumber:   4,
+			expectedStart: 20,
+			expectedEnd:   25,
+		},
+		{
+			offsets:       []uint64{5, 10, 15, 20, 25},
+			chunkNumber:   0,
+			expectedStart: 0,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{5, 10, 15, 20, 25},
+			chunkNumber:   2,
+			expectedStart: 10,
+			expectedEnd:   15,
+		},
+		{
+			offsets:       []uint64{0, 5, 5, 10, 10},
+			chunkNumber:   4,
+			expectedStart: 10,
+			expectedEnd:   10,
+		},
+		{
+			offsets:       []uint64{0, 5, 5, 10, 10},
+			chunkNumber:   1,
+			expectedStart: 0,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{5, 5, 5, 5, 5},
+			chunkNumber:   0,
+			expectedStart: 0,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{5, 5, 5, 5, 5},
+			chunkNumber:   4,
+			expectedStart: 5,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{5, 5, 5, 5, 5},
+			chunkNumber:   1,
+			expectedStart: 5,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{0, 5, 5, 5, 5},
+			chunkNumber:   1,
+			expectedStart: 0,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{0, 5, 5, 5, 5},
+			chunkNumber:   0,
+			expectedStart: 0,
+			expectedEnd:   0,
+		},
+		{
+			offsets:       []uint64{0, 0, 0, 5, 5},
+			chunkNumber:   2,
+			expectedStart: 0,
+			expectedEnd:   0,
+		},
+		{
+			offsets:       []uint64{0, 0, 0, 5, 5},
+			chunkNumber:   1,
+			expectedStart: 0,
+			expectedEnd:   0,
+		},
+		{
+			offsets:       []uint64{0, 0, 0, 0, 5},
+			chunkNumber:   4,
+			expectedStart: 0,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{0, 0, 0, 0, 5},
+			chunkNumber:   2,
+			expectedStart: 0,
+			expectedEnd:   0,
+		},
+		{
+			offsets:       []uint64{5, 10, 15, 15, 15},
+			chunkNumber:   0,
+			expectedStart: 0,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{5, 10, 15, 15, 15},
+			chunkNumber:   1,
+			expectedStart: 5,
+			expectedEnd:   10,
+		},
+		{
+			offsets:       []uint64{5, 10, 15, 15, 15},
+			chunkNumber:   2,
+			expectedStart: 10,
+			expectedEnd:   15,
+		},
+		{
+			offsets:       []uint64{5, 10, 15, 15, 15},
+			chunkNumber:   3,
+			expectedStart: 15,
+			expectedEnd:   15,
+		},
+		{
+			offsets:       []uint64{5, 10, 15, 15, 15},
+			chunkNumber:   4,
+			expectedStart: 15,
+			expectedEnd:   15,
+		},
+		{
+			offsets:       []uint64{5},
+			chunkNumber:   0,
+			expectedStart: 0,
+			expectedEnd:   5,
+		},
+	}
+
+	for i, test := range tests {
+		s, e := readChunkBoundary(test.chunkNumber, test.offsets)
+		if test.expectedStart != s || test.expectedEnd != e {
+			t.Errorf("Test: %d failed for chunkNumber: %d got start: %d end: %d,"+
+				" expected start: %d end: %d", i, test.chunkNumber, s, e,
+				test.expectedStart, test.expectedEnd)
+		}
+	}
+}
--- a/index/scorch/segment/zap/posting.go
+++ b/index/scorch/segment/zap/posting.go
@ -189,9 +189,9 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator {
 	var numFreqChunks uint64
 	numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
 	n += uint64(read)
-	rv.freqChunkLens = make([]uint64, int(numFreqChunks))
+	rv.freqChunkOffsets = make([]uint64, int(numFreqChunks))
 	for i := 0; i < int(numFreqChunks); i++ {
-		rv.freqChunkLens[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
+		rv.freqChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
 		n += uint64(read)
 	}
 	rv.freqChunkStart = p.freqOffset + n
@ -201,9 +201,9 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator {
 	var numLocChunks uint64
 	numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
 	n += uint64(read)
-	rv.locChunkLens = make([]uint64, int(numLocChunks))
+	rv.locChunkOffsets = make([]uint64, int(numLocChunks))
 	for i := 0; i < int(numLocChunks); i++ {
-		rv.locChunkLens[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
+		rv.locChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
 		n += uint64(read)
 	}
 	rv.locChunkStart = p.locOffset + n
@ -316,11 +316,11 @@ type PostingsIterator struct {
 	locDecoder        *govarint.Base128Decoder
 	locReader         *bytes.Reader

-	freqChunkLens  []uint64
-	freqChunkStart uint64
+	freqChunkOffsets []uint64
+	freqChunkStart   uint64

-	locChunkLens  []uint64
-	locChunkStart uint64
+	locChunkOffsets []uint64
+	locChunkStart   uint64

 	locBitmap *roaring.Bitmap

@ -337,8 +337,8 @@ func (i *PostingsIterator) Size() int {
 	sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr +
 		len(i.currChunkFreqNorm) +
 		len(i.currChunkLoc) +
-		len(i.freqChunkLens)*size.SizeOfUint64 +
-		len(i.locChunkLens)*size.SizeOfUint64 +
+		len(i.freqChunkOffsets)*size.SizeOfUint64 +
+		len(i.locChunkOffsets)*size.SizeOfUint64 +
 		i.next.Size()

 	if i.locBitmap != nil {
@ -353,16 +353,14 @@ func (i *PostingsIterator) Size() int {
 }

 func (i *PostingsIterator) loadChunk(chunk int) error {
-	if chunk >= len(i.freqChunkLens) || chunk >= len(i.locChunkLens) {
-		return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkLens), len(i.locChunkLens))
+	if chunk >= len(i.freqChunkOffsets) || chunk >= len(i.locChunkOffsets) {
+		return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkOffsets), len(i.locChunkOffsets))
 	}

-	// load freq chunk bytes
-	start := i.freqChunkStart
-	for j := 0; j < chunk; j++ {
-		start += i.freqChunkLens[j]
-	}
-	end := start + i.freqChunkLens[chunk]
+	end, start := i.freqChunkStart, i.freqChunkStart
+	s, e := readChunkBoundary(chunk, i.freqChunkOffsets)
+	start += s
+	end += e
 	i.currChunkFreqNorm = i.postings.sb.mem[start:end]
 	if i.freqNormReader == nil {
 		i.freqNormReader = bytes.NewReader(i.currChunkFreqNorm)
@ -371,12 +369,10 @@ func (i *PostingsIterator) loadChunk(chunk int) error {
 		i.freqNormReader.Reset(i.currChunkFreqNorm)
 	}

-	// load loc chunk bytes
-	start = i.locChunkStart
-	for j := 0; j < chunk; j++ {
-		start += i.locChunkLens[j]
-	}
-	end = start + i.locChunkLens[chunk]
+	end, start = i.locChunkStart, i.locChunkStart
+	s, e = readChunkBoundary(chunk, i.locChunkOffsets)
+	start += s
+	end += e
 	i.currChunkLoc = i.postings.sb.mem[start:end]
 	if i.locReader == nil {
 		i.locReader = bytes.NewReader(i.currChunkLoc)