minor optimisation to loadChunk method

2018-03-09 16:01:37 +05:30 · 2018-03-09 16:01:37 +05:30 · d6522e7e17
parent b38a61d4cf
commit d6522e7e17
5 changed files with 294 additions and 38 deletions
--- a/index/scorch/segment/zap/contentcoder.go
+++ b/index/scorch/segment/zap/contentcoder.go
@ -156,7 +156,12 @@ func (c *chunkedContentCoder) Write(w io.Writer) (int, error) {
 	if err != nil {
 		return tw, err
 	}
-	// write out the chunk lens
+
+	if len(c.chunkLens) > 1 {
+		chunkLengthsToOffsets(c.chunkLens)
+	}
+
+	// write out the chunk starting offsets
 	for _, chunkLen := range c.chunkLens {
 		n := binary.PutUvarint(buf, uint64(chunkLen))
 		nw, err = w.Write(buf[:n])
--- a/index/scorch/segment/zap/docvalues.go
+++ b/index/scorch/segment/zap/docvalues.go
@ -38,7 +38,7 @@ type docValueIterator struct {
 	field          string
 	curChunkNum    uint64
 	numChunks      uint64
-	chunkLens      []uint64
+	chunkOffsets   []uint64
 	dvDataLoc      uint64
 	curChunkHeader []MetaData
 	curChunkData   []byte // compressed data cache
@ -47,7 +47,7 @@ type docValueIterator struct {
 func (di *docValueIterator) size() int {
 	return reflectStaticSizedocValueIterator + size.SizeOfPtr +
 		len(di.field) +
-		len(di.chunkLens)*size.SizeOfUint64 +
+		len(di.chunkOffsets)*size.SizeOfUint64 +
 		len(di.curChunkHeader)*reflectStaticSizeMetaData +
 		len(di.curChunkData)
 }
@ -78,16 +78,16 @@ func (s *SegmentBase) loadFieldDocValueIterator(field string,
 	offset += uint64(read)

 	fdvIter := &docValueIterator{
-		curChunkNum: math.MaxUint64,
-		field:       field,
-		chunkLens:   make([]uint64, int(numChunks)),
+		curChunkNum:  math.MaxUint64,
+		field:        field,
+		chunkOffsets: make([]uint64, int(numChunks)),
 	}
 	for i := 0; i < int(numChunks); i++ {
 		clen, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64])
 		if read <= 0 {
 			return nil, fmt.Errorf("corrupted chunk length during segment load")
 		}
-		fdvIter.chunkLens[i] = clen
+		fdvIter.chunkOffsets[i] = clen
 		offset += uint64(read)
 	}

@ -99,12 +99,11 @@ func (di *docValueIterator) loadDvChunk(chunkNumber,
 	localDocNum uint64, s *SegmentBase) error {
 	// advance to the chunk where the docValues
 	// reside for the given docNum
-	destChunkDataLoc := di.dvDataLoc
-	for i := 0; i < int(chunkNumber); i++ {
-		destChunkDataLoc += di.chunkLens[i]
-	}
+	destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc
+	start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets)
+	destChunkDataLoc += start
+	curChunkEnd += end

-	curChunkSize := di.chunkLens[chunkNumber]
 	// read the number of docs reside in the chunk
 	numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64])
 	if read <= 0 {
@ -124,7 +123,7 @@ func (di *docValueIterator) loadDvChunk(chunkNumber,
 	}

 	compressedDataLoc := chunkMetaLoc + offset
-	dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc
+	dataLength := curChunkEnd - compressedDataLoc
 	di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength]
 	di.curChunkNum = chunkNumber
 	return nil
--- a/index/scorch/segment/zap/intcoder.go
+++ b/index/scorch/segment/zap/intcoder.go
@ -111,7 +111,12 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) {
 	}
 	buf := c.buf

-	// write out the number of chunks & each chunkLen
+	// convert the chunk lengths into starting chunk offsets
+	if len(c.chunkLens) > 1 {
+		chunkLengthsToOffsets(c.chunkLens)
+	}
+
+	// write out the number of chunks & each chunk starting offsets
 	n := binary.PutUvarint(buf, uint64(len(c.chunkLens)))
 	for _, chunkLen := range c.chunkLens {
 		n += binary.PutUvarint(buf[n:], uint64(chunkLen))
@ -134,3 +139,42 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) {
 func (c *chunkedIntCoder) FinalSize() int {
 	return len(c.final)
 }
+
+// chunkLengthsToOffsets converts the chunk length array
+// to a chunk starting offset array. The readChunkBoundary
+// will figure out the start and end of every chunk from
+// these offsets. The starting offset of the first/single
+// array element will always be zero and this position is
+// used for storing the size of the current last item in
+// the array at any given point.
+// For eg:
+// Lens ->  5 5 5 5 => 5 5 10 15
+// Lens ->  0 5 0 5 => 5 0 5 5
+// Lens ->  0 0 0 5 => 5 0 0 0
+// Lens ->  5 0 0 0 => 0 5 5 5
+// Lens ->  0 5 0 0 => 0 0 5 5
+// Lens ->  0 0 5 0 => 0 0 0 5
+func chunkLengthsToOffsets(lengths []uint64) {
+	lengths[1], lengths[0] = lengths[0], lengths[1]
+	for i := 2; i < len(lengths); i++ {
+		cur := lengths[i]
+		lengths[i] = lengths[i-1] + lengths[0]
+		lengths[0] = cur
+	}
+}
+
+func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) {
+	var start, end uint64
+	if chunk > 0 {
+		start = offsets[chunk]
+	}
+	// single element case
+	if chunk == 0 && len(offsets) == 1 {
+		end = offsets[chunk]
+	} else if chunk < len(offsets)-1 {
+		end = offsets[chunk+1]
+	} else { // for last element
+		end = start + offsets[0]
+	}
+	return start, end
+}
--- a/index/scorch/segment/zap/intcoder_test.go
+++ b/index/scorch/segment/zap/intcoder_test.go
@ -71,3 +71,215 @@ func TestChunkIntCoder(t *testing.T) {
 		}
 	}
 }
+
+func TestChunkLengthToOffsets(t *testing.T) {
+
+	tests := []struct {
+		lengths         []uint64
+		expectedOffsets []uint64
+	}{
+		{
+			lengths:         []uint64{5, 5, 5, 5, 5},
+			expectedOffsets: []uint64{5, 5, 10, 15, 20},
+		},
+		{
+			lengths:         []uint64{0, 5, 0, 5, 0},
+			expectedOffsets: []uint64{0, 0, 5, 5, 10},
+		},
+		{
+			lengths:         []uint64{0, 0, 0, 0, 5},
+			expectedOffsets: []uint64{5, 0, 0, 0, 0},
+		},
+		{
+			lengths:         []uint64{5, 0, 0, 0, 0},
+			expectedOffsets: []uint64{0, 5, 5, 5, 5},
+		},
+		{
+			lengths:         []uint64{0, 5, 0, 0, 0},
+			expectedOffsets: []uint64{0, 0, 5, 5, 5},
+		},
+		{
+			lengths:         []uint64{0, 0, 0, 5, 0},
+			expectedOffsets: []uint64{0, 0, 0, 0, 5},
+		},
+		{
+			lengths:         []uint64{0, 0, 0, 5, 5},
+			expectedOffsets: []uint64{5, 0, 0, 0, 5},
+		},
+		{
+			lengths:         []uint64{5, 5, 5, 0, 0},
+			expectedOffsets: []uint64{0, 5, 10, 15, 15},
+		},
+	}
+
+	for i, test := range tests {
+		chunkLengthsToOffsets(test.lengths)
+		if !reflect.DeepEqual(test.expectedOffsets, test.lengths) {
+			t.Errorf("Test: %d failed, got %+v, expected %+v", i, test.lengths, test.expectedOffsets)
+		}
+	}
+}
+
+func TestChunkReadBoundaryFromOffsets(t *testing.T) {
+
+	tests := []struct {
+		chunkNumber   int
+		offsets       []uint64
+		expectedStart uint64
+		expectedEnd   uint64
+	}{
+		{
+			offsets:       []uint64{5, 5, 10, 15, 20},
+			chunkNumber:   4,
+			expectedStart: 20,
+			expectedEnd:   25,
+		},
+		{
+			offsets:       []uint64{5, 5, 10, 15, 20},
+			chunkNumber:   0,
+			expectedStart: 0,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{5, 5, 10, 15, 20},
+			chunkNumber:   2,
+			expectedStart: 10,
+			expectedEnd:   15,
+		},
+		{
+			offsets:       []uint64{0, 0, 5, 5, 10},
+			chunkNumber:   4,
+			expectedStart: 10,
+			expectedEnd:   10,
+		},
+		{
+			offsets:       []uint64{0, 0, 5, 5, 10},
+			chunkNumber:   1,
+			expectedStart: 0,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{5, 0, 0, 0, 0},
+			chunkNumber:   0,
+			expectedStart: 0,
+			expectedEnd:   0,
+		},
+		{
+			offsets:       []uint64{5, 0, 0, 0, 0},
+			chunkNumber:   4,
+			expectedStart: 0,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{5, 0, 0, 0, 0},
+			chunkNumber:   1,
+			expectedStart: 0,
+			expectedEnd:   0,
+		},
+		{
+			offsets:       []uint64{0, 5, 5, 5, 5},
+			chunkNumber:   1,
+			expectedStart: 5,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{0, 5, 5, 5, 5},
+			chunkNumber:   0,
+			expectedStart: 0,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{0, 0, 5, 5, 5},
+			chunkNumber:   2,
+			expectedStart: 5,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{0, 0, 5, 5, 5},
+			chunkNumber:   1,
+			expectedStart: 0,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{0, 0, 0, 0, 5},
+			chunkNumber:   4,
+			expectedStart: 5,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{0, 0, 0, 0, 5},
+			chunkNumber:   3,
+			expectedStart: 0,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{0, 0, 0, 0, 5},
+			chunkNumber:   2,
+			expectedStart: 0,
+			expectedEnd:   0,
+		},
+		{
+			offsets:       []uint64{5, 0, 0, 0, 5},
+			chunkNumber:   0,
+			expectedStart: 0,
+			expectedEnd:   0,
+		},
+		{
+			offsets:       []uint64{5, 0, 0, 0, 5},
+			chunkNumber:   1,
+			expectedStart: 0,
+			expectedEnd:   0,
+		},
+		{
+			offsets:       []uint64{5, 0, 0, 0, 5},
+			chunkNumber:   3,
+			expectedStart: 0,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{5, 0, 0, 0, 5},
+			chunkNumber:   4,
+			expectedStart: 5,
+			expectedEnd:   10,
+		},
+		{
+			offsets:       []uint64{0, 5, 10, 15, 15},
+			chunkNumber:   0,
+			expectedStart: 0,
+			expectedEnd:   5,
+		},
+		{
+			offsets:       []uint64{0, 5, 10, 15, 15},
+			chunkNumber:   1,
+			expectedStart: 5,
+			expectedEnd:   10,
+		},
+		{
+			offsets:       []uint64{0, 5, 10, 15, 15},
+			chunkNumber:   2,
+			expectedStart: 10,
+			expectedEnd:   15,
+		},
+		{
+			offsets:       []uint64{0, 5, 10, 15, 15},
+			chunkNumber:   3,
+			expectedStart: 15,
+			expectedEnd:   15,
+		},
+		{
+			offsets:       []uint64{0, 5, 10, 15, 15},
+			chunkNumber:   4,
+			expectedStart: 15,
+			expectedEnd:   15,
+		},
+	}
+
+	for i, test := range tests {
+		s, e := readChunkBoundary(test.chunkNumber, test.offsets)
+		if test.expectedStart != s || test.expectedEnd != e {
+			t.Errorf("Test: %d failed for chunkNumber: %d got start: %d end: %d,"+
+				" expected start: %d end: %d", i, test.chunkNumber, s, e,
+				test.expectedStart, test.expectedEnd)
+		}
+	}
+}
--- a/index/scorch/segment/zap/posting.go
+++ b/index/scorch/segment/zap/posting.go
@ -163,9 +163,9 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator {
 		var numFreqChunks uint64
 		numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
 		n += uint64(read)
-		rv.freqChunkLens = make([]uint64, int(numFreqChunks))
+		rv.freqChunkOffsets = make([]uint64, int(numFreqChunks))
 		for i := 0; i < int(numFreqChunks); i++ {
-			rv.freqChunkLens[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
+			rv.freqChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
 			n += uint64(read)
 		}
 		rv.freqChunkStart = p.freqOffset + n
@ -175,9 +175,9 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator {
 		var numLocChunks uint64
 		numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
 		n += uint64(read)
-		rv.locChunkLens = make([]uint64, int(numLocChunks))
+		rv.locChunkOffsets = make([]uint64, int(numLocChunks))
 		for i := 0; i < int(numLocChunks); i++ {
-			rv.locChunkLens[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
+			rv.locChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
 			n += uint64(read)
 		}
 		rv.locChunkStart = p.locOffset + n
@ -297,11 +297,11 @@ type PostingsIterator struct {
 	locDecoder        *govarint.Base128Decoder
 	locReader         *bytes.Reader

-	freqChunkLens  []uint64
-	freqChunkStart uint64
+	freqChunkOffsets []uint64
+	freqChunkStart   uint64

-	locChunkLens  []uint64
-	locChunkStart uint64
+	locChunkOffsets []uint64
+	locChunkStart   uint64

 	locBitmap *roaring.Bitmap

@ -317,8 +317,8 @@ func (i *PostingsIterator) Size() int {
 	sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr +
 		len(i.currChunkFreqNorm) +
 		len(i.currChunkLoc) +
-		len(i.freqChunkLens)*size.SizeOfUint64 +
-		len(i.locChunkLens)*size.SizeOfUint64 +
+		len(i.freqChunkOffsets)*size.SizeOfUint64 +
+		len(i.locChunkOffsets)*size.SizeOfUint64 +
 		i.next.Size()

 	if i.locBitmap != nil {
@ -333,16 +333,14 @@ func (i *PostingsIterator) Size() int {
 }

 func (i *PostingsIterator) loadChunk(chunk int) error {
-	if chunk >= len(i.freqChunkLens) || chunk >= len(i.locChunkLens) {
-		return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkLens), len(i.locChunkLens))
+	if chunk >= len(i.freqChunkOffsets) || chunk >= len(i.locChunkOffsets) {
+		return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkOffsets), len(i.locChunkOffsets))
 	}

-	// load freq chunk bytes
-	start := i.freqChunkStart
-	for j := 0; j < chunk; j++ {
-		start += i.freqChunkLens[j]
-	}
-	end := start + i.freqChunkLens[chunk]
+	end, start := i.freqChunkStart, i.freqChunkStart
+	s, e := readChunkBoundary(chunk, i.freqChunkOffsets)
+	start += s
+	end += e
 	i.currChunkFreqNorm = i.postings.sb.mem[start:end]
 	if i.freqNormReader == nil {
 		i.freqNormReader = bytes.NewReader(i.currChunkFreqNorm)
@ -351,12 +349,10 @@ func (i *PostingsIterator) loadChunk(chunk int) error {
 		i.freqNormReader.Reset(i.currChunkFreqNorm)
 	}

-	// load loc chunk bytes
-	start = i.locChunkStart
-	for j := 0; j < chunk; j++ {
-		start += i.locChunkLens[j]
-	}
-	end = start + i.locChunkLens[chunk]
+	end, start = i.locChunkStart, i.locChunkStart
+	s, e = readChunkBoundary(chunk, i.locChunkOffsets)
+	start += s
+	end += e
 	i.currChunkLoc = i.postings.sb.mem[start:end]
 	if i.locReader == nil {
 		i.locReader = bytes.NewReader(i.currChunkLoc)