docValue space savings

merging the doc value length and loc slices into a single offset slice as that is enough to compute the starting offset and length of the the doc values data for a given document inside a docValue chunk.
2018-03-12 15:36:46 +05:30 · 2018-03-12 15:36:46 +05:30 · aaccf59191
parent fa52ff856a
commit aaccf59191
4 changed files with 51 additions and 23 deletions
--- a/cmd/bleve/cmd/zap/docvalue.go
+++ b/cmd/bleve/cmd/zap/docvalue.go
@ -209,9 +209,7 @@ var docvalueCmd = &cobra.Command{
 		for i := 0; i < int(numDocs); i++ {
 			curChunkHeader[i].DocNum, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
 			offset += uint64(nread)
-			curChunkHeader[i].DocDvLoc, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
-			offset += uint64(nread)
-			curChunkHeader[i].DocDvLen, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
+			curChunkHeader[i].DocDvOffset, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
 			offset += uint64(nread)
 		}

@ -255,7 +253,19 @@ func getDocValueLocs(docNum uint64, metaHeader []zap.MetaData) (uint64, uint64)
 		return metaHeader[i].DocNum >= docNum
 	})
 	if i < len(metaHeader) && metaHeader[i].DocNum == docNum {
-		return metaHeader[i].DocDvLoc, metaHeader[i].DocDvLen
+		var start, end uint64
+		if i > 0 {
+			start = metaHeader[i].DocDvOffset
+		}
+		// single element case
+		if i == 0 && len(metaHeader) == 1 {
+			end = metaHeader[i].DocDvOffset
+		} else if i < len(metaHeader)-1 {
+			end = metaHeader[i+1].DocDvOffset
+		} else { // for last element
+			end = start + metaHeader[0].DocDvOffset
+		}
+		return start, end
 	}
 	return math.MaxUint64, math.MaxUint64
 }
--- a/index/scorch/segment/zap/contentcoder.go
+++ b/index/scorch/segment/zap/contentcoder.go
@ -47,9 +47,8 @@ type chunkedContentCoder struct {
 // MetaData represents the data information inside a
 // chunk.
 type MetaData struct {
-	DocNum   uint64 // docNum of the data inside the chunk
-	DocDvLoc uint64 // starting offset for a given docid
-	DocDvLen uint64 // length of data inside the chunk for the given docid
+	DocNum      uint64 // docNum of the data inside the chunk
+	DocDvOffset uint64 // offset of data inside the chunk for the given docid
 }

 // newChunkedContentCoder returns a new chunk content coder which
@ -94,9 +93,20 @@ func (c *chunkedContentCoder) flushContents() error {
 		return err
 	}

+	// convert the document data lens to data offsets
+	if len(c.chunkMeta) > 1 {
+		c.chunkMeta[1].DocDvOffset, c.chunkMeta[0].DocDvOffset =
+			c.chunkMeta[0].DocDvOffset, c.chunkMeta[1].DocDvOffset
+		for i := 2; i < len(c.chunkMeta); i++ {
+			cur := c.chunkMeta[i].DocDvOffset
+			c.chunkMeta[i].DocDvOffset = c.chunkMeta[i-1].DocDvOffset + c.chunkMeta[0].DocDvOffset
+			c.chunkMeta[0].DocDvOffset = cur
+		}
+	}
+
 	// write out the metaData slice
 	for _, meta := range c.chunkMeta {
-		_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvLoc, meta.DocDvLen)
+		_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset)
 		if err != nil {
 			return err
 		}
@ -130,17 +140,15 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error {
 		c.currChunk = chunk
 	}

-	// mark the starting offset for this doc
-	dvOffset := c.chunkBuf.Len()
+	// mark the data length for this doc
 	dvSize, err := c.chunkBuf.Write(vals)
 	if err != nil {
 		return err
 	}

 	c.chunkMeta = append(c.chunkMeta, MetaData{
-		DocNum:   docNum,
-		DocDvLoc: uint64(dvOffset),
-		DocDvLen: uint64(dvSize),
+		DocNum:      docNum,
+		DocDvOffset: uint64(dvSize),
 	})
 	return nil
 }
--- a/index/scorch/segment/zap/contentcoder_test.go
+++ b/index/scorch/segment/zap/contentcoder_test.go
@ -35,7 +35,7 @@ func TestChunkContentCoder(t *testing.T) {
 			docNums:   []uint64{0},
 			vals:      [][]byte{[]byte("bleve")},
 			// 1 chunk, chunk-0 length 11(b), value
-			expected: string([]byte{0x1, 0xb, 0x1, 0x0, 0x0, 0x05, 0x05, 0x10, 0x62, 0x6c, 0x65, 0x76, 0x65}),
+			expected: string([]byte{0x1, 0xa, 0x1, 0x0, 0x05, 0x05, 0x10, 0x62, 0x6c, 0x65, 0x76, 0x65}),
 		},
 		{
 			maxDocNum: 1,
@ -46,8 +46,8 @@ func TestChunkContentCoder(t *testing.T) {
 				[]byte("scorch"),
 			},

-			expected: string([]byte{0x02, 0x0c, 0x0c, 0x01, 0x00, 0x00, 0x06, 0x06, 0x14,
-				0x75, 0x70, 0x73, 0x69, 0x64, 0x65, 0x01, 0x01, 0x00, 0x06, 0x06,
+			expected: string([]byte{0x02, 0x0b, 0x0b, 0x01, 0x00, 0x06, 0x06, 0x14,
+				0x75, 0x70, 0x73, 0x69, 0x64, 0x65, 0x01, 0x01, 0x06, 0x06,
 				0x14, 0x73, 0x63, 0x6f, 0x72, 0x63, 0x68}),
 		},
 	}
--- a/index/scorch/segment/zap/docvalues.go
+++ b/index/scorch/segment/zap/docvalues.go
@ -117,9 +117,7 @@ func (di *docValueIterator) loadDvChunk(chunkNumber,
 	for i := 0; i < int(numDocs); i++ {
 		di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
 		offset += uint64(read)
-		di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
-		offset += uint64(read)
-		di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
+		di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
 		offset += uint64(read)
 	}

@ -133,8 +131,8 @@ func (di *docValueIterator) loadDvChunk(chunkNumber,
 func (di *docValueIterator) visitDocValues(docNum uint64,
 	visitor index.DocumentFieldTermVisitor) error {
 	// binary search the term locations for the docNum
-	start, length := di.getDocValueLocs(docNum)
-	if start == math.MaxUint64 || length == math.MaxUint64 {
+	start, end := di.getDocValueLocs(docNum)
+	if start == math.MaxUint64 || end == math.MaxUint64 {
 		return nil
 	}
 	// uncompress the already loaded data
@ -144,7 +142,7 @@ func (di *docValueIterator) visitDocValues(docNum uint64,
 	}

 	// pick the terms for the given docNum
-	uncompressed = uncompressed[start : start+length]
+	uncompressed = uncompressed[start:end]
 	for {
 		i := bytes.Index(uncompressed, termSeparatorSplitSlice)
 		if i < 0 {
@ -163,7 +161,19 @@ func (di *docValueIterator) getDocValueLocs(docNum uint64) (uint64, uint64) {
 		return di.curChunkHeader[i].DocNum >= docNum
 	})
 	if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum {
-		return di.curChunkHeader[i].DocDvLoc, di.curChunkHeader[i].DocDvLen
+		var start, end uint64
+		if i > 0 {
+			start = di.curChunkHeader[i].DocDvOffset
+		}
+		// single element case
+		if i == 0 && len(di.curChunkHeader) == 1 {
+			end = di.curChunkHeader[i].DocDvOffset
+		} else if i < len(di.curChunkHeader)-1 {
+			end = di.curChunkHeader[i+1].DocDvOffset
+		} else { // for last element
+			end = start + di.curChunkHeader[0].DocDvOffset
+		}
+		return start, end
 	}
 	return math.MaxUint64, math.MaxUint64
 }