0
0
Fork 0

docValue space savings

merging the doc value length and loc
slices into a single offset slice  as that
is enough to compute the starting offset and
length of the the doc values data for a given
document inside a docValue chunk.
This commit is contained in:
Sreekanth Sivasankaran 2018-03-12 15:36:46 +05:30
parent fa52ff856a
commit aaccf59191
4 changed files with 51 additions and 23 deletions

View File

@ -209,9 +209,7 @@ var docvalueCmd = &cobra.Command{
for i := 0; i < int(numDocs); i++ {
curChunkHeader[i].DocNum, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(nread)
curChunkHeader[i].DocDvLoc, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(nread)
curChunkHeader[i].DocDvLen, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
curChunkHeader[i].DocDvOffset, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(nread)
}
@ -255,7 +253,19 @@ func getDocValueLocs(docNum uint64, metaHeader []zap.MetaData) (uint64, uint64)
return metaHeader[i].DocNum >= docNum
})
if i < len(metaHeader) && metaHeader[i].DocNum == docNum {
return metaHeader[i].DocDvLoc, metaHeader[i].DocDvLen
var start, end uint64
if i > 0 {
start = metaHeader[i].DocDvOffset
}
// single element case
if i == 0 && len(metaHeader) == 1 {
end = metaHeader[i].DocDvOffset
} else if i < len(metaHeader)-1 {
end = metaHeader[i+1].DocDvOffset
} else { // for last element
end = start + metaHeader[0].DocDvOffset
}
return start, end
}
return math.MaxUint64, math.MaxUint64
}

View File

@ -47,9 +47,8 @@ type chunkedContentCoder struct {
// MetaData represents the data information inside a
// chunk.
type MetaData struct {
DocNum uint64 // docNum of the data inside the chunk
DocDvLoc uint64 // starting offset for a given docid
DocDvLen uint64 // length of data inside the chunk for the given docid
DocNum uint64 // docNum of the data inside the chunk
DocDvOffset uint64 // offset of data inside the chunk for the given docid
}
// newChunkedContentCoder returns a new chunk content coder which
@ -94,9 +93,20 @@ func (c *chunkedContentCoder) flushContents() error {
return err
}
// convert the document data lens to data offsets
if len(c.chunkMeta) > 1 {
c.chunkMeta[1].DocDvOffset, c.chunkMeta[0].DocDvOffset =
c.chunkMeta[0].DocDvOffset, c.chunkMeta[1].DocDvOffset
for i := 2; i < len(c.chunkMeta); i++ {
cur := c.chunkMeta[i].DocDvOffset
c.chunkMeta[i].DocDvOffset = c.chunkMeta[i-1].DocDvOffset + c.chunkMeta[0].DocDvOffset
c.chunkMeta[0].DocDvOffset = cur
}
}
// write out the metaData slice
for _, meta := range c.chunkMeta {
_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvLoc, meta.DocDvLen)
_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset)
if err != nil {
return err
}
@ -130,17 +140,15 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error {
c.currChunk = chunk
}
// mark the starting offset for this doc
dvOffset := c.chunkBuf.Len()
// mark the data length for this doc
dvSize, err := c.chunkBuf.Write(vals)
if err != nil {
return err
}
c.chunkMeta = append(c.chunkMeta, MetaData{
DocNum: docNum,
DocDvLoc: uint64(dvOffset),
DocDvLen: uint64(dvSize),
DocNum: docNum,
DocDvOffset: uint64(dvSize),
})
return nil
}

View File

@ -35,7 +35,7 @@ func TestChunkContentCoder(t *testing.T) {
docNums: []uint64{0},
vals: [][]byte{[]byte("bleve")},
// 1 chunk, chunk-0 length 11(b), value
expected: string([]byte{0x1, 0xb, 0x1, 0x0, 0x0, 0x05, 0x05, 0x10, 0x62, 0x6c, 0x65, 0x76, 0x65}),
expected: string([]byte{0x1, 0xa, 0x1, 0x0, 0x05, 0x05, 0x10, 0x62, 0x6c, 0x65, 0x76, 0x65}),
},
{
maxDocNum: 1,
@ -46,8 +46,8 @@ func TestChunkContentCoder(t *testing.T) {
[]byte("scorch"),
},
expected: string([]byte{0x02, 0x0c, 0x0c, 0x01, 0x00, 0x00, 0x06, 0x06, 0x14,
0x75, 0x70, 0x73, 0x69, 0x64, 0x65, 0x01, 0x01, 0x00, 0x06, 0x06,
expected: string([]byte{0x02, 0x0b, 0x0b, 0x01, 0x00, 0x06, 0x06, 0x14,
0x75, 0x70, 0x73, 0x69, 0x64, 0x65, 0x01, 0x01, 0x06, 0x06,
0x14, 0x73, 0x63, 0x6f, 0x72, 0x63, 0x68}),
},
}

View File

@ -117,9 +117,7 @@ func (di *docValueIterator) loadDvChunk(chunkNumber,
for i := 0; i < int(numDocs); i++ {
di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(read)
di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(read)
di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(read)
}
@ -133,8 +131,8 @@ func (di *docValueIterator) loadDvChunk(chunkNumber,
func (di *docValueIterator) visitDocValues(docNum uint64,
visitor index.DocumentFieldTermVisitor) error {
// binary search the term locations for the docNum
start, length := di.getDocValueLocs(docNum)
if start == math.MaxUint64 || length == math.MaxUint64 {
start, end := di.getDocValueLocs(docNum)
if start == math.MaxUint64 || end == math.MaxUint64 {
return nil
}
// uncompress the already loaded data
@ -144,7 +142,7 @@ func (di *docValueIterator) visitDocValues(docNum uint64,
}
// pick the terms for the given docNum
uncompressed = uncompressed[start : start+length]
uncompressed = uncompressed[start:end]
for {
i := bytes.Index(uncompressed, termSeparatorSplitSlice)
if i < 0 {
@ -163,7 +161,19 @@ func (di *docValueIterator) getDocValueLocs(docNum uint64) (uint64, uint64) {
return di.curChunkHeader[i].DocNum >= docNum
})
if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum {
return di.curChunkHeader[i].DocDvLoc, di.curChunkHeader[i].DocDvLen
var start, end uint64
if i > 0 {
start = di.curChunkHeader[i].DocDvOffset
}
// single element case
if i == 0 && len(di.curChunkHeader) == 1 {
end = di.curChunkHeader[i].DocDvOffset
} else if i < len(di.curChunkHeader)-1 {
end = di.curChunkHeader[i+1].DocDvOffset
} else { // for last element
end = start + di.curChunkHeader[0].DocDvOffset
}
return start, end
}
return math.MaxUint64, math.MaxUint64
}