0
0
Fork 0

Merge pull request #821 from blevesearch/minor_docvalue_space_savings

docValue space savings
This commit is contained in:
Sreekanth Sivasankaran 2018-03-16 09:12:13 +05:30 committed by GitHub
commit 53bf29763b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 28 additions and 24 deletions

View File

@ -209,9 +209,7 @@ var docvalueCmd = &cobra.Command{
for i := 0; i < int(numDocs); i++ {
curChunkHeader[i].DocNum, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(nread)
curChunkHeader[i].DocDvLoc, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(nread)
curChunkHeader[i].DocDvLen, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
curChunkHeader[i].DocDvOffset, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(nread)
}
@ -255,7 +253,7 @@ func getDocValueLocs(docNum uint64, metaHeader []zap.MetaData) (uint64, uint64)
return metaHeader[i].DocNum >= docNum
})
if i < len(metaHeader) && metaHeader[i].DocNum == docNum {
return metaHeader[i].DocDvLoc, metaHeader[i].DocDvLen
return zap.ReadDocValueBoundary(i, metaHeader)
}
return math.MaxUint64, math.MaxUint64
}

View File

@ -22,7 +22,7 @@ import (
"github.com/Smerity/govarint"
)
const version uint32 = 5
const version uint32 = 6
const fieldNotUninverted = math.MaxUint64

View File

@ -47,9 +47,8 @@ type chunkedContentCoder struct {
// MetaData represents the data information inside a
// chunk.
type MetaData struct {
DocNum uint64 // docNum of the data inside the chunk
DocDvLoc uint64 // starting offset for a given docid
DocDvLen uint64 // length of data inside the chunk for the given docid
DocNum uint64 // docNum of the data inside the chunk
DocDvOffset uint64 // offset of data inside the chunk for the given docid
}
// newChunkedContentCoder returns a new chunk content coder which
@ -96,7 +95,7 @@ func (c *chunkedContentCoder) flushContents() error {
// write out the metaData slice
for _, meta := range c.chunkMeta {
_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvLoc, meta.DocDvLen)
_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset)
if err != nil {
return err
}
@ -130,7 +129,7 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error {
c.currChunk = chunk
}
// mark the starting offset for this doc
// get the starting offset for this doc
dvOffset := c.chunkBuf.Len()
dvSize, err := c.chunkBuf.Write(vals)
if err != nil {
@ -138,9 +137,8 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error {
}
c.chunkMeta = append(c.chunkMeta, MetaData{
DocNum: docNum,
DocDvLoc: uint64(dvOffset),
DocDvLen: uint64(dvSize),
DocNum: docNum,
DocDvOffset: uint64(dvOffset + dvSize),
})
return nil
}
@ -175,3 +173,13 @@ func (c *chunkedContentCoder) Write(w io.Writer) (int, error) {
}
return tw, nil
}
// ReadDocValueBoundary elicits the start, end offsets from a
// metaData header slice
func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) {
var start uint64
if chunk > 0 {
start = metaHeaders[chunk-1].DocDvOffset
}
return start, metaHeaders[chunk].DocDvOffset
}

View File

@ -35,7 +35,7 @@ func TestChunkContentCoder(t *testing.T) {
docNums: []uint64{0},
vals: [][]byte{[]byte("bleve")},
// 1 chunk, chunk-0 length 11(b), value
expected: string([]byte{0x1, 0xb, 0x1, 0x0, 0x0, 0x05, 0x05, 0x10, 0x62, 0x6c, 0x65, 0x76, 0x65}),
expected: string([]byte{0x1, 0xa, 0x1, 0x0, 0x05, 0x05, 0x10, 0x62, 0x6c, 0x65, 0x76, 0x65}),
},
{
maxDocNum: 1,
@ -46,8 +46,8 @@ func TestChunkContentCoder(t *testing.T) {
[]byte("scorch"),
},
expected: string([]byte{0x02, 0x0c, 0x18, 0x01, 0x00, 0x00, 0x06, 0x06, 0x14,
0x75, 0x70, 0x73, 0x69, 0x64, 0x65, 0x01, 0x01, 0x00, 0x06, 0x06,
expected: string([]byte{0x02, 0x0b, 0x16, 0x01, 0x00, 0x06, 0x06, 0x14,
0x75, 0x70, 0x73, 0x69, 0x64, 0x65, 0x01, 0x01, 0x06, 0x06,
0x14, 0x73, 0x63, 0x6f, 0x72, 0x63, 0x68}),
},
}
@ -69,7 +69,7 @@ func TestChunkContentCoder(t *testing.T) {
}
if !reflect.DeepEqual(test.expected, string(actual.Bytes())) {
t.Errorf("got % s, expected % s", string(actual.Bytes()), test.expected)
t.Errorf("got:%s, expected:%s", string(actual.Bytes()), test.expected)
}
}
}

View File

@ -116,9 +116,7 @@ func (di *docValueIterator) loadDvChunk(chunkNumber,
for i := 0; i < int(numDocs); i++ {
di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(read)
di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(read)
di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(read)
}
@ -132,8 +130,8 @@ func (di *docValueIterator) loadDvChunk(chunkNumber,
func (di *docValueIterator) visitDocValues(docNum uint64,
visitor index.DocumentFieldTermVisitor) error {
// binary search the term locations for the docNum
start, length := di.getDocValueLocs(docNum)
if start == math.MaxUint64 || length == math.MaxUint64 {
start, end := di.getDocValueLocs(docNum)
if start == math.MaxUint64 || end == math.MaxUint64 {
return nil
}
// uncompress the already loaded data
@ -143,7 +141,7 @@ func (di *docValueIterator) visitDocValues(docNum uint64,
}
// pick the terms for the given docNum
uncompressed = uncompressed[start : start+length]
uncompressed = uncompressed[start:end]
for {
i := bytes.Index(uncompressed, termSeparatorSplitSlice)
if i < 0 {
@ -162,7 +160,7 @@ func (di *docValueIterator) getDocValueLocs(docNum uint64) (uint64, uint64) {
return di.curChunkHeader[i].DocNum >= docNum
})
if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum {
return di.curChunkHeader[i].DocDvLoc, di.curChunkHeader[i].DocDvLen
return ReadDocValueBoundary(i, di.curChunkHeader)
}
return math.MaxUint64, math.MaxUint64
}