diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 30ae8d77..cd56e3b5 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -22,7 +22,7 @@ import ( "github.com/Smerity/govarint" ) -const version uint32 = 4 +const version uint32 = 5 const fieldNotUninverted = math.MaxUint64 diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go index c731f52c..5ba15d69 100644 --- a/index/scorch/segment/zap/contentcoder.go +++ b/index/scorch/segment/zap/contentcoder.go @@ -157,13 +157,10 @@ func (c *chunkedContentCoder) Write(w io.Writer) (int, error) { return tw, err } - if len(c.chunkLens) > 1 { - chunkLengthsToOffsets(c.chunkLens) - } - - // write out the chunk starting offsets - for _, chunkLen := range c.chunkLens { - n := binary.PutUvarint(buf, uint64(chunkLen)) + chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) + // write out the chunk offsets + for _, chunkOffset := range chunkOffsets { + n := binary.PutUvarint(buf, chunkOffset) nw, err = w.Write(buf[:n]) tw += nw if err != nil { diff --git a/index/scorch/segment/zap/contentcoder_test.go b/index/scorch/segment/zap/contentcoder_test.go index 0e45b783..da80f947 100644 --- a/index/scorch/segment/zap/contentcoder_test.go +++ b/index/scorch/segment/zap/contentcoder_test.go @@ -46,7 +46,7 @@ func TestChunkContentCoder(t *testing.T) { []byte("scorch"), }, - expected: string([]byte{0x02, 0x0c, 0x0c, 0x01, 0x00, 0x00, 0x06, 0x06, 0x14, + expected: string([]byte{0x02, 0x0c, 0x18, 0x01, 0x00, 0x00, 0x06, 0x06, 0x14, 0x75, 0x70, 0x73, 0x69, 0x64, 0x65, 0x01, 0x01, 0x00, 0x06, 0x06, 0x14, 0x73, 0x63, 0x6f, 0x72, 0x63, 0x68}), }, diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 882ff43d..61b83877 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -69,7 +69,7 @@ func (s *SegmentBase) loadFieldDocValueIterator(field string, } // read the number of chunks, chunk lengths - var offset, clen uint64 + var offset, loc uint64 numChunks, read := binary.Uvarint(s.mem[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64]) if read <= 0 { return nil, fmt.Errorf("failed to read the field "+ @@ -83,11 +83,11 @@ func (s *SegmentBase) loadFieldDocValueIterator(field string, chunkOffsets: make([]uint64, int(numChunks)), } for i := 0; i < int(numChunks); i++ { - clen, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) + loc, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) if read <= 0 { - return nil, fmt.Errorf("corrupted chunk length during segment load") + return nil, fmt.Errorf("corrupted chunk offset during segment load") } - fdvIter.chunkOffsets[i] = clen + fdvIter.chunkOffsets[i] = loc offset += uint64(read) } diff --git a/index/scorch/segment/zap/intcoder.go b/index/scorch/segment/zap/intcoder.go index 79fe5156..81ef8bb2 100644 --- a/index/scorch/segment/zap/intcoder.go +++ b/index/scorch/segment/zap/intcoder.go @@ -111,15 +111,13 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { } buf := c.buf - // convert the chunk lengths into starting chunk offsets - if len(c.chunkLens) > 1 { - chunkLengthsToOffsets(c.chunkLens) - } + // convert the chunk lengths into chunk offsets + chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) - // write out the number of chunks & each chunk starting offsets - n := binary.PutUvarint(buf, uint64(len(c.chunkLens))) - for _, chunkLen := range c.chunkLens { - n += binary.PutUvarint(buf[n:], uint64(chunkLen)) + // write out the number of chunks & each chunk offsets + n := binary.PutUvarint(buf, uint64(len(chunkOffsets))) + for _, chunkOffset := range chunkOffsets { + n += binary.PutUvarint(buf[n:], chunkOffset) } tw, err := w.Write(buf[:n]) @@ -140,41 +138,35 @@ func (c *chunkedIntCoder) FinalSize() int { return len(c.final) } -// chunkLengthsToOffsets converts the chunk length array -// to a chunk starting offset array. The readChunkBoundary +// modifyLengthsToEndOffsets converts the chunk length array +// to a chunk offset array. The readChunkBoundary // will figure out the start and end of every chunk from -// these offsets. The starting offset of the first/single -// array element will always be zero and this position is -// used for storing the size of the current last item in -// the array at any given point. -// For eg: -// Lens -> 5 5 5 5 => 5 5 10 15 -// Lens -> 0 5 0 5 => 5 0 5 5 -// Lens -> 0 0 0 5 => 5 0 0 0 -// Lens -> 5 0 0 0 => 0 5 5 5 -// Lens -> 0 5 0 0 => 0 0 5 5 -// Lens -> 0 0 5 0 => 0 0 0 5 -func chunkLengthsToOffsets(lengths []uint64) { - lengths[1], lengths[0] = lengths[0], lengths[1] - for i := 2; i < len(lengths); i++ { - cur := lengths[i] - lengths[i] = lengths[i-1] + lengths[0] - lengths[0] = cur +// these offsets. Starting offset of i'th index is stored +// in i-1'th position except for 0'th index and ending offset +// is stored at i'th index position. +// For 0'th element, starting position is always zero. +// eg: +// Lens -> 5 5 5 5 => 5 10 15 20 +// Lens -> 0 5 0 5 => 0 5 5 10 +// Lens -> 0 0 0 5 => 0 0 0 5 +// Lens -> 5 0 0 0 => 5 5 5 5 +// Lens -> 0 5 0 0 => 0 5 5 5 +// Lens -> 0 0 5 0 => 0 0 5 5 +func modifyLengthsToEndOffsets(lengths []uint64) []uint64 { + var runningOffset uint64 + var index, i int + for i = 1; i <= len(lengths); i++ { + runningOffset += lengths[i-1] + lengths[index] = runningOffset + index++ } + return lengths } func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) { - var start, end uint64 + var start uint64 if chunk > 0 { - start = offsets[chunk] + start = offsets[chunk-1] } - // single element case - if chunk == 0 && len(offsets) == 1 { - end = offsets[chunk] - } else if chunk < len(offsets)-1 { - end = offsets[chunk+1] - } else { // for last element - end = start + offsets[0] - } - return start, end + return start, offsets[chunk] } diff --git a/index/scorch/segment/zap/intcoder_test.go b/index/scorch/segment/zap/intcoder_test.go index 8c77eab6..952e0669 100644 --- a/index/scorch/segment/zap/intcoder_test.go +++ b/index/scorch/segment/zap/intcoder_test.go @@ -46,8 +46,8 @@ func TestChunkIntCoder(t *testing.T) { []uint64{3}, []uint64{7}, }, - // 2 chunks, chunk-0 length 1, chunk-1 length 1, value 3, value 7 - expected: []byte{0x2, 0x1, 0x1, 0x3, 0x7}, + // 2 chunks, chunk-0 offset 1, chunk-1 offset 2, value 3, value 7 + expected: []byte{0x2, 0x1, 0x2, 0x3, 0x7}, }, } @@ -80,40 +80,48 @@ func TestChunkLengthToOffsets(t *testing.T) { }{ { lengths: []uint64{5, 5, 5, 5, 5}, - expectedOffsets: []uint64{5, 5, 10, 15, 20}, + expectedOffsets: []uint64{5, 10, 15, 20, 25}, }, { lengths: []uint64{0, 5, 0, 5, 0}, - expectedOffsets: []uint64{0, 0, 5, 5, 10}, + expectedOffsets: []uint64{0, 5, 5, 10, 10}, }, { lengths: []uint64{0, 0, 0, 0, 5}, - expectedOffsets: []uint64{5, 0, 0, 0, 0}, - }, - { - lengths: []uint64{5, 0, 0, 0, 0}, - expectedOffsets: []uint64{0, 5, 5, 5, 5}, - }, - { - lengths: []uint64{0, 5, 0, 0, 0}, - expectedOffsets: []uint64{0, 0, 5, 5, 5}, - }, - { - lengths: []uint64{0, 0, 0, 5, 0}, expectedOffsets: []uint64{0, 0, 0, 0, 5}, }, + { + lengths: []uint64{5, 0, 0, 0, 0}, + expectedOffsets: []uint64{5, 5, 5, 5, 5}, + }, + { + lengths: []uint64{0, 5, 0, 0, 0}, + expectedOffsets: []uint64{0, 5, 5, 5, 5}, + }, + { + lengths: []uint64{0, 0, 0, 5, 0}, + expectedOffsets: []uint64{0, 0, 0, 5, 5}, + }, { lengths: []uint64{0, 0, 0, 5, 5}, - expectedOffsets: []uint64{5, 0, 0, 0, 5}, + expectedOffsets: []uint64{0, 0, 0, 5, 10}, }, { lengths: []uint64{5, 5, 5, 0, 0}, - expectedOffsets: []uint64{0, 5, 10, 15, 15}, + expectedOffsets: []uint64{5, 10, 15, 15, 15}, + }, + { + lengths: []uint64{5}, + expectedOffsets: []uint64{5}, + }, + { + lengths: []uint64{5, 5}, + expectedOffsets: []uint64{5, 10}, }, } for i, test := range tests { - chunkLengthsToOffsets(test.lengths) + modifyLengthsToEndOffsets(test.lengths) if !reflect.DeepEqual(test.expectedOffsets, test.lengths) { t.Errorf("Test: %d failed, got %+v, expected %+v", i, test.lengths, test.expectedOffsets) } @@ -129,86 +137,80 @@ func TestChunkReadBoundaryFromOffsets(t *testing.T) { expectedEnd uint64 }{ { - offsets: []uint64{5, 5, 10, 15, 20}, + offsets: []uint64{5, 10, 15, 20, 25}, chunkNumber: 4, expectedStart: 20, expectedEnd: 25, }, { - offsets: []uint64{5, 5, 10, 15, 20}, + offsets: []uint64{5, 10, 15, 20, 25}, chunkNumber: 0, expectedStart: 0, expectedEnd: 5, }, { - offsets: []uint64{5, 5, 10, 15, 20}, + offsets: []uint64{5, 10, 15, 20, 25}, chunkNumber: 2, expectedStart: 10, expectedEnd: 15, }, { - offsets: []uint64{0, 0, 5, 5, 10}, + offsets: []uint64{0, 5, 5, 10, 10}, chunkNumber: 4, expectedStart: 10, expectedEnd: 10, }, { - offsets: []uint64{0, 0, 5, 5, 10}, + offsets: []uint64{0, 5, 5, 10, 10}, chunkNumber: 1, expectedStart: 0, expectedEnd: 5, }, { - offsets: []uint64{5, 0, 0, 0, 0}, + offsets: []uint64{5, 5, 5, 5, 5}, chunkNumber: 0, expectedStart: 0, - expectedEnd: 0, - }, - { - offsets: []uint64{5, 0, 0, 0, 0}, - chunkNumber: 4, - expectedStart: 0, expectedEnd: 5, }, { - offsets: []uint64{5, 0, 0, 0, 0}, - chunkNumber: 1, - expectedStart: 0, - expectedEnd: 0, + offsets: []uint64{5, 5, 5, 5, 5}, + chunkNumber: 4, + expectedStart: 5, + expectedEnd: 5, }, { - offsets: []uint64{0, 5, 5, 5, 5}, + offsets: []uint64{5, 5, 5, 5, 5}, chunkNumber: 1, expectedStart: 5, expectedEnd: 5, }, { offsets: []uint64{0, 5, 5, 5, 5}, - chunkNumber: 0, - expectedStart: 0, - expectedEnd: 5, - }, - { - offsets: []uint64{0, 0, 5, 5, 5}, - chunkNumber: 2, - expectedStart: 5, - expectedEnd: 5, - }, - { - offsets: []uint64{0, 0, 5, 5, 5}, chunkNumber: 1, expectedStart: 0, expectedEnd: 5, }, { - offsets: []uint64{0, 0, 0, 0, 5}, - chunkNumber: 4, - expectedStart: 5, - expectedEnd: 5, + offsets: []uint64{0, 5, 5, 5, 5}, + chunkNumber: 0, + expectedStart: 0, + expectedEnd: 0, + }, + { + offsets: []uint64{0, 0, 0, 5, 5}, + chunkNumber: 2, + expectedStart: 0, + expectedEnd: 0, + }, + { + offsets: []uint64{0, 0, 0, 5, 5}, + chunkNumber: 1, + expectedStart: 0, + expectedEnd: 0, }, { offsets: []uint64{0, 0, 0, 0, 5}, - chunkNumber: 3, + chunkNumber: 4, expectedStart: 0, expectedEnd: 5, }, @@ -219,59 +221,41 @@ func TestChunkReadBoundaryFromOffsets(t *testing.T) { expectedEnd: 0, }, { - offsets: []uint64{5, 0, 0, 0, 5}, - chunkNumber: 0, - expectedStart: 0, - expectedEnd: 0, - }, - { - offsets: []uint64{5, 0, 0, 0, 5}, - chunkNumber: 1, - expectedStart: 0, - expectedEnd: 0, - }, - { - offsets: []uint64{5, 0, 0, 0, 5}, - chunkNumber: 3, - expectedStart: 0, - expectedEnd: 5, - }, - { - offsets: []uint64{5, 0, 0, 0, 5}, - chunkNumber: 4, - expectedStart: 5, - expectedEnd: 10, - }, - { - offsets: []uint64{0, 5, 10, 15, 15}, + offsets: []uint64{5, 10, 15, 15, 15}, chunkNumber: 0, expectedStart: 0, expectedEnd: 5, }, { - offsets: []uint64{0, 5, 10, 15, 15}, + offsets: []uint64{5, 10, 15, 15, 15}, chunkNumber: 1, expectedStart: 5, expectedEnd: 10, }, { - offsets: []uint64{0, 5, 10, 15, 15}, + offsets: []uint64{5, 10, 15, 15, 15}, chunkNumber: 2, expectedStart: 10, expectedEnd: 15, }, { - offsets: []uint64{0, 5, 10, 15, 15}, + offsets: []uint64{5, 10, 15, 15, 15}, chunkNumber: 3, expectedStart: 15, expectedEnd: 15, }, { - offsets: []uint64{0, 5, 10, 15, 15}, + offsets: []uint64{5, 10, 15, 15, 15}, chunkNumber: 4, expectedStart: 15, expectedEnd: 15, }, + { + offsets: []uint64{5}, + chunkNumber: 0, + expectedStart: 0, + expectedEnd: 5, + }, } for i, test := range tests { diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index c3fc2330..bdbb47e3 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -189,9 +189,9 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { var numFreqChunks uint64 numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) n += uint64(read) - rv.freqChunkLens = make([]uint64, int(numFreqChunks)) + rv.freqChunkOffsets = make([]uint64, int(numFreqChunks)) for i := 0; i < int(numFreqChunks); i++ { - rv.freqChunkLens[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) + rv.freqChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) n += uint64(read) } rv.freqChunkStart = p.freqOffset + n @@ -201,9 +201,9 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { var numLocChunks uint64 numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) n += uint64(read) - rv.locChunkLens = make([]uint64, int(numLocChunks)) + rv.locChunkOffsets = make([]uint64, int(numLocChunks)) for i := 0; i < int(numLocChunks); i++ { - rv.locChunkLens[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) + rv.locChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) n += uint64(read) } rv.locChunkStart = p.locOffset + n