initial cut of zap segment merging
This commit is contained in:
parent
927216df8c
commit
665c3c80ff
111
index/scorch/segment/zap/intcoder.go
Normal file
111
index/scorch/segment/zap/intcoder.go
Normal file
@ -0,0 +1,111 @@
|
||||
package zap
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"io"
|
||||
|
||||
"github.com/Smerity/govarint"
|
||||
)
|
||||
|
||||
type chunkedIntCoder struct {
|
||||
final []byte
|
||||
maxDocNum uint64
|
||||
chunkSize uint64
|
||||
chunkBuf bytes.Buffer
|
||||
encoder *govarint.Base128Encoder
|
||||
chunkLens []uint64
|
||||
currChunk uint64
|
||||
}
|
||||
|
||||
// newChunkedIntCoder returns a new chunk int coder which packs data into
|
||||
// chunks based on the provided chunkSize and supports up to the specified
|
||||
// maxDocNum
|
||||
func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder {
|
||||
total := maxDocNum/chunkSize + 1
|
||||
rv := &chunkedIntCoder{
|
||||
chunkSize: chunkSize,
|
||||
maxDocNum: maxDocNum,
|
||||
chunkLens: make([]uint64, total),
|
||||
}
|
||||
rv.encoder = govarint.NewU64Base128Encoder(&rv.chunkBuf)
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
// Reset lets you reuse this chunked int coder. buffers are reset and reused
|
||||
// from previous use. you cannot change the chunk size or max doc num.
|
||||
func (c *chunkedIntCoder) Reset() {
|
||||
c.final = c.final[:0]
|
||||
c.chunkBuf.Reset()
|
||||
c.currChunk = 0
|
||||
for i := range c.chunkLens {
|
||||
c.chunkLens[i] = 0
|
||||
}
|
||||
}
|
||||
|
||||
// Add encodes the provided integers into the correct chunk for the provided
|
||||
// doc num. You MUST call Add() with increasing docNums.
|
||||
func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error {
|
||||
chunk := docNum / c.chunkSize
|
||||
if chunk != c.currChunk {
|
||||
// starting a new chunk
|
||||
if c.encoder != nil {
|
||||
// close out last
|
||||
c.encoder.Close()
|
||||
encodingBytes := c.chunkBuf.Bytes()
|
||||
c.chunkLens[c.currChunk] = uint64(len(encodingBytes))
|
||||
c.final = append(c.final, encodingBytes...)
|
||||
c.chunkBuf.Reset()
|
||||
c.encoder = govarint.NewU64Base128Encoder(&c.chunkBuf)
|
||||
}
|
||||
c.currChunk = chunk
|
||||
}
|
||||
|
||||
for _, val := range vals {
|
||||
_, err := c.encoder.PutU64(val)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Close indicates you are done calling Add() this allows the final chunk
|
||||
// to be encoded.
|
||||
func (c *chunkedIntCoder) Close() {
|
||||
c.encoder.Close()
|
||||
encodingBytes := c.chunkBuf.Bytes()
|
||||
c.chunkLens[c.currChunk] = uint64(len(encodingBytes))
|
||||
c.final = append(c.final, encodingBytes...)
|
||||
}
|
||||
|
||||
// Write commits all the encoded chunked integers to the provided writer.
|
||||
func (c *chunkedIntCoder) Write(w io.Writer) (int, error) {
|
||||
var tw int
|
||||
buf := make([]byte, binary.MaxVarintLen64)
|
||||
// write out the number of chunks
|
||||
n := binary.PutUvarint(buf, uint64(len(c.chunkLens)))
|
||||
nw, err := w.Write(buf[:n])
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
// write out the chunk lens
|
||||
for _, chunkLen := range c.chunkLens {
|
||||
n := binary.PutUvarint(buf, uint64(chunkLen))
|
||||
nw, err = w.Write(buf[:n])
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
}
|
||||
// write out the data
|
||||
nw, err = w.Write(c.final)
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
return tw, nil
|
||||
}
|
59
index/scorch/segment/zap/intcoder_test.go
Normal file
59
index/scorch/segment/zap/intcoder_test.go
Normal file
@ -0,0 +1,59 @@
|
||||
package zap
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestChunkIntCoder(t *testing.T) {
|
||||
tests := []struct {
|
||||
maxDocNum uint64
|
||||
chunkSize uint64
|
||||
docNums []uint64
|
||||
vals [][]uint64
|
||||
expected []byte
|
||||
}{
|
||||
{
|
||||
maxDocNum: 0,
|
||||
chunkSize: 1,
|
||||
docNums: []uint64{0},
|
||||
vals: [][]uint64{
|
||||
[]uint64{3},
|
||||
},
|
||||
// 1 chunk, chunk-0 length 1, value 3
|
||||
expected: []byte{0x1, 0x1, 0x3},
|
||||
},
|
||||
{
|
||||
maxDocNum: 1,
|
||||
chunkSize: 1,
|
||||
docNums: []uint64{0, 1},
|
||||
vals: [][]uint64{
|
||||
[]uint64{3},
|
||||
[]uint64{7},
|
||||
},
|
||||
// 2 chunks, chunk-0 length 1, chunk-1 length 1, value 3, value 7
|
||||
expected: []byte{0x2, 0x1, 0x1, 0x3, 0x7},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
|
||||
cic := newChunkedIntCoder(test.chunkSize, test.maxDocNum)
|
||||
for i, docNum := range test.docNums {
|
||||
err := cic.Add(docNum, test.vals[i]...)
|
||||
if err != nil {
|
||||
t.Fatalf("error adding to intcoder: %v", err)
|
||||
}
|
||||
}
|
||||
cic.Close()
|
||||
var actual bytes.Buffer
|
||||
_, err := cic.Write(&actual)
|
||||
if err != nil {
|
||||
t.Fatalf("error writing: %v", err)
|
||||
}
|
||||
if !reflect.DeepEqual(test.expected, actual.Bytes()) {
|
||||
t.Errorf("got % x, expected % x", actual.Bytes(), test.expected)
|
||||
}
|
||||
}
|
||||
}
|
526
index/scorch/segment/zap/merge.go
Normal file
526
index/scorch/segment/zap/merge.go
Normal file
@ -0,0 +1,526 @@
|
||||
package zap
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"os"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/Smerity/govarint"
|
||||
"github.com/couchbaselabs/vellum"
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
// Merge takes a slice of zap segments, bit masks describing which documents
|
||||
// from the may be dropped, and creates a new segment containing the remaining
|
||||
// data. This new segment is built at the specified path, with the provided
|
||||
// chunkFactor.
|
||||
func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, chunkFactor uint32) error {
|
||||
|
||||
flag := os.O_RDWR | os.O_CREATE
|
||||
|
||||
f, err := os.OpenFile(path, flag, 0600)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// bufer the output
|
||||
br := bufio.NewWriter(f)
|
||||
|
||||
// wrap it for counting (tracking offsets)
|
||||
cr := NewCountHashWriter(br)
|
||||
|
||||
fieldsInv := mergeFields(segments)
|
||||
fieldsMap := mapFields(fieldsInv)
|
||||
|
||||
newSegDocCount := computeNewDocCount(segments, drops)
|
||||
|
||||
var newDocNums [][]uint64
|
||||
var storedIndexOffset uint64
|
||||
storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops,
|
||||
fieldsMap, fieldsInv, newSegDocCount, cr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// FIXME temp until computed
|
||||
//dictLocs := make([]uint64, len(fieldsInv))
|
||||
|
||||
var dictLocs []uint64
|
||||
dictLocs, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap,
|
||||
newDocNums, newSegDocCount, cr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var fieldsIndexOffset uint64
|
||||
fieldsIndexOffset, err = persistMergedFields(fieldsInv, cr, dictLocs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = persistFooter(newSegDocCount, storedIndexOffset,
|
||||
fieldsIndexOffset, chunkFactor, cr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = br.Flush()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = f.Sync()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = f.Close()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func mapFields(fields []string) map[string]uint16 {
|
||||
rv := make(map[string]uint16)
|
||||
for i, fieldName := range fields {
|
||||
rv[fieldName] = uint16(i)
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func computeNewDocCount(segments []*Segment, drops []*roaring.Bitmap) uint64 {
|
||||
var newSegDocCount uint64
|
||||
for segI, segment := range segments {
|
||||
segIAfterDrop := segment.NumDocs()
|
||||
if drops[segI] != nil {
|
||||
segIAfterDrop -= drops[segI].GetCardinality()
|
||||
}
|
||||
newSegDocCount += segIAfterDrop
|
||||
}
|
||||
return newSegDocCount
|
||||
}
|
||||
|
||||
func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer) (int, error) {
|
||||
var buffer bytes.Buffer
|
||||
// write out postings list to memory so we know the len
|
||||
postingsListLen, err := r.WriteTo(&buffer)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
var tw int
|
||||
// write out the length of this postings list
|
||||
buf := make([]byte, binary.MaxVarintLen64)
|
||||
n := binary.PutUvarint(buf, uint64(postingsListLen))
|
||||
nw, err := w.Write(buf[:n])
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
|
||||
// write out the postings list itself
|
||||
nw, err = w.Write(buffer.Bytes())
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
|
||||
return tw, nil
|
||||
}
|
||||
|
||||
func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
|
||||
fieldsInv []string, fieldsMap map[string]uint16, newDocNums [][]uint64, newSegDocCount uint64,
|
||||
w *CountHashWriter) ([]uint64, error) {
|
||||
|
||||
rv := make([]uint64, len(fieldsInv))
|
||||
|
||||
var vellumBuf bytes.Buffer
|
||||
// for each field
|
||||
for fieldID, fieldName := range fieldsInv {
|
||||
if fieldID != 0 {
|
||||
vellumBuf.Reset()
|
||||
}
|
||||
newVellum, err := vellum.New(&vellumBuf, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// collect FTS iterators from all segments for this field
|
||||
var dicts []*Dictionary
|
||||
var itrs []vellum.Iterator
|
||||
for _, segment := range segments {
|
||||
dict, err2 := segment.dictionary(fieldName)
|
||||
if err2 != nil {
|
||||
return nil, err2
|
||||
}
|
||||
dicts = append(dicts, dict)
|
||||
|
||||
itr, err2 := dict.fst.Iterator(nil, nil)
|
||||
if err2 != nil {
|
||||
return nil, err2
|
||||
}
|
||||
itrs = append(itrs, itr)
|
||||
}
|
||||
|
||||
// create merging iterator
|
||||
mergeItr, err := vellum.NewMergeIterator(itrs, func(postingOffsets []uint64) uint64 {
|
||||
// we don't actually use the merged value
|
||||
return 0
|
||||
})
|
||||
|
||||
tfEncoder := newChunkedIntCoder(1024, newSegDocCount-1)
|
||||
locEncoder := newChunkedIntCoder(1024, newSegDocCount-1)
|
||||
for err == nil {
|
||||
term, _ := mergeItr.Current()
|
||||
|
||||
newRoaring := roaring.NewBitmap()
|
||||
newRoaringLocs := roaring.NewBitmap()
|
||||
tfEncoder.Reset()
|
||||
locEncoder.Reset()
|
||||
|
||||
// now go back and get posting list for this term
|
||||
// but pass in the deleted docs for that segment
|
||||
for dictI, dict := range dicts {
|
||||
postings, err2 := dict.postingsList(string(term), drops[dictI])
|
||||
if err2 != nil {
|
||||
return nil, err2
|
||||
}
|
||||
|
||||
postItr := postings.Iterator()
|
||||
next, err2 := postItr.Next()
|
||||
for next != nil && err2 == nil {
|
||||
hitNewDocNum := newDocNums[dictI][next.Number()]
|
||||
if hitNewDocNum == docDropped {
|
||||
return nil, fmt.Errorf("see hit with dropped doc num")
|
||||
}
|
||||
newRoaring.Add(uint32(hitNewDocNum))
|
||||
// encode norm bits
|
||||
norm := next.Norm()
|
||||
normBits := math.Float32bits(float32(norm))
|
||||
err3 := tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits))
|
||||
if err3 != nil {
|
||||
return nil, err3
|
||||
}
|
||||
locs := next.Locations()
|
||||
if len(locs) > 0 {
|
||||
newRoaringLocs.Add(uint32(hitNewDocNum))
|
||||
for _, loc := range locs {
|
||||
args := make([]uint64, 0, 5+len(loc.ArrayPositions()))
|
||||
args = append(args, uint64(fieldsMap[loc.Field()]))
|
||||
args = append(args, loc.Pos())
|
||||
args = append(args, loc.Start())
|
||||
args = append(args, loc.End())
|
||||
args = append(args, uint64(len(loc.ArrayPositions())))
|
||||
args = append(args, loc.ArrayPositions()...)
|
||||
locEncoder.Add(hitNewDocNum, args...)
|
||||
}
|
||||
}
|
||||
next, err2 = postItr.Next()
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
}
|
||||
tfEncoder.Close()
|
||||
locEncoder.Close()
|
||||
|
||||
if newRoaring.GetCardinality() > 0 {
|
||||
// this field/term actually has hits in the new segment, lets write it down
|
||||
freqOffset := uint64(w.Count())
|
||||
_, err = tfEncoder.Write(w)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
locOffset := uint64(w.Count())
|
||||
_, err = locEncoder.Write(w)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
postingLocOffset := uint64(w.Count())
|
||||
_, err = writeRoaringWithLen(newRoaringLocs, w)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
postingOffset := uint64(w.Count())
|
||||
// write out the start of the term info
|
||||
buf := make([]byte, binary.MaxVarintLen64)
|
||||
n := binary.PutUvarint(buf, freqOffset)
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// write out the start of the loc info
|
||||
n = binary.PutUvarint(buf, locOffset)
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// write out the start of the loc posting list
|
||||
n = binary.PutUvarint(buf, postingLocOffset)
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
_, err = writeRoaringWithLen(newRoaring, w)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
newVellum.Insert(term, postingOffset)
|
||||
}
|
||||
|
||||
err = mergeItr.Next()
|
||||
}
|
||||
if err != nil && err != vellum.ErrIteratorDone {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
dictOffset := uint64(w.Count())
|
||||
err = newVellum.Close()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
vellumData := vellumBuf.Bytes()
|
||||
|
||||
// write out the length of the vellum data
|
||||
buf := make([]byte, binary.MaxVarintLen64)
|
||||
// write out the number of chunks
|
||||
n := binary.PutUvarint(buf, uint64(len(vellumData)))
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// write this vellum to disk
|
||||
_, err = w.Write(vellumData)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
rv[fieldID] = dictOffset
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
const docDropped = math.MaxUint64
|
||||
|
||||
func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap,
|
||||
fieldsMap map[string]uint16, fieldsInv []string, newSegDocCount uint64,
|
||||
w *CountHashWriter) (uint64, [][]uint64, error) {
|
||||
var rv [][]uint64
|
||||
var newDocNum int
|
||||
|
||||
var curr int
|
||||
var metaBuf bytes.Buffer
|
||||
var data, compressed []byte
|
||||
|
||||
docNumOffsets := make([]uint64, newSegDocCount)
|
||||
|
||||
// for each segment
|
||||
for segI, segment := range segments {
|
||||
var segNewDocNums []uint64
|
||||
|
||||
// for each doc num
|
||||
for docNum := uint64(0); docNum < segment.numDocs; docNum++ {
|
||||
metaBuf.Reset()
|
||||
data = data[:0]
|
||||
compressed = compressed[:0]
|
||||
curr = 0
|
||||
|
||||
metaEncoder := govarint.NewU64Base128Encoder(&metaBuf)
|
||||
|
||||
if drops[segI] != nil && drops[segI].Contains(uint32(docNum)) {
|
||||
segNewDocNums = append(segNewDocNums, docDropped)
|
||||
} else {
|
||||
segNewDocNums = append(segNewDocNums, uint64(newDocNum))
|
||||
// collect all the data
|
||||
vals := make(map[uint16][][]byte)
|
||||
typs := make(map[uint16][]byte)
|
||||
poss := make(map[uint16][][]uint64)
|
||||
err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool {
|
||||
fieldID := fieldsMap[field]
|
||||
vals[fieldID] = append(vals[fieldID], value)
|
||||
typs[fieldID] = append(typs[fieldID], typ)
|
||||
poss[fieldID] = append(poss[fieldID], pos)
|
||||
return true
|
||||
})
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
// now walk the fields in order
|
||||
for fieldID := range fieldsInv {
|
||||
|
||||
if storedFieldValues, ok := vals[uint16(fieldID)]; ok {
|
||||
|
||||
// has stored values for this field
|
||||
num := len(storedFieldValues)
|
||||
|
||||
// process each value
|
||||
for i := 0; i < num; i++ {
|
||||
// encode field
|
||||
_, err2 := metaEncoder.PutU64(uint64(fieldID))
|
||||
if err2 != nil {
|
||||
return 0, nil, err2
|
||||
}
|
||||
// encode type
|
||||
_, err2 = metaEncoder.PutU64(uint64(typs[uint16(fieldID)][i]))
|
||||
if err2 != nil {
|
||||
return 0, nil, err2
|
||||
}
|
||||
// encode start offset
|
||||
_, err2 = metaEncoder.PutU64(uint64(curr))
|
||||
if err2 != nil {
|
||||
return 0, nil, err2
|
||||
}
|
||||
// end len
|
||||
_, err2 = metaEncoder.PutU64(uint64(len(storedFieldValues[i])))
|
||||
if err2 != nil {
|
||||
return 0, nil, err2
|
||||
}
|
||||
// encode number of array pos
|
||||
_, err2 = metaEncoder.PutU64(uint64(len(poss[uint16(fieldID)][i])))
|
||||
if err2 != nil {
|
||||
return 0, nil, err2
|
||||
}
|
||||
// encode all array positions
|
||||
for j := 0; j < len(poss[uint16(fieldID)][i]); j++ {
|
||||
_, err2 = metaEncoder.PutU64(poss[uint16(fieldID)][i][j])
|
||||
if err2 != nil {
|
||||
return 0, nil, err2
|
||||
}
|
||||
}
|
||||
// append data
|
||||
data = append(data, storedFieldValues[i]...)
|
||||
// update curr
|
||||
curr += len(storedFieldValues[i])
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
metaEncoder.Close()
|
||||
metaBytes := metaBuf.Bytes()
|
||||
compressed = snappy.Encode(compressed, data)
|
||||
// record where we're about to start writing
|
||||
docNumOffsets[newDocNum] = uint64(w.Count())
|
||||
|
||||
buf := make([]byte, binary.MaxVarintLen64)
|
||||
// write out the meta length
|
||||
n := binary.PutUvarint(buf, uint64(len(metaBytes)))
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// write out the compressed data length
|
||||
n = binary.PutUvarint(buf, uint64(len(compressed)))
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// now write the meta
|
||||
_, err = w.Write(metaBytes)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// now write the compressed data
|
||||
_, err = w.Write(compressed)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
newDocNum++
|
||||
}
|
||||
}
|
||||
rv = append(rv, segNewDocNums)
|
||||
}
|
||||
|
||||
// return value is the start of the stored index
|
||||
offset := uint64(w.Count())
|
||||
// now write out the stored doc index
|
||||
for docNum := range docNumOffsets {
|
||||
err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum])
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return offset, rv, nil
|
||||
}
|
||||
|
||||
// mergeFields builds a unified list of fields used across all the input segments
|
||||
func mergeFields(segments []*Segment) []string {
|
||||
fieldsMap := map[string]struct{}{}
|
||||
|
||||
for _, segment := range segments {
|
||||
fields := segment.Fields()
|
||||
for _, field := range fields {
|
||||
fieldsMap[field] = struct{}{}
|
||||
}
|
||||
}
|
||||
rv := make([]string, 0, len(fieldsMap))
|
||||
// ensure _id stays first
|
||||
rv = append(rv, "_id")
|
||||
for k := range fieldsMap {
|
||||
if k != "_id" {
|
||||
rv = append(rv, k)
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func persistMergedFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (uint64, error) {
|
||||
var rv uint64
|
||||
|
||||
var fieldStarts []uint64
|
||||
for fieldID, fieldName := range fieldsInv {
|
||||
|
||||
// record start of this field
|
||||
fieldStarts = append(fieldStarts, uint64(w.Count()))
|
||||
|
||||
buf := make([]byte, binary.MaxVarintLen64)
|
||||
// write out dict location for this field
|
||||
n := binary.PutUvarint(buf, dictLocs[fieldID])
|
||||
_, err := w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// write out the length of the field name
|
||||
n = binary.PutUvarint(buf, uint64(len(fieldName)))
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// write out the field name
|
||||
_, err = w.Write([]byte(fieldName))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
// now write out the fields index
|
||||
rv = uint64(w.Count())
|
||||
|
||||
// now write out the stored doc index
|
||||
for fieldID := range fieldsInv {
|
||||
err := binary.Write(w, binary.BigEndian, fieldStarts[fieldID])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
280
index/scorch/segment/zap/merge_test.go
Normal file
280
index/scorch/segment/zap/merge_test.go
Normal file
@ -0,0 +1,280 @@
|
||||
package zap
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/mem"
|
||||
)
|
||||
|
||||
func TestMerge(t *testing.T) {
|
||||
_ = os.RemoveAll("/tmp/scorch.zap")
|
||||
_ = os.RemoveAll("/tmp/scorch2.zap")
|
||||
_ = os.RemoveAll("/tmp/scorch3.zap")
|
||||
|
||||
memSegment := buildMemSegmentMulti()
|
||||
err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
memSegment2 := buildMemSegmentMulti2()
|
||||
err = PersistSegment(memSegment2, "/tmp/scorch2.zap", 1024)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
segment, err := Open("/tmp/scorch.zap")
|
||||
if err != nil {
|
||||
t.Fatalf("error opening segment: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
cerr := segment.Close()
|
||||
if cerr != nil {
|
||||
t.Fatalf("error closing segment: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
segment2, err := Open("/tmp/scorch2.zap")
|
||||
if err != nil {
|
||||
t.Fatalf("error opening segment: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
cerr := segment2.Close()
|
||||
if cerr != nil {
|
||||
t.Fatalf("error closing segment: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
segsToMerge := make([]*Segment, 2)
|
||||
segsToMerge[0] = segment.(*Segment)
|
||||
segsToMerge[1] = segment2.(*Segment)
|
||||
|
||||
err = Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeAndDrop(t *testing.T) {
|
||||
_ = os.RemoveAll("/tmp/scorch.zap")
|
||||
_ = os.RemoveAll("/tmp/scorch2.zap")
|
||||
_ = os.RemoveAll("/tmp/scorch3.zap")
|
||||
|
||||
memSegment := buildMemSegmentMulti()
|
||||
err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
memSegment2 := buildMemSegmentMulti2()
|
||||
err = PersistSegment(memSegment2, "/tmp/scorch2.zap", 1024)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
segment, err := Open("/tmp/scorch.zap")
|
||||
if err != nil {
|
||||
t.Fatalf("error opening segment: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
cerr := segment.Close()
|
||||
if cerr != nil {
|
||||
t.Fatalf("error closing segment: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
segment2, err := Open("/tmp/scorch2.zap")
|
||||
if err != nil {
|
||||
t.Fatalf("error opening segment: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
cerr := segment2.Close()
|
||||
if cerr != nil {
|
||||
t.Fatalf("error closing segment: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
segsToMerge := make([]*Segment, 2)
|
||||
segsToMerge[0] = segment.(*Segment)
|
||||
segsToMerge[1] = segment2.(*Segment)
|
||||
|
||||
docsToDrop := make([]*roaring.Bitmap, 2)
|
||||
docsToDrop[0] = roaring.NewBitmap()
|
||||
docsToDrop[0].AddInt(1)
|
||||
docsToDrop[1] = roaring.NewBitmap()
|
||||
docsToDrop[1].AddInt(1)
|
||||
|
||||
err = Merge(segsToMerge, docsToDrop, "/tmp/scorch3.zap", 1024)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func buildMemSegmentMulti2() *mem.Segment {
|
||||
|
||||
doc := &document.Document{
|
||||
ID: "c",
|
||||
Fields: []document.Field{
|
||||
document.NewTextFieldCustom("_id", nil, []byte("c"), document.IndexField|document.StoreField, nil),
|
||||
document.NewTextFieldCustom("name", nil, []byte("mat"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
},
|
||||
CompositeFields: []*document.CompositeField{
|
||||
document.NewCompositeField("_all", true, nil, []string{"_id"}),
|
||||
},
|
||||
}
|
||||
|
||||
doc2 := &document.Document{
|
||||
ID: "d",
|
||||
Fields: []document.Field{
|
||||
document.NewTextFieldCustom("_id", nil, []byte("d"), document.IndexField|document.StoreField, nil),
|
||||
document.NewTextFieldCustom("name", nil, []byte("joa"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
},
|
||||
CompositeFields: []*document.CompositeField{
|
||||
document.NewCompositeField("_all", true, nil, []string{"_id"}),
|
||||
},
|
||||
}
|
||||
|
||||
// forge analyzed docs
|
||||
results := []*index.AnalysisResult{
|
||||
&index.AnalysisResult{
|
||||
Document: doc,
|
||||
Analyzed: []analysis.TokenFrequencies{
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
Term: []byte("c"),
|
||||
},
|
||||
}, nil, false),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Position: 1,
|
||||
Term: []byte("mat"),
|
||||
},
|
||||
}, nil, true),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 4,
|
||||
Position: 1,
|
||||
Term: []byte("some"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Start: 5,
|
||||
End: 10,
|
||||
Position: 2,
|
||||
Term: []byte("thing"),
|
||||
},
|
||||
}, nil, true),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 4,
|
||||
Position: 1,
|
||||
Term: []byte("cold"),
|
||||
},
|
||||
}, []uint64{0}, true),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 4,
|
||||
Position: 1,
|
||||
Term: []byte("dark"),
|
||||
},
|
||||
}, []uint64{1}, true),
|
||||
},
|
||||
Length: []int{
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
},
|
||||
},
|
||||
&index.AnalysisResult{
|
||||
Document: doc2,
|
||||
Analyzed: []analysis.TokenFrequencies{
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
Term: []byte("d"),
|
||||
},
|
||||
}, nil, false),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Position: 1,
|
||||
Term: []byte("joa"),
|
||||
},
|
||||
}, nil, true),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 4,
|
||||
Position: 1,
|
||||
Term: []byte("some"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Start: 5,
|
||||
End: 10,
|
||||
Position: 2,
|
||||
Term: []byte("thing"),
|
||||
},
|
||||
}, nil, true),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 4,
|
||||
Position: 1,
|
||||
Term: []byte("cold"),
|
||||
},
|
||||
}, []uint64{0}, true),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 4,
|
||||
Position: 1,
|
||||
Term: []byte("dark"),
|
||||
},
|
||||
}, []uint64{1}, true),
|
||||
},
|
||||
Length: []int{
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// fix up composite fields
|
||||
for _, ar := range results {
|
||||
for i, f := range ar.Document.Fields {
|
||||
for _, cf := range ar.Document.CompositeFields {
|
||||
cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
segment := mem.NewFromAnalyzedDocs(results)
|
||||
|
||||
return segment
|
||||
}
|
Loading…
Reference in New Issue
Block a user