Merge pull request #755 from steveyen/optimize-zap-merge-byte-copy-storedDocs
optimize zap merge byte copy stored docs
This commit is contained in:
commit
ea20b1be42
|
@ -95,6 +95,21 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) {
|
||||||
var numTokenFrequencies int
|
var numTokenFrequencies int
|
||||||
var totLocs int
|
var totLocs int
|
||||||
|
|
||||||
|
// initial scan for all fieldID's to sort them
|
||||||
|
for _, result := range results {
|
||||||
|
for _, field := range result.Document.CompositeFields {
|
||||||
|
s.getOrDefineField(field.Name())
|
||||||
|
}
|
||||||
|
for _, field := range result.Document.Fields {
|
||||||
|
s.getOrDefineField(field.Name())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
|
||||||
|
s.FieldsMap = make(map[string]uint16, len(s.FieldsInv))
|
||||||
|
for fieldID, fieldName := range s.FieldsInv {
|
||||||
|
s.FieldsMap[fieldName] = uint16(fieldID + 1)
|
||||||
|
}
|
||||||
|
|
||||||
processField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
|
processField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
|
||||||
for term, tf := range tfs {
|
for term, tf := range tfs {
|
||||||
pidPlus1, exists := s.Dicts[fieldID][term]
|
pidPlus1, exists := s.Dicts[fieldID][term]
|
||||||
|
|
|
@ -28,7 +28,7 @@ import (
|
||||||
"github.com/golang/snappy"
|
"github.com/golang/snappy"
|
||||||
)
|
)
|
||||||
|
|
||||||
const version uint32 = 2
|
const version uint32 = 3
|
||||||
|
|
||||||
const fieldNotUninverted = math.MaxUint64
|
const fieldNotUninverted = math.MaxUint64
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,7 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
|
"sort"
|
||||||
|
|
||||||
"github.com/RoaringBitmap/roaring"
|
"github.com/RoaringBitmap/roaring"
|
||||||
"github.com/Smerity/govarint"
|
"github.com/Smerity/govarint"
|
||||||
|
@ -28,6 +29,8 @@ import (
|
||||||
"github.com/golang/snappy"
|
"github.com/golang/snappy"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc
|
||||||
|
|
||||||
// Merge takes a slice of zap segments and bit masks describing which
|
// Merge takes a slice of zap segments and bit masks describing which
|
||||||
// documents may be dropped, and creates a new segment containing the
|
// documents may be dropped, and creates a new segment containing the
|
||||||
// remaining data. This new segment is built at the specified path,
|
// remaining data. This new segment is built at the specified path,
|
||||||
|
@ -101,13 +104,13 @@ func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,
|
||||||
|
|
||||||
var dictLocs []uint64
|
var dictLocs []uint64
|
||||||
|
|
||||||
fieldsInv := mergeFields(segments)
|
fieldsSame, fieldsInv := mergeFields(segments)
|
||||||
fieldsMap := mapFields(fieldsInv)
|
fieldsMap := mapFields(fieldsInv)
|
||||||
|
|
||||||
numDocs = computeNewDocCount(segments, drops)
|
numDocs = computeNewDocCount(segments, drops)
|
||||||
if numDocs > 0 {
|
if numDocs > 0 {
|
||||||
storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops,
|
storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops,
|
||||||
fieldsMap, fieldsInv, numDocs, cr)
|
fieldsMap, fieldsInv, fieldsSame, numDocs, cr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, 0, 0, 0, 0, err
|
return nil, 0, 0, 0, 0, err
|
||||||
}
|
}
|
||||||
|
@ -411,10 +414,8 @@ func persistMergedRest(segments []*SegmentBase, drops []*roaring.Bitmap,
|
||||||
return rv, fieldDvLocsOffset, nil
|
return rv, fieldDvLocsOffset, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
const docDropped = math.MaxUint64
|
|
||||||
|
|
||||||
func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
||||||
fieldsMap map[string]uint16, fieldsInv []string, newSegDocCount uint64,
|
fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64,
|
||||||
w *CountHashWriter) (uint64, [][]uint64, error) {
|
w *CountHashWriter) (uint64, [][]uint64, error) {
|
||||||
var rv [][]uint64 // The remapped or newDocNums for each segment.
|
var rv [][]uint64 // The remapped or newDocNums for each segment.
|
||||||
|
|
||||||
|
@ -436,6 +437,24 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
||||||
for segI, segment := range segments {
|
for segI, segment := range segments {
|
||||||
segNewDocNums := make([]uint64, segment.numDocs)
|
segNewDocNums := make([]uint64, segment.numDocs)
|
||||||
|
|
||||||
|
// optimize when the field mapping is the same across all
|
||||||
|
// segments and there are no deletions, via byte-copying
|
||||||
|
// of stored docs bytes directly to the writer
|
||||||
|
if fieldsSame && (drops[segI] == nil || drops[segI].GetCardinality() == 0) {
|
||||||
|
err := segment.copyStoredDocs(newDocNum, docNumOffsets, w)
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := uint64(0); i < segment.numDocs; i++ {
|
||||||
|
segNewDocNums[i] = newDocNum
|
||||||
|
newDocNum++
|
||||||
|
}
|
||||||
|
rv = append(rv, segNewDocNums)
|
||||||
|
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
// for each doc num
|
// for each doc num
|
||||||
for docNum := uint64(0); docNum < segment.numDocs; docNum++ {
|
for docNum := uint64(0); docNum < segment.numDocs; docNum++ {
|
||||||
// TODO: roaring's API limits docNums to 32-bits?
|
// TODO: roaring's API limits docNums to 32-bits?
|
||||||
|
@ -527,13 +546,61 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
||||||
return storedIndexOffset, rv, nil
|
return storedIndexOffset, rv, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// mergeFields builds a unified list of fields used across all the input segments
|
// copyStoredDocs writes out a segment's stored doc info, optimized by
|
||||||
func mergeFields(segments []*SegmentBase) []string {
|
// using a single Write() call for the entire set of bytes. The
|
||||||
|
// newDocNumOffsets is filled with the new offsets for each doc.
|
||||||
|
func (s *SegmentBase) copyStoredDocs(newDocNum uint64, newDocNumOffsets []uint64,
|
||||||
|
w *CountHashWriter) error {
|
||||||
|
if s.numDocs <= 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
indexOffset0, storedOffset0, _, _, _ :=
|
||||||
|
s.getDocStoredOffsets(0) // the segment's first doc
|
||||||
|
|
||||||
|
indexOffsetN, storedOffsetN, readN, metaLenN, dataLenN :=
|
||||||
|
s.getDocStoredOffsets(s.numDocs - 1) // the segment's last doc
|
||||||
|
|
||||||
|
storedOffset0New := uint64(w.Count())
|
||||||
|
|
||||||
|
storedBytes := s.mem[storedOffset0 : storedOffsetN+readN+metaLenN+dataLenN]
|
||||||
|
_, err := w.Write(storedBytes)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// remap the storedOffset's for the docs into new offsets relative
|
||||||
|
// to storedOffset0New, filling the given docNumOffsetsOut array
|
||||||
|
for indexOffset := indexOffset0; indexOffset <= indexOffsetN; indexOffset += 8 {
|
||||||
|
storedOffset := binary.BigEndian.Uint64(s.mem[indexOffset : indexOffset+8])
|
||||||
|
storedOffsetNew := storedOffset - storedOffset0 + storedOffset0New
|
||||||
|
newDocNumOffsets[newDocNum] = storedOffsetNew
|
||||||
|
newDocNum += 1
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// mergeFields builds a unified list of fields used across all the
|
||||||
|
// input segments, and computes whether the fields are the same across
|
||||||
|
// segments (which depends on fields to be sorted in the same way
|
||||||
|
// across segments)
|
||||||
|
func mergeFields(segments []*SegmentBase) (bool, []string) {
|
||||||
|
fieldsSame := true
|
||||||
|
|
||||||
|
var segment0Fields []string
|
||||||
|
if len(segments) > 0 {
|
||||||
|
segment0Fields = segments[0].Fields()
|
||||||
|
}
|
||||||
|
|
||||||
fieldsMap := map[string]struct{}{}
|
fieldsMap := map[string]struct{}{}
|
||||||
for _, segment := range segments {
|
for _, segment := range segments {
|
||||||
fields := segment.Fields()
|
fields := segment.Fields()
|
||||||
for _, field := range fields {
|
for fieldi, field := range fields {
|
||||||
fieldsMap[field] = struct{}{}
|
fieldsMap[field] = struct{}{}
|
||||||
|
if len(segment0Fields) != len(fields) || segment0Fields[fieldi] != field {
|
||||||
|
fieldsSame = false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -545,5 +612,8 @@ func mergeFields(segments []*SegmentBase) []string {
|
||||||
rv = append(rv, k)
|
rv = append(rv, k)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return rv
|
|
||||||
|
sort.Strings(rv[1:]) // leave _id as first
|
||||||
|
|
||||||
|
return fieldsSame, rv
|
||||||
}
|
}
|
||||||
|
|
|
@ -398,6 +398,40 @@ func compareSegments(a, b *Segment) string {
|
||||||
fieldName, next.Term, aloc, bloc))
|
fieldName, next.Term, aloc, bloc))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if fieldName == "_id" {
|
||||||
|
docId := next.Term
|
||||||
|
docNumA := apitrn.Number()
|
||||||
|
docNumB := bpitrn.Number()
|
||||||
|
afields := map[string]interface{}{}
|
||||||
|
err = a.VisitDocument(apitrn.Number(),
|
||||||
|
func(field string, typ byte, value []byte, pos []uint64) bool {
|
||||||
|
afields[field+"-typ"] = typ
|
||||||
|
afields[field+"-value"] = value
|
||||||
|
afields[field+"-pos"] = pos
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
rv = append(rv, fmt.Sprintf("a.VisitDocument err: %v", err))
|
||||||
|
}
|
||||||
|
bfields := map[string]interface{}{}
|
||||||
|
err = b.VisitDocument(bpitrn.Number(),
|
||||||
|
func(field string, typ byte, value []byte, pos []uint64) bool {
|
||||||
|
bfields[field+"-typ"] = typ
|
||||||
|
bfields[field+"-value"] = value
|
||||||
|
bfields[field+"-pos"] = pos
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
rv = append(rv, fmt.Sprintf("b.VisitDocument err: %v", err))
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(afields, bfields) {
|
||||||
|
rv = append(rv, fmt.Sprintf("afields != bfields,"+
|
||||||
|
" id: %s, docNumA: %d, docNumB: %d,"+
|
||||||
|
" afields: %#v, bfields: %#v",
|
||||||
|
docId, docNumA, docNumB, afields, bfields))
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,15 +17,27 @@ package zap
|
||||||
import "encoding/binary"
|
import "encoding/binary"
|
||||||
|
|
||||||
func (s *SegmentBase) getDocStoredMetaAndCompressed(docNum uint64) ([]byte, []byte) {
|
func (s *SegmentBase) getDocStoredMetaAndCompressed(docNum uint64) ([]byte, []byte) {
|
||||||
docStoredStartAddr := s.storedIndexOffset + (8 * docNum)
|
_, storedOffset, n, metaLen, dataLen := s.getDocStoredOffsets(docNum)
|
||||||
docStoredStart := binary.BigEndian.Uint64(s.mem[docStoredStartAddr : docStoredStartAddr+8])
|
|
||||||
var n uint64
|
meta := s.mem[storedOffset+n : storedOffset+n+metaLen]
|
||||||
metaLen, read := binary.Uvarint(s.mem[docStoredStart : docStoredStart+binary.MaxVarintLen64])
|
data := s.mem[storedOffset+n+metaLen : storedOffset+n+metaLen+dataLen]
|
||||||
n += uint64(read)
|
|
||||||
var dataLen uint64
|
|
||||||
dataLen, read = binary.Uvarint(s.mem[docStoredStart+n : docStoredStart+n+binary.MaxVarintLen64])
|
|
||||||
n += uint64(read)
|
|
||||||
meta := s.mem[docStoredStart+n : docStoredStart+n+metaLen]
|
|
||||||
data := s.mem[docStoredStart+n+metaLen : docStoredStart+n+metaLen+dataLen]
|
|
||||||
return meta, data
|
return meta, data
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *SegmentBase) getDocStoredOffsets(docNum uint64) (
|
||||||
|
uint64, uint64, uint64, uint64, uint64) {
|
||||||
|
indexOffset := s.storedIndexOffset + (8 * docNum)
|
||||||
|
|
||||||
|
storedOffset := binary.BigEndian.Uint64(s.mem[indexOffset : indexOffset+8])
|
||||||
|
|
||||||
|
var n uint64
|
||||||
|
|
||||||
|
metaLen, read := binary.Uvarint(s.mem[storedOffset : storedOffset+binary.MaxVarintLen64])
|
||||||
|
n += uint64(read)
|
||||||
|
|
||||||
|
dataLen, read := binary.Uvarint(s.mem[storedOffset+n : storedOffset+n+binary.MaxVarintLen64])
|
||||||
|
n += uint64(read)
|
||||||
|
|
||||||
|
return indexOffset, storedOffset, n, metaLen, dataLen
|
||||||
|
}
|
||||||
|
|
|
@ -18,6 +18,7 @@ import (
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"reflect"
|
"reflect"
|
||||||
|
"sort"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
|
@ -574,6 +575,7 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) {
|
||||||
t.Fatalf("segment VisitableDocValueFields err: %v", err)
|
t.Fatalf("segment VisitableDocValueFields err: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sort.Strings(expectedFields[1:]) // keep _id as first field
|
||||||
if !reflect.DeepEqual(fields, expectedFields) {
|
if !reflect.DeepEqual(fields, expectedFields) {
|
||||||
t.Errorf("expected field terms: %#v, got: %#v", expectedFields, fields)
|
t.Errorf("expected field terms: %#v, got: %#v", expectedFields, fields)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue