Merge pull request #822 from steveyen/scorch-reuse-roaring-and-sync-Pool
reuse roaring Bitmaps and also use sync.Pool for interim data
This commit is contained in:
commit
a526fe70f3
|
@ -68,7 +68,19 @@ func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap)
|
||||||
if rv == nil {
|
if rv == nil {
|
||||||
rv = &PostingsList{}
|
rv = &PostingsList{}
|
||||||
} else {
|
} else {
|
||||||
|
postings := rv.postings
|
||||||
|
if postings != nil {
|
||||||
|
postings.Clear()
|
||||||
|
}
|
||||||
|
locBitmap := rv.locBitmap
|
||||||
|
if locBitmap != nil {
|
||||||
|
locBitmap.Clear()
|
||||||
|
}
|
||||||
|
|
||||||
*rv = PostingsList{} // clear the struct
|
*rv = PostingsList{} // clear the struct
|
||||||
|
|
||||||
|
rv.postings = postings
|
||||||
|
rv.locBitmap = locBitmap
|
||||||
}
|
}
|
||||||
rv.sb = d.sb
|
rv.sb = d.sb
|
||||||
rv.except = except
|
rv.except = except
|
||||||
|
|
|
@ -183,6 +183,9 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
newRoaring := roaring.NewBitmap()
|
||||||
|
newRoaringLocs := roaring.NewBitmap()
|
||||||
|
|
||||||
// for each field
|
// for each field
|
||||||
for fieldID, fieldName := range fieldsInv {
|
for fieldID, fieldName := range fieldsInv {
|
||||||
|
|
||||||
|
@ -222,8 +225,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
||||||
|
|
||||||
var prevTerm []byte
|
var prevTerm []byte
|
||||||
|
|
||||||
newRoaring := roaring.NewBitmap()
|
newRoaring.Clear()
|
||||||
newRoaringLocs := roaring.NewBitmap()
|
newRoaringLocs.Clear()
|
||||||
|
|
||||||
var lastDocNum, lastFreq, lastNorm uint64
|
var lastDocNum, lastFreq, lastNorm uint64
|
||||||
|
|
||||||
|
@ -262,8 +265,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
newRoaring = roaring.NewBitmap()
|
newRoaring.Clear()
|
||||||
newRoaringLocs = roaring.NewBitmap()
|
newRoaringLocs.Clear()
|
||||||
|
|
||||||
tfEncoder.Reset()
|
tfEncoder.Reset()
|
||||||
locEncoder.Reset()
|
locEncoder.Reset()
|
||||||
|
|
|
@ -19,6 +19,7 @@ import (
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"math"
|
"math"
|
||||||
"sort"
|
"sort"
|
||||||
|
"sync"
|
||||||
|
|
||||||
"github.com/RoaringBitmap/roaring"
|
"github.com/RoaringBitmap/roaring"
|
||||||
"github.com/Smerity/govarint"
|
"github.com/Smerity/govarint"
|
||||||
|
@ -35,12 +36,11 @@ func AnalysisResultsToSegmentBase(results []*index.AnalysisResult,
|
||||||
chunkFactor uint32) (*SegmentBase, error) {
|
chunkFactor uint32) (*SegmentBase, error) {
|
||||||
var br bytes.Buffer
|
var br bytes.Buffer
|
||||||
|
|
||||||
s := interim{
|
s := interimPool.Get().(*interim)
|
||||||
results: results,
|
|
||||||
chunkFactor: chunkFactor,
|
s.results = results
|
||||||
w: NewCountHashWriter(&br),
|
s.chunkFactor = chunkFactor
|
||||||
FieldsMap: map[string]uint16{},
|
s.w = NewCountHashWriter(&br)
|
||||||
}
|
|
||||||
|
|
||||||
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets,
|
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets,
|
||||||
err := s.convert()
|
err := s.convert()
|
||||||
|
@ -52,9 +52,13 @@ func AnalysisResultsToSegmentBase(results []*index.AnalysisResult,
|
||||||
s.FieldsMap, s.FieldsInv, uint64(len(results)),
|
s.FieldsMap, s.FieldsInv, uint64(len(results)),
|
||||||
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets)
|
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets)
|
||||||
|
|
||||||
|
interimPool.Put(s.cleanse())
|
||||||
|
|
||||||
return sb, err
|
return sb, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var interimPool = sync.Pool{New: func() interface{} { return &interim{} }}
|
||||||
|
|
||||||
// interim holds temporary working data used while converting from
|
// interim holds temporary working data used while converting from
|
||||||
// analysis results to a zap-encoded segment
|
// analysis results to a zap-encoded segment
|
||||||
type interim struct {
|
type interim struct {
|
||||||
|
@ -91,16 +95,66 @@ type interim struct {
|
||||||
PostingsLocs []*roaring.Bitmap
|
PostingsLocs []*roaring.Bitmap
|
||||||
|
|
||||||
// postings id -> freq/norm's, one for each docNum in postings
|
// postings id -> freq/norm's, one for each docNum in postings
|
||||||
FreqNorms [][]interimFreqNorm
|
FreqNorms [][]interimFreqNorm
|
||||||
|
freqNormsBacking []interimFreqNorm
|
||||||
|
|
||||||
// postings id -> locs, one for each freq
|
// postings id -> locs, one for each freq
|
||||||
Locs [][]interimLoc
|
Locs [][]interimLoc
|
||||||
|
locsBacking []interimLoc
|
||||||
|
|
||||||
|
numTermsPerPostingsList []int // key is postings list id
|
||||||
|
numLocsPerPostingsList []int // key is postings list id
|
||||||
|
|
||||||
buf0 bytes.Buffer
|
buf0 bytes.Buffer
|
||||||
tmp0 []byte
|
tmp0 []byte
|
||||||
tmp1 []byte
|
tmp1 []byte
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *interim) cleanse() *interim {
|
||||||
|
s.results = nil
|
||||||
|
s.chunkFactor = 0
|
||||||
|
s.w = nil
|
||||||
|
s.FieldsMap = nil
|
||||||
|
s.FieldsInv = s.FieldsInv[:0]
|
||||||
|
for i := range s.Dicts {
|
||||||
|
s.Dicts[i] = nil
|
||||||
|
}
|
||||||
|
s.Dicts = s.Dicts[:0]
|
||||||
|
for i := range s.DictKeys {
|
||||||
|
s.DictKeys[i] = s.DictKeys[i][:0]
|
||||||
|
}
|
||||||
|
s.DictKeys = s.DictKeys[:0]
|
||||||
|
for i := range s.IncludeDocValues {
|
||||||
|
s.IncludeDocValues[i] = false
|
||||||
|
}
|
||||||
|
s.IncludeDocValues = s.IncludeDocValues[:0]
|
||||||
|
for _, idn := range s.Postings {
|
||||||
|
idn.Clear()
|
||||||
|
}
|
||||||
|
s.Postings = s.Postings[:0]
|
||||||
|
for _, idn := range s.PostingsLocs {
|
||||||
|
idn.Clear()
|
||||||
|
}
|
||||||
|
s.PostingsLocs = s.PostingsLocs[:0]
|
||||||
|
s.FreqNorms = s.FreqNorms[:0]
|
||||||
|
for i := range s.freqNormsBacking {
|
||||||
|
s.freqNormsBacking[i] = interimFreqNorm{}
|
||||||
|
}
|
||||||
|
s.freqNormsBacking = s.freqNormsBacking[:0]
|
||||||
|
s.Locs = s.Locs[:0]
|
||||||
|
for i := range s.locsBacking {
|
||||||
|
s.locsBacking[i] = interimLoc{}
|
||||||
|
}
|
||||||
|
s.locsBacking = s.locsBacking[:0]
|
||||||
|
s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0]
|
||||||
|
s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0]
|
||||||
|
s.buf0.Reset()
|
||||||
|
s.tmp0 = s.tmp0[:0]
|
||||||
|
s.tmp1 = s.tmp1[:0]
|
||||||
|
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
func (s *interim) grabBuf(size int) []byte {
|
func (s *interim) grabBuf(size int) []byte {
|
||||||
buf := s.tmp0
|
buf := s.tmp0
|
||||||
if cap(buf) < size {
|
if cap(buf) < size {
|
||||||
|
@ -130,6 +184,8 @@ type interimLoc struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) {
|
func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) {
|
||||||
|
s.FieldsMap = map[string]uint16{}
|
||||||
|
|
||||||
s.getOrDefineField("_id") // _id field is fieldID 0
|
s.getOrDefineField("_id") // _id field is fieldID 0
|
||||||
|
|
||||||
for _, result := range s.results {
|
for _, result := range s.results {
|
||||||
|
@ -143,12 +199,15 @@ func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) {
|
||||||
|
|
||||||
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
|
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
|
||||||
|
|
||||||
s.FieldsMap = make(map[string]uint16, len(s.FieldsInv))
|
|
||||||
for fieldID, fieldName := range s.FieldsInv {
|
for fieldID, fieldName := range s.FieldsInv {
|
||||||
s.FieldsMap[fieldName] = uint16(fieldID + 1)
|
s.FieldsMap[fieldName] = uint16(fieldID + 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
s.IncludeDocValues = make([]bool, len(s.FieldsInv))
|
if cap(s.IncludeDocValues) >= len(s.FieldsInv) {
|
||||||
|
s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)]
|
||||||
|
} else {
|
||||||
|
s.IncludeDocValues = make([]bool, len(s.FieldsInv))
|
||||||
|
}
|
||||||
|
|
||||||
s.prepareDicts()
|
s.prepareDicts()
|
||||||
|
|
||||||
|
@ -189,9 +248,18 @@ func (s *interim) getOrDefineField(fieldName string) int {
|
||||||
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
|
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
|
||||||
s.FieldsMap[fieldName] = fieldIDPlus1
|
s.FieldsMap[fieldName] = fieldIDPlus1
|
||||||
s.FieldsInv = append(s.FieldsInv, fieldName)
|
s.FieldsInv = append(s.FieldsInv, fieldName)
|
||||||
|
|
||||||
s.Dicts = append(s.Dicts, make(map[string]uint64))
|
s.Dicts = append(s.Dicts, make(map[string]uint64))
|
||||||
s.DictKeys = append(s.DictKeys, make([]string, 0))
|
|
||||||
|
n := len(s.DictKeys)
|
||||||
|
if n < cap(s.DictKeys) {
|
||||||
|
s.DictKeys = s.DictKeys[:n+1]
|
||||||
|
s.DictKeys[n] = s.DictKeys[n][:0]
|
||||||
|
} else {
|
||||||
|
s.DictKeys = append(s.DictKeys, []string(nil))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return int(fieldIDPlus1 - 1)
|
return int(fieldIDPlus1 - 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -199,9 +267,6 @@ func (s *interim) getOrDefineField(fieldName string) int {
|
||||||
func (s *interim) prepareDicts() {
|
func (s *interim) prepareDicts() {
|
||||||
var pidNext int
|
var pidNext int
|
||||||
|
|
||||||
numTermsPerPostingsList := make([]int, 0, 64) // key is postings list id
|
|
||||||
numLocsPerPostingsList := make([]int, 0, 64) // key is postings list id
|
|
||||||
|
|
||||||
var totTFs int
|
var totTFs int
|
||||||
var totLocs int
|
var totLocs int
|
||||||
|
|
||||||
|
@ -218,14 +283,14 @@ func (s *interim) prepareDicts() {
|
||||||
dict[term] = pidPlus1
|
dict[term] = pidPlus1
|
||||||
dictKeys = append(dictKeys, term)
|
dictKeys = append(dictKeys, term)
|
||||||
|
|
||||||
numTermsPerPostingsList = append(numTermsPerPostingsList, 0)
|
s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0)
|
||||||
numLocsPerPostingsList = append(numLocsPerPostingsList, 0)
|
s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
pid := pidPlus1 - 1
|
pid := pidPlus1 - 1
|
||||||
|
|
||||||
numTermsPerPostingsList[pid] += 1
|
s.numTermsPerPostingsList[pid] += 1
|
||||||
numLocsPerPostingsList[pid] += len(tf.Locations)
|
s.numLocsPerPostingsList[pid] += len(tf.Locations)
|
||||||
|
|
||||||
totLocs += len(tf.Locations)
|
totLocs += len(tf.Locations)
|
||||||
}
|
}
|
||||||
|
@ -253,28 +318,64 @@ func (s *interim) prepareDicts() {
|
||||||
|
|
||||||
numPostingsLists := pidNext
|
numPostingsLists := pidNext
|
||||||
|
|
||||||
s.Postings = make([]*roaring.Bitmap, numPostingsLists)
|
if cap(s.Postings) >= numPostingsLists {
|
||||||
for i := 0; i < numPostingsLists; i++ {
|
s.Postings = s.Postings[:numPostingsLists]
|
||||||
s.Postings[i] = roaring.New()
|
} else {
|
||||||
|
postings := make([]*roaring.Bitmap, numPostingsLists)
|
||||||
|
copy(postings, s.Postings[:cap(s.Postings)])
|
||||||
|
for i := 0; i < numPostingsLists; i++ {
|
||||||
|
if postings[i] == nil {
|
||||||
|
postings[i] = roaring.New()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s.Postings = postings
|
||||||
}
|
}
|
||||||
|
|
||||||
s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists)
|
if cap(s.PostingsLocs) >= numPostingsLists {
|
||||||
for i := 0; i < numPostingsLists; i++ {
|
s.PostingsLocs = s.PostingsLocs[:numPostingsLists]
|
||||||
s.PostingsLocs[i] = roaring.New()
|
} else {
|
||||||
|
postingsLocs := make([]*roaring.Bitmap, numPostingsLists)
|
||||||
|
copy(postingsLocs, s.PostingsLocs[:cap(s.PostingsLocs)])
|
||||||
|
for i := 0; i < numPostingsLists; i++ {
|
||||||
|
if postingsLocs[i] == nil {
|
||||||
|
postingsLocs[i] = roaring.New()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s.PostingsLocs = postingsLocs
|
||||||
}
|
}
|
||||||
|
|
||||||
s.FreqNorms = make([][]interimFreqNorm, numPostingsLists)
|
if cap(s.FreqNorms) >= numPostingsLists {
|
||||||
|
s.FreqNorms = s.FreqNorms[:numPostingsLists]
|
||||||
|
} else {
|
||||||
|
s.FreqNorms = make([][]interimFreqNorm, numPostingsLists)
|
||||||
|
}
|
||||||
|
|
||||||
freqNormsBacking := make([]interimFreqNorm, totTFs)
|
if cap(s.freqNormsBacking) >= totTFs {
|
||||||
for pid, numTerms := range numTermsPerPostingsList {
|
s.freqNormsBacking = s.freqNormsBacking[:totTFs]
|
||||||
|
} else {
|
||||||
|
s.freqNormsBacking = make([]interimFreqNorm, totTFs)
|
||||||
|
}
|
||||||
|
|
||||||
|
freqNormsBacking := s.freqNormsBacking
|
||||||
|
for pid, numTerms := range s.numTermsPerPostingsList {
|
||||||
s.FreqNorms[pid] = freqNormsBacking[0:0]
|
s.FreqNorms[pid] = freqNormsBacking[0:0]
|
||||||
freqNormsBacking = freqNormsBacking[numTerms:]
|
freqNormsBacking = freqNormsBacking[numTerms:]
|
||||||
}
|
}
|
||||||
|
|
||||||
s.Locs = make([][]interimLoc, numPostingsLists)
|
if cap(s.Locs) >= numPostingsLists {
|
||||||
|
s.Locs = s.Locs[:numPostingsLists]
|
||||||
|
} else {
|
||||||
|
s.Locs = make([][]interimLoc, numPostingsLists)
|
||||||
|
}
|
||||||
|
|
||||||
locsBacking := make([]interimLoc, totLocs)
|
if cap(s.locsBacking) >= totLocs {
|
||||||
for pid, numLocs := range numLocsPerPostingsList {
|
s.locsBacking = s.locsBacking[:totLocs]
|
||||||
|
} else {
|
||||||
|
s.locsBacking = make([]interimLoc, totLocs)
|
||||||
|
}
|
||||||
|
|
||||||
|
locsBacking := s.locsBacking
|
||||||
|
for pid, numLocs := range s.numLocsPerPostingsList {
|
||||||
s.Locs[pid] = locsBacking[0:0]
|
s.Locs[pid] = locsBacking[0:0]
|
||||||
locsBacking = locsBacking[numLocs:]
|
locsBacking = locsBacking[numLocs:]
|
||||||
}
|
}
|
||||||
|
@ -334,7 +435,7 @@ func (s *interim) processDocument(docNum uint64,
|
||||||
for term, tf := range tfs {
|
for term, tf := range tfs {
|
||||||
pid := dict[term] - 1
|
pid := dict[term] - 1
|
||||||
bs := s.Postings[pid]
|
bs := s.Postings[pid]
|
||||||
bs.AddInt(int(docNum))
|
bs.Add(uint32(docNum))
|
||||||
|
|
||||||
s.FreqNorms[pid] = append(s.FreqNorms[pid],
|
s.FreqNorms[pid] = append(s.FreqNorms[pid],
|
||||||
interimFreqNorm{
|
interimFreqNorm{
|
||||||
|
@ -344,7 +445,7 @@ func (s *interim) processDocument(docNum uint64,
|
||||||
|
|
||||||
if len(tf.Locations) > 0 {
|
if len(tf.Locations) > 0 {
|
||||||
locBS := s.PostingsLocs[pid]
|
locBS := s.PostingsLocs[pid]
|
||||||
locBS.AddInt(int(docNum))
|
locBS.Add(uint32(docNum))
|
||||||
|
|
||||||
locs := s.Locs[pid]
|
locs := s.Locs[pid]
|
||||||
|
|
||||||
|
|
|
@ -266,7 +266,9 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
|
||||||
|
|
||||||
locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen]
|
locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen]
|
||||||
|
|
||||||
rv.locBitmap = roaring.NewBitmap()
|
if rv.locBitmap == nil {
|
||||||
|
rv.locBitmap = roaring.NewBitmap()
|
||||||
|
}
|
||||||
_, err := rv.locBitmap.FromBuffer(locRoaringBytes)
|
_, err := rv.locBitmap.FromBuffer(locRoaringBytes)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err)
|
return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err)
|
||||||
|
@ -278,7 +280,9 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error {
|
||||||
|
|
||||||
roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen]
|
roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen]
|
||||||
|
|
||||||
rv.postings = roaring.NewBitmap()
|
if rv.postings == nil {
|
||||||
|
rv.postings = roaring.NewBitmap()
|
||||||
|
}
|
||||||
_, err = rv.postings.FromBuffer(roaringBytes)
|
_, err = rv.postings.FromBuffer(roaringBytes)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("error loading roaring bitmap: %v", err)
|
return fmt.Errorf("error loading roaring bitmap: %v", err)
|
||||||
|
|
Loading…
Reference in New Issue