0
0
Fork 0

Merge pull request #805 from steveyen/optimize-scorch-mem-processField

Optimize scorch processField() inner loop and writeRoaringWithLen()
This commit is contained in:
Steve Yen 2018-03-07 09:09:57 -08:00 committed by GitHub
commit 15242af465
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 20 additions and 20 deletions

View File

@ -111,13 +111,15 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) {
}
processField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
dict := s.Dicts[fieldID]
dictKeys := s.DictKeys[fieldID]
for term, tf := range tfs {
pidPlus1, exists := s.Dicts[fieldID][term]
pidPlus1, exists := dict[term]
if !exists {
numPostingsLists++
pidPlus1 = uint64(numPostingsLists)
s.Dicts[fieldID][term] = pidPlus1
s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term)
dict[term] = pidPlus1
dictKeys = append(dictKeys, term)
numTermsPerPostingsList = append(numTermsPerPostingsList, 0)
numLocsPerPostingsList = append(numLocsPerPostingsList, 0)
}
@ -127,6 +129,7 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) {
totLocs += len(tf.Locations)
}
numTokenFrequencies += len(tfs)
s.DictKeys[fieldID] = dictKeys
}
for _, result := range results {

View File

@ -394,13 +394,12 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac
func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) {
rv = make([]uint64, 0, len(memSegment.PostingsLocs))
var reuseBuf bytes.Buffer
reuseBufVarint := make([]byte, binary.MaxVarintLen64)
for postingID := range memSegment.PostingsLocs {
// record where we start this posting loc
rv = append(rv, uint64(w.Count()))
// write out the length and bitmap
_, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w, &reuseBuf, reuseBufVarint)
_, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w, reuseBufVarint)
if err != nil {
return nil, err
}
@ -411,7 +410,6 @@ func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint
func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter,
postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) {
rv = make([]uint64, 0, len(memSegment.Postings))
var reuseBuf bytes.Buffer
reuseBufVarint := make([]byte, binary.MaxVarintLen64)
for postingID := range memSegment.Postings {
// record where we start this posting list
@ -425,7 +423,7 @@ func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter,
}
// write out the length and bitmap
_, err = writeRoaringWithLen(memSegment.Postings[postingID], w, &reuseBuf, reuseBufVarint)
_, err = writeRoaringWithLen(memSegment.Postings[postingID], w, reuseBufVarint)
if err != nil {
return nil, err
}

View File

@ -160,7 +160,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
newSegDocCount uint64, chunkFactor uint32,
w *CountHashWriter) ([]uint64, uint64, error) {
var bufReuse bytes.Buffer
var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64)
var postings *PostingsList
@ -247,7 +246,7 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
return err
}
postingLocOffset := uint64(w.Count())
_, err = writeRoaringWithLen(newRoaringLocs, w, &bufReuse, bufMaxVarintLen64)
_, err = writeRoaringWithLen(newRoaringLocs, w, bufMaxVarintLen64)
if err != nil {
return err
}
@ -271,7 +270,7 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
if err != nil {
return err
}
_, err = writeRoaringWithLen(newRoaring, w, &bufReuse, bufMaxVarintLen64)
_, err = writeRoaringWithLen(newRoaring, w, bufMaxVarintLen64)
if err != nil {
return err
}

View File

@ -15,7 +15,6 @@
package zap
import (
"bytes"
"encoding/binary"
"io"
@ -25,28 +24,29 @@ import (
// writes out the length of the roaring bitmap in bytes as varint
// then writes out the roaring bitmap itself
func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer,
reuseBuf *bytes.Buffer, reuseBufVarint []byte) (int, error) {
reuseBuf.Reset()
// write out postings list to memory so we know the len
postingsListLen, err := r.WriteTo(reuseBuf)
reuseBufVarint []byte) (int, error) {
buf, err := r.ToBytes()
if err != nil {
return 0, err
}
var tw int
// write out the length of this postings list
n := binary.PutUvarint(reuseBufVarint, uint64(postingsListLen))
// write out the length
n := binary.PutUvarint(reuseBufVarint, uint64(len(buf)))
nw, err := w.Write(reuseBufVarint[:n])
tw += nw
if err != nil {
return tw, err
}
// write out the postings list itself
nw, err = w.Write(reuseBuf.Bytes())
// write out the roaring bytes
nw, err = w.Write(buf)
tw += nw
if err != nil {
return tw, err
}
return tw, nil
}