0
0

major refactoring of posting details

This commit is contained in:
Marty Schoch 2017-12-13 16:10:06 -05:00
parent 6e2207c445
commit 85e15628ee
2 changed files with 63 additions and 148 deletions

View File

@ -47,33 +47,39 @@ func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (e
cr := NewCountHashWriter(br) cr := NewCountHashWriter(br)
var storedIndexOffset uint64 var storedIndexOffset uint64
storedIndexOffset, err = persistStored(memSegment, cr)
if err != nil {
return err
}
var freqOffsets, locOffsets []uint64
freqOffsets, locOffsets, err = persistPostingDetails(memSegment, cr, chunkFactor)
if err != nil {
return err
}
var postingsListLocs []uint64
postingsListLocs, err = persistPostingsLocs(memSegment, cr)
if err != nil {
return err
}
var postingsLocs []uint64
postingsLocs, err = persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets)
if err != nil {
return err
}
var dictLocs []uint64 var dictLocs []uint64
dictLocs, err = persistDictionary(memSegment, cr, postingsLocs) if len(memSegment.Stored) > 0 {
if err != nil {
return err storedIndexOffset, err = persistStored(memSegment, cr)
if err != nil {
return err
}
var freqOffsets, locOffsets []uint64
freqOffsets, locOffsets, err = persistPostingDetails(memSegment, cr, chunkFactor)
if err != nil {
return err
}
var postingsListLocs []uint64
postingsListLocs, err = persistPostingsLocs(memSegment, cr)
if err != nil {
return err
}
var postingsLocs []uint64
postingsLocs, err = persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets)
if err != nil {
return err
}
dictLocs, err = persistDictionary(memSegment, cr, postingsLocs)
if err != nil {
return err
}
} else {
dictLocs = make([]uint64, len(memSegment.FieldsInv))
} }
var fieldIndexStart uint64 var fieldIndexStart uint64
@ -215,40 +221,19 @@ func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error)
func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) { func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) {
var freqOffsets, locOfffsets []uint64 var freqOffsets, locOfffsets []uint64
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
for postingID := range memSegment.Postings { for postingID := range memSegment.Postings {
if postingID != 0 {
tfEncoder.Reset()
}
postingsListItr := memSegment.Postings[postingID].Iterator() postingsListItr := memSegment.Postings[postingID].Iterator()
total := uint64(len(memSegment.Stored))/uint64(chunkFactor) + 1
var freqNormBuf []byte
var offset int var offset int
var encodingBuf bytes.Buffer
encoder := govarint.NewU64Base128Encoder(&encodingBuf)
chunkLens := make([]uint64, total)
var currChunk uint64
for postingsListItr.HasNext() { for postingsListItr.HasNext() {
docNum := postingsListItr.Next()
chunk := uint64(docNum) / uint64(chunkFactor)
if chunk != currChunk { docNum := uint64(postingsListItr.Next())
// starting a new chunk
if encoder != nil {
// close out last
encoder.Close()
encodingBytes := encodingBuf.Bytes()
chunkLens[currChunk] = uint64(len(encodingBytes))
freqNormBuf = append(freqNormBuf, encodingBytes...)
encodingBuf.Reset()
encoder = govarint.NewU64Base128Encoder(&encodingBuf)
}
currChunk = chunk
}
// put freq // put freq
_, err := encoder.PutU64(memSegment.Freqs[postingID][offset]) err := tfEncoder.Add(docNum, memSegment.Freqs[postingID][offset])
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
} }
@ -256,7 +241,7 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac
// put norm // put norm
norm := memSegment.Norms[postingID][offset] norm := memSegment.Norms[postingID][offset]
normBits := math.Float32bits(norm) normBits := math.Float32bits(norm)
_, err = encoder.PutU32(normBits) err = tfEncoder.Add(docNum, uint64(normBits))
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
} }
@ -264,35 +249,11 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac
offset++ offset++
} }
// close out last chunk
if encoder != nil {
// fix me write freq/norms
encoder.Close()
encodingBytes := encodingBuf.Bytes()
chunkLens[currChunk] = uint64(len(encodingBytes))
freqNormBuf = append(freqNormBuf, encodingBytes...)
}
// record where this postings freq info starts // record where this postings freq info starts
freqOffsets = append(freqOffsets, uint64(w.Count())) freqOffsets = append(freqOffsets, uint64(w.Count()))
buf := make([]byte, binary.MaxVarintLen64) tfEncoder.Close()
// write out the number of chunks _, err := tfEncoder.Write(w)
n := binary.PutUvarint(buf, uint64(total))
_, err := w.Write(buf[:n])
if err != nil {
return nil, nil, err
}
// write out the chunk lens
for _, chunkLen := range chunkLens {
n := binary.PutUvarint(buf, uint64(chunkLen))
_, err = w.Write(buf[:n])
if err != nil {
return nil, nil, err
}
}
// write out the data
_, err = w.Write(freqNormBuf)
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
} }
@ -300,61 +261,39 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac
} }
// now do it again for the locations // now do it again for the locations
locEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
for postingID := range memSegment.Postings { for postingID := range memSegment.Postings {
if postingID != 0 {
locEncoder.Reset()
}
postingsListItr := memSegment.Postings[postingID].Iterator() postingsListItr := memSegment.Postings[postingID].Iterator()
total := uint64(len(memSegment.Stored))/uint64(chunkFactor) + 1
var locBuf []byte
var offset int var offset int
var locOffset int var locOffset int
var encodingBuf bytes.Buffer
encoder := govarint.NewU64Base128Encoder(&encodingBuf)
chunkLens := make([]uint64, total)
var currChunk uint64
for postingsListItr.HasNext() { for postingsListItr.HasNext() {
docNum := postingsListItr.Next() docNum := uint64(postingsListItr.Next())
chunk := uint64(docNum) / uint64(chunkFactor)
if chunk != currChunk {
// starting a new chunk
if encoder != nil {
// close out last
encoder.Close()
encodingBytes := encodingBuf.Bytes()
chunkLens[currChunk] = uint64(len(encodingBytes))
locBuf = append(locBuf, encodingBytes...)
encodingBuf.Reset()
encoder = govarint.NewU64Base128Encoder(&encodingBuf)
}
currChunk = chunk
}
for i := 0; i < int(memSegment.Freqs[postingID][offset]); i++ { for i := 0; i < int(memSegment.Freqs[postingID][offset]); i++ {
if len(memSegment.Locfields[postingID]) > 0 { if len(memSegment.Locfields[postingID]) > 0 {
// put field // put field
_, err := encoder.PutU64(uint64(memSegment.Locfields[postingID][locOffset])) err := locEncoder.Add(docNum, uint64(memSegment.Locfields[postingID][locOffset]))
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
} }
// put pos // put pos
_, err = encoder.PutU64(memSegment.Locpos[postingID][locOffset])
err = locEncoder.Add(docNum, memSegment.Locpos[postingID][locOffset])
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
} }
// put start // put start
_, err = encoder.PutU64(memSegment.Locstarts[postingID][locOffset]) err = locEncoder.Add(docNum, memSegment.Locstarts[postingID][locOffset])
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
} }
// put end // put end
_, err = encoder.PutU64(memSegment.Locends[postingID][locOffset]) err = locEncoder.Add(docNum, memSegment.Locends[postingID][locOffset])
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
} }
@ -363,58 +302,31 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac
num := len(memSegment.Locarraypos[postingID][locOffset]) num := len(memSegment.Locarraypos[postingID][locOffset])
// put the number of array positions to follow // put the number of array positions to follow
_, err = encoder.PutU64(uint64(num)) err = locEncoder.Add(docNum, uint64(num))
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
} }
// put each array position // put each array position
for j := 0; j < num; j++ { for j := 0; j < num; j++ {
_, err = encoder.PutU64(memSegment.Locarraypos[postingID][locOffset][j]) err = locEncoder.Add(docNum, memSegment.Locarraypos[postingID][locOffset][j])
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
} }
} }
} }
locOffset++ locOffset++
} }
offset++ offset++
} }
// close out last chunk
if encoder != nil {
// fix me write freq/norms
encoder.Close()
encodingBytes := encodingBuf.Bytes()
chunkLens[currChunk] = uint64(len(encodingBytes))
locBuf = append(locBuf, encodingBytes...)
}
// record where this postings loc info starts // record where this postings loc info starts
locOfffsets = append(locOfffsets, uint64(w.Count())) locOfffsets = append(locOfffsets, uint64(w.Count()))
locEncoder.Close()
buf := make([]byte, binary.MaxVarintLen64) _, err := locEncoder.Write(w)
// write out the number of chunks
n := binary.PutUvarint(buf, uint64(total))
_, err := w.Write(buf[:n])
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
} }
// write out the chunk lens
for _, chunkLen := range chunkLens {
n := binary.PutUvarint(buf, uint64(chunkLen))
_, err = w.Write(buf[:n])
if err != nil {
return nil, nil, err
}
}
// write out the data
_, err = w.Write(locBuf)
if err != nil {
return nil, nil, err
}
} }
return freqOffsets, locOfffsets, nil return freqOffsets, locOfffsets, nil
} }

View File

@ -39,7 +39,7 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string,
var newDocNums [][]uint64 var newDocNums [][]uint64
var storedIndexOffset uint64 var storedIndexOffset uint64
dictLocs := make([]uint64, len(fieldsInv)) var dictLocs []uint64
if newSegDocCount > 0 { if newSegDocCount > 0 {
storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops,
fieldsMap, fieldsInv, newSegDocCount, cr) fieldsMap, fieldsInv, newSegDocCount, cr)
@ -48,10 +48,12 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string,
} }
dictLocs, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, dictLocs, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap,
newDocNums, newSegDocCount, cr) newDocNums, newSegDocCount, chunkFactor, cr)
if err != nil { if err != nil {
return nil, err return nil, err
} }
} else {
dictLocs = make([]uint64, len(fieldsInv))
} }
var fieldsIndexOffset uint64 var fieldsIndexOffset uint64
@ -108,7 +110,8 @@ func computeNewDocCount(segments []*Segment, drops []*roaring.Bitmap) uint64 {
} }
func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
fieldsInv []string, fieldsMap map[string]uint16, newDocNums [][]uint64, newSegDocCount uint64, fieldsInv []string, fieldsMap map[string]uint16, newDocNums [][]uint64,
newSegDocCount uint64, chunkFactor uint32,
w *CountHashWriter) ([]uint64, error) { w *CountHashWriter) ([]uint64, error) {
rv := make([]uint64, len(fieldsInv)) rv := make([]uint64, len(fieldsInv))
@ -149,8 +152,8 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
return 0 return 0
}) })
tfEncoder := newChunkedIntCoder(1024, newSegDocCount-1) tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
locEncoder := newChunkedIntCoder(1024, newSegDocCount-1) locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
for err == nil { for err == nil {
term, _ := mergeItr.Current() term, _ := mergeItr.Current()