0
0
Fork 0

Merge pull request #743 from steveyen/master

zap-based in-memory segment impl & various merge optimizations
This commit is contained in:
Steve Yen 2018-01-29 09:22:12 -08:00 committed by GitHub
commit 5d1a2b0ad7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 536 additions and 380 deletions

View File

@ -54,7 +54,6 @@ OUTER:
lastEpochMergePlanned = ourSnapshot.epoch lastEpochMergePlanned = ourSnapshot.epoch
s.fireEvent(EventKindMergerProgress, time.Since(startTime)) s.fireEvent(EventKindMergerProgress, time.Since(startTime))
} }
_ = ourSnapshot.DecRef() _ = ourSnapshot.DecRef()
@ -81,6 +80,7 @@ OUTER:
// lets get started // lets get started
err := s.planMergeAtSnapshot(ourSnapshot) err := s.planMergeAtSnapshot(ourSnapshot)
if err != nil { if err != nil {
s.fireAsyncError(fmt.Errorf("merging err: %v", err))
_ = ourSnapshot.DecRef() _ = ourSnapshot.DecRef()
continue OUTER continue OUTER
} }
@ -141,7 +141,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot) error {
filename := zapFileName(newSegmentID) filename := zapFileName(newSegmentID)
s.markIneligibleForRemoval(filename) s.markIneligibleForRemoval(filename)
path := s.path + string(os.PathSeparator) + filename path := s.path + string(os.PathSeparator) + filename
newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, DefaultChunkFactor)
if err != nil { if err != nil {
s.unmarkIneligibleForRemoval(filename) s.unmarkIneligibleForRemoval(filename)
return fmt.Errorf("merging failed: %v", err) return fmt.Errorf("merging failed: %v", err)

View File

@ -28,11 +28,12 @@ import (
"github.com/RoaringBitmap/roaring" "github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/index/scorch/segment/mem"
"github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/blevesearch/bleve/index/scorch/segment/zap"
"github.com/boltdb/bolt" "github.com/boltdb/bolt"
) )
var DefaultChunkFactor uint32 = 1024
type notificationChan chan struct{} type notificationChan chan struct{}
func (s *Scorch) persisterLoop() { func (s *Scorch) persisterLoop() {
@ -178,11 +179,11 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error {
return err2 return err2
} }
switch seg := segmentSnapshot.segment.(type) { switch seg := segmentSnapshot.segment.(type) {
case *mem.Segment: case *zap.SegmentBase:
// need to persist this to disk // need to persist this to disk
filename := zapFileName(segmentSnapshot.id) filename := zapFileName(segmentSnapshot.id)
path := s.path + string(os.PathSeparator) + filename path := s.path + string(os.PathSeparator) + filename
err2 := zap.PersistSegment(seg, path, 1024) err2 := zap.PersistSegmentBase(seg, path)
if err2 != nil { if err2 != nil {
return fmt.Errorf("error persisting segment: %v", err2) return fmt.Errorf("error persisting segment: %v", err2)
} }

View File

@ -28,6 +28,7 @@ import (
"github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment"
"github.com/blevesearch/bleve/index/scorch/segment/mem" "github.com/blevesearch/bleve/index/scorch/segment/mem"
"github.com/blevesearch/bleve/index/scorch/segment/zap"
"github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/registry"
"github.com/boltdb/bolt" "github.com/boltdb/bolt"
@ -217,7 +218,7 @@ func (s *Scorch) Delete(id string) error {
} }
// Batch applices a batch of changes to the index atomically // Batch applices a batch of changes to the index atomically
func (s *Scorch) Batch(batch *index.Batch) error { func (s *Scorch) Batch(batch *index.Batch) (err error) {
start := time.Now() start := time.Now()
defer func() { defer func() {
@ -271,10 +272,13 @@ func (s *Scorch) Batch(batch *index.Batch) error {
var newSegment segment.Segment var newSegment segment.Segment
if len(analysisResults) > 0 { if len(analysisResults) > 0 {
newSegment = mem.NewFromAnalyzedDocs(analysisResults) newSegment, err = zap.NewSegmentBase(mem.NewFromAnalyzedDocs(analysisResults), DefaultChunkFactor)
if err != nil {
return err
}
} }
err := s.prepareSegment(newSegment, ids, batch.InternalOps) err = s.prepareSegment(newSegment, ids, batch.InternalOps)
if err != nil { if err != nil {
if newSegment != nil { if newSegment != nil {
_ = newSegment.Close() _ = newSegment.Close()

View File

@ -1395,7 +1395,7 @@ func TestConcurrentUpdate(t *testing.T) {
// do some concurrent updates // do some concurrent updates
var wg sync.WaitGroup var wg sync.WaitGroup
for i := 0; i < 10; i++ { for i := 0; i < 100; i++ {
wg.Add(1) wg.Add(1)
go func(i int) { go func(i int) {
doc := document.NewDocument("1") doc := document.NewDocument("1")

View File

@ -267,15 +267,15 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
} }
func (s *Segment) getOrDefineField(name string) int { func (s *Segment) getOrDefineField(name string) int {
fieldID, ok := s.FieldsMap[name] fieldIDPlus1, ok := s.FieldsMap[name]
if !ok { if !ok {
fieldID = uint16(len(s.FieldsInv) + 1) fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
s.FieldsMap[name] = fieldID s.FieldsMap[name] = fieldIDPlus1
s.FieldsInv = append(s.FieldsInv, name) s.FieldsInv = append(s.FieldsInv, name)
s.Dicts = append(s.Dicts, make(map[string]uint64)) s.Dicts = append(s.Dicts, make(map[string]uint64))
s.DictKeys = append(s.DictKeys, make([]string, 0)) s.DictKeys = append(s.DictKeys, make([]string, 0))
} }
return int(fieldID - 1) return int(fieldIDPlus1 - 1)
} }
func (s *Segment) addDocument() int { func (s *Segment) addDocument() int {

View File

@ -40,35 +40,38 @@ const idFieldID uint16 = 0
// Segment is an in memory implementation of scorch.Segment // Segment is an in memory implementation of scorch.Segment
type Segment struct { type Segment struct {
// FieldsMap name -> id+1 // FieldsMap adds 1 to field id to avoid zero value issues
// name -> field id + 1
FieldsMap map[string]uint16 FieldsMap map[string]uint16
// fields id -> name
// FieldsInv is the inverse of FieldsMap
// field id -> name
FieldsInv []string FieldsInv []string
// term dictionary // Term dictionaries for each field
// field id -> term -> postings list id + 1 // field id -> term -> postings list id + 1
Dicts []map[string]uint64 Dicts []map[string]uint64
// term dictionary keys // Terms for each field, where terms are sorted ascending
// field id -> []dictionary keys // field id -> []term
DictKeys [][]string DictKeys [][]string
// Postings list // Postings list
// postings list id -> Postings bitmap // postings list id -> bitmap by docNum
Postings []*roaring.Bitmap Postings []*roaring.Bitmap
// Postings List has locations // Postings list has locations
PostingsLocs []*roaring.Bitmap PostingsLocs []*roaring.Bitmap
// term frequencies // Term frequencies
// postings list id -> Freqs (one for each hit in bitmap) // postings list id -> Freqs (one for each hit in bitmap)
Freqs [][]uint64 Freqs [][]uint64
// field Norms // Field norms
// postings list id -> Norms (one for each hit in bitmap) // postings list id -> Norms (one for each hit in bitmap)
Norms [][]float32 Norms [][]float32
// field/start/end/pos/locarraypos // Field/start/end/pos/locarraypos
// postings list id -> start/end/pos/locarraypos (one for each freq) // postings list id -> start/end/pos/locarraypos (one for each freq)
Locfields [][]uint16 Locfields [][]uint16
Locstarts [][]uint64 Locstarts [][]uint64
@ -80,18 +83,18 @@ type Segment struct {
// docNum -> field id -> slice of values (each value []byte) // docNum -> field id -> slice of values (each value []byte)
Stored []map[uint16][][]byte Stored []map[uint16][][]byte
// stored field types // Stored field types
// docNum -> field id -> slice of types (each type byte) // docNum -> field id -> slice of types (each type byte)
StoredTypes []map[uint16][]byte StoredTypes []map[uint16][]byte
// stored field array positions // Stored field array positions
// docNum -> field id -> slice of array positions (each is []uint64) // docNum -> field id -> slice of array positions (each is []uint64)
StoredPos []map[uint16][][]uint64 StoredPos []map[uint16][][]uint64
// for storing the docValue persisted fields // For storing the docValue persisted fields
DocValueFields map[uint16]bool DocValueFields map[uint16]bool
// footprint of the segment, updated when analyzed document mutations // Footprint of the segment, updated when analyzed document mutations
// are added into the segment // are added into the segment
sizeInBytes uint64 sizeInBytes uint64
} }

View File

@ -32,10 +32,8 @@ const version uint32 = 2
const fieldNotUninverted = math.MaxUint64 const fieldNotUninverted = math.MaxUint64
// PersistSegment takes the in-memory segment and persists it to the specified // PersistSegmentBase persists SegmentBase in the zap file format.
// path in the zap file format. func PersistSegmentBase(sb *SegmentBase, path string) error {
func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (err error) {
flag := os.O_RDWR | os.O_CREATE flag := os.O_RDWR | os.O_CREATE
f, err := os.OpenFile(path, flag, 0600) f, err := os.OpenFile(path, flag, 0600)
@ -43,84 +41,151 @@ func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (e
return err return err
} }
// bufer the output cleanup := func() {
br := bufio.NewWriter(f) _ = f.Close()
_ = os.Remove(path)
// wrap it for counting (tracking offsets)
cr := NewCountHashWriter(br)
var storedIndexOffset uint64
var dictLocs []uint64
docValueOffset := uint64(fieldNotUninverted)
if len(memSegment.Stored) > 0 {
storedIndexOffset, err = persistStored(memSegment, cr)
if err != nil {
return err
}
var freqOffsets, locOffsets []uint64
freqOffsets, locOffsets, err = persistPostingDetails(memSegment, cr, chunkFactor)
if err != nil {
return err
}
var postingsListLocs []uint64
postingsListLocs, err = persistPostingsLocs(memSegment, cr)
if err != nil {
return err
}
var postingsLocs []uint64
postingsLocs, err = persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets)
if err != nil {
return err
}
dictLocs, err = persistDictionary(memSegment, cr, postingsLocs)
if err != nil {
return err
}
docValueOffset, err = persistFieldDocValues(cr, chunkFactor, memSegment)
if err != nil {
return err
}
} else {
dictLocs = make([]uint64, len(memSegment.FieldsInv))
} }
var fieldIndexStart uint64 br := bufio.NewWriter(f)
fieldIndexStart, err = persistFields(memSegment.FieldsInv, cr, dictLocs)
_, err = br.Write(sb.mem)
if err != nil { if err != nil {
cleanup()
return err return err
} }
err = persistFooter(uint64(len(memSegment.Stored)), storedIndexOffset, err = persistFooter(sb.numDocs, sb.storedIndexOffset, sb.fieldsIndexOffset, sb.docValueOffset,
fieldIndexStart, docValueOffset, chunkFactor, cr) sb.chunkFactor, sb.memCRC, br)
if err != nil { if err != nil {
cleanup()
return err return err
} }
err = br.Flush() err = br.Flush()
if err != nil { if err != nil {
cleanup()
return err return err
} }
err = f.Sync() err = f.Sync()
if err != nil { if err != nil {
cleanup()
return err return err
} }
err = f.Close() err = f.Close()
if err != nil { if err != nil {
cleanup()
return err return err
} }
return nil return nil
} }
// PersistSegment takes the in-memory segment and persists it to
// the specified path in the zap file format.
func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) error {
flag := os.O_RDWR | os.O_CREATE
f, err := os.OpenFile(path, flag, 0600)
if err != nil {
return err
}
cleanup := func() {
_ = f.Close()
_ = os.Remove(path)
}
// buffer the output
br := bufio.NewWriter(f)
// wrap it for counting (tracking offsets)
cr := NewCountHashWriter(br)
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, err :=
persistBase(memSegment, cr, chunkFactor)
if err != nil {
cleanup()
return err
}
err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset,
chunkFactor, cr.Sum32(), cr)
if err != nil {
cleanup()
return err
}
err = br.Flush()
if err != nil {
cleanup()
return err
}
err = f.Sync()
if err != nil {
cleanup()
return err
}
err = f.Close()
if err != nil {
cleanup()
return err
}
return nil
}
func persistBase(memSegment *mem.Segment, cr *CountHashWriter, chunkFactor uint32) (
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
dictLocs []uint64, err error) {
docValueOffset = uint64(fieldNotUninverted)
if len(memSegment.Stored) > 0 {
storedIndexOffset, err = persistStored(memSegment, cr)
if err != nil {
return 0, 0, 0, 0, nil, err
}
freqOffsets, locOffsets, err := persistPostingDetails(memSegment, cr, chunkFactor)
if err != nil {
return 0, 0, 0, 0, nil, err
}
postingsListLocs, err := persistPostingsLocs(memSegment, cr)
if err != nil {
return 0, 0, 0, 0, nil, err
}
postingsLocs, err := persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets)
if err != nil {
return 0, 0, 0, 0, nil, err
}
dictLocs, err = persistDictionary(memSegment, cr, postingsLocs)
if err != nil {
return 0, 0, 0, 0, nil, err
}
docValueOffset, err = persistFieldDocValues(memSegment, cr, chunkFactor)
if err != nil {
return 0, 0, 0, 0, nil, err
}
} else {
dictLocs = make([]uint64, len(memSegment.FieldsInv))
}
fieldsIndexOffset, err = persistFields(memSegment.FieldsInv, cr, dictLocs)
if err != nil {
return 0, 0, 0, 0, nil, err
}
return uint64(len(memSegment.Stored)), storedIndexOffset, fieldsIndexOffset, docValueOffset,
dictLocs, nil
}
func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) { func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) {
var curr int var curr int
@ -356,11 +421,13 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac
func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) { func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) {
rv = make([]uint64, 0, len(memSegment.PostingsLocs)) rv = make([]uint64, 0, len(memSegment.PostingsLocs))
var reuseBuf bytes.Buffer
reuseBufVarint := make([]byte, binary.MaxVarintLen64)
for postingID := range memSegment.PostingsLocs { for postingID := range memSegment.PostingsLocs {
// record where we start this posting loc // record where we start this posting loc
rv = append(rv, uint64(w.Count())) rv = append(rv, uint64(w.Count()))
// write out the length and bitmap // write out the length and bitmap
_, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w) _, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w, &reuseBuf, reuseBufVarint)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -371,6 +438,8 @@ func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint
func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter,
postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) { postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) {
rv = make([]uint64, 0, len(memSegment.Postings)) rv = make([]uint64, 0, len(memSegment.Postings))
var reuseBuf bytes.Buffer
reuseBufVarint := make([]byte, binary.MaxVarintLen64)
for postingID := range memSegment.Postings { for postingID := range memSegment.Postings {
// record where we start this posting list // record where we start this posting list
rv = append(rv, uint64(w.Count())) rv = append(rv, uint64(w.Count()))
@ -383,7 +452,7 @@ func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter,
} }
// write out the length and bitmap // write out the length and bitmap
_, err = writeRoaringWithLen(memSegment.Postings[postingID], w) _, err = writeRoaringWithLen(memSegment.Postings[postingID], w, &reuseBuf, reuseBufVarint)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -394,6 +463,8 @@ func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter,
func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) { func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) {
rv := make([]uint64, 0, len(memSegment.DictKeys)) rv := make([]uint64, 0, len(memSegment.DictKeys))
varintBuf := make([]byte, binary.MaxVarintLen64)
var buffer bytes.Buffer var buffer bytes.Buffer
for fieldID, fieldTerms := range memSegment.DictKeys { for fieldID, fieldTerms := range memSegment.DictKeys {
if fieldID != 0 { if fieldID != 0 {
@ -427,10 +498,8 @@ func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs
vellumData := buffer.Bytes() vellumData := buffer.Bytes()
// write out the length of the vellum data // write out the length of the vellum data
buf := make([]byte, binary.MaxVarintLen64) n := binary.PutUvarint(varintBuf, uint64(len(vellumData)))
// write out the number of chunks _, err = w.Write(varintBuf[:n])
n := binary.PutUvarint(buf, uint64(len(vellumData)))
_, err = w.Write(buf[:n])
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -521,9 +590,8 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter,
return fieldChunkOffsets, nil return fieldChunkOffsets, nil
} }
func persistFieldDocValues(w *CountHashWriter, chunkFactor uint32, func persistFieldDocValues(memSegment *mem.Segment, w *CountHashWriter,
memSegment *mem.Segment) (uint64, error) { chunkFactor uint32) (uint64, error) {
fieldDvOffsets, err := persistDocValues(memSegment, w, chunkFactor) fieldDvOffsets, err := persistDocValues(memSegment, w, chunkFactor)
if err != nil { if err != nil {
return 0, err return 0, err
@ -548,3 +616,36 @@ func persistFieldDocValues(w *CountHashWriter, chunkFactor uint32,
return fieldDocValuesOffset, nil return fieldDocValuesOffset, nil
} }
func NewSegmentBase(memSegment *mem.Segment, chunkFactor uint32) (*SegmentBase, error) {
var br bytes.Buffer
cr := NewCountHashWriter(&br)
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, err :=
persistBase(memSegment, cr, chunkFactor)
if err != nil {
return nil, err
}
sb := &SegmentBase{
mem: br.Bytes(),
memCRC: cr.Sum32(),
chunkFactor: chunkFactor,
fieldsMap: memSegment.FieldsMap,
fieldsInv: memSegment.FieldsInv,
numDocs: numDocs,
storedIndexOffset: storedIndexOffset,
fieldsIndexOffset: fieldsIndexOffset,
docValueOffset: docValueOffset,
dictLocs: dictLocs,
fieldDvIterMap: make(map[uint16]*docValueIterator),
}
err = sb.loadDvIterators()
if err != nil {
return nil, err
}
return sb, nil
}

View File

@ -15,32 +15,28 @@
package zap package zap
import ( import (
"hash"
"hash/crc32" "hash/crc32"
"io" "io"
) )
// CountHashWriter is a wrapper around a Writer which counts the number of // CountHashWriter is a wrapper around a Writer which counts the number of
// bytes which have been written // bytes which have been written and computes a crc32 hash
type CountHashWriter struct { type CountHashWriter struct {
w io.Writer w io.Writer
h hash.Hash32 crc uint32
n int n int
} }
// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer // NewCountHashWriter returns a CountHashWriter which wraps the provided Writer
func NewCountHashWriter(w io.Writer) *CountHashWriter { func NewCountHashWriter(w io.Writer) *CountHashWriter {
return &CountHashWriter{ return &CountHashWriter{w: w}
w: w,
h: crc32.NewIEEE(),
}
} }
// Write writes the provided bytes to the wrapped writer and counts the bytes // Write writes the provided bytes to the wrapped writer and counts the bytes
func (c *CountHashWriter) Write(b []byte) (int, error) { func (c *CountHashWriter) Write(b []byte) (int, error) {
n, err := c.w.Write(b) n, err := c.w.Write(b)
c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n])
c.n += n c.n += n
_, _ = c.h.Write(b)
return n, err return n, err
} }
@ -51,5 +47,5 @@ func (c *CountHashWriter) Count() int {
// Sum32 returns the CRC-32 hash of the content written to this writer // Sum32 returns the CRC-32 hash of the content written to this writer
func (c *CountHashWriter) Sum32() uint32 { func (c *CountHashWriter) Sum32() uint32 {
return c.h.Sum32() return c.crc
} }

View File

@ -27,7 +27,7 @@ import (
// Dictionary is the zap representation of the term dictionary // Dictionary is the zap representation of the term dictionary
type Dictionary struct { type Dictionary struct {
segment *Segment sb *SegmentBase
field string field string
fieldID uint16 fieldID uint16
fst *vellum.FST fst *vellum.FST
@ -35,18 +35,18 @@ type Dictionary struct {
// PostingsList returns the postings list for the specified term // PostingsList returns the postings list for the specified term
func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) {
return d.postingsList(term, except) return d.postingsList([]byte(term), except)
} }
func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*PostingsList, error) { func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap) (*PostingsList, error) {
rv := &PostingsList{ rv := &PostingsList{
dictionary: d, sb: d.sb,
term: term, term: term,
except: except, except: except,
} }
if d.fst != nil { if d.fst != nil {
postingsOffset, exists, err := d.fst.Get([]byte(term)) postingsOffset, exists, err := d.fst.Get(term)
if err != nil { if err != nil {
return nil, fmt.Errorf("vellum err: %v", err) return nil, fmt.Errorf("vellum err: %v", err)
} }
@ -56,19 +56,19 @@ func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*Posting
var n uint64 var n uint64
var read int var read int
rv.freqOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+binary.MaxVarintLen64]) rv.freqOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+binary.MaxVarintLen64])
n += uint64(read) n += uint64(read)
rv.locOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read) n += uint64(read)
var locBitmapOffset uint64 var locBitmapOffset uint64
locBitmapOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) locBitmapOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read) n += uint64(read)
// go ahead and load loc bitmap // go ahead and load loc bitmap
var locBitmapLen uint64 var locBitmapLen uint64
locBitmapLen, read = binary.Uvarint(d.segment.mm[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64]) locBitmapLen, read = binary.Uvarint(d.sb.mem[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64])
locRoaringBytes := d.segment.mm[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen] locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen]
rv.locBitmap = roaring.NewBitmap() rv.locBitmap = roaring.NewBitmap()
_, err := rv.locBitmap.FromBuffer(locRoaringBytes) _, err := rv.locBitmap.FromBuffer(locRoaringBytes)
if err != nil { if err != nil {
@ -76,10 +76,10 @@ func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*Posting
} }
var postingsLen uint64 var postingsLen uint64
postingsLen, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read) n += uint64(read)
roaringBytes := d.segment.mm[postingsOffset+n : postingsOffset+n+postingsLen] roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen]
bitmap := roaring.NewBitmap() bitmap := roaring.NewBitmap()
_, err = bitmap.FromBuffer(roaringBytes) _, err = bitmap.FromBuffer(roaringBytes)
@ -96,7 +96,6 @@ func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*Posting
// Iterator returns an iterator for this dictionary // Iterator returns an iterator for this dictionary
func (d *Dictionary) Iterator() segment.DictionaryIterator { func (d *Dictionary) Iterator() segment.DictionaryIterator {
rv := &DictionaryIterator{ rv := &DictionaryIterator{
d: d, d: d,
} }

View File

@ -61,17 +61,17 @@ func (di *docValueIterator) curChunkNumber() uint64 {
return di.curChunkNum return di.curChunkNum
} }
func (s *Segment) loadFieldDocValueIterator(field string, func (s *SegmentBase) loadFieldDocValueIterator(field string,
fieldDvLoc uint64) (*docValueIterator, error) { fieldDvLoc uint64) (*docValueIterator, error) {
// get the docValue offset for the given fields // get the docValue offset for the given fields
if fieldDvLoc == fieldNotUninverted { if fieldDvLoc == fieldNotUninverted {
return nil, fmt.Errorf("loadFieldDocValueConfigs: "+ return nil, fmt.Errorf("loadFieldDocValueIterator: "+
"no docValues found for field: %s", field) "no docValues found for field: %s", field)
} }
// read the number of chunks, chunk lengths // read the number of chunks, chunk lengths
var offset, clen uint64 var offset, clen uint64
numChunks, read := binary.Uvarint(s.mm[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64]) numChunks, read := binary.Uvarint(s.mem[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64])
if read <= 0 { if read <= 0 {
return nil, fmt.Errorf("failed to read the field "+ return nil, fmt.Errorf("failed to read the field "+
"doc values for field %s", field) "doc values for field %s", field)
@ -84,7 +84,7 @@ func (s *Segment) loadFieldDocValueIterator(field string,
chunkLens: make([]uint64, int(numChunks)), chunkLens: make([]uint64, int(numChunks)),
} }
for i := 0; i < int(numChunks); i++ { for i := 0; i < int(numChunks); i++ {
clen, read = binary.Uvarint(s.mm[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) clen, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64])
if read <= 0 { if read <= 0 {
return nil, fmt.Errorf("corrupted chunk length during segment load") return nil, fmt.Errorf("corrupted chunk length during segment load")
} }
@ -97,7 +97,7 @@ func (s *Segment) loadFieldDocValueIterator(field string,
} }
func (di *docValueIterator) loadDvChunk(chunkNumber, func (di *docValueIterator) loadDvChunk(chunkNumber,
localDocNum uint64, s *Segment) error { localDocNum uint64, s *SegmentBase) error {
// advance to the chunk where the docValues // advance to the chunk where the docValues
// reside for the given docID // reside for the given docID
destChunkDataLoc := di.dvDataLoc destChunkDataLoc := di.dvDataLoc
@ -107,7 +107,7 @@ func (di *docValueIterator) loadDvChunk(chunkNumber,
curChunkSize := di.chunkLens[chunkNumber] curChunkSize := di.chunkLens[chunkNumber]
// read the number of docs reside in the chunk // read the number of docs reside in the chunk
numDocs, read := binary.Uvarint(s.mm[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64])
if read <= 0 { if read <= 0 {
return fmt.Errorf("failed to read the chunk") return fmt.Errorf("failed to read the chunk")
} }
@ -116,17 +116,17 @@ func (di *docValueIterator) loadDvChunk(chunkNumber,
offset := uint64(0) offset := uint64(0)
di.curChunkHeader = make([]MetaData, int(numDocs)) di.curChunkHeader = make([]MetaData, int(numDocs))
for i := 0; i < int(numDocs); i++ { for i := 0; i < int(numDocs); i++ {
di.curChunkHeader[i].DocID, read = binary.Uvarint(s.mm[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) di.curChunkHeader[i].DocID, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(read) offset += uint64(read)
di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mm[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(read) offset += uint64(read)
di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mm[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
offset += uint64(read) offset += uint64(read)
} }
compressedDataLoc := chunkMetaLoc + offset compressedDataLoc := chunkMetaLoc + offset
dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc
di.curChunkData = s.mm[compressedDataLoc : compressedDataLoc+dataLength] di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength]
di.curChunkNum = chunkNumber di.curChunkNum = chunkNumber
return nil return nil
} }
@ -171,18 +171,18 @@ func (di *docValueIterator) getDocValueLocs(docID uint64) (uint64, uint64) {
// VisitDocumentFieldTerms is an implementation of the // VisitDocumentFieldTerms is an implementation of the
// DocumentFieldTermVisitable interface // DocumentFieldTermVisitable interface
func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
visitor index.DocumentFieldTermVisitor) error { visitor index.DocumentFieldTermVisitor) error {
fieldID := uint16(0) fieldIDPlus1 := uint16(0)
ok := true ok := true
for _, field := range fields { for _, field := range fields {
if fieldID, ok = s.fieldsMap[field]; !ok { if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
continue continue
} }
// find the chunkNumber where the docValues are stored // find the chunkNumber where the docValues are stored
docInChunk := localDocNum / uint64(s.chunkFactor) docInChunk := localDocNum / uint64(s.chunkFactor)
if dvIter, exists := s.fieldDvIterMap[fieldID-1]; exists && if dvIter, exists := s.fieldDvIterMap[fieldIDPlus1-1]; exists &&
dvIter != nil { dvIter != nil {
// check if the chunk is already loaded // check if the chunk is already loaded
if docInChunk != dvIter.curChunkNumber() { if docInChunk != dvIter.curChunkNumber() {

View File

@ -29,10 +29,10 @@ import (
"github.com/golang/snappy" "github.com/golang/snappy"
) )
// Merge takes a slice of zap segments, bit masks describing which documents // Merge takes a slice of zap segments and bit masks describing which
// from the may be dropped, and creates a new segment containing the remaining // documents may be dropped, and creates a new segment containing the
// data. This new segment is built at the specified path, with the provided // remaining data. This new segment is built at the specified path,
// chunkFactor. // with the provided chunkFactor.
func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, func Merge(segments []*Segment, drops []*roaring.Bitmap, path string,
chunkFactor uint32) ([][]uint64, error) { chunkFactor uint32) ([][]uint64, error) {
flag := os.O_RDWR | os.O_CREATE flag := os.O_RDWR | os.O_CREATE
@ -42,6 +42,11 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string,
return nil, err return nil, err
} }
cleanup := func() {
_ = f.Close()
_ = os.Remove(path)
}
// buffer the output // buffer the output
br := bufio.NewWriter(f) br := bufio.NewWriter(f)
@ -50,52 +55,59 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string,
fieldsInv := mergeFields(segments) fieldsInv := mergeFields(segments)
fieldsMap := mapFields(fieldsInv) fieldsMap := mapFields(fieldsInv)
newSegDocCount := computeNewDocCount(segments, drops)
var newDocNums [][]uint64 var newDocNums [][]uint64
var storedIndexOffset uint64 var storedIndexOffset uint64
fieldDvLocsOffset := uint64(fieldNotUninverted) fieldDvLocsOffset := uint64(fieldNotUninverted)
var dictLocs []uint64 var dictLocs []uint64
newSegDocCount := computeNewDocCount(segments, drops)
if newSegDocCount > 0 { if newSegDocCount > 0 {
storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops,
fieldsMap, fieldsInv, newSegDocCount, cr) fieldsMap, fieldsInv, newSegDocCount, cr)
if err != nil { if err != nil {
cleanup()
return nil, err return nil, err
} }
dictLocs, fieldDvLocsOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, dictLocs, fieldDvLocsOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap,
newDocNums, newSegDocCount, chunkFactor, cr) newDocNums, newSegDocCount, chunkFactor, cr)
if err != nil { if err != nil {
cleanup()
return nil, err return nil, err
} }
} else { } else {
dictLocs = make([]uint64, len(fieldsInv)) dictLocs = make([]uint64, len(fieldsInv))
} }
var fieldsIndexOffset uint64 fieldsIndexOffset, err := persistFields(fieldsInv, cr, dictLocs)
fieldsIndexOffset, err = persistFields(fieldsInv, cr, dictLocs)
if err != nil { if err != nil {
cleanup()
return nil, err return nil, err
} }
err = persistFooter(newSegDocCount, storedIndexOffset, err = persistFooter(newSegDocCount, storedIndexOffset,
fieldsIndexOffset, fieldDvLocsOffset, chunkFactor, cr) fieldsIndexOffset, fieldDvLocsOffset, chunkFactor, cr.Sum32(), cr)
if err != nil { if err != nil {
cleanup()
return nil, err return nil, err
} }
err = br.Flush() err = br.Flush()
if err != nil { if err != nil {
cleanup()
return nil, err return nil, err
} }
err = f.Sync() err = f.Sync()
if err != nil { if err != nil {
cleanup()
return nil, err return nil, err
} }
err = f.Close() err = f.Close()
if err != nil { if err != nil {
cleanup()
return nil, err return nil, err
} }
@ -104,7 +116,7 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string,
// mapFields takes the fieldsInv list and builds the map // mapFields takes the fieldsInv list and builds the map
func mapFields(fields []string) map[string]uint16 { func mapFields(fields []string) map[string]uint16 {
rv := make(map[string]uint16) rv := make(map[string]uint16, len(fields))
for i, fieldName := range fields { for i, fieldName := range fields {
rv[fieldName] = uint16(i) rv[fieldName] = uint16(i)
} }
@ -114,15 +126,14 @@ func mapFields(fields []string) map[string]uint16 {
// computeNewDocCount determines how many documents will be in the newly // computeNewDocCount determines how many documents will be in the newly
// merged segment when obsoleted docs are dropped // merged segment when obsoleted docs are dropped
func computeNewDocCount(segments []*Segment, drops []*roaring.Bitmap) uint64 { func computeNewDocCount(segments []*Segment, drops []*roaring.Bitmap) uint64 {
var newSegDocCount uint64 var newDocCount uint64
for segI, segment := range segments { for segI, segment := range segments {
segIAfterDrop := segment.NumDocs() newDocCount += segment.NumDocs()
if drops[segI] != nil { if drops[segI] != nil {
segIAfterDrop -= drops[segI].GetCardinality() newDocCount -= drops[segI].GetCardinality()
} }
newSegDocCount += segIAfterDrop
} }
return newSegDocCount return newDocCount
} }
func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
@ -130,14 +141,18 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
newSegDocCount uint64, chunkFactor uint32, newSegDocCount uint64, chunkFactor uint32,
w *CountHashWriter) ([]uint64, uint64, error) { w *CountHashWriter) ([]uint64, uint64, error) {
var bufReuse bytes.Buffer
var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64)
var bufLoc []uint64 var bufLoc []uint64
rv1 := make([]uint64, len(fieldsInv)) rv := make([]uint64, len(fieldsInv))
fieldDvLocs := make([]uint64, len(fieldsInv)) fieldDvLocs := make([]uint64, len(fieldsInv))
fieldDvLocsOffset := uint64(fieldNotUninverted) fieldDvLocsOffset := uint64(fieldNotUninverted)
var docNumbers docIDRange
var vellumBuf bytes.Buffer var vellumBuf bytes.Buffer
// for each field // for each field
for fieldID, fieldName := range fieldsInv { for fieldID, fieldName := range fieldsInv {
if fieldID != 0 { if fieldID != 0 {
@ -177,13 +192,15 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1)
docTermMap := make(map[uint64][]byte, 0) docTermMap := make(map[uint64][]byte, newSegDocCount)
for err == nil { for err == nil {
term, _ := mergeItr.Current() term, _ := mergeItr.Current()
newRoaring := roaring.NewBitmap() newRoaring := roaring.NewBitmap()
newRoaringLocs := roaring.NewBitmap() newRoaringLocs := roaring.NewBitmap()
tfEncoder.Reset() tfEncoder.Reset()
locEncoder.Reset() locEncoder.Reset()
@ -193,7 +210,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
if dict == nil { if dict == nil {
continue continue
} }
postings, err2 := dict.postingsList(string(term), drops[dictI]) postings, err2 := dict.postingsList(term, drops[dictI])
if err2 != nil { if err2 != nil {
return nil, 0, err2 return nil, 0, err2
} }
@ -209,9 +226,9 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
// encode norm bits // encode norm bits
norm := next.Norm() norm := next.Norm()
normBits := math.Float32bits(float32(norm)) normBits := math.Float32bits(float32(norm))
err3 := tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits)) err = tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits))
if err3 != nil { if err != nil {
return nil, 0, err3 return nil, 0, err
} }
locs := next.Locations() locs := next.Locations()
if len(locs) > 0 { if len(locs) > 0 {
@ -234,15 +251,16 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
} }
} }
docTermMap[hitNewDocNum] = append(docTermMap[hitNewDocNum], []byte(term)...) docTermMap[hitNewDocNum] =
docTermMap[hitNewDocNum] = append(docTermMap[hitNewDocNum], termSeparator) append(append(docTermMap[hitNewDocNum], term...), termSeparator)
next, err2 = postItr.Next() next, err2 = postItr.Next()
} }
if err != nil { if err2 != nil {
return nil, 0, err return nil, 0, err2
} }
} }
tfEncoder.Close() tfEncoder.Close()
locEncoder.Close() locEncoder.Close()
@ -259,7 +277,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
return nil, 0, err return nil, 0, err
} }
postingLocOffset := uint64(w.Count()) postingLocOffset := uint64(w.Count())
_, err = writeRoaringWithLen(newRoaringLocs, w) _, err = writeRoaringWithLen(newRoaringLocs, w, &bufReuse, bufMaxVarintLen64)
if err != nil { if err != nil {
return nil, 0, err return nil, 0, err
} }
@ -285,7 +303,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
if err != nil { if err != nil {
return nil, 0, err return nil, 0, err
} }
_, err = writeRoaringWithLen(newRoaring, w) _, err = writeRoaringWithLen(newRoaring, w, &bufReuse, bufMaxVarintLen64)
if err != nil { if err != nil {
return nil, 0, err return nil, 0, err
} }
@ -303,6 +321,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
} }
dictOffset := uint64(w.Count()) dictOffset := uint64(w.Count())
err = newVellum.Close() err = newVellum.Close()
if err != nil { if err != nil {
return nil, 0, err return nil, 0, err
@ -310,10 +329,8 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
vellumData := vellumBuf.Bytes() vellumData := vellumBuf.Bytes()
// write out the length of the vellum data // write out the length of the vellum data
buf := bufMaxVarintLen64 n := binary.PutUvarint(bufMaxVarintLen64, uint64(len(vellumData)))
// write out the number of chunks _, err = w.Write(bufMaxVarintLen64[:n])
n := binary.PutUvarint(buf, uint64(len(vellumData)))
_, err = w.Write(buf[:n])
if err != nil { if err != nil {
return nil, 0, err return nil, 0, err
} }
@ -324,27 +341,33 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
return nil, 0, err return nil, 0, err
} }
rv1[fieldID] = dictOffset rv[fieldID] = dictOffset
// update the doc value // update the doc nums
var docNumbers docIDRange if cap(docNumbers) < len(docTermMap) {
docNumbers = make(docIDRange, 0, len(docTermMap))
}
docNumbers = docNumbers[:0]
for k := range docTermMap { for k := range docTermMap {
docNumbers = append(docNumbers, k) docNumbers = append(docNumbers, k)
} }
sort.Sort(docNumbers) sort.Sort(docNumbers)
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1)
for _, docNum := range docNumbers { for _, docNum := range docNumbers {
err = fdvEncoder.Add(docNum, docTermMap[docNum]) err = fdvEncoder.Add(docNum, docTermMap[docNum])
if err != nil { if err != nil {
return nil, 0, err return nil, 0, err
} }
} }
// get the field doc value offset
fieldDvLocs[fieldID] = uint64(w.Count())
err = fdvEncoder.Close() err = fdvEncoder.Close()
if err != nil { if err != nil {
return nil, 0, err return nil, 0, err
} }
// get the field doc value offset
fieldDvLocs[fieldID] = uint64(w.Count())
// persist the doc value details for this field // persist the doc value details for this field
_, err = fdvEncoder.Write(w) _, err = fdvEncoder.Write(w)
if err != nil { if err != nil {
@ -353,7 +376,8 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
} }
fieldDvLocsOffset = uint64(w.Count()) fieldDvLocsOffset = uint64(w.Count())
buf := make([]byte, binary.MaxVarintLen64)
buf := bufMaxVarintLen64
for _, offset := range fieldDvLocs { for _, offset := range fieldDvLocs {
n := binary.PutUvarint(buf, uint64(offset)) n := binary.PutUvarint(buf, uint64(offset))
_, err := w.Write(buf[:n]) _, err := w.Write(buf[:n])
@ -362,7 +386,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap,
} }
} }
return rv1, fieldDvLocsOffset, nil return rv, fieldDvLocsOffset, nil
} }
const docDropped = math.MaxUint64 const docDropped = math.MaxUint64
@ -370,13 +394,16 @@ const docDropped = math.MaxUint64
func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap,
fieldsMap map[string]uint16, fieldsInv []string, newSegDocCount uint64, fieldsMap map[string]uint16, fieldsInv []string, newSegDocCount uint64,
w *CountHashWriter) (uint64, [][]uint64, error) { w *CountHashWriter) (uint64, [][]uint64, error) {
var rv [][]uint64 var rv [][]uint64 // The remapped or newDocNums for each segment.
var newDocNum int
var newDocNum uint64
var curr int var curr int
var metaBuf bytes.Buffer var metaBuf bytes.Buffer
var data, compressed []byte var data, compressed []byte
metaEncoder := govarint.NewU64Base128Encoder(&metaBuf)
vals := make([][][]byte, len(fieldsInv)) vals := make([][][]byte, len(fieldsInv))
typs := make([][]byte, len(fieldsInv)) typs := make([][]byte, len(fieldsInv))
poss := make([][][]uint64, len(fieldsInv)) poss := make([][][]uint64, len(fieldsInv))
@ -385,118 +412,121 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap,
// for each segment // for each segment
for segI, segment := range segments { for segI, segment := range segments {
var segNewDocNums []uint64 segNewDocNums := make([]uint64, segment.numDocs)
// for each doc num // for each doc num
for docNum := uint64(0); docNum < segment.numDocs; docNum++ { for docNum := uint64(0); docNum < segment.numDocs; docNum++ {
// TODO: roaring's API limits docNums to 32-bits?
if drops[segI] != nil && drops[segI].Contains(uint32(docNum)) {
segNewDocNums[docNum] = docDropped
continue
}
segNewDocNums[docNum] = newDocNum
curr = 0
metaBuf.Reset() metaBuf.Reset()
data = data[:0] data = data[:0]
compressed = compressed[:0] compressed = compressed[:0]
curr = 0
metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) // collect all the data
for i := 0; i < len(fieldsInv); i++ {
if drops[segI] != nil && drops[segI].Contains(uint32(docNum)) { vals[i] = vals[i][:0]
segNewDocNums = append(segNewDocNums, docDropped) typs[i] = typs[i][:0]
} else { poss[i] = poss[i][:0]
segNewDocNums = append(segNewDocNums, uint64(newDocNum))
// collect all the data
for i := 0; i < len(fieldsInv); i++ {
vals[i] = vals[i][:0]
typs[i] = typs[i][:0]
poss[i] = poss[i][:0]
}
err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool {
fieldID := int(fieldsMap[field])
vals[fieldID] = append(vals[fieldID], value)
typs[fieldID] = append(typs[fieldID], typ)
poss[fieldID] = append(poss[fieldID], pos)
return true
})
if err != nil {
return 0, nil, err
}
// now walk the fields in order
for fieldID := range fieldsInv {
storedFieldValues := vals[int(fieldID)]
// has stored values for this field
num := len(storedFieldValues)
// process each value
for i := 0; i < num; i++ {
// encode field
_, err2 := metaEncoder.PutU64(uint64(fieldID))
if err2 != nil {
return 0, nil, err2
}
// encode type
_, err2 = metaEncoder.PutU64(uint64(typs[int(fieldID)][i]))
if err2 != nil {
return 0, nil, err2
}
// encode start offset
_, err2 = metaEncoder.PutU64(uint64(curr))
if err2 != nil {
return 0, nil, err2
}
// end len
_, err2 = metaEncoder.PutU64(uint64(len(storedFieldValues[i])))
if err2 != nil {
return 0, nil, err2
}
// encode number of array pos
_, err2 = metaEncoder.PutU64(uint64(len(poss[int(fieldID)][i])))
if err2 != nil {
return 0, nil, err2
}
// encode all array positions
for j := 0; j < len(poss[int(fieldID)][i]); j++ {
_, err2 = metaEncoder.PutU64(poss[int(fieldID)][i][j])
if err2 != nil {
return 0, nil, err2
}
}
// append data
data = append(data, storedFieldValues[i]...)
// update curr
curr += len(storedFieldValues[i])
}
}
metaEncoder.Close()
metaBytes := metaBuf.Bytes()
compressed = snappy.Encode(compressed, data)
// record where we're about to start writing
docNumOffsets[newDocNum] = uint64(w.Count())
// write out the meta len and compressed data len
_, err = writeUvarints(w,
uint64(len(metaBytes)), uint64(len(compressed)))
if err != nil {
return 0, nil, err
}
// now write the meta
_, err = w.Write(metaBytes)
if err != nil {
return 0, nil, err
}
// now write the compressed data
_, err = w.Write(compressed)
if err != nil {
return 0, nil, err
}
newDocNum++
} }
err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool {
fieldID := int(fieldsMap[field])
vals[fieldID] = append(vals[fieldID], value)
typs[fieldID] = append(typs[fieldID], typ)
poss[fieldID] = append(poss[fieldID], pos)
return true
})
if err != nil {
return 0, nil, err
}
// now walk the fields in order
for fieldID := range fieldsInv {
storedFieldValues := vals[int(fieldID)]
// has stored values for this field
num := len(storedFieldValues)
// process each value
for i := 0; i < num; i++ {
// encode field
_, err2 := metaEncoder.PutU64(uint64(fieldID))
if err2 != nil {
return 0, nil, err2
}
// encode type
_, err2 = metaEncoder.PutU64(uint64(typs[int(fieldID)][i]))
if err2 != nil {
return 0, nil, err2
}
// encode start offset
_, err2 = metaEncoder.PutU64(uint64(curr))
if err2 != nil {
return 0, nil, err2
}
// end len
_, err2 = metaEncoder.PutU64(uint64(len(storedFieldValues[i])))
if err2 != nil {
return 0, nil, err2
}
// encode number of array pos
_, err2 = metaEncoder.PutU64(uint64(len(poss[int(fieldID)][i])))
if err2 != nil {
return 0, nil, err2
}
// encode all array positions
for j := 0; j < len(poss[int(fieldID)][i]); j++ {
_, err2 = metaEncoder.PutU64(poss[int(fieldID)][i][j])
if err2 != nil {
return 0, nil, err2
}
}
// append data
data = append(data, storedFieldValues[i]...)
// update curr
curr += len(storedFieldValues[i])
}
}
metaEncoder.Close()
metaBytes := metaBuf.Bytes()
compressed = snappy.Encode(compressed, data)
// record where we're about to start writing
docNumOffsets[newDocNum] = uint64(w.Count())
// write out the meta len and compressed data len
_, err = writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed)))
if err != nil {
return 0, nil, err
}
// now write the meta
_, err = w.Write(metaBytes)
if err != nil {
return 0, nil, err
}
// now write the compressed data
_, err = w.Write(compressed)
if err != nil {
return 0, nil, err
}
newDocNum++
} }
rv = append(rv, segNewDocNums) rv = append(rv, segNewDocNums)
} }
// return value is the start of the stored index // return value is the start of the stored index
offset := uint64(w.Count()) offset := uint64(w.Count())
// now write out the stored doc index // now write out the stored doc index
for docNum := range docNumOffsets { for docNum := range docNumOffsets {
err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum]) err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum])
@ -511,13 +541,13 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap,
// mergeFields builds a unified list of fields used across all the input segments // mergeFields builds a unified list of fields used across all the input segments
func mergeFields(segments []*Segment) []string { func mergeFields(segments []*Segment) []string {
fieldsMap := map[string]struct{}{} fieldsMap := map[string]struct{}{}
for _, segment := range segments { for _, segment := range segments {
fields := segment.Fields() fields := segment.Fields()
for _, field := range fields { for _, field := range fields {
fieldsMap[field] = struct{}{} fieldsMap[field] = struct{}{}
} }
} }
rv := make([]string, 0, len(fieldsMap)) rv := make([]string, 0, len(fieldsMap))
// ensure _id stays first // ensure _id stays first
rv = append(rv, "_id") rv = append(rv, "_id")
@ -526,6 +556,5 @@ func mergeFields(segments []*Segment) []string {
rv = append(rv, k) rv = append(rv, k)
} }
} }
return rv return rv
} }

View File

@ -27,8 +27,8 @@ import (
// PostingsList is an in-memory represenation of a postings list // PostingsList is an in-memory represenation of a postings list
type PostingsList struct { type PostingsList struct {
dictionary *Dictionary sb *SegmentBase
term string term []byte
postingsOffset uint64 postingsOffset uint64
freqOffset uint64 freqOffset uint64
locOffset uint64 locOffset uint64
@ -48,11 +48,11 @@ func (p *PostingsList) Iterator() segment.PostingsIterator {
var n uint64 var n uint64
var read int var read int
var numFreqChunks uint64 var numFreqChunks uint64
numFreqChunks, read = binary.Uvarint(p.dictionary.segment.mm[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
n += uint64(read) n += uint64(read)
rv.freqChunkLens = make([]uint64, int(numFreqChunks)) rv.freqChunkLens = make([]uint64, int(numFreqChunks))
for i := 0; i < int(numFreqChunks); i++ { for i := 0; i < int(numFreqChunks); i++ {
rv.freqChunkLens[i], read = binary.Uvarint(p.dictionary.segment.mm[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) rv.freqChunkLens[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
n += uint64(read) n += uint64(read)
} }
rv.freqChunkStart = p.freqOffset + n rv.freqChunkStart = p.freqOffset + n
@ -60,11 +60,11 @@ func (p *PostingsList) Iterator() segment.PostingsIterator {
// prepare the loc chunk details // prepare the loc chunk details
n = 0 n = 0
var numLocChunks uint64 var numLocChunks uint64
numLocChunks, read = binary.Uvarint(p.dictionary.segment.mm[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
n += uint64(read) n += uint64(read)
rv.locChunkLens = make([]uint64, int(numLocChunks)) rv.locChunkLens = make([]uint64, int(numLocChunks))
for i := 0; i < int(numLocChunks); i++ { for i := 0; i < int(numLocChunks); i++ {
rv.locChunkLens[i], read = binary.Uvarint(p.dictionary.segment.mm[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) rv.locChunkLens[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
n += uint64(read) n += uint64(read)
} }
rv.locChunkStart = p.locOffset + n rv.locChunkStart = p.locOffset + n
@ -133,7 +133,7 @@ func (i *PostingsIterator) loadChunk(chunk int) error {
start += i.freqChunkLens[j] start += i.freqChunkLens[j]
} }
end := start + i.freqChunkLens[chunk] end := start + i.freqChunkLens[chunk]
i.currChunkFreqNorm = i.postings.dictionary.segment.mm[start:end] i.currChunkFreqNorm = i.postings.sb.mem[start:end]
i.freqNormDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkFreqNorm)) i.freqNormDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkFreqNorm))
start = i.locChunkStart start = i.locChunkStart
@ -141,7 +141,7 @@ func (i *PostingsIterator) loadChunk(chunk int) error {
start += i.locChunkLens[j] start += i.locChunkLens[j]
} }
end = start + i.locChunkLens[chunk] end = start + i.locChunkLens[chunk]
i.currChunkLoc = i.postings.dictionary.segment.mm[start:end] i.currChunkLoc = i.postings.sb.mem[start:end]
i.locDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkLoc)) i.locDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkLoc))
i.currChunk = uint32(chunk) i.currChunk = uint32(chunk)
return nil return nil
@ -192,7 +192,7 @@ func (i *PostingsIterator) readLocation(l *Location) error {
// group these together for less branching // group these together for less branching
if l != nil { if l != nil {
l.field = i.postings.dictionary.segment.fieldsInv[fieldID] l.field = i.postings.sb.fieldsInv[fieldID]
l.pos = pos l.pos = pos
l.start = start l.start = start
l.end = end l.end = end
@ -221,9 +221,9 @@ func (i *PostingsIterator) Next() (segment.Posting, error) {
return nil, nil return nil, nil
} }
n := i.actual.Next() n := i.actual.Next()
nChunk := n / i.postings.dictionary.segment.chunkFactor nChunk := n / i.postings.sb.chunkFactor
allN := i.all.Next() allN := i.all.Next()
allNChunk := allN / i.postings.dictionary.segment.chunkFactor allNChunk := allN / i.postings.sb.chunkFactor
// n is the next actual hit (excluding some postings) // n is the next actual hit (excluding some postings)
// allN is the next hit in the full postings // allN is the next hit in the full postings

View File

@ -16,16 +16,16 @@ package zap
import "encoding/binary" import "encoding/binary"
func (s *Segment) getStoredMetaAndCompressed(docNum uint64) ([]byte, []byte) { func (s *SegmentBase) getDocStoredMetaAndCompressed(docNum uint64) ([]byte, []byte) {
docStoredStartAddr := s.storedIndexOffset + (8 * docNum) docStoredStartAddr := s.storedIndexOffset + (8 * docNum)
docStoredStart := binary.BigEndian.Uint64(s.mm[docStoredStartAddr : docStoredStartAddr+8]) docStoredStart := binary.BigEndian.Uint64(s.mem[docStoredStartAddr : docStoredStartAddr+8])
var n uint64 var n uint64
metaLen, read := binary.Uvarint(s.mm[docStoredStart : docStoredStart+binary.MaxVarintLen64]) metaLen, read := binary.Uvarint(s.mem[docStoredStart : docStoredStart+binary.MaxVarintLen64])
n += uint64(read) n += uint64(read)
var dataLen uint64 var dataLen uint64
dataLen, read = binary.Uvarint(s.mm[docStoredStart+n : docStoredStart+n+binary.MaxVarintLen64]) dataLen, read = binary.Uvarint(s.mem[docStoredStart+n : docStoredStart+n+binary.MaxVarintLen64])
n += uint64(read) n += uint64(read)
meta := s.mm[docStoredStart+n : docStoredStart+n+metaLen] meta := s.mem[docStoredStart+n : docStoredStart+n+metaLen]
data := s.mm[docStoredStart+n+metaLen : docStoredStart+n+metaLen+dataLen] data := s.mem[docStoredStart+n+metaLen : docStoredStart+n+metaLen+dataLen]
return meta, data return meta, data
} }

View File

@ -44,12 +44,15 @@ func Open(path string) (segment.Segment, error) {
} }
rv := &Segment{ rv := &Segment{
f: f, SegmentBase: SegmentBase{
mm: mm, mem: mm[0 : len(mm)-FooterSize],
path: path, fieldsMap: make(map[string]uint16),
fieldsMap: make(map[string]uint16), fieldDvIterMap: make(map[uint16]*docValueIterator),
fieldDvIterMap: make(map[uint16]*docValueIterator), },
refs: 1, f: f,
mm: mm,
path: path,
refs: 1,
} }
err = rv.loadConfig() err = rv.loadConfig()
@ -73,24 +76,36 @@ func Open(path string) (segment.Segment, error) {
return rv, nil return rv, nil
} }
// Segment implements the segment.Segment inteface over top the zap file format // SegmentBase is a memory only, read-only implementation of the
type Segment struct { // segment.Segment interface, using zap's data representation.
f *os.File type SegmentBase struct {
mm mmap.MMap mem []byte
path string memCRC uint32
crc uint32
version uint32
chunkFactor uint32 chunkFactor uint32
fieldsMap map[string]uint16 // fieldName -> fieldID+1
fieldsInv []string // fieldID -> fieldName
numDocs uint64 numDocs uint64
storedIndexOffset uint64 storedIndexOffset uint64
fieldsIndexOffset uint64 fieldsIndexOffset uint64
docValueOffset uint64
dictLocs []uint64
fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field
}
fieldsMap map[string]uint16 func (sb *SegmentBase) AddRef() {}
fieldsInv []string func (sb *SegmentBase) DecRef() (err error) { return nil }
fieldsOffsets []uint64 func (sb *SegmentBase) Close() (err error) { return nil }
docValueOffset uint64 // Segment implements a persisted segment.Segment interface, by
fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field // embedding an mmap()'ed SegmentBase.
type Segment struct {
SegmentBase
f *os.File
mm mmap.MMap
path string
version uint32
crc uint32
m sync.Mutex // Protects the fields that follow. m sync.Mutex // Protects the fields that follow.
refs int64 refs int64
@ -98,17 +113,29 @@ type Segment struct {
func (s *Segment) SizeInBytes() uint64 { func (s *Segment) SizeInBytes() uint64 {
// 8 /* size of file pointer */ // 8 /* size of file pointer */
// 4 /* size of crc -> uint32 */
// 4 /* size of version -> uint32 */ // 4 /* size of version -> uint32 */
// 4 /* size of crc -> uint32 */
sizeOfUints := 16
sizeInBytes := (len(s.path) + int(segment.SizeOfString)) + sizeOfUints
// mutex, refs -> int64
sizeInBytes += 16
// do not include the mmap'ed part
return uint64(sizeInBytes) + s.SegmentBase.SizeInBytes() - uint64(len(s.mem))
}
func (s *SegmentBase) SizeInBytes() uint64 {
// 4 /* size of memCRC -> uint32 */
// 4 /* size of chunkFactor -> uint32 */ // 4 /* size of chunkFactor -> uint32 */
// 8 /* size of numDocs -> uint64 */ // 8 /* size of numDocs -> uint64 */
// 8 /* size of storedIndexOffset -> uint64 */ // 8 /* size of storedIndexOffset -> uint64 */
// 8 /* size of fieldsIndexOffset -> uint64 */ // 8 /* size of fieldsIndexOffset -> uint64 */
// 8 /* size of docValueOffset -> uint64 */ // 8 /* size of docValueOffset -> uint64 */
sizeOfUints := 52 sizeInBytes := 40
// Do not include the mmap'ed part sizeInBytes += len(s.mem) + int(segment.SizeOfSlice)
sizeInBytes := (len(s.path) + int(segment.SizeOfString)) + sizeOfUints
// fieldsMap // fieldsMap
for k, _ := range s.fieldsMap { for k, _ := range s.fieldsMap {
@ -116,12 +143,12 @@ func (s *Segment) SizeInBytes() uint64 {
} }
sizeInBytes += int(segment.SizeOfMap) /* overhead from map */ sizeInBytes += int(segment.SizeOfMap) /* overhead from map */
// fieldsInv, fieldsOffsets // fieldsInv, dictLocs
for _, entry := range s.fieldsInv { for _, entry := range s.fieldsInv {
sizeInBytes += (len(entry) + int(segment.SizeOfString)) sizeInBytes += (len(entry) + int(segment.SizeOfString))
} }
sizeInBytes += len(s.fieldsOffsets) * 8 /* size of uint64 */ sizeInBytes += len(s.dictLocs) * 8 /* size of uint64 */
sizeInBytes += int(segment.SizeOfSlice) * 2 /* overhead from slices */ sizeInBytes += int(segment.SizeOfSlice) * 3 /* overhead from slices */
// fieldDvIterMap // fieldDvIterMap
sizeInBytes += len(s.fieldDvIterMap) * sizeInBytes += len(s.fieldDvIterMap) *
@ -133,9 +160,6 @@ func (s *Segment) SizeInBytes() uint64 {
} }
sizeInBytes += int(segment.SizeOfMap) sizeInBytes += int(segment.SizeOfMap)
// mutex, refs -> int64
sizeInBytes += 16
return uint64(sizeInBytes) return uint64(sizeInBytes)
} }
@ -158,47 +182,50 @@ func (s *Segment) DecRef() (err error) {
func (s *Segment) loadConfig() error { func (s *Segment) loadConfig() error {
crcOffset := len(s.mm) - 4 crcOffset := len(s.mm) - 4
s.crc = binary.BigEndian.Uint32(s.mm[crcOffset : crcOffset+4]) s.crc = binary.BigEndian.Uint32(s.mm[crcOffset : crcOffset+4])
verOffset := crcOffset - 4 verOffset := crcOffset - 4
s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4]) s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4])
if s.version != version { if s.version != version {
return fmt.Errorf("unsupported version %d", s.version) return fmt.Errorf("unsupported version %d", s.version)
} }
chunkOffset := verOffset - 4 chunkOffset := verOffset - 4
s.chunkFactor = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4]) s.chunkFactor = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4])
docValueOffset := chunkOffset - 8 docValueOffset := chunkOffset - 8
s.docValueOffset = binary.BigEndian.Uint64(s.mm[docValueOffset : docValueOffset+8]) s.docValueOffset = binary.BigEndian.Uint64(s.mm[docValueOffset : docValueOffset+8])
fieldsOffset := docValueOffset - 8
s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsOffset : fieldsOffset+8]) fieldsIndexOffset := docValueOffset - 8
storedOffset := fieldsOffset - 8 s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsIndexOffset : fieldsIndexOffset+8])
s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedOffset : storedOffset+8])
docNumOffset := storedOffset - 8 storedIndexOffset := fieldsIndexOffset - 8
s.numDocs = binary.BigEndian.Uint64(s.mm[docNumOffset : docNumOffset+8]) s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedIndexOffset : storedIndexOffset+8])
numDocsOffset := storedIndexOffset - 8
s.numDocs = binary.BigEndian.Uint64(s.mm[numDocsOffset : numDocsOffset+8])
return nil return nil
} }
func (s *Segment) loadFields() error { func (s *SegmentBase) loadFields() error {
// NOTE for now we assume the fields index immediately preceeds the footer // NOTE for now we assume the fields index immediately preceeds
// if this changes, need to adjust accordingly (or store epxlicit length) // the footer, and if this changes, need to adjust accordingly (or
fieldsIndexEnd := uint64(len(s.mm) - FooterSize) // store explicit length), where s.mem was sliced from s.mm in Open().
fieldsIndexEnd := uint64(len(s.mem))
// iterate through fields index // iterate through fields index
var fieldID uint64 var fieldID uint64
for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd { for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd {
addr := binary.BigEndian.Uint64(s.mm[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8]) addr := binary.BigEndian.Uint64(s.mem[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8])
var n uint64
dictLoc, read := binary.Uvarint(s.mm[addr+n : fieldsIndexEnd]) dictLoc, read := binary.Uvarint(s.mem[addr:fieldsIndexEnd])
n += uint64(read) n := uint64(read)
s.fieldsOffsets = append(s.fieldsOffsets, dictLoc) s.dictLocs = append(s.dictLocs, dictLoc)
var nameLen uint64 var nameLen uint64
nameLen, read = binary.Uvarint(s.mm[addr+n : fieldsIndexEnd]) nameLen, read = binary.Uvarint(s.mem[addr+n : fieldsIndexEnd])
n += uint64(read) n += uint64(read)
name := string(s.mm[addr+n : addr+n+nameLen]) name := string(s.mem[addr+n : addr+n+nameLen])
s.fieldsInv = append(s.fieldsInv, name) s.fieldsInv = append(s.fieldsInv, name)
s.fieldsMap[name] = uint16(fieldID + 1) s.fieldsMap[name] = uint16(fieldID + 1)
@ -208,7 +235,7 @@ func (s *Segment) loadFields() error {
} }
// Dictionary returns the term dictionary for the specified field // Dictionary returns the term dictionary for the specified field
func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { func (s *SegmentBase) Dictionary(field string) (segment.TermDictionary, error) {
dict, err := s.dictionary(field) dict, err := s.dictionary(field)
if err == nil && dict == nil { if err == nil && dict == nil {
return &segment.EmptyDictionary{}, nil return &segment.EmptyDictionary{}, nil
@ -216,21 +243,20 @@ func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) {
return dict, err return dict, err
} }
func (s *Segment) dictionary(field string) (rv *Dictionary, err error) { func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) {
rv = &Dictionary{ fieldIDPlus1 := sb.fieldsMap[field]
segment: s, if fieldIDPlus1 > 0 {
field: field, rv = &Dictionary{
} sb: sb,
field: field,
fieldID: fieldIDPlus1 - 1,
}
rv.fieldID = s.fieldsMap[field] dictStart := sb.dictLocs[rv.fieldID]
if rv.fieldID > 0 {
rv.fieldID = rv.fieldID - 1
dictStart := s.fieldsOffsets[rv.fieldID]
if dictStart > 0 { if dictStart > 0 {
// read the length of the vellum data // read the length of the vellum data
vellumLen, read := binary.Uvarint(s.mm[dictStart : dictStart+binary.MaxVarintLen64]) vellumLen, read := binary.Uvarint(sb.mem[dictStart : dictStart+binary.MaxVarintLen64])
fstBytes := s.mm[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen] fstBytes := sb.mem[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen]
if fstBytes != nil { if fstBytes != nil {
rv.fst, err = vellum.Load(fstBytes) rv.fst, err = vellum.Load(fstBytes)
if err != nil { if err != nil {
@ -238,9 +264,6 @@ func (s *Segment) dictionary(field string) (rv *Dictionary, err error) {
} }
} }
} }
} else {
return nil, nil
} }
return rv, nil return rv, nil
@ -248,10 +271,10 @@ func (s *Segment) dictionary(field string) (rv *Dictionary, err error) {
// VisitDocument invokes the DocFieldValueVistor for each stored field // VisitDocument invokes the DocFieldValueVistor for each stored field
// for the specified doc number // for the specified doc number
func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
// first make sure this is a valid number in this segment // first make sure this is a valid number in this segment
if num < s.numDocs { if num < s.numDocs {
meta, compressed := s.getStoredMetaAndCompressed(num) meta, compressed := s.getDocStoredMetaAndCompressed(num)
uncompressed, err := snappy.Decode(nil, compressed) uncompressed, err := snappy.Decode(nil, compressed)
if err != nil { if err != nil {
return err return err
@ -305,13 +328,13 @@ func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVi
} }
// Count returns the number of documents in this segment. // Count returns the number of documents in this segment.
func (s *Segment) Count() uint64 { func (s *SegmentBase) Count() uint64 {
return s.numDocs return s.numDocs
} }
// DocNumbers returns a bitset corresponding to the doc numbers of all the // DocNumbers returns a bitset corresponding to the doc numbers of all the
// provided _id strings // provided _id strings
func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) { func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) {
rv := roaring.New() rv := roaring.New()
if len(s.fieldsMap) > 0 { if len(s.fieldsMap) > 0 {
@ -321,7 +344,7 @@ func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) {
} }
for _, id := range ids { for _, id := range ids {
postings, err := idDict.postingsList(id, nil) postings, err := idDict.postingsList([]byte(id), nil)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -335,7 +358,7 @@ func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) {
} }
// Fields returns the field names used in this segment // Fields returns the field names used in this segment
func (s *Segment) Fields() []string { func (s *SegmentBase) Fields() []string {
return s.fieldsInv return s.fieldsInv
} }
@ -409,23 +432,22 @@ func (s *Segment) NumDocs() uint64 {
// DictAddr is a helper function to compute the file offset where the // DictAddr is a helper function to compute the file offset where the
// dictionary is stored for the specified field. // dictionary is stored for the specified field.
func (s *Segment) DictAddr(field string) (uint64, error) { func (s *Segment) DictAddr(field string) (uint64, error) {
var fieldID uint16 fieldIDPlus1, ok := s.fieldsMap[field]
var ok bool if !ok {
if fieldID, ok = s.fieldsMap[field]; !ok {
return 0, fmt.Errorf("no such field '%s'", field) return 0, fmt.Errorf("no such field '%s'", field)
} }
return s.fieldsOffsets[fieldID-1], nil return s.dictLocs[fieldIDPlus1-1], nil
} }
func (s *Segment) loadDvIterators() error { func (s *SegmentBase) loadDvIterators() error {
if s.docValueOffset == fieldNotUninverted { if s.docValueOffset == fieldNotUninverted {
return nil return nil
} }
var read uint64 var read uint64
for fieldID, field := range s.fieldsInv { for fieldID, field := range s.fieldsInv {
fieldLoc, n := binary.Uvarint(s.mm[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) fieldLoc, n := binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
if n <= 0 { if n <= 0 {
return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID) return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID)
} }

View File

@ -23,42 +23,40 @@ import (
) )
// writes out the length of the roaring bitmap in bytes as varint // writes out the length of the roaring bitmap in bytes as varint
// then writs out the roaring bitmap itself // then writes out the roaring bitmap itself
func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer) (int, error) { func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer,
var buffer bytes.Buffer reuseBuf *bytes.Buffer, reuseBufVarint []byte) (int, error) {
reuseBuf.Reset()
// write out postings list to memory so we know the len // write out postings list to memory so we know the len
postingsListLen, err := r.WriteTo(&buffer) postingsListLen, err := r.WriteTo(reuseBuf)
if err != nil { if err != nil {
return 0, err return 0, err
} }
var tw int var tw int
// write out the length of this postings list // write out the length of this postings list
buf := make([]byte, binary.MaxVarintLen64) n := binary.PutUvarint(reuseBufVarint, uint64(postingsListLen))
n := binary.PutUvarint(buf, uint64(postingsListLen)) nw, err := w.Write(reuseBufVarint[:n])
nw, err := w.Write(buf[:n])
tw += nw tw += nw
if err != nil { if err != nil {
return tw, err return tw, err
} }
// write out the postings list itself // write out the postings list itself
nw, err = w.Write(buffer.Bytes()) nw, err = w.Write(reuseBuf.Bytes())
tw += nw tw += nw
if err != nil { if err != nil {
return tw, err return tw, err
} }
return tw, nil return tw, nil
} }
func persistFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (uint64, error) { func persistFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (uint64, error) {
var rv uint64 var rv uint64
var fieldsOffsets []uint64
var fieldStarts []uint64
for fieldID, fieldName := range fieldsInv { for fieldID, fieldName := range fieldsInv {
// record start of this field // record start of this field
fieldStarts = append(fieldStarts, uint64(w.Count())) fieldsOffsets = append(fieldsOffsets, uint64(w.Count()))
// write out the dict location and field name length // write out the dict location and field name length
_, err := writeUvarints(w, dictLocs[fieldID], uint64(len(fieldName))) _, err := writeUvarints(w, dictLocs[fieldID], uint64(len(fieldName)))
@ -76,7 +74,7 @@ func persistFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (u
// now write out the fields index // now write out the fields index
rv = uint64(w.Count()) rv = uint64(w.Count())
for fieldID := range fieldsInv { for fieldID := range fieldsInv {
err := binary.Write(w, binary.BigEndian, fieldStarts[fieldID]) err := binary.Write(w, binary.BigEndian, fieldsOffsets[fieldID])
if err != nil { if err != nil {
return 0, err return 0, err
} }
@ -89,8 +87,11 @@ func persistFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (u
// crc + ver + chunk + field offset + stored offset + num docs + docValueOffset // crc + ver + chunk + field offset + stored offset + num docs + docValueOffset
const FooterSize = 4 + 4 + 4 + 8 + 8 + 8 + 8 const FooterSize = 4 + 4 + 4 + 8 + 8 + 8 + 8
func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset, docValueOffset uint64, func persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
chunkFactor uint32, w *CountHashWriter) error { chunkFactor uint32, crcBeforeFooter uint32, writerIn io.Writer) error {
w := NewCountHashWriter(writerIn)
w.crc = crcBeforeFooter
// write out the number of docs // write out the number of docs
err := binary.Write(w, binary.BigEndian, numDocs) err := binary.Write(w, binary.BigEndian, numDocs)
if err != nil { if err != nil {
@ -102,7 +103,7 @@ func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset, docValueOffset
return err return err
} }
// write out the field index location // write out the field index location
err = binary.Write(w, binary.BigEndian, fieldIndexOffset) err = binary.Write(w, binary.BigEndian, fieldsIndexOffset)
if err != nil { if err != nil {
return err return err
} }
@ -122,7 +123,7 @@ func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset, docValueOffset
return err return err
} }
// write out CRC-32 of everything upto but not including this CRC // write out CRC-32 of everything upto but not including this CRC
err = binary.Write(w, binary.BigEndian, w.Sum32()) err = binary.Write(w, binary.BigEndian, w.crc)
if err != nil { if err != nil {
return err return err
} }