// Copyright (c) 2017 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package zap import ( "bytes" "encoding/binary" "fmt" "math" "reflect" "github.com/RoaringBitmap/roaring" "github.com/Smerity/govarint" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/size" ) var reflectStaticSizePostingsList int var reflectStaticSizePostingsIterator int var reflectStaticSizePosting int var reflectStaticSizeLocation int func init() { var pl PostingsList reflectStaticSizePostingsList = int(reflect.TypeOf(pl).Size()) var pi PostingsIterator reflectStaticSizePostingsIterator = int(reflect.TypeOf(pi).Size()) var p Posting reflectStaticSizePosting = int(reflect.TypeOf(p).Size()) var l Location reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) } // FST or vellum value (uint64) encoding is determined by the top two // highest-order or most significant bits... // // encoding : MSB // name : 63 62 61...to...bit #0 (LSB) // ----------+---+---+--------------------------------------------------- // general : 0 | 0 | 62-bits of postingsOffset. // ~ : 0 | 1 | reserved for future. // 1-hit : 1 | 0 | 31-bits of positive float31 norm | 31-bits docNum. // ~ : 1 | 1 | reserved for future. // // Encoding "general" is able to handle all cases, where the // postingsOffset points to more information about the postings for // the term. // // Encoding "1-hit" is used to optimize a commonly seen case when a // term has only a single hit. For example, a term in the _id field // will have only 1 hit. The "1-hit" encoding is used for a term // in a field when... // // - term vector info is disabled for that field; // - and, the term appears in only a single doc for that field; // - and, the term's freq is exactly 1 in that single doc for that field; // - and, the docNum must fit into 31-bits; // // Otherwise, the "general" encoding is used instead. // // In the "1-hit" encoding, the field in that single doc may have // other terms, which is supported in the "1-hit" encoding by the // positive float31 norm. const FSTValEncodingMask = uint64(0xc000000000000000) const FSTValEncodingGeneral = uint64(0x0000000000000000) const FSTValEncoding1Hit = uint64(0x8000000000000000) func FSTValEncode1Hit(docNum uint64, normBits uint64) uint64 { return FSTValEncoding1Hit | ((mask31Bits & normBits) << 31) | (mask31Bits & docNum) } func FSTValDecode1Hit(v uint64) (docNum uint64, normBits uint64) { return (mask31Bits & v), (mask31Bits & (v >> 31)) } const mask31Bits = uint64(0x000000007fffffff) func under32Bits(x uint64) bool { return x <= mask31Bits } const docNum1HitFinished = math.MaxUint64 // PostingsList is an in-memory represenation of a postings list type PostingsList struct { sb *SegmentBase postingsOffset uint64 freqOffset uint64 locOffset uint64 postings *roaring.Bitmap except *roaring.Bitmap // when normBits1Hit != 0, then this postings list came from a // 1-hit encoding, and only the docNum1Hit & normBits1Hit apply docNum1Hit uint64 normBits1Hit uint64 } func (p *PostingsList) Size() int { sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr if p.except != nil { sizeInBytes += int(p.except.GetSizeInBytes()) } return sizeInBytes } func (p *PostingsList) OrInto(receiver *roaring.Bitmap) { if p.normBits1Hit != 0 { receiver.Add(uint32(p.docNum1Hit)) return } if p.postings != nil { receiver.Or(p.postings) } } // Iterator returns an iterator for this postings list func (p *PostingsList) Iterator() segment.PostingsIterator { return p.iterator(nil) } func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { if rv == nil { rv = &PostingsIterator{} } else { freqNormReader := rv.freqNormReader if freqNormReader != nil { freqNormReader.Reset([]byte(nil)) } freqNormDecoder := rv.freqNormDecoder locReader := rv.locReader if locReader != nil { locReader.Reset([]byte(nil)) } locDecoder := rv.locDecoder freqChunkOffsets := rv.freqChunkOffsets[:0] locChunkOffsets := rv.locChunkOffsets[:0] nextLocs := rv.nextLocs[:0] nextSegmentLocs := rv.nextSegmentLocs[:0] buf := rv.buf *rv = PostingsIterator{} // clear the struct rv.freqNormReader = freqNormReader rv.freqNormDecoder = freqNormDecoder rv.locReader = locReader rv.locDecoder = locDecoder rv.freqChunkOffsets = freqChunkOffsets rv.locChunkOffsets = locChunkOffsets rv.nextLocs = nextLocs rv.nextSegmentLocs = nextSegmentLocs rv.buf = buf } rv.postings = p if p.normBits1Hit != 0 { // "1-hit" encoding rv.docNum1Hit = p.docNum1Hit rv.normBits1Hit = p.normBits1Hit if p.except != nil && p.except.Contains(uint32(rv.docNum1Hit)) { rv.docNum1Hit = docNum1HitFinished } return rv } // "general" encoding, check if empty if p.postings == nil { return rv } // prepare the freq chunk details var n uint64 var read int var numFreqChunks uint64 numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) n += uint64(read) if cap(rv.freqChunkOffsets) >= int(numFreqChunks) { rv.freqChunkOffsets = rv.freqChunkOffsets[:int(numFreqChunks)] } else { rv.freqChunkOffsets = make([]uint64, int(numFreqChunks)) } for i := 0; i < int(numFreqChunks); i++ { rv.freqChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) n += uint64(read) } rv.freqChunkStart = p.freqOffset + n // prepare the loc chunk details n = 0 var numLocChunks uint64 numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) n += uint64(read) if cap(rv.locChunkOffsets) >= int(numLocChunks) { rv.locChunkOffsets = rv.locChunkOffsets[:int(numLocChunks)] } else { rv.locChunkOffsets = make([]uint64, int(numLocChunks)) } for i := 0; i < int(numLocChunks); i++ { rv.locChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) n += uint64(read) } rv.locChunkStart = p.locOffset + n rv.all = p.postings.Iterator() if p.except != nil { allExcept := roaring.AndNot(p.postings, p.except) rv.actual = allExcept.Iterator() } else { rv.actual = p.postings.Iterator() } return rv } // Count returns the number of items on this postings list func (p *PostingsList) Count() uint64 { var n uint64 if p.normBits1Hit != 0 { n = 1 } else if p.postings != nil { n = p.postings.GetCardinality() } var e uint64 if p.except != nil { e = p.except.GetCardinality() } if n <= e { return 0 } return n - e } func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { rv.postingsOffset = postingsOffset // handle "1-hit" encoding special case if rv.postingsOffset&FSTValEncodingMask == FSTValEncoding1Hit { return rv.init1Hit(postingsOffset) } // read the location of the freq/norm details var n uint64 var read int rv.freqOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+binary.MaxVarintLen64]) n += uint64(read) rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) var postingsLen uint64 postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen] if rv.postings == nil { rv.postings = roaring.NewBitmap() } _, err := rv.postings.FromBuffer(roaringBytes) if err != nil { return fmt.Errorf("error loading roaring bitmap: %v", err) } return nil } func (rv *PostingsList) init1Hit(fstVal uint64) error { docNum, normBits := FSTValDecode1Hit(fstVal) rv.docNum1Hit = docNum rv.normBits1Hit = normBits return nil } // PostingsIterator provides a way to iterate through the postings list type PostingsIterator struct { postings *PostingsList all roaring.IntIterable actual roaring.IntIterable currChunk uint32 currChunkFreqNorm []byte currChunkLoc []byte freqNormDecoder *govarint.Base128Decoder freqNormReader *bytes.Reader locDecoder *govarint.Base128Decoder locReader *bytes.Reader freqChunkOffsets []uint64 freqChunkStart uint64 locChunkOffsets []uint64 locChunkStart uint64 next Posting // reused across Next() calls nextLocs []Location // reused across Next() calls nextSegmentLocs []segment.Location // reused across Next() calls docNum1Hit uint64 normBits1Hit uint64 buf []byte } func (i *PostingsIterator) Size() int { sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr + len(i.currChunkFreqNorm) + len(i.currChunkLoc) + len(i.freqChunkOffsets)*size.SizeOfUint64 + len(i.locChunkOffsets)*size.SizeOfUint64 + i.next.Size() for _, entry := range i.nextLocs { sizeInBytes += entry.Size() } return sizeInBytes } func (i *PostingsIterator) loadChunk(chunk int) error { if chunk >= len(i.freqChunkOffsets) || chunk >= len(i.locChunkOffsets) { return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkOffsets), len(i.locChunkOffsets)) } end, start := i.freqChunkStart, i.freqChunkStart s, e := readChunkBoundary(chunk, i.freqChunkOffsets) start += s end += e i.currChunkFreqNorm = i.postings.sb.mem[start:end] if i.freqNormReader == nil { i.freqNormReader = bytes.NewReader(i.currChunkFreqNorm) i.freqNormDecoder = govarint.NewU64Base128Decoder(i.freqNormReader) } else { i.freqNormReader.Reset(i.currChunkFreqNorm) } end, start = i.locChunkStart, i.locChunkStart s, e = readChunkBoundary(chunk, i.locChunkOffsets) start += s end += e i.currChunkLoc = i.postings.sb.mem[start:end] if i.locReader == nil { i.locReader = bytes.NewReader(i.currChunkLoc) i.locDecoder = govarint.NewU64Base128Decoder(i.locReader) } else { i.locReader.Reset(i.currChunkLoc) } i.currChunk = uint32(chunk) return nil } func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) { if i.normBits1Hit != 0 { return 1, i.normBits1Hit, false, nil } freqHasLocs, err := i.freqNormDecoder.GetU64() if err != nil { return 0, 0, false, fmt.Errorf("error reading frequency: %v", err) } freq, hasLocs := decodeFreqHasLocs(freqHasLocs) normBits, err := i.freqNormDecoder.GetU64() if err != nil { return 0, 0, false, fmt.Errorf("error reading norm: %v", err) } return freq, normBits, hasLocs, err } func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 { rv := freq << 1 if hasLocs { rv = rv | 0x01 // 0'th LSB encodes whether there are locations } return rv } func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) { freq := freqHasLocs >> 1 hasLocs := freqHasLocs&0x01 != 0 return freq, hasLocs } // readLocation processes all the integers on the stream representing a single // location. if you care about it, pass in a non-nil location struct, and we // will fill it. if you don't care about it, pass in nil and we safely consume // the contents. func (i *PostingsIterator) readLocation(l *Location) error { // read off field fieldID, err := i.locDecoder.GetU64() if err != nil { return fmt.Errorf("error reading location field: %v", err) } // read off pos pos, err := i.locDecoder.GetU64() if err != nil { return fmt.Errorf("error reading location pos: %v", err) } // read off start start, err := i.locDecoder.GetU64() if err != nil { return fmt.Errorf("error reading location start: %v", err) } // read off end end, err := i.locDecoder.GetU64() if err != nil { return fmt.Errorf("error reading location end: %v", err) } // read off num array pos numArrayPos, err := i.locDecoder.GetU64() if err != nil { return fmt.Errorf("error reading location num array pos: %v", err) } // group these together for less branching if l != nil { l.field = i.postings.sb.fieldsInv[fieldID] l.pos = pos l.start = start l.end = end if cap(l.ap) < int(numArrayPos) { l.ap = make([]uint64, int(numArrayPos)) } else { l.ap = l.ap[:int(numArrayPos)] } } // read off array positions for k := 0; k < int(numArrayPos); k++ { ap, err := i.locDecoder.GetU64() if err != nil { return fmt.Errorf("error reading array position: %v", err) } if l != nil { l.ap[k] = ap } } return nil } // Next returns the next posting on the postings list, or nil at the end func (i *PostingsIterator) Next() (segment.Posting, error) { docNum, exists, err := i.nextDocNum() if err != nil || !exists { return nil, err } i.next = Posting{} // clear the struct rv := &i.next rv.docNum = docNum var normBits uint64 var hasLocs bool rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs() if err != nil { return nil, err } rv.norm = math.Float32frombits(uint32(normBits)) if hasLocs { // read off 'freq' locations, into reused slices if cap(i.nextLocs) >= int(rv.freq) { i.nextLocs = i.nextLocs[0:rv.freq] } else { i.nextLocs = make([]Location, rv.freq) } if cap(i.nextSegmentLocs) < int(rv.freq) { i.nextSegmentLocs = make([]segment.Location, rv.freq) } rv.locs = i.nextSegmentLocs[0:rv.freq] for j := 0; j < int(rv.freq); j++ { err := i.readLocation(&i.nextLocs[j]) if err != nil { return nil, err } rv.locs[j] = &i.nextLocs[j] } } return rv, nil } var freqHasLocs1Hit = encodeFreqHasLocs(1, false) // nextBytes returns the docNum and the encoded freq & loc bytes for // the next posting func (i *PostingsIterator) nextBytes() ( docNumOut uint64, freq uint64, normBits uint64, bytesFreqNorm []byte, bytesLoc []byte, err error) { docNum, exists, err := i.nextDocNum() if err != nil || !exists { return 0, 0, 0, nil, nil, err } if i.normBits1Hit != 0 { if i.buf == nil { i.buf = make([]byte, binary.MaxVarintLen64*2) } n := binary.PutUvarint(i.buf, freqHasLocs1Hit) n += binary.PutUvarint(i.buf[n:], i.normBits1Hit) return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil } startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() var hasLocs bool freq, normBits, hasLocs, err = i.readFreqNormHasLocs() if err != nil { return 0, 0, 0, nil, nil, err } endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm] if hasLocs { startLoc := len(i.currChunkLoc) - i.locReader.Len() for j := uint64(0); j < freq; j++ { err := i.readLocation(nil) if err != nil { return 0, 0, 0, nil, nil, err } } endLoc := len(i.currChunkLoc) - i.locReader.Len() bytesLoc = i.currChunkLoc[startLoc:endLoc] } return docNum, freq, normBits, bytesFreqNorm, bytesLoc, nil } // nextDocNum returns the next docNum on the postings list, and also // sets up the currChunk / loc related fields of the iterator. func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { if i.normBits1Hit != 0 { if i.docNum1Hit == docNum1HitFinished { return 0, false, nil } docNum := i.docNum1Hit i.docNum1Hit = docNum1HitFinished // consume our 1-hit docNum return docNum, true, nil } if i.actual == nil || !i.actual.HasNext() { return 0, false, nil } n := i.actual.Next() allN := i.all.Next() nChunk := n / i.postings.sb.chunkFactor allNChunk := allN / i.postings.sb.chunkFactor // n is the next actual hit (excluding some postings), and // allN is the next hit in the full postings, and // if they don't match, move 'all' forwards until they do for allN != n { // in the same chunk, so move the freq/norm/loc decoders forward if allNChunk == nChunk { if i.currChunk != nChunk || i.currChunkFreqNorm == nil { err := i.loadChunk(int(nChunk)) if err != nil { return 0, false, fmt.Errorf("error loading chunk: %v", err) } } // read off freq/offsets even though we don't care about them freq, _, hasLocs, err := i.readFreqNormHasLocs() if err != nil { return 0, false, err } if hasLocs { for j := 0; j < int(freq); j++ { err := i.readLocation(nil) if err != nil { return 0, false, err } } } } allN = i.all.Next() allNChunk = allN / i.postings.sb.chunkFactor } if i.currChunk != nChunk || i.currChunkFreqNorm == nil { err := i.loadChunk(int(nChunk)) if err != nil { return 0, false, fmt.Errorf("error loading chunk: %v", err) } } return uint64(n), true, nil } // Posting is a single entry in a postings list type Posting struct { docNum uint64 freq uint64 norm float32 locs []segment.Location } func (p *Posting) Size() int { sizeInBytes := reflectStaticSizePosting for _, entry := range p.locs { sizeInBytes += entry.Size() } return sizeInBytes } // Number returns the document number of this posting in this segment func (p *Posting) Number() uint64 { return p.docNum } // Frequency returns the frequence of occurance of this term in this doc/field func (p *Posting) Frequency() uint64 { return p.freq } // Norm returns the normalization factor for this posting func (p *Posting) Norm() float64 { return float64(p.norm) } // Locations returns the location information for each occurance func (p *Posting) Locations() []segment.Location { return p.locs } // Location represents the location of a single occurance type Location struct { field string pos uint64 start uint64 end uint64 ap []uint64 } func (l *Location) Size() int { return reflectStaticSizeLocation + len(l.field) + len(l.ap)*size.SizeOfUint64 } // Field returns the name of the field (useful in composite fields to know // which original field the value came from) func (l *Location) Field() string { return l.field } // Start returns the start byte offset of this occurance func (l *Location) Start() uint64 { return l.start } // End returns the end byte offset of this occurance func (l *Location) End() uint64 { return l.end } // Pos returns the 1-based phrase position of this occurance func (l *Location) Pos() uint64 { return l.pos } // ArrayPositions returns the array position vector associated with this occurance func (l *Location) ArrayPositions() []uint64 { return l.ap }