2017-12-09 20:28:33 +01:00
|
|
|
// Copyright (c) 2017 Couchbase, Inc.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
package zap
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"bytes"
|
|
|
|
"encoding/binary"
|
|
|
|
"math"
|
|
|
|
"os"
|
2017-12-28 07:35:33 +01:00
|
|
|
"sort"
|
2017-12-09 20:28:33 +01:00
|
|
|
|
|
|
|
"github.com/Smerity/govarint"
|
|
|
|
"github.com/blevesearch/bleve/index/scorch/segment/mem"
|
2017-12-19 19:49:57 +01:00
|
|
|
"github.com/couchbase/vellum"
|
2017-12-09 20:28:33 +01:00
|
|
|
"github.com/golang/snappy"
|
|
|
|
)
|
|
|
|
|
2018-02-06 01:03:17 +01:00
|
|
|
const version uint32 = 3
|
2017-12-29 17:09:29 +01:00
|
|
|
|
|
|
|
const fieldNotUninverted = math.MaxUint64
|
2017-12-09 20:28:33 +01:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
// PersistSegmentBase persists SegmentBase in the zap file format.
|
|
|
|
func PersistSegmentBase(sb *SegmentBase, path string) error {
|
2017-12-09 20:28:33 +01:00
|
|
|
flag := os.O_RDWR | os.O_CREATE
|
|
|
|
|
|
|
|
f, err := os.OpenFile(path, flag, 0600)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
cleanup := func() {
|
|
|
|
_ = f.Close()
|
|
|
|
_ = os.Remove(path)
|
|
|
|
}
|
|
|
|
|
2017-12-09 20:28:33 +01:00
|
|
|
br := bufio.NewWriter(f)
|
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
_, err = br.Write(sb.mem)
|
|
|
|
if err != nil {
|
|
|
|
cleanup()
|
|
|
|
return err
|
|
|
|
}
|
2017-12-09 20:28:33 +01:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
err = persistFooter(sb.numDocs, sb.storedIndexOffset, sb.fieldsIndexOffset, sb.docValueOffset,
|
|
|
|
sb.chunkFactor, sb.memCRC, br)
|
|
|
|
if err != nil {
|
|
|
|
cleanup()
|
|
|
|
return err
|
|
|
|
}
|
2017-12-09 20:28:33 +01:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
err = br.Flush()
|
|
|
|
if err != nil {
|
|
|
|
cleanup()
|
|
|
|
return err
|
|
|
|
}
|
2017-12-09 20:28:33 +01:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
err = f.Sync()
|
|
|
|
if err != nil {
|
|
|
|
cleanup()
|
|
|
|
return err
|
|
|
|
}
|
2017-12-11 21:47:41 +01:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
err = f.Close()
|
|
|
|
if err != nil {
|
|
|
|
cleanup()
|
|
|
|
return err
|
|
|
|
}
|
2017-12-09 20:28:33 +01:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
return nil
|
|
|
|
}
|
2017-12-13 22:10:06 +01:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
// PersistSegment takes the in-memory segment and persists it to
|
|
|
|
// the specified path in the zap file format.
|
|
|
|
func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) error {
|
|
|
|
flag := os.O_RDWR | os.O_CREATE
|
2017-12-13 22:10:06 +01:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
f, err := os.OpenFile(path, flag, 0600)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-12-28 07:35:33 +01:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
cleanup := func() {
|
|
|
|
_ = f.Close()
|
|
|
|
_ = os.Remove(path)
|
2017-12-09 20:28:33 +01:00
|
|
|
}
|
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
// buffer the output
|
|
|
|
br := bufio.NewWriter(f)
|
|
|
|
|
|
|
|
// wrap it for counting (tracking offsets)
|
|
|
|
cr := NewCountHashWriter(br)
|
|
|
|
|
|
|
|
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, err :=
|
|
|
|
persistBase(memSegment, cr, chunkFactor)
|
2017-12-09 20:28:33 +01:00
|
|
|
if err != nil {
|
2018-01-18 03:46:57 +01:00
|
|
|
cleanup()
|
2017-12-09 20:28:33 +01:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset,
|
|
|
|
chunkFactor, cr.Sum32(), cr)
|
2017-12-09 20:28:33 +01:00
|
|
|
if err != nil {
|
2018-01-18 03:46:57 +01:00
|
|
|
cleanup()
|
2017-12-09 20:28:33 +01:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
err = br.Flush()
|
|
|
|
if err != nil {
|
2018-01-18 03:46:57 +01:00
|
|
|
cleanup()
|
2017-12-09 20:28:33 +01:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2017-12-10 02:52:01 +01:00
|
|
|
err = f.Sync()
|
|
|
|
if err != nil {
|
2018-01-18 03:46:57 +01:00
|
|
|
cleanup()
|
2017-12-10 02:52:01 +01:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2017-12-09 20:28:33 +01:00
|
|
|
err = f.Close()
|
|
|
|
if err != nil {
|
2018-01-18 03:46:57 +01:00
|
|
|
cleanup()
|
2017-12-09 20:28:33 +01:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
func persistBase(memSegment *mem.Segment, cr *CountHashWriter, chunkFactor uint32) (
|
|
|
|
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
|
|
|
|
dictLocs []uint64, err error) {
|
|
|
|
docValueOffset = uint64(fieldNotUninverted)
|
|
|
|
|
|
|
|
if len(memSegment.Stored) > 0 {
|
|
|
|
storedIndexOffset, err = persistStored(memSegment, cr)
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, 0, 0, nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
freqOffsets, locOffsets, err := persistPostingDetails(memSegment, cr, chunkFactor)
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, 0, 0, nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
postingsListLocs, err := persistPostingsLocs(memSegment, cr)
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, 0, 0, nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
postingsLocs, err := persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets)
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, 0, 0, nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
dictLocs, err = persistDictionary(memSegment, cr, postingsLocs)
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, 0, 0, nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
docValueOffset, err = persistFieldDocValues(memSegment, cr, chunkFactor)
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, 0, 0, nil, err
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
dictLocs = make([]uint64, len(memSegment.FieldsInv))
|
|
|
|
}
|
|
|
|
|
|
|
|
fieldsIndexOffset, err = persistFields(memSegment.FieldsInv, cr, dictLocs)
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, 0, 0, nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return uint64(len(memSegment.Stored)), storedIndexOffset, fieldsIndexOffset, docValueOffset,
|
|
|
|
dictLocs, nil
|
|
|
|
}
|
|
|
|
|
2017-12-09 20:28:33 +01:00
|
|
|
func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) {
|
|
|
|
var curr int
|
|
|
|
var metaBuf bytes.Buffer
|
|
|
|
var data, compressed []byte
|
|
|
|
|
2018-02-01 00:08:31 +01:00
|
|
|
metaEncoder := govarint.NewU64Base128Encoder(&metaBuf)
|
|
|
|
|
2017-12-09 20:28:33 +01:00
|
|
|
docNumOffsets := make(map[int]uint64, len(memSegment.Stored))
|
|
|
|
|
|
|
|
for docNum, storedValues := range memSegment.Stored {
|
|
|
|
if docNum != 0 {
|
|
|
|
// reset buffer if necessary
|
2018-02-01 00:08:31 +01:00
|
|
|
curr = 0
|
2017-12-09 20:28:33 +01:00
|
|
|
metaBuf.Reset()
|
|
|
|
data = data[:0]
|
|
|
|
compressed = compressed[:0]
|
|
|
|
}
|
|
|
|
|
2018-01-16 07:43:08 +01:00
|
|
|
st := memSegment.StoredTypes[docNum]
|
|
|
|
sp := memSegment.StoredPos[docNum]
|
|
|
|
|
2017-12-09 20:28:33 +01:00
|
|
|
// encode fields in order
|
|
|
|
for fieldID := range memSegment.FieldsInv {
|
|
|
|
if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok {
|
2018-01-16 07:43:08 +01:00
|
|
|
stf := st[uint16(fieldID)]
|
|
|
|
spf := sp[uint16(fieldID)]
|
|
|
|
|
2018-02-01 00:08:31 +01:00
|
|
|
var err2 error
|
|
|
|
curr, data, err2 = persistStoredFieldValues(fieldID,
|
|
|
|
storedFieldValues, stf, spf, curr, metaEncoder, data)
|
|
|
|
if err2 != nil {
|
|
|
|
return 0, err2
|
2017-12-09 20:28:33 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-02-01 00:08:31 +01:00
|
|
|
metaEncoder.Close()
|
2017-12-09 20:28:33 +01:00
|
|
|
metaBytes := metaBuf.Bytes()
|
|
|
|
|
|
|
|
// compress the data
|
|
|
|
compressed = snappy.Encode(compressed, data)
|
|
|
|
|
|
|
|
// record where we're about to start writing
|
|
|
|
docNumOffsets[docNum] = uint64(w.Count())
|
|
|
|
|
2017-12-13 21:22:13 +01:00
|
|
|
// write out the meta len and compressed data len
|
|
|
|
_, err := writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed)))
|
2017-12-09 20:28:33 +01:00
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
2017-12-13 21:22:13 +01:00
|
|
|
|
2017-12-09 20:28:33 +01:00
|
|
|
// now write the meta
|
|
|
|
_, err = w.Write(metaBytes)
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
// now write the compressed data
|
|
|
|
_, err = w.Write(compressed)
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// return value is the start of the stored index
|
|
|
|
rv := uint64(w.Count())
|
|
|
|
// now write out the stored doc index
|
|
|
|
for docNum := range memSegment.Stored {
|
|
|
|
err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum])
|
|
|
|
if err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return rv, nil
|
|
|
|
}
|
|
|
|
|
2018-02-01 00:08:31 +01:00
|
|
|
func persistStoredFieldValues(fieldID int,
|
|
|
|
storedFieldValues [][]byte, stf []byte, spf [][]uint64,
|
|
|
|
curr int, metaEncoder *govarint.Base128Encoder, data []byte) (
|
|
|
|
int, []byte, error) {
|
|
|
|
for i := 0; i < len(storedFieldValues); i++ {
|
|
|
|
// encode field
|
|
|
|
_, err := metaEncoder.PutU64(uint64(fieldID))
|
|
|
|
if err != nil {
|
|
|
|
return 0, nil, err
|
|
|
|
}
|
|
|
|
// encode type
|
|
|
|
_, err = metaEncoder.PutU64(uint64(stf[i]))
|
|
|
|
if err != nil {
|
|
|
|
return 0, nil, err
|
|
|
|
}
|
|
|
|
// encode start offset
|
|
|
|
_, err = metaEncoder.PutU64(uint64(curr))
|
|
|
|
if err != nil {
|
|
|
|
return 0, nil, err
|
|
|
|
}
|
|
|
|
// end len
|
|
|
|
_, err = metaEncoder.PutU64(uint64(len(storedFieldValues[i])))
|
|
|
|
if err != nil {
|
|
|
|
return 0, nil, err
|
|
|
|
}
|
|
|
|
// encode number of array pos
|
|
|
|
_, err = metaEncoder.PutU64(uint64(len(spf[i])))
|
|
|
|
if err != nil {
|
|
|
|
return 0, nil, err
|
|
|
|
}
|
|
|
|
// encode all array positions
|
|
|
|
for _, pos := range spf[i] {
|
|
|
|
_, err = metaEncoder.PutU64(pos)
|
|
|
|
if err != nil {
|
|
|
|
return 0, nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
data = append(data, storedFieldValues[i]...)
|
|
|
|
curr += len(storedFieldValues[i])
|
|
|
|
}
|
|
|
|
|
|
|
|
return curr, data, nil
|
|
|
|
}
|
|
|
|
|
2017-12-09 20:28:33 +01:00
|
|
|
func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) {
|
2018-03-04 21:56:33 +01:00
|
|
|
freqOffsets := make([]uint64, 0, len(memSegment.Postings))
|
2017-12-13 22:10:06 +01:00
|
|
|
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
|
2017-12-09 20:28:33 +01:00
|
|
|
for postingID := range memSegment.Postings {
|
2017-12-13 22:10:06 +01:00
|
|
|
if postingID != 0 {
|
|
|
|
tfEncoder.Reset()
|
|
|
|
}
|
2018-01-16 07:43:08 +01:00
|
|
|
freqs := memSegment.Freqs[postingID]
|
|
|
|
norms := memSegment.Norms[postingID]
|
2017-12-09 20:28:33 +01:00
|
|
|
postingsListItr := memSegment.Postings[postingID].Iterator()
|
|
|
|
var offset int
|
|
|
|
for postingsListItr.HasNext() {
|
2017-12-13 22:10:06 +01:00
|
|
|
docNum := uint64(postingsListItr.Next())
|
2017-12-09 20:28:33 +01:00
|
|
|
|
2018-03-06 22:30:44 +01:00
|
|
|
// put freq & norm
|
|
|
|
err := tfEncoder.Add(docNum, freqs[offset], uint64(math.Float32bits(norms[offset])))
|
2017-12-09 20:28:33 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
offset++
|
|
|
|
}
|
|
|
|
|
|
|
|
// record where this postings freq info starts
|
|
|
|
freqOffsets = append(freqOffsets, uint64(w.Count()))
|
|
|
|
|
2017-12-13 22:10:06 +01:00
|
|
|
tfEncoder.Close()
|
|
|
|
_, err := tfEncoder.Write(w)
|
2017-12-09 20:28:33 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// now do it again for the locations
|
2018-03-04 21:56:33 +01:00
|
|
|
locOffsets := make([]uint64, 0, len(memSegment.Postings))
|
2017-12-13 22:10:06 +01:00
|
|
|
locEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
|
2017-12-09 20:28:33 +01:00
|
|
|
for postingID := range memSegment.Postings {
|
2017-12-13 22:10:06 +01:00
|
|
|
if postingID != 0 {
|
|
|
|
locEncoder.Reset()
|
|
|
|
}
|
2018-01-16 07:43:08 +01:00
|
|
|
freqs := memSegment.Freqs[postingID]
|
|
|
|
locfields := memSegment.Locfields[postingID]
|
|
|
|
locpos := memSegment.Locpos[postingID]
|
|
|
|
locstarts := memSegment.Locstarts[postingID]
|
|
|
|
locends := memSegment.Locends[postingID]
|
|
|
|
locarraypos := memSegment.Locarraypos[postingID]
|
2017-12-09 20:28:33 +01:00
|
|
|
postingsListItr := memSegment.Postings[postingID].Iterator()
|
|
|
|
var offset int
|
|
|
|
var locOffset int
|
|
|
|
for postingsListItr.HasNext() {
|
2017-12-13 22:10:06 +01:00
|
|
|
docNum := uint64(postingsListItr.Next())
|
2018-03-04 22:01:22 +01:00
|
|
|
n := int(freqs[offset])
|
|
|
|
for i := 0; i < n; i++ {
|
2018-01-16 07:43:08 +01:00
|
|
|
if len(locfields) > 0 {
|
2018-03-06 22:30:44 +01:00
|
|
|
err := locEncoder.Add(docNum, uint64(locfields[locOffset]),
|
|
|
|
locpos[locOffset], locstarts[locOffset], locends[locOffset],
|
|
|
|
uint64(len(locarraypos[locOffset])))
|
2017-12-09 20:28:33 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
|
|
|
|
2018-03-06 22:30:44 +01:00
|
|
|
// put each array position
|
|
|
|
err = locEncoder.Add(docNum, locarraypos[locOffset]...)
|
2017-12-09 20:28:33 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
locOffset++
|
|
|
|
}
|
|
|
|
offset++
|
|
|
|
}
|
|
|
|
|
|
|
|
// record where this postings loc info starts
|
2018-03-04 21:56:33 +01:00
|
|
|
locOffsets = append(locOffsets, uint64(w.Count()))
|
2018-03-06 22:30:44 +01:00
|
|
|
|
2017-12-13 22:10:06 +01:00
|
|
|
locEncoder.Close()
|
|
|
|
_, err := locEncoder.Write(w)
|
2017-12-09 20:28:33 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
|
|
|
}
|
2018-03-04 21:56:33 +01:00
|
|
|
|
|
|
|
return freqOffsets, locOffsets, nil
|
2017-12-09 20:28:33 +01:00
|
|
|
}
|
|
|
|
|
2017-12-13 20:41:20 +01:00
|
|
|
func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) {
|
2018-01-16 07:43:08 +01:00
|
|
|
rv = make([]uint64, 0, len(memSegment.PostingsLocs))
|
2018-01-21 19:53:58 +01:00
|
|
|
var reuseBuf bytes.Buffer
|
|
|
|
reuseBufVarint := make([]byte, binary.MaxVarintLen64)
|
2017-12-11 21:47:41 +01:00
|
|
|
for postingID := range memSegment.PostingsLocs {
|
|
|
|
// record where we start this posting loc
|
|
|
|
rv = append(rv, uint64(w.Count()))
|
2017-12-13 20:41:20 +01:00
|
|
|
// write out the length and bitmap
|
2018-01-21 19:53:58 +01:00
|
|
|
_, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w, &reuseBuf, reuseBufVarint)
|
2017-12-11 21:47:41 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return rv, nil
|
|
|
|
}
|
|
|
|
|
2017-12-13 20:41:20 +01:00
|
|
|
func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter,
|
|
|
|
postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) {
|
2018-01-16 07:43:08 +01:00
|
|
|
rv = make([]uint64, 0, len(memSegment.Postings))
|
2018-01-21 19:53:58 +01:00
|
|
|
var reuseBuf bytes.Buffer
|
|
|
|
reuseBufVarint := make([]byte, binary.MaxVarintLen64)
|
2017-12-09 20:28:33 +01:00
|
|
|
for postingID := range memSegment.Postings {
|
|
|
|
// record where we start this posting list
|
|
|
|
rv = append(rv, uint64(w.Count()))
|
2017-12-13 21:22:13 +01:00
|
|
|
|
|
|
|
// write out the term info, loc info, and loc posting list offset
|
|
|
|
_, err = writeUvarints(w, freqOffsets[postingID],
|
|
|
|
locOffsets[postingID], postingsListLocs[postingID])
|
2017-12-11 21:47:41 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2017-12-13 21:22:13 +01:00
|
|
|
|
2017-12-13 20:41:20 +01:00
|
|
|
// write out the length and bitmap
|
2018-01-21 19:53:58 +01:00
|
|
|
_, err = writeRoaringWithLen(memSegment.Postings[postingID], w, &reuseBuf, reuseBufVarint)
|
2017-12-09 20:28:33 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return rv, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) {
|
2018-01-16 07:43:08 +01:00
|
|
|
rv := make([]uint64, 0, len(memSegment.DictKeys))
|
2017-12-09 20:28:33 +01:00
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
varintBuf := make([]byte, binary.MaxVarintLen64)
|
|
|
|
|
2017-12-09 20:28:33 +01:00
|
|
|
var buffer bytes.Buffer
|
2018-03-01 16:31:53 +01:00
|
|
|
builder, err := vellum.New(&buffer, nil)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2017-12-09 20:28:33 +01:00
|
|
|
for fieldID, fieldTerms := range memSegment.DictKeys {
|
|
|
|
|
|
|
|
dict := memSegment.Dicts[fieldID]
|
|
|
|
// now walk the dictionary in order of fieldTerms (already sorted)
|
2018-01-16 07:43:08 +01:00
|
|
|
for _, fieldTerm := range fieldTerms {
|
|
|
|
postingID := dict[fieldTerm] - 1
|
2017-12-09 20:28:33 +01:00
|
|
|
postingsAddr := postingsLocs[postingID]
|
2018-01-16 07:43:08 +01:00
|
|
|
err = builder.Insert([]byte(fieldTerm), postingsAddr)
|
2017-12-09 20:28:33 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
err = builder.Close()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// record where this dictionary starts
|
|
|
|
rv = append(rv, uint64(w.Count()))
|
|
|
|
|
|
|
|
vellumData := buffer.Bytes()
|
|
|
|
|
|
|
|
// write out the length of the vellum data
|
2018-01-18 03:46:57 +01:00
|
|
|
n := binary.PutUvarint(varintBuf, uint64(len(vellumData)))
|
|
|
|
_, err = w.Write(varintBuf[:n])
|
2017-12-09 20:28:33 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// write this vellum to disk
|
|
|
|
_, err = w.Write(vellumData)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2018-03-01 16:31:53 +01:00
|
|
|
|
|
|
|
// reset buffer and vellum builder
|
|
|
|
buffer.Reset()
|
|
|
|
err = builder.Reset(&buffer)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2017-12-09 20:28:33 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return rv, nil
|
|
|
|
}
|
2017-12-28 07:35:33 +01:00
|
|
|
|
|
|
|
type docIDRange []uint64
|
|
|
|
|
|
|
|
func (a docIDRange) Len() int { return len(a) }
|
|
|
|
func (a docIDRange) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
|
|
|
func (a docIDRange) Less(i, j int) bool { return a[i] < a[j] }
|
|
|
|
|
|
|
|
func persistDocValues(memSegment *mem.Segment, w *CountHashWriter,
|
|
|
|
chunkFactor uint32) (map[uint16]uint64, error) {
|
2017-12-29 17:09:29 +01:00
|
|
|
fieldChunkOffsets := make(map[uint16]uint64, len(memSegment.FieldsInv))
|
2017-12-28 07:35:33 +01:00
|
|
|
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
|
|
|
|
|
2018-03-04 21:03:02 +01:00
|
|
|
var postings *mem.PostingsList
|
|
|
|
var postingsItr *mem.PostingsIterator
|
|
|
|
|
2018-01-04 11:04:55 +01:00
|
|
|
for fieldID := range memSegment.DocValueFields {
|
2017-12-28 07:35:33 +01:00
|
|
|
field := memSegment.FieldsInv[fieldID]
|
|
|
|
docTermMap := make(map[uint64][]byte, 0)
|
|
|
|
dict, err := memSegment.Dictionary(field)
|
|
|
|
if err != nil {
|
2017-12-29 17:09:29 +01:00
|
|
|
return nil, err
|
2017-12-28 07:35:33 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
dictItr := dict.Iterator()
|
|
|
|
next, err := dictItr.Next()
|
|
|
|
for err == nil && next != nil {
|
2018-03-04 21:03:02 +01:00
|
|
|
var err1 error
|
|
|
|
postings, err1 = dict.(*mem.Dictionary).InitPostingsList(next.Term, nil, postings)
|
2017-12-28 07:35:33 +01:00
|
|
|
if err1 != nil {
|
2017-12-29 17:09:29 +01:00
|
|
|
return nil, err
|
2017-12-28 07:35:33 +01:00
|
|
|
}
|
|
|
|
|
2018-03-04 21:03:02 +01:00
|
|
|
postingsItr = postings.InitIterator(postingsItr)
|
2017-12-28 07:35:33 +01:00
|
|
|
nextPosting, err2 := postingsItr.Next()
|
|
|
|
for err2 == nil && nextPosting != nil {
|
|
|
|
docNum := nextPosting.Number()
|
2018-03-03 19:59:53 +01:00
|
|
|
docTermMap[docNum] = append(append(docTermMap[docNum], []byte(next.Term)...), termSeparator)
|
2017-12-28 07:35:33 +01:00
|
|
|
nextPosting, err2 = postingsItr.Next()
|
|
|
|
}
|
|
|
|
if err2 != nil {
|
2017-12-29 17:09:29 +01:00
|
|
|
return nil, err2
|
2017-12-28 07:35:33 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
next, err = dictItr.Next()
|
|
|
|
}
|
|
|
|
if err != nil {
|
2017-12-29 17:09:29 +01:00
|
|
|
return nil, err
|
2017-12-28 07:35:33 +01:00
|
|
|
}
|
2018-03-03 19:59:53 +01:00
|
|
|
|
2017-12-28 07:35:33 +01:00
|
|
|
// sort wrt to docIDs
|
2018-03-04 21:06:45 +01:00
|
|
|
docNumbers := make(docIDRange, 0, len(docTermMap))
|
2017-12-28 07:35:33 +01:00
|
|
|
for k := range docTermMap {
|
|
|
|
docNumbers = append(docNumbers, k)
|
|
|
|
}
|
|
|
|
sort.Sort(docNumbers)
|
|
|
|
|
|
|
|
for _, docNum := range docNumbers {
|
|
|
|
err = fdvEncoder.Add(docNum, docTermMap[docNum])
|
|
|
|
if err != nil {
|
2017-12-29 17:09:29 +01:00
|
|
|
return nil, err
|
2017-12-28 07:35:33 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fieldChunkOffsets[fieldID] = uint64(w.Count())
|
2017-12-29 17:09:29 +01:00
|
|
|
err = fdvEncoder.Close()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2017-12-28 07:35:33 +01:00
|
|
|
// persist the doc value details for this field
|
|
|
|
_, err = fdvEncoder.Write(w)
|
|
|
|
if err != nil {
|
2017-12-29 17:09:29 +01:00
|
|
|
return nil, err
|
2017-12-28 07:35:33 +01:00
|
|
|
}
|
2018-02-03 19:51:24 +01:00
|
|
|
// reseting encoder for the next field
|
2017-12-28 07:35:33 +01:00
|
|
|
fdvEncoder.Reset()
|
|
|
|
}
|
|
|
|
|
|
|
|
return fieldChunkOffsets, nil
|
|
|
|
}
|
|
|
|
|
2018-01-18 03:46:57 +01:00
|
|
|
func persistFieldDocValues(memSegment *mem.Segment, w *CountHashWriter,
|
|
|
|
chunkFactor uint32) (uint64, error) {
|
2017-12-28 07:35:33 +01:00
|
|
|
fieldDvOffsets, err := persistDocValues(memSegment, w, chunkFactor)
|
|
|
|
if err != nil {
|
2017-12-29 17:09:29 +01:00
|
|
|
return 0, err
|
2017-12-28 07:35:33 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
fieldDocValuesOffset := uint64(w.Count())
|
|
|
|
buf := make([]byte, binary.MaxVarintLen64)
|
2017-12-29 17:09:29 +01:00
|
|
|
offset := uint64(0)
|
2017-12-28 07:35:33 +01:00
|
|
|
ok := true
|
|
|
|
for fieldID := range memSegment.FieldsInv {
|
|
|
|
// if the field isn't configured for docValue, then mark
|
|
|
|
// the offset accordingly
|
|
|
|
if offset, ok = fieldDvOffsets[uint16(fieldID)]; !ok {
|
2017-12-29 17:09:29 +01:00
|
|
|
offset = fieldNotUninverted
|
2017-12-28 07:35:33 +01:00
|
|
|
}
|
|
|
|
n := binary.PutUvarint(buf, uint64(offset))
|
|
|
|
_, err := w.Write(buf[:n])
|
|
|
|
if err != nil {
|
2017-12-29 17:09:29 +01:00
|
|
|
return 0, err
|
2017-12-28 07:35:33 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return fieldDocValuesOffset, nil
|
|
|
|
}
|
2018-01-18 03:46:57 +01:00
|
|
|
|
|
|
|
func NewSegmentBase(memSegment *mem.Segment, chunkFactor uint32) (*SegmentBase, error) {
|
|
|
|
var br bytes.Buffer
|
|
|
|
|
|
|
|
cr := NewCountHashWriter(&br)
|
|
|
|
|
|
|
|
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, err :=
|
|
|
|
persistBase(memSegment, cr, chunkFactor)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2018-02-14 23:50:30 +01:00
|
|
|
return InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor,
|
|
|
|
memSegment.FieldsMap, memSegment.FieldsInv, numDocs,
|
|
|
|
storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs)
|
|
|
|
}
|
|
|
|
|
|
|
|
func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32,
|
|
|
|
fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64,
|
|
|
|
storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64,
|
|
|
|
dictLocs []uint64) (*SegmentBase, error) {
|
2018-01-18 03:46:57 +01:00
|
|
|
sb := &SegmentBase{
|
2018-02-14 23:50:30 +01:00
|
|
|
mem: mem,
|
|
|
|
memCRC: memCRC,
|
2018-01-18 03:46:57 +01:00
|
|
|
chunkFactor: chunkFactor,
|
2018-02-14 23:50:30 +01:00
|
|
|
fieldsMap: fieldsMap,
|
|
|
|
fieldsInv: fieldsInv,
|
2018-01-18 03:46:57 +01:00
|
|
|
numDocs: numDocs,
|
|
|
|
storedIndexOffset: storedIndexOffset,
|
|
|
|
fieldsIndexOffset: fieldsIndexOffset,
|
|
|
|
docValueOffset: docValueOffset,
|
|
|
|
dictLocs: dictLocs,
|
|
|
|
fieldDvIterMap: make(map[uint16]*docValueIterator),
|
|
|
|
}
|
|
|
|
|
2018-02-14 23:50:30 +01:00
|
|
|
err := sb.loadDvIterators()
|
2018-01-18 03:46:57 +01:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return sb, nil
|
|
|
|
}
|