add initial version of zap file format
This commit is contained in:
parent
ff2e6b98e4
commit
9781d9b089
|
@ -0,0 +1,120 @@
|
|||
# zap file format
|
||||
|
||||
## stored fields section
|
||||
|
||||
- for each document
|
||||
- preparation phase:
|
||||
- produce a slice of metadata bytes and data bytes
|
||||
- produce these slices in field id order
|
||||
- field value is appended to the data slice
|
||||
- metadata slice is govarint encoded with the following values for each field value
|
||||
- field id (uint16)
|
||||
- field type (byte)
|
||||
- field value start offset in uncompressed data slice (uint64)
|
||||
- field value length (uint64)
|
||||
- field number of array positions (uint64)
|
||||
- one additional value for each array position (uint64)
|
||||
- compress the data slice using snappy
|
||||
- file writing phase:
|
||||
- remember the start offset for this document
|
||||
- write out meta data length (varint uint64)
|
||||
- write out compressed data length (varint uint64)
|
||||
- write out the metadata bytes
|
||||
- write out the compressed data bytes
|
||||
|
||||
## stored fields idx
|
||||
|
||||
- for each document
|
||||
- write start offset (remembered from previous section) of stored data (big endian uint64)
|
||||
|
||||
With this index and a known document number, we have direct access to all the stored field data.
|
||||
|
||||
## posting details (freq/norm) section
|
||||
|
||||
- for each posting list
|
||||
- produce a slice containing multiple consecutive chunks (each chunk is govarint stream)
|
||||
- produce a slice remembering offsets of where each chunk starts
|
||||
- preparation phase:
|
||||
- for each hit in the posting list
|
||||
- if this hit is in next chunk close out encoding of last chunk and record offset start of next
|
||||
- encode term frequency (uint64)
|
||||
- encode norm factor (float32)
|
||||
- file writing phase:
|
||||
- remember start position for this posting list details
|
||||
- write out number of chunks that follow (varint uint64)
|
||||
- write out length of each chunk (each a varint uint64)
|
||||
- write out the byte slice containing all the chunk data
|
||||
|
||||
If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it.
|
||||
|
||||
## posting details (location) section
|
||||
|
||||
- for each posting list
|
||||
- produce a slice containing multiple consecutive chunks (each chunk is govarint stream)
|
||||
- produce a slice remembering offsets of where each chunk starts
|
||||
- preparation phase:
|
||||
- for each hit in the posting list
|
||||
- if this hit is in next chunk close out encoding of last chunk and record offset start of next
|
||||
- encode field (uint16)
|
||||
- encode field pos (uint64)
|
||||
- encode field start (uint64)
|
||||
- encode field end (uint64)
|
||||
- encode number of array positions to follow (uint64)
|
||||
- encode each array position (each uint64)
|
||||
- file writing phase:
|
||||
- remember start position for this posting list details
|
||||
- write out number of chunks that follow (varint uint64)
|
||||
- write out length of each chunk (each a varint uint64)
|
||||
- write out the byte slice containing all the chunk data
|
||||
|
||||
If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it.
|
||||
|
||||
## postings list section
|
||||
|
||||
- for each posting list
|
||||
- preparation phase:
|
||||
- encode roaring bitmap posting list to bytes (so we know the length)
|
||||
- file writing phase:
|
||||
- remember the start position for this posting list
|
||||
- write freq/norm details offset (remembered from previous, as varint uint64)
|
||||
- write location details offset (remembered from previous, as varint uint64)
|
||||
- write length of encoded roaring bitmap
|
||||
- write the serialized roaring bitmap data
|
||||
|
||||
## dictionary
|
||||
|
||||
- for each field
|
||||
- preparation phase:
|
||||
- encode vellum FST with dictionary data pointing to file offset of posting list (remembered from previous)
|
||||
- file writing phase:
|
||||
- remember the start position of this persistDictionary
|
||||
- write length of vellum data (varint uint64)
|
||||
- write out vellum data
|
||||
|
||||
## fields section
|
||||
|
||||
- for each field
|
||||
- file writing phase:
|
||||
- remember start offset for each field
|
||||
- write 1 if field has location info indexed, 0 if not (varint uint64)
|
||||
- write dictionary address (remembered from previous) (varint uint64)
|
||||
- write length of field name (varint uint64)
|
||||
- write field name bytes
|
||||
|
||||
## fields idx
|
||||
|
||||
- for each field
|
||||
- file writing phase:
|
||||
- write big endian uint64 of start offset for each field
|
||||
|
||||
NOTE: currently we don't know or record the length of this fields index. Instead we rely on the fact that we know it immediately precedes a footer of known size.
|
||||
|
||||
## footer
|
||||
|
||||
- file writing phase
|
||||
- write number of docs (big endian uint64)
|
||||
- write stored field index location (big endian uint64)
|
||||
- write field index location (big endian uint64)
|
||||
- write out chunk factor (big endian uint32)
|
||||
- write out version (big endian uint32)
|
||||
- write out file CRC of everything preceding this (big endian uint32)
|
|
@ -0,0 +1,615 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package zap
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"math"
|
||||
"os"
|
||||
|
||||
"github.com/Smerity/govarint"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/mem"
|
||||
"github.com/couchbaselabs/vellum"
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
var version uint32
|
||||
|
||||
// PersistSegment takes the in-memory segment and persists it to the specified
|
||||
// path in the zap file format.
|
||||
func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (err error) {
|
||||
|
||||
flag := os.O_RDWR | os.O_CREATE
|
||||
|
||||
f, err := os.OpenFile(path, flag, 0600)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// bufer the output
|
||||
br := bufio.NewWriter(f)
|
||||
|
||||
// wrap it for counting (tracking offsets)
|
||||
cr := NewCountHashWriter(br)
|
||||
|
||||
var storedIndexOffset uint64
|
||||
storedIndexOffset, err = persistStored(memSegment, cr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var freqOffsets, locOffsets []uint64
|
||||
freqOffsets, locOffsets, err = persistPostingDetails(memSegment, cr, chunkFactor)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var postingsLocs []uint64
|
||||
postingsLocs, err = persistPostingsLists(memSegment, cr, freqOffsets, locOffsets)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var dictLocs []uint64
|
||||
dictLocs, err = persistDictionary(memSegment, cr, postingsLocs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var fieldIndexStart uint64
|
||||
fieldIndexStart, err = persistFields(memSegment, cr, dictLocs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = persistFooter(uint64(len(memSegment.Stored)), storedIndexOffset,
|
||||
fieldIndexStart, chunkFactor, cr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = br.Flush()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = f.Close()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) {
|
||||
|
||||
var curr int
|
||||
var metaBuf bytes.Buffer
|
||||
var data, compressed []byte
|
||||
|
||||
docNumOffsets := make(map[int]uint64, len(memSegment.Stored))
|
||||
|
||||
for docNum, storedValues := range memSegment.Stored {
|
||||
if docNum != 0 {
|
||||
// reset buffer if necessary
|
||||
metaBuf.Reset()
|
||||
data = data[:0]
|
||||
compressed = compressed[:0]
|
||||
curr = 0
|
||||
}
|
||||
|
||||
metaEncoder := govarint.NewU64Base128Encoder(&metaBuf)
|
||||
|
||||
// encode fields in order
|
||||
for fieldID := range memSegment.FieldsInv {
|
||||
if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok {
|
||||
// has stored values for this field
|
||||
num := len(storedFieldValues)
|
||||
|
||||
// process each value
|
||||
for i := 0; i < num; i++ {
|
||||
// encode field
|
||||
_, err2 := metaEncoder.PutU64(uint64(fieldID))
|
||||
if err2 != nil {
|
||||
return 0, err2
|
||||
}
|
||||
// encode type
|
||||
_, err2 = metaEncoder.PutU64(uint64(memSegment.StoredTypes[docNum][uint16(fieldID)][i]))
|
||||
if err2 != nil {
|
||||
return 0, err2
|
||||
}
|
||||
// encode start offset
|
||||
_, err2 = metaEncoder.PutU64(uint64(curr))
|
||||
if err2 != nil {
|
||||
return 0, err2
|
||||
}
|
||||
// end len
|
||||
_, err2 = metaEncoder.PutU64(uint64(len(storedFieldValues[i])))
|
||||
if err2 != nil {
|
||||
return 0, err2
|
||||
}
|
||||
// encode number of array pos
|
||||
_, err2 = metaEncoder.PutU64(uint64(len(memSegment.StoredPos[docNum][uint16(fieldID)][i])))
|
||||
if err2 != nil {
|
||||
return 0, err2
|
||||
}
|
||||
// encode all array positions
|
||||
for j := 0; j < len(memSegment.StoredPos[docNum][uint16(fieldID)][i]); j++ {
|
||||
_, err2 = metaEncoder.PutU64(memSegment.StoredPos[docNum][uint16(fieldID)][i][j])
|
||||
if err2 != nil {
|
||||
return 0, err2
|
||||
}
|
||||
}
|
||||
// append data
|
||||
data = append(data, storedFieldValues[i]...)
|
||||
// update curr
|
||||
curr += len(storedFieldValues[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
metaEncoder.Close()
|
||||
|
||||
metaBytes := metaBuf.Bytes()
|
||||
|
||||
// compress the data
|
||||
compressed = snappy.Encode(compressed, data)
|
||||
|
||||
// record where we're about to start writing
|
||||
docNumOffsets[docNum] = uint64(w.Count())
|
||||
|
||||
buf := make([]byte, binary.MaxVarintLen64)
|
||||
// write out the meta length
|
||||
n := binary.PutUvarint(buf, uint64(len(metaBytes)))
|
||||
_, err := w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
// write out the compressed data length
|
||||
n = binary.PutUvarint(buf, uint64(len(compressed)))
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
// now write the meta
|
||||
_, err = w.Write(metaBytes)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
// now write the compressed data
|
||||
_, err = w.Write(compressed)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
// return value is the start of the stored index
|
||||
rv := uint64(w.Count())
|
||||
// now write out the stored doc index
|
||||
for docNum := range memSegment.Stored {
|
||||
err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) {
|
||||
var freqOffsets, locOfffsets []uint64
|
||||
for postingID := range memSegment.Postings {
|
||||
postingsListItr := memSegment.Postings[postingID].Iterator()
|
||||
|
||||
total := uint64(len(memSegment.Stored))/uint64(chunkFactor) + 1
|
||||
|
||||
var freqNormBuf []byte
|
||||
var offset int
|
||||
|
||||
var encodingBuf bytes.Buffer
|
||||
encoder := govarint.NewU64Base128Encoder(&encodingBuf)
|
||||
|
||||
chunkLens := make([]uint64, total)
|
||||
var currChunk uint64
|
||||
for postingsListItr.HasNext() {
|
||||
docNum := postingsListItr.Next()
|
||||
chunk := uint64(docNum) / uint64(chunkFactor)
|
||||
|
||||
if chunk != currChunk {
|
||||
// starting a new chunk
|
||||
if encoder != nil {
|
||||
// close out last
|
||||
encoder.Close()
|
||||
encodingBytes := encodingBuf.Bytes()
|
||||
chunkLens[currChunk] = uint64(len(encodingBytes))
|
||||
freqNormBuf = append(freqNormBuf, encodingBytes...)
|
||||
encodingBuf.Reset()
|
||||
encoder = govarint.NewU64Base128Encoder(&encodingBuf)
|
||||
}
|
||||
|
||||
currChunk = chunk
|
||||
}
|
||||
|
||||
// put freq
|
||||
_, err := encoder.PutU64(memSegment.Freqs[postingID][offset])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put norm
|
||||
norm := memSegment.Norms[postingID][offset]
|
||||
normBits := math.Float32bits(norm)
|
||||
_, err = encoder.PutU32(normBits)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
offset++
|
||||
}
|
||||
|
||||
// close out last chunk
|
||||
if encoder != nil {
|
||||
// fix me write freq/norms
|
||||
encoder.Close()
|
||||
encodingBytes := encodingBuf.Bytes()
|
||||
chunkLens[currChunk] = uint64(len(encodingBytes))
|
||||
freqNormBuf = append(freqNormBuf, encodingBytes...)
|
||||
}
|
||||
|
||||
// record where this postings freq info starts
|
||||
freqOffsets = append(freqOffsets, uint64(w.Count()))
|
||||
|
||||
buf := make([]byte, binary.MaxVarintLen64)
|
||||
// write out the number of chunks
|
||||
n := binary.PutUvarint(buf, uint64(total))
|
||||
_, err := w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
// write out the chunk lens
|
||||
for _, chunkLen := range chunkLens {
|
||||
n := binary.PutUvarint(buf, uint64(chunkLen))
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
}
|
||||
// write out the data
|
||||
_, err = w.Write(freqNormBuf)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// now do it again for the locations
|
||||
for postingID := range memSegment.Postings {
|
||||
postingsListItr := memSegment.Postings[postingID].Iterator()
|
||||
|
||||
total := uint64(len(memSegment.Stored))/uint64(chunkFactor) + 1
|
||||
|
||||
var locBuf []byte
|
||||
var offset int
|
||||
var locOffset int
|
||||
|
||||
var encodingBuf bytes.Buffer
|
||||
encoder := govarint.NewU64Base128Encoder(&encodingBuf)
|
||||
|
||||
chunkLens := make([]uint64, total)
|
||||
var currChunk uint64
|
||||
for postingsListItr.HasNext() {
|
||||
docNum := postingsListItr.Next()
|
||||
chunk := uint64(docNum) / uint64(chunkFactor)
|
||||
|
||||
if chunk != currChunk {
|
||||
// starting a new chunk
|
||||
if encoder != nil {
|
||||
// close out last
|
||||
encoder.Close()
|
||||
encodingBytes := encodingBuf.Bytes()
|
||||
chunkLens[currChunk] = uint64(len(encodingBytes))
|
||||
locBuf = append(locBuf, encodingBytes...)
|
||||
encodingBuf.Reset()
|
||||
encoder = govarint.NewU64Base128Encoder(&encodingBuf)
|
||||
}
|
||||
currChunk = chunk
|
||||
}
|
||||
|
||||
for i := 0; i < int(memSegment.Freqs[postingID][offset]); i++ {
|
||||
|
||||
if len(memSegment.Locfields[postingID]) > 0 {
|
||||
// put field
|
||||
_, err := encoder.PutU64(uint64(memSegment.Locfields[postingID][locOffset]))
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put pos
|
||||
_, err = encoder.PutU64(memSegment.Locpos[postingID][locOffset])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put start
|
||||
_, err = encoder.PutU64(memSegment.Locstarts[postingID][locOffset])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put end
|
||||
_, err = encoder.PutU64(memSegment.Locends[postingID][locOffset])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put array positions
|
||||
num := len(memSegment.Locarraypos[postingID][locOffset])
|
||||
|
||||
// put the number of array positions to follow
|
||||
_, err = encoder.PutU64(uint64(num))
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put each array position
|
||||
for j := 0; j < num; j++ {
|
||||
_, err = encoder.PutU64(memSegment.Locarraypos[postingID][locOffset][j])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
locOffset++
|
||||
}
|
||||
offset++
|
||||
}
|
||||
|
||||
// close out last chunk
|
||||
if encoder != nil {
|
||||
// fix me write freq/norms
|
||||
encoder.Close()
|
||||
encodingBytes := encodingBuf.Bytes()
|
||||
chunkLens[currChunk] = uint64(len(encodingBytes))
|
||||
locBuf = append(locBuf, encodingBytes...)
|
||||
}
|
||||
|
||||
// record where this postings loc info starts
|
||||
locOfffsets = append(locOfffsets, uint64(w.Count()))
|
||||
|
||||
buf := make([]byte, binary.MaxVarintLen64)
|
||||
// write out the number of chunks
|
||||
n := binary.PutUvarint(buf, uint64(total))
|
||||
_, err := w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
// write out the chunk lens
|
||||
for _, chunkLen := range chunkLens {
|
||||
n := binary.PutUvarint(buf, uint64(chunkLen))
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
}
|
||||
// write out the data
|
||||
_, err = w.Write(locBuf)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
}
|
||||
return freqOffsets, locOfffsets, nil
|
||||
}
|
||||
|
||||
func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, freqOffsets, locOffsets []uint64) ([]uint64, error) {
|
||||
var rv []uint64
|
||||
|
||||
var postingsBuf bytes.Buffer
|
||||
for postingID := range memSegment.Postings {
|
||||
if postingID != 0 {
|
||||
postingsBuf.Reset()
|
||||
}
|
||||
|
||||
// record where we start this posting list
|
||||
rv = append(rv, uint64(w.Count()))
|
||||
|
||||
// write out postings list to memory so we know the len
|
||||
postingsListLen, err := memSegment.Postings[postingID].WriteTo(&postingsBuf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// write out the start of the term info
|
||||
buf := make([]byte, binary.MaxVarintLen64)
|
||||
n := binary.PutUvarint(buf, freqOffsets[postingID])
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// write out the start of the loc info
|
||||
n = binary.PutUvarint(buf, locOffsets[postingID])
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// write out the length of this postings list
|
||||
n = binary.PutUvarint(buf, uint64(postingsListLen))
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// write out the postings list itself
|
||||
_, err = w.Write(postingsBuf.Bytes())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) {
|
||||
var rv []uint64
|
||||
|
||||
var buffer bytes.Buffer
|
||||
for fieldID, fieldTerms := range memSegment.DictKeys {
|
||||
if fieldID != 0 {
|
||||
buffer.Reset()
|
||||
}
|
||||
|
||||
// start a new vellum for this field
|
||||
builder, err := vellum.New(&buffer, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
dict := memSegment.Dicts[fieldID]
|
||||
// now walk the dictionary in order of fieldTerms (already sorted)
|
||||
for i := range fieldTerms {
|
||||
postingID := dict[fieldTerms[i]] - 1
|
||||
postingsAddr := postingsLocs[postingID]
|
||||
err = builder.Insert([]byte(fieldTerms[i]), postingsAddr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
err = builder.Close()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// record where this dictionary starts
|
||||
rv = append(rv, uint64(w.Count()))
|
||||
|
||||
vellumData := buffer.Bytes()
|
||||
|
||||
// write out the length of the vellum data
|
||||
buf := make([]byte, binary.MaxVarintLen64)
|
||||
// write out the number of chunks
|
||||
n := binary.PutUvarint(buf, uint64(len(vellumData)))
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// write this vellum to disk
|
||||
_, err = w.Write(vellumData)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func persistFields(memSegment *mem.Segment, w *CountHashWriter, dictLocs []uint64) (uint64, error) {
|
||||
var rv uint64
|
||||
|
||||
var fieldStarts []uint64
|
||||
for fieldID, fieldName := range memSegment.FieldsInv {
|
||||
|
||||
// record start of this field
|
||||
fieldStarts = append(fieldStarts, uint64(w.Count()))
|
||||
|
||||
buf := make([]byte, binary.MaxVarintLen64)
|
||||
// write out if the field has indexed locs (0 or 1)
|
||||
var indexedLoc uint64
|
||||
if memSegment.FieldsLoc[fieldID] {
|
||||
indexedLoc = 1
|
||||
}
|
||||
n := binary.PutUvarint(buf, indexedLoc)
|
||||
_, err := w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// write out dict location for this field
|
||||
n = binary.PutUvarint(buf, dictLocs[fieldID])
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// write out the length of the field name
|
||||
n = binary.PutUvarint(buf, uint64(len(fieldName)))
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// write out the field name
|
||||
_, err = w.Write([]byte(fieldName))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
// now write out the fields index
|
||||
rv = uint64(w.Count())
|
||||
|
||||
// now write out the stored doc index
|
||||
for fieldID := range memSegment.FieldsInv {
|
||||
err := binary.Write(w, binary.BigEndian, fieldStarts[fieldID])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
// NOTE: update if you make the footer bigger
|
||||
// crc + ver + chunk + field offset + stored offset + num docs
|
||||
const footerSize = 4 + 4 + 4 + 8 + 8 + 8
|
||||
|
||||
func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset uint64,
|
||||
chunkFactor uint32, w *CountHashWriter) error {
|
||||
// write out the number of docs
|
||||
err := binary.Write(w, binary.BigEndian, numDocs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// write out the stored field index location:
|
||||
err = binary.Write(w, binary.BigEndian, storedIndexOffset)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// write out the field index location
|
||||
err = binary.Write(w, binary.BigEndian, fieldIndexOffset)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// write out 32-bit chunk factor
|
||||
err = binary.Write(w, binary.BigEndian, chunkFactor)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// write out 32-bit version
|
||||
err = binary.Write(w, binary.BigEndian, version)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// write out CRC-32 of everything upto but not including this CRC
|
||||
err = binary.Write(w, binary.BigEndian, w.Sum32())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,288 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package zap
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/mem"
|
||||
)
|
||||
|
||||
func TestBuild(t *testing.T) {
|
||||
_ = os.RemoveAll("/tmp/scorch.zap")
|
||||
|
||||
memSegment := buildMemSegment()
|
||||
err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func buildMemSegment() *mem.Segment {
|
||||
doc := &document.Document{
|
||||
ID: "a",
|
||||
Fields: []document.Field{
|
||||
document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil),
|
||||
document.NewTextFieldCustom("name", nil, []byte("wow"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
},
|
||||
CompositeFields: []*document.CompositeField{
|
||||
document.NewCompositeField("_all", true, nil, []string{"_id"}),
|
||||
},
|
||||
}
|
||||
|
||||
// forge analyzed docs
|
||||
results := []*index.AnalysisResult{
|
||||
&index.AnalysisResult{
|
||||
Document: doc,
|
||||
Analyzed: []analysis.TokenFrequencies{
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
Term: []byte("a"),
|
||||
},
|
||||
}, nil, false),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Position: 1,
|
||||
Term: []byte("wow"),
|
||||
},
|
||||
}, nil, true),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 4,
|
||||
Position: 1,
|
||||
Term: []byte("some"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Start: 5,
|
||||
End: 10,
|
||||
Position: 2,
|
||||
Term: []byte("thing"),
|
||||
},
|
||||
}, nil, true),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 4,
|
||||
Position: 1,
|
||||
Term: []byte("cold"),
|
||||
},
|
||||
}, []uint64{0}, true),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 4,
|
||||
Position: 1,
|
||||
Term: []byte("dark"),
|
||||
},
|
||||
}, []uint64{1}, true),
|
||||
},
|
||||
Length: []int{
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// fix up composite fields
|
||||
for _, ar := range results {
|
||||
for i, f := range ar.Document.Fields {
|
||||
for _, cf := range ar.Document.CompositeFields {
|
||||
cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return mem.NewFromAnalyzedDocs(results)
|
||||
}
|
||||
|
||||
func buildMemSegmentMulti() *mem.Segment {
|
||||
|
||||
doc := &document.Document{
|
||||
ID: "a",
|
||||
Fields: []document.Field{
|
||||
document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil),
|
||||
document.NewTextFieldCustom("name", nil, []byte("wow"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
},
|
||||
CompositeFields: []*document.CompositeField{
|
||||
document.NewCompositeField("_all", true, nil, []string{"_id"}),
|
||||
},
|
||||
}
|
||||
|
||||
doc2 := &document.Document{
|
||||
ID: "b",
|
||||
Fields: []document.Field{
|
||||
document.NewTextFieldCustom("_id", nil, []byte("b"), document.IndexField|document.StoreField, nil),
|
||||
document.NewTextFieldCustom("name", nil, []byte("who"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
},
|
||||
CompositeFields: []*document.CompositeField{
|
||||
document.NewCompositeField("_all", true, nil, []string{"_id"}),
|
||||
},
|
||||
}
|
||||
|
||||
// forge analyzed docs
|
||||
results := []*index.AnalysisResult{
|
||||
&index.AnalysisResult{
|
||||
Document: doc,
|
||||
Analyzed: []analysis.TokenFrequencies{
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
Term: []byte("a"),
|
||||
},
|
||||
}, nil, false),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Position: 1,
|
||||
Term: []byte("wow"),
|
||||
},
|
||||
}, nil, true),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 4,
|
||||
Position: 1,
|
||||
Term: []byte("some"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Start: 5,
|
||||
End: 10,
|
||||
Position: 2,
|
||||
Term: []byte("thing"),
|
||||
},
|
||||
}, nil, true),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 4,
|
||||
Position: 1,
|
||||
Term: []byte("cold"),
|
||||
},
|
||||
}, []uint64{0}, true),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 4,
|
||||
Position: 1,
|
||||
Term: []byte("dark"),
|
||||
},
|
||||
}, []uint64{1}, true),
|
||||
},
|
||||
Length: []int{
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
},
|
||||
},
|
||||
&index.AnalysisResult{
|
||||
Document: doc2,
|
||||
Analyzed: []analysis.TokenFrequencies{
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
Term: []byte("b"),
|
||||
},
|
||||
}, nil, false),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Position: 1,
|
||||
Term: []byte("who"),
|
||||
},
|
||||
}, nil, true),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 4,
|
||||
Position: 1,
|
||||
Term: []byte("some"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Start: 5,
|
||||
End: 10,
|
||||
Position: 2,
|
||||
Term: []byte("thing"),
|
||||
},
|
||||
}, nil, true),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 4,
|
||||
Position: 1,
|
||||
Term: []byte("cold"),
|
||||
},
|
||||
}, []uint64{0}, true),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 4,
|
||||
Position: 1,
|
||||
Term: []byte("dark"),
|
||||
},
|
||||
}, []uint64{1}, true),
|
||||
},
|
||||
Length: []int{
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// fix up composite fields
|
||||
for _, ar := range results {
|
||||
for i, f := range ar.Document.Fields {
|
||||
for _, cf := range ar.Document.CompositeFields {
|
||||
cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
segment := mem.NewFromAnalyzedDocs(results)
|
||||
|
||||
return segment
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
# zap command line utility
|
||||
|
||||
Kind of a hack just put together quickly to let me debug some issues.
|
|
@ -0,0 +1,72 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
|
||||
"github.com/couchbaselabs/vellum"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// dictCmd represents the dict command
|
||||
var dictCmd = &cobra.Command{
|
||||
Use: "dict [path] [field]",
|
||||
Short: "dict prints the term dictionary for the specified field",
|
||||
Long: `The dict command lets you print the term dictionary for the specified field.`,
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
if len(args) < 2 {
|
||||
return fmt.Errorf("must specify field")
|
||||
}
|
||||
|
||||
data := segment.Data()
|
||||
|
||||
addr, err := segment.DictAddr(args[1])
|
||||
if err != nil {
|
||||
return fmt.Errorf("error determing address: %v", err)
|
||||
}
|
||||
fmt.Printf("dictionary for field starts at %d (%x)\n", addr, addr)
|
||||
|
||||
vellumLen, read := binary.Uvarint(data[addr : addr+binary.MaxVarintLen64])
|
||||
fmt.Printf("vellum length: %d\n", vellumLen)
|
||||
fstBytes := data[addr+uint64(read) : addr+uint64(read)+vellumLen]
|
||||
fmt.Printf("raw vellum data % x\n", fstBytes)
|
||||
fmt.Printf("dictionary:\n\n")
|
||||
if fstBytes != nil {
|
||||
fst, err := vellum.Load(fstBytes)
|
||||
if err != nil {
|
||||
return fmt.Errorf("dictionary field %s vellum err: %v", args[1], err)
|
||||
}
|
||||
|
||||
itr, err := fst.Iterator(nil, nil)
|
||||
for err == nil {
|
||||
currTerm, currVal := itr.Current()
|
||||
fmt.Printf("%s - %d (%x)\n", currTerm, currVal, currVal)
|
||||
err = itr.Next()
|
||||
}
|
||||
if err != nil && err != vellum.ErrIteratorDone {
|
||||
return fmt.Errorf("error iterating dictionary: %v", err)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
func init() {
|
||||
RootCmd.AddCommand(dictCmd)
|
||||
}
|
|
@ -0,0 +1,124 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/couchbaselabs/vellum"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// exploreCmd represents the explore command
|
||||
var exploreCmd = &cobra.Command{
|
||||
Use: "explore [path] [field] <term> <docNum>",
|
||||
Short: "explores the index by field, then term (optional), and then docNum (optional)",
|
||||
Long: `The explore command lets you explore the index in order of field, then optionally by term, then optionally again by doc number.`,
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
if len(args) < 2 {
|
||||
return fmt.Errorf("must specify field")
|
||||
}
|
||||
|
||||
data := segment.Data()
|
||||
|
||||
addr, err := segment.DictAddr(args[1])
|
||||
if err != nil {
|
||||
return fmt.Errorf("error determing address: %v", err)
|
||||
}
|
||||
fmt.Printf("dictionary for field starts at %d (%x)\n", addr, addr)
|
||||
|
||||
vellumLen, read := binary.Uvarint(data[addr : addr+binary.MaxVarintLen64])
|
||||
fmt.Printf("vellum length: %d\n", vellumLen)
|
||||
fstBytes := data[addr+uint64(read) : addr+uint64(read)+vellumLen]
|
||||
fmt.Printf("raw vellum data % x\n", fstBytes)
|
||||
|
||||
if len(args) >= 3 {
|
||||
if fstBytes != nil {
|
||||
fst, err := vellum.Load(fstBytes)
|
||||
if err != nil {
|
||||
return fmt.Errorf("dictionary field %s vellum err: %v", args[1], err)
|
||||
}
|
||||
postingsAddr, exists, err := fst.Get([]byte(args[2]))
|
||||
if err != nil {
|
||||
return fmt.Errorf("error looking for term : %v", err)
|
||||
}
|
||||
if exists {
|
||||
fmt.Printf("postings list begins at %d (%x)\n", postingsAddr, postingsAddr)
|
||||
|
||||
var n uint64
|
||||
freqAddr, read := binary.Uvarint(data[postingsAddr : postingsAddr+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
|
||||
var locAddr uint64
|
||||
locAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
|
||||
var postingListLen uint64
|
||||
postingListLen, _ = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64])
|
||||
|
||||
fmt.Printf("Posting List Length: %d\n", postingListLen)
|
||||
|
||||
fmt.Printf("Freq details at: %d (%x)\n", freqAddr, freqAddr)
|
||||
numChunks, r2 := binary.Uvarint(data[freqAddr : freqAddr+binary.MaxVarintLen64])
|
||||
n = uint64(r2)
|
||||
|
||||
var freqOffsets []uint64
|
||||
for j := uint64(0); j < numChunks; j++ {
|
||||
chunkLen, r3 := binary.Uvarint(data[freqAddr+n : freqAddr+n+binary.MaxVarintLen64])
|
||||
n += uint64(r3)
|
||||
freqOffsets = append(freqOffsets, chunkLen)
|
||||
}
|
||||
running := freqAddr + n
|
||||
for k, offset := range freqOffsets {
|
||||
fmt.Printf("freq chunk: %d, len %d, start at %d (%x) end %d (%x)\n", k, offset, running, running, running+offset, running+offset)
|
||||
running += offset
|
||||
}
|
||||
|
||||
fmt.Printf("Loc details at: %d (%x)\n", locAddr, locAddr)
|
||||
numLChunks, r4 := binary.Uvarint(data[locAddr : locAddr+binary.MaxVarintLen64])
|
||||
n = uint64(r4)
|
||||
fmt.Printf("there are %d loc chunks\n", numLChunks)
|
||||
|
||||
var locOffsets []uint64
|
||||
for j := uint64(0); j < numLChunks; j++ {
|
||||
log.Printf("reading from %d(%x)\n", locAddr+n, locAddr+n)
|
||||
log.Printf("data i see here: % x\n", data[locAddr+n:locAddr+n+binary.MaxVarintLen64])
|
||||
lchunkLen, r4 := binary.Uvarint(data[locAddr+n : locAddr+n+binary.MaxVarintLen64])
|
||||
n += uint64(r4)
|
||||
log.Printf("see chunk len %d(%x)\n", lchunkLen, lchunkLen)
|
||||
locOffsets = append(locOffsets, lchunkLen)
|
||||
}
|
||||
|
||||
running2 := locAddr + n
|
||||
for k, offset := range locOffsets {
|
||||
fmt.Printf("loc chunk: %d, len %d(%x), start at %d (%x) end %d (%x)\n", k, offset, offset, running2, running2, running2+offset, running2+offset)
|
||||
running2 += offset
|
||||
}
|
||||
|
||||
} else {
|
||||
fmt.Printf("dictionary does not contain term '%s'\n", args[2])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
func init() {
|
||||
RootCmd.AddCommand(exploreCmd)
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// footerCmd represents the footer command
|
||||
var footerCmd = &cobra.Command{
|
||||
Use: "footer [path]",
|
||||
Short: "prints the contents of the zap footer",
|
||||
Long: `The footer command will print the contents of the footer.`,
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
data := segment.Data()
|
||||
fmt.Printf("Length: %d\n", len(data))
|
||||
fmt.Printf("CRC: %#x\n", segment.CRC())
|
||||
fmt.Printf("Version: %d\n", segment.Version())
|
||||
fmt.Printf("Chunk Factor: %d\n", segment.ChunkFactor())
|
||||
fmt.Printf("Fields Idx: %d (%#x)\n", segment.FieldsIndexOffset(), segment.FieldsIndexOffset())
|
||||
fmt.Printf("Stored Idx: %d (%#x)\n", segment.StoredIndexOffset(), segment.StoredIndexOffset())
|
||||
fmt.Printf("Num Docs: %d\n", segment.NumDocs())
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
func init() {
|
||||
RootCmd.AddCommand(footerCmd)
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
var segment *zap.Segment
|
||||
|
||||
// RootCmd represents the base command when called without any subcommands
|
||||
var RootCmd = &cobra.Command{
|
||||
Use: "zap",
|
||||
Short: "command-line tool to interact with a zap file",
|
||||
Long: `Zap is a command-line tool to interact with a zap file.`,
|
||||
PersistentPreRunE: func(cmd *cobra.Command, args []string) error {
|
||||
|
||||
if len(args) < 1 {
|
||||
return fmt.Errorf("must specify path to zap file")
|
||||
}
|
||||
|
||||
segInf, err := zap.Open(args[0])
|
||||
if err != nil {
|
||||
return fmt.Errorf("error opening zap file: %v", err)
|
||||
}
|
||||
segment = segInf.(*zap.Segment)
|
||||
|
||||
return nil
|
||||
},
|
||||
PersistentPostRunE: func(cmd *cobra.Command, args []string) error {
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
// Execute adds all child commands to the root command sets flags appropriately.
|
||||
// This is called by main.main(). It only needs to happen once to the rootCmd.
|
||||
func Execute() {
|
||||
if err := RootCmd.Execute(); err != nil {
|
||||
fmt.Println(err)
|
||||
os.Exit(-1)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,73 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"strconv"
|
||||
|
||||
"github.com/golang/snappy"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
// storedCmd represents the stored command
|
||||
var storedCmd = &cobra.Command{
|
||||
Use: "stored [path] [docNum]",
|
||||
Short: "prints the stored section for a doc number",
|
||||
Long: `The stored command will print the raw stored data bytes for the specified document number.`,
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
if len(args) < 2 {
|
||||
return fmt.Errorf("must specify doc number")
|
||||
}
|
||||
docNum, err := strconv.Atoi(args[1])
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to parse doc number: %v", err)
|
||||
}
|
||||
if docNum >= int(segment.NumDocs()) {
|
||||
return fmt.Errorf("invalid doc number %d (valid 0 - %d)", docNum, segment.NumDocs()-1)
|
||||
}
|
||||
data := segment.Data()
|
||||
storedIdx := segment.StoredIndexOffset()
|
||||
// read docNum entry in the index
|
||||
indexPos := storedIdx + (8 * uint64(docNum))
|
||||
storedStartAddr := binary.BigEndian.Uint64(data[indexPos : indexPos+8])
|
||||
fmt.Printf("Stored field starts at %d (%#x)\n", storedStartAddr, storedStartAddr)
|
||||
|
||||
var n uint64
|
||||
metaLen, read := binary.Uvarint(data[storedStartAddr : storedStartAddr+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
fmt.Printf("Meta Len: %d\n", metaLen)
|
||||
var dataLen uint64
|
||||
dataLen, read = binary.Uvarint(data[storedStartAddr+n : storedStartAddr+n+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
fmt.Printf("Data Len: %d\n", dataLen)
|
||||
meta := data[storedStartAddr+n : storedStartAddr+n+metaLen]
|
||||
fmt.Printf("Raw meta: % x\n", meta)
|
||||
raw := data[storedStartAddr+n+metaLen : storedStartAddr+n+metaLen+dataLen]
|
||||
fmt.Printf("Raw data (len %d): % x\n", len(raw), raw)
|
||||
uncompressed, err := snappy.Decode(nil, raw)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
fmt.Printf("Uncompressed data (len %d): % x\n", len(uncompressed), uncompressed)
|
||||
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
func init() {
|
||||
RootCmd.AddCommand(storedCmd)
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/zap/cmd/zap/cmd"
|
||||
)
|
||||
|
||||
func main() {
|
||||
cmd.Execute()
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package zap
|
||||
|
||||
import (
|
||||
"hash"
|
||||
"hash/crc32"
|
||||
"io"
|
||||
)
|
||||
|
||||
// CountHashWriter is a wrapper around a Writer which counts the number of
|
||||
// bytes which have been written
|
||||
type CountHashWriter struct {
|
||||
w io.Writer
|
||||
h hash.Hash32
|
||||
n int
|
||||
}
|
||||
|
||||
// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer
|
||||
func NewCountHashWriter(w io.Writer) *CountHashWriter {
|
||||
return &CountHashWriter{
|
||||
w: w,
|
||||
h: crc32.NewIEEE(),
|
||||
}
|
||||
}
|
||||
|
||||
// Write writes the provided bytes to the wrapped writer and counts the bytes
|
||||
func (c *CountHashWriter) Write(b []byte) (int, error) {
|
||||
n, err := c.w.Write(b)
|
||||
c.n += n
|
||||
_, _ = c.h.Write(b)
|
||||
return n, err
|
||||
}
|
||||
|
||||
// Count returns the number of bytes written
|
||||
func (c *CountHashWriter) Count() int {
|
||||
return c.n
|
||||
}
|
||||
|
||||
// Sum32 returns the CRC-32 hash of the content written to this writer
|
||||
func (c *CountHashWriter) Sum32() uint32 {
|
||||
return c.h.Sum32()
|
||||
}
|
|
@ -0,0 +1,165 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package zap
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/couchbaselabs/vellum"
|
||||
"github.com/couchbaselabs/vellum/regexp"
|
||||
)
|
||||
|
||||
// Dictionary is the zap representation of the term dictionary
|
||||
type Dictionary struct {
|
||||
segment *Segment
|
||||
field string
|
||||
fieldID uint16
|
||||
fst *vellum.FST
|
||||
}
|
||||
|
||||
// PostingsList returns the postings list for the specified term
|
||||
func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) {
|
||||
return d.postingsList(term, except)
|
||||
}
|
||||
|
||||
func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*PostingsList, error) {
|
||||
rv := &PostingsList{
|
||||
dictionary: d,
|
||||
term: term,
|
||||
except: except,
|
||||
}
|
||||
|
||||
if d.fst != nil {
|
||||
postingsOffset, exists, err := d.fst.Get([]byte(term))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vellum err: %v", err)
|
||||
}
|
||||
if exists {
|
||||
rv.postingsOffset = postingsOffset
|
||||
// read the location of the freq/norm details
|
||||
var n uint64
|
||||
var read int
|
||||
|
||||
rv.freqOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
rv.locOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
var postingsLen uint64
|
||||
postingsLen, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
|
||||
roaringBytes := d.segment.mm[postingsOffset+n : postingsOffset+n+postingsLen]
|
||||
|
||||
bitmap := roaring.NewBitmap()
|
||||
_, err = bitmap.FromBuffer(roaringBytes)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error loading roaring bitmap: %v", err)
|
||||
}
|
||||
|
||||
rv.postings = bitmap
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
// Iterator returns an iterator for this dictionary
|
||||
func (d *Dictionary) Iterator() segment.DictionaryIterator {
|
||||
|
||||
rv := &DictionaryIterator{
|
||||
d: d,
|
||||
}
|
||||
|
||||
if d.fst != nil {
|
||||
itr, err := d.fst.Iterator(nil, nil)
|
||||
if err == nil {
|
||||
rv.itr = itr
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
// PrefixIterator returns an iterator which only visits terms having the
|
||||
// the specified prefix
|
||||
func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
|
||||
rv := &DictionaryIterator{
|
||||
d: d,
|
||||
}
|
||||
|
||||
if d.fst != nil {
|
||||
r, err := regexp.New(prefix + ".*")
|
||||
if err == nil {
|
||||
itr, err := d.fst.Search(r, nil, nil)
|
||||
if err == nil {
|
||||
rv.itr = itr
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
// RangeIterator returns an iterator which only visits terms between the
|
||||
// start and end terms. NOTE: bleve.index API specifies the end is inclusive.
|
||||
func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator {
|
||||
rv := &DictionaryIterator{
|
||||
d: d,
|
||||
}
|
||||
|
||||
// need to increment the end position to be inclusive
|
||||
endBytes := []byte(end)
|
||||
if endBytes[len(endBytes)-1] < 0xff {
|
||||
endBytes[len(endBytes)-1]++
|
||||
} else {
|
||||
endBytes = append(endBytes, 0xff)
|
||||
}
|
||||
|
||||
if d.fst != nil {
|
||||
itr, err := d.fst.Iterator([]byte(start), endBytes)
|
||||
if err == nil {
|
||||
rv.itr = itr
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
// DictionaryIterator is an iterator for term dictionary
|
||||
type DictionaryIterator struct {
|
||||
d *Dictionary
|
||||
itr vellum.Iterator
|
||||
err error
|
||||
}
|
||||
|
||||
// Next returns the next entry in the dictionary
|
||||
func (i *DictionaryIterator) Next() (*index.DictEntry, error) {
|
||||
if i.itr == nil || i.err == vellum.ErrIteratorDone {
|
||||
return nil, nil
|
||||
} else if i.err != nil {
|
||||
return nil, i.err
|
||||
}
|
||||
term, count := i.itr.Current()
|
||||
rv := &index.DictEntry{
|
||||
Term: string(term),
|
||||
Count: count,
|
||||
}
|
||||
i.err = i.itr.Next()
|
||||
return rv, nil
|
||||
}
|
|
@ -0,0 +1,183 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package zap
|
||||
|
||||
import (
|
||||
"os"
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/mem"
|
||||
)
|
||||
|
||||
func buildMemSegmentForDict() *mem.Segment {
|
||||
doc := &document.Document{
|
||||
ID: "a",
|
||||
Fields: []document.Field{
|
||||
document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil),
|
||||
document.NewTextFieldCustom("desc", nil, []byte("apple ball cat dog egg fish bat"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
|
||||
},
|
||||
}
|
||||
|
||||
// forge analyzed docs
|
||||
results := []*index.AnalysisResult{
|
||||
&index.AnalysisResult{
|
||||
Document: doc,
|
||||
Analyzed: []analysis.TokenFrequencies{
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
Term: []byte("a"),
|
||||
},
|
||||
}, nil, false),
|
||||
analysis.TokenFrequency(analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Start: 0,
|
||||
End: 5,
|
||||
Position: 1,
|
||||
Term: []byte("apple"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Start: 6,
|
||||
End: 10,
|
||||
Position: 2,
|
||||
Term: []byte("ball"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Start: 11,
|
||||
End: 14,
|
||||
Position: 3,
|
||||
Term: []byte("cat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Start: 15,
|
||||
End: 18,
|
||||
Position: 4,
|
||||
Term: []byte("dog"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Start: 19,
|
||||
End: 22,
|
||||
Position: 5,
|
||||
Term: []byte("egg"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Start: 20,
|
||||
End: 24,
|
||||
Position: 6,
|
||||
Term: []byte("fish"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Start: 25,
|
||||
End: 28,
|
||||
Position: 7,
|
||||
Term: []byte("bat"),
|
||||
},
|
||||
}, nil, true),
|
||||
},
|
||||
Length: []int{
|
||||
1,
|
||||
7,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
segment := mem.NewFromAnalyzedDocs(results)
|
||||
|
||||
return segment
|
||||
}
|
||||
|
||||
func TestDictionary(t *testing.T) {
|
||||
|
||||
_ = os.RemoveAll("/tmp/scorch.zap")
|
||||
|
||||
memSegment := buildMemSegmentForDict()
|
||||
err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024)
|
||||
if err != nil {
|
||||
t.Fatalf("error persisting segment: %v", err)
|
||||
}
|
||||
|
||||
segment, err := Open("/tmp/scorch.zap")
|
||||
if err != nil {
|
||||
t.Fatalf("error opening segment: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
cerr := segment.Close()
|
||||
if cerr != nil {
|
||||
t.Fatalf("error closing segment: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
dict, err := segment.Dictionary("desc")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// test basic full iterator
|
||||
expected := []string{"apple", "ball", "bat", "cat", "dog", "egg", "fish"}
|
||||
var got []string
|
||||
itr := dict.Iterator()
|
||||
next, err := itr.Next()
|
||||
for next != nil && err == nil {
|
||||
got = append(got, next.Term)
|
||||
next, err = itr.Next()
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("dict itr error: %v", err)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(expected, got) {
|
||||
t.Errorf("expected: %v, got: %v", expected, got)
|
||||
}
|
||||
|
||||
// test prefix iterator
|
||||
expected = []string{"ball", "bat"}
|
||||
got = got[:0]
|
||||
itr = dict.PrefixIterator("b")
|
||||
next, err = itr.Next()
|
||||
for next != nil && err == nil {
|
||||
got = append(got, next.Term)
|
||||
next, err = itr.Next()
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("dict itr error: %v", err)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(expected, got) {
|
||||
t.Errorf("expected: %v, got: %v", expected, got)
|
||||
}
|
||||
|
||||
// test range iterator
|
||||
expected = []string{"cat", "dog", "egg"}
|
||||
got = got[:0]
|
||||
itr = dict.RangeIterator("cat", "egg")
|
||||
next, err = itr.Next()
|
||||
for next != nil && err == nil {
|
||||
got = append(got, next.Term)
|
||||
next, err = itr.Next()
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("dict itr error: %v", err)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(expected, got) {
|
||||
t.Errorf("expected: %v, got: %v", expected, got)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,362 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package zap
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/Smerity/govarint"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
)
|
||||
|
||||
// PostingsList is an in-memory represenation of a postings list
|
||||
type PostingsList struct {
|
||||
dictionary *Dictionary
|
||||
term string
|
||||
postingsOffset uint64
|
||||
freqOffset uint64
|
||||
locOffset uint64
|
||||
postings *roaring.Bitmap
|
||||
except *roaring.Bitmap
|
||||
postingKey []byte
|
||||
}
|
||||
|
||||
// Iterator returns an iterator for this postings list
|
||||
func (p *PostingsList) Iterator() segment.PostingsIterator {
|
||||
rv := &PostingsIterator{
|
||||
postings: p,
|
||||
}
|
||||
if p.postings != nil {
|
||||
// prepare the freq chunk details
|
||||
var n uint64
|
||||
var read int
|
||||
var numFreqChunks uint64
|
||||
numFreqChunks, read = binary.Uvarint(p.dictionary.segment.mm[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
rv.freqChunkLens = make([]uint64, int(numFreqChunks))
|
||||
for i := 0; i < int(numFreqChunks); i++ {
|
||||
rv.freqChunkLens[i], read = binary.Uvarint(p.dictionary.segment.mm[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
}
|
||||
rv.freqChunkStart = p.freqOffset + n
|
||||
|
||||
// prepare the loc chunk details
|
||||
n = 0
|
||||
var numLocChunks uint64
|
||||
numLocChunks, read = binary.Uvarint(p.dictionary.segment.mm[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
rv.locChunkLens = make([]uint64, int(numLocChunks))
|
||||
for i := 0; i < int(numLocChunks); i++ {
|
||||
rv.locChunkLens[i], read = binary.Uvarint(p.dictionary.segment.mm[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
}
|
||||
rv.locChunkStart = p.locOffset + n
|
||||
|
||||
rv.all = p.postings.Iterator()
|
||||
if p.except != nil {
|
||||
allExcept := p.postings.Clone()
|
||||
allExcept.AndNot(p.except)
|
||||
rv.actual = allExcept.Iterator()
|
||||
} else {
|
||||
rv.actual = p.postings.Iterator()
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
// Count returns the number of items on this postings list
|
||||
func (p *PostingsList) Count() uint64 {
|
||||
var rv uint64
|
||||
if p.postings != nil {
|
||||
rv = p.postings.GetCardinality()
|
||||
if p.except != nil {
|
||||
except := p.except.GetCardinality()
|
||||
if except > rv {
|
||||
// avoid underflow
|
||||
except = rv
|
||||
}
|
||||
rv -= except
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
// PostingsIterator provides a way to iterate through the postings list
|
||||
type PostingsIterator struct {
|
||||
postings *PostingsList
|
||||
all roaring.IntIterable
|
||||
offset int
|
||||
locoffset int
|
||||
actual roaring.IntIterable
|
||||
|
||||
currChunk uint32
|
||||
currChunkFreqNorm []byte
|
||||
currChunkLoc []byte
|
||||
freqNormDecoder *govarint.Base128Decoder
|
||||
locDecoder *govarint.Base128Decoder
|
||||
|
||||
freqChunkLens []uint64
|
||||
freqChunkStart uint64
|
||||
|
||||
locChunkLens []uint64
|
||||
locChunkStart uint64
|
||||
}
|
||||
|
||||
func (i *PostingsIterator) loadChunk(chunk int) error {
|
||||
if chunk >= len(i.freqChunkLens) || chunk >= len(i.locChunkLens) {
|
||||
return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkLens), len(i.locChunkLens))
|
||||
}
|
||||
// load correct chunk bytes
|
||||
start := i.freqChunkStart
|
||||
for j := 0; j < chunk; j++ {
|
||||
start += i.freqChunkLens[j]
|
||||
}
|
||||
end := start + i.freqChunkLens[chunk]
|
||||
i.currChunkFreqNorm = i.postings.dictionary.segment.mm[start:end]
|
||||
i.freqNormDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkFreqNorm))
|
||||
|
||||
start = i.locChunkStart
|
||||
for j := 0; j < chunk; j++ {
|
||||
start += i.locChunkLens[j]
|
||||
}
|
||||
end = start + i.locChunkLens[chunk]
|
||||
i.currChunkLoc = i.postings.dictionary.segment.mm[start:end]
|
||||
i.locDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkLoc))
|
||||
i.currChunk = uint32(chunk)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) {
|
||||
freq, err := i.freqNormDecoder.GetU64()
|
||||
if err != nil {
|
||||
return 0, 0, fmt.Errorf("error reading frequency: %v", err)
|
||||
}
|
||||
normBits, err := i.freqNormDecoder.GetU64()
|
||||
if err != nil {
|
||||
return 0, 0, fmt.Errorf("error reading norm: %v", err)
|
||||
}
|
||||
return freq, normBits, err
|
||||
}
|
||||
|
||||
// readLocation processes all the integers on the stream representing a single
|
||||
// location. if you care about it, pass in a non-nil location struct, and we
|
||||
// will fill it. if you don't care about it, pass in nil and we safely consume
|
||||
// the contents.
|
||||
func (i *PostingsIterator) readLocation(l *Location) error {
|
||||
// read off field
|
||||
fieldID, err := i.locDecoder.GetU64()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error reading location field: %v", err)
|
||||
}
|
||||
// read off pos
|
||||
pos, err := i.locDecoder.GetU64()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error reading location pos: %v", err)
|
||||
}
|
||||
// read off start
|
||||
start, err := i.locDecoder.GetU64()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error reading location start: %v", err)
|
||||
}
|
||||
// read off end
|
||||
end, err := i.locDecoder.GetU64()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error reading location end: %v", err)
|
||||
}
|
||||
// read off num array pos
|
||||
numArrayPos, err := i.locDecoder.GetU64()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error reading location num array pos: %v", err)
|
||||
}
|
||||
|
||||
// group these together for less branching
|
||||
if l != nil {
|
||||
l.field = i.postings.dictionary.segment.fieldsInv[fieldID]
|
||||
l.pos = pos
|
||||
l.start = start
|
||||
l.end = end
|
||||
if numArrayPos > 0 {
|
||||
l.ap = make([]uint64, int(numArrayPos))
|
||||
}
|
||||
}
|
||||
|
||||
// read off array positions
|
||||
for k := 0; k < int(numArrayPos); k++ {
|
||||
ap, err := i.locDecoder.GetU64()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error reading array position: %v", err)
|
||||
}
|
||||
if l != nil {
|
||||
l.ap[k] = ap
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Next returns the next posting on the postings list, or nil at the end
|
||||
func (i *PostingsIterator) Next() (segment.Posting, error) {
|
||||
if i.actual == nil || !i.actual.HasNext() {
|
||||
return nil, nil
|
||||
}
|
||||
n := i.actual.Next()
|
||||
nChunk := n / i.postings.dictionary.segment.chunkFactor
|
||||
allN := i.all.Next()
|
||||
allNChunk := allN / i.postings.dictionary.segment.chunkFactor
|
||||
|
||||
// n is the next actual hit (excluding some postings)
|
||||
// allN is the next hit in the full postings
|
||||
// if they don't match, adjust offsets to factor in item we're skipping over
|
||||
// incr the all iterator, and check again
|
||||
for allN != n {
|
||||
|
||||
// in different chunks, reset offsets
|
||||
if allNChunk != nChunk {
|
||||
i.locoffset = 0
|
||||
i.offset = 0
|
||||
} else {
|
||||
|
||||
if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
|
||||
err := i.loadChunk(int(nChunk))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error loading chunk: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// read off freq/offsets even though we don't care about them
|
||||
freq, _, err := i.readFreqNorm()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if i.postings.dictionary.segment.fieldsLoc[i.postings.dictionary.fieldID] {
|
||||
for j := 0; j < int(freq); j++ {
|
||||
err := i.readLocation(nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// in same chunk, need to account for offsets
|
||||
i.offset++
|
||||
}
|
||||
|
||||
allN = i.all.Next()
|
||||
}
|
||||
|
||||
if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
|
||||
err := i.loadChunk(int(nChunk))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error loading chunk: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
rv := &Posting{
|
||||
iterator: i,
|
||||
docNum: uint64(n),
|
||||
}
|
||||
|
||||
var err error
|
||||
var normBits uint64
|
||||
rv.freq, normBits, err = i.readFreqNorm()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv.norm = math.Float32frombits(uint32(normBits))
|
||||
if i.postings.dictionary.segment.fieldsLoc[i.postings.dictionary.fieldID] {
|
||||
// read off 'freq' locations
|
||||
rv.locs = make([]segment.Location, rv.freq)
|
||||
locs := make([]Location, rv.freq)
|
||||
for j := 0; j < int(rv.freq); j++ {
|
||||
err := i.readLocation(&locs[j])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv.locs[j] = &locs[j]
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
// Posting is a single entry in a postings list
|
||||
type Posting struct {
|
||||
iterator *PostingsIterator
|
||||
docNum uint64
|
||||
|
||||
freq uint64
|
||||
norm float32
|
||||
locs []segment.Location
|
||||
}
|
||||
|
||||
// Number returns the document number of this posting in this segment
|
||||
func (p *Posting) Number() uint64 {
|
||||
return p.docNum
|
||||
}
|
||||
|
||||
// Frequency returns the frequence of occurance of this term in this doc/field
|
||||
func (p *Posting) Frequency() uint64 {
|
||||
return p.freq
|
||||
}
|
||||
|
||||
// Norm returns the normalization factor for this posting
|
||||
func (p *Posting) Norm() float64 {
|
||||
return float64(p.norm)
|
||||
}
|
||||
|
||||
// Locations returns the location information for each occurance
|
||||
func (p *Posting) Locations() []segment.Location {
|
||||
return p.locs
|
||||
}
|
||||
|
||||
// Location represents the location of a single occurance
|
||||
type Location struct {
|
||||
field string
|
||||
pos uint64
|
||||
start uint64
|
||||
end uint64
|
||||
ap []uint64
|
||||
}
|
||||
|
||||
// Field returns the name of the field (useful in composite fields to know
|
||||
// which original field the value came from)
|
||||
func (l *Location) Field() string {
|
||||
return l.field
|
||||
}
|
||||
|
||||
// Start returns the start byte offset of this occurance
|
||||
func (l *Location) Start() uint64 {
|
||||
return l.start
|
||||
}
|
||||
|
||||
// End returns the end byte offset of this occurance
|
||||
func (l *Location) End() uint64 {
|
||||
return l.end
|
||||
}
|
||||
|
||||
// Pos returns the 1-based phrase position of this occurance
|
||||
func (l *Location) Pos() uint64 {
|
||||
return l.pos
|
||||
}
|
||||
|
||||
// ArrayPositions returns the array position vector associated with this occurance
|
||||
func (l *Location) ArrayPositions() []uint64 {
|
||||
return l.ap
|
||||
}
|
|
@ -0,0 +1,352 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package zap
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/Smerity/govarint"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/couchbaselabs/vellum"
|
||||
mmap "github.com/edsrzf/mmap-go"
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
// Open returns a zap impl of a segment
|
||||
func Open(path string) (segment.Segment, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
mm, err := mmap.Map(f, mmap.RDONLY, 0)
|
||||
if err != nil {
|
||||
// mmap failed, try to close the file
|
||||
_ = f.Close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
rv := &Segment{
|
||||
f: f,
|
||||
mm: mm,
|
||||
path: path,
|
||||
fieldsMap: make(map[string]uint16),
|
||||
}
|
||||
|
||||
err = rv.loadConfig()
|
||||
if err != nil {
|
||||
_ = rv.Close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
err = rv.loadFields()
|
||||
if err != nil {
|
||||
_ = rv.Close()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
// Segment implements the segment.Segment inteface over top the zap file format
|
||||
type Segment struct {
|
||||
f *os.File
|
||||
mm mmap.MMap
|
||||
path string
|
||||
crc uint32
|
||||
version uint32
|
||||
chunkFactor uint32
|
||||
numDocs uint64
|
||||
storedIndexOffset uint64
|
||||
fieldsIndexOffset uint64
|
||||
|
||||
fieldsMap map[string]uint16
|
||||
fieldsInv []string
|
||||
fieldsLoc []bool
|
||||
fieldsOffsets []uint64
|
||||
}
|
||||
|
||||
func (s *Segment) loadConfig() error {
|
||||
crcOffset := len(s.mm) - 4
|
||||
s.crc = binary.BigEndian.Uint32(s.mm[crcOffset : crcOffset+4])
|
||||
verOffset := crcOffset - 4
|
||||
s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4])
|
||||
if s.version != version {
|
||||
return fmt.Errorf("unsupported version %d", s.version)
|
||||
}
|
||||
chunkOffset := verOffset - 4
|
||||
s.chunkFactor = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4])
|
||||
fieldsOffset := chunkOffset - 8
|
||||
s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsOffset : fieldsOffset+8])
|
||||
storedOffset := fieldsOffset - 8
|
||||
s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedOffset : storedOffset+8])
|
||||
docNumOffset := storedOffset - 8
|
||||
s.numDocs = binary.BigEndian.Uint64(s.mm[docNumOffset : docNumOffset+8])
|
||||
return nil
|
||||
|
||||
}
|
||||
|
||||
func (s *Segment) loadFields() error {
|
||||
// NOTE for now we assume the fields index immediately preceeds the footer
|
||||
// if this changes, need to adjust accordingly (or store epxlicit length)
|
||||
fieldsIndexEnd := uint64(len(s.mm) - footerSize)
|
||||
|
||||
// iterate through fields index
|
||||
var fieldID uint64
|
||||
for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd {
|
||||
addr := binary.BigEndian.Uint64(s.mm[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8])
|
||||
var n uint64
|
||||
hasStoredLoc, read := binary.Uvarint(s.mm[addr:fieldsIndexEnd])
|
||||
n += uint64(read)
|
||||
if hasStoredLoc == 1 {
|
||||
s.fieldsLoc = append(s.fieldsLoc, true)
|
||||
} else {
|
||||
s.fieldsLoc = append(s.fieldsLoc, false)
|
||||
}
|
||||
|
||||
var dictLoc uint64
|
||||
dictLoc, read = binary.Uvarint(s.mm[addr+n : fieldsIndexEnd])
|
||||
n += uint64(read)
|
||||
s.fieldsOffsets = append(s.fieldsOffsets, dictLoc)
|
||||
|
||||
var nameLen uint64
|
||||
nameLen, read = binary.Uvarint(s.mm[addr+n : fieldsIndexEnd])
|
||||
n += uint64(read)
|
||||
|
||||
name := string(s.mm[addr+n : addr+n+nameLen])
|
||||
s.fieldsInv = append(s.fieldsInv, name)
|
||||
s.fieldsMap[name] = uint16(fieldID + 1)
|
||||
|
||||
fieldID++
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Dictionary returns the term dictionary for the specified field
|
||||
func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) {
|
||||
dict, err := s.dictionary(field)
|
||||
if err == nil && dict == nil {
|
||||
return &segment.EmptyDictionary{}, nil
|
||||
}
|
||||
return dict, err
|
||||
}
|
||||
|
||||
func (s *Segment) dictionary(field string) (*Dictionary, error) {
|
||||
rv := &Dictionary{
|
||||
segment: s,
|
||||
field: field,
|
||||
}
|
||||
|
||||
rv.fieldID = s.fieldsMap[field]
|
||||
if rv.fieldID > 0 {
|
||||
rv.fieldID = rv.fieldID - 1
|
||||
|
||||
dictStart := s.fieldsOffsets[rv.fieldID]
|
||||
|
||||
// read the length of the vellum data
|
||||
vellumLen, read := binary.Uvarint(s.mm[dictStart : dictStart+binary.MaxVarintLen64])
|
||||
fstBytes := s.mm[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen]
|
||||
if fstBytes != nil {
|
||||
fst, err := vellum.Load(fstBytes)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err)
|
||||
}
|
||||
if err == nil {
|
||||
rv.fst = fst
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
// VisitDocument invokes the DocFieldValueVistor for each stored field
|
||||
// for the specified doc number
|
||||
func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
|
||||
// first make sure this is a valid number in this segment
|
||||
if num < s.numDocs {
|
||||
docStoredStartAddr := s.storedIndexOffset + (8 * num)
|
||||
docStoredStart := binary.BigEndian.Uint64(s.mm[docStoredStartAddr : docStoredStartAddr+8])
|
||||
var n uint64
|
||||
metaLen, read := binary.Uvarint(s.mm[docStoredStart : docStoredStart+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
var dataLen uint64
|
||||
dataLen, read = binary.Uvarint(s.mm[docStoredStart+n : docStoredStart+n+binary.MaxVarintLen64])
|
||||
n += uint64(read)
|
||||
meta := s.mm[docStoredStart+n : docStoredStart+n+metaLen]
|
||||
data := s.mm[docStoredStart+n+metaLen : docStoredStart+n+metaLen+dataLen]
|
||||
uncompressed, err := snappy.Decode(nil, data)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
// now decode meta and process
|
||||
reader := bytes.NewReader(meta)
|
||||
decoder := govarint.NewU64Base128Decoder(reader)
|
||||
|
||||
keepGoing := true
|
||||
for keepGoing {
|
||||
field, err := decoder.GetU64()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
typ, err := decoder.GetU64()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
offset, err := decoder.GetU64()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
l, err := decoder.GetU64()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
numap, err := decoder.GetU64()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var arrayPos []uint64
|
||||
if numap > 0 {
|
||||
arrayPos = make([]uint64, numap)
|
||||
for i := 0; i < int(numap); i++ {
|
||||
ap, err := decoder.GetU64()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
arrayPos[i] = ap
|
||||
}
|
||||
}
|
||||
|
||||
value := uncompressed[offset : offset+l]
|
||||
keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Count returns the number of documents in this segment.
|
||||
func (s *Segment) Count() uint64 {
|
||||
return s.numDocs
|
||||
}
|
||||
|
||||
// DocNumbers returns a bitset corresponding to the doc numbers of all the
|
||||
// provided _id strings
|
||||
func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) {
|
||||
rv := roaring.New()
|
||||
|
||||
if len(s.fieldsMap) > 0 {
|
||||
idDict, err := s.dictionary("_id")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, id := range ids {
|
||||
postings, err := idDict.postingsList(id, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if postings.postings != nil {
|
||||
rv.Or(postings.postings)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
// Fields returns the field names used in this segment
|
||||
func (s *Segment) Fields() []string {
|
||||
return s.fieldsInv
|
||||
}
|
||||
|
||||
// Path returns the path of this segment on disk
|
||||
func (s *Segment) Path() string {
|
||||
return s.path
|
||||
}
|
||||
|
||||
// Close releases all resources associated with this segment
|
||||
func (s *Segment) Close() (err error) {
|
||||
if s.mm != nil {
|
||||
err = s.mm.Unmap()
|
||||
}
|
||||
// try to close file even if unmap failed
|
||||
if s.f != nil {
|
||||
err2 := s.f.Close()
|
||||
if err == nil {
|
||||
// try to return first error
|
||||
err = err2
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// some helpers i started adding for the command-line utility
|
||||
|
||||
// Data returns the underlying mmaped data slice
|
||||
func (s *Segment) Data() []byte {
|
||||
return s.mm
|
||||
}
|
||||
|
||||
// CRC returns the CRC value stored in the file footer
|
||||
func (s *Segment) CRC() uint32 {
|
||||
return s.crc
|
||||
}
|
||||
|
||||
// Version returns the file version in the file footer
|
||||
func (s *Segment) Version() uint32 {
|
||||
return s.version
|
||||
}
|
||||
|
||||
// ChunkFactor returns the chunk factor in the file footer
|
||||
func (s *Segment) ChunkFactor() uint32 {
|
||||
return s.chunkFactor
|
||||
}
|
||||
|
||||
// FieldsIndexOffset returns the fields index offset in the file footer
|
||||
func (s *Segment) FieldsIndexOffset() uint64 {
|
||||
return s.fieldsIndexOffset
|
||||
}
|
||||
|
||||
// StoredIndexOffset returns the stored value index offset in the file foooter
|
||||
func (s *Segment) StoredIndexOffset() uint64 {
|
||||
return s.storedIndexOffset
|
||||
}
|
||||
|
||||
// NumDocs returns the number of documents in the file footer
|
||||
func (s *Segment) NumDocs() uint64 {
|
||||
return s.numDocs
|
||||
}
|
||||
|
||||
// DictAddr is a helper function to compute the file offset where the
|
||||
// dictionary is stored for the specified field.
|
||||
func (s *Segment) DictAddr(field string) (uint64, error) {
|
||||
var fieldID uint16
|
||||
var ok bool
|
||||
if fieldID, ok = s.fieldsMap[field]; !ok {
|
||||
return 0, fmt.Errorf("no such field '%s'", field)
|
||||
}
|
||||
|
||||
return s.fieldsOffsets[fieldID-1], nil
|
||||
}
|
|
@ -0,0 +1,517 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package zap
|
||||
|
||||
import (
|
||||
"math"
|
||||
"os"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestOpen(t *testing.T) {
|
||||
_ = os.RemoveAll("/tmp/scorch.zap")
|
||||
|
||||
memSegment := buildMemSegment()
|
||||
err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024)
|
||||
if err != nil {
|
||||
t.Fatalf("error persisting segment: %v", err)
|
||||
}
|
||||
|
||||
segment, err := Open("/tmp/scorch.zap")
|
||||
if err != nil {
|
||||
t.Fatalf("error opening segment: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
cerr := segment.Close()
|
||||
if cerr != nil {
|
||||
t.Fatalf("error closing segment: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
expectFields := map[string]struct{}{
|
||||
"_id": struct{}{},
|
||||
"_all": struct{}{},
|
||||
"name": struct{}{},
|
||||
"desc": struct{}{},
|
||||
"tag": struct{}{},
|
||||
}
|
||||
fields := segment.Fields()
|
||||
if len(fields) != len(expectFields) {
|
||||
t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields))
|
||||
}
|
||||
for _, field := range fields {
|
||||
if _, ok := expectFields[field]; !ok {
|
||||
t.Errorf("got unexpected field: %s", field)
|
||||
}
|
||||
}
|
||||
|
||||
docCount := segment.Count()
|
||||
if docCount != 1 {
|
||||
t.Errorf("expected count 1, got %d", docCount)
|
||||
}
|
||||
|
||||
// check the _id field
|
||||
dict, err := segment.Dictionary("_id")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if dict == nil {
|
||||
t.Fatal("got nil dict, expected non-nil")
|
||||
}
|
||||
|
||||
postingsList, err := dict.PostingsList("a", nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if postingsList == nil {
|
||||
t.Fatal("got nil postings list, expected non-nil")
|
||||
}
|
||||
|
||||
postingsItr := postingsList.Iterator()
|
||||
if postingsItr == nil {
|
||||
t.Fatal("got nil iterator, expected non-nil")
|
||||
}
|
||||
|
||||
count := 0
|
||||
nextPosting, err := postingsItr.Next()
|
||||
for nextPosting != nil && err == nil {
|
||||
count++
|
||||
if nextPosting.Frequency() != 1 {
|
||||
t.Errorf("expected frequency 1, got %d", nextPosting.Frequency())
|
||||
}
|
||||
if nextPosting.Number() != 0 {
|
||||
t.Errorf("expected doc number 0, got %d", nextPosting.Number())
|
||||
}
|
||||
if nextPosting.Norm() != 1.0 {
|
||||
t.Errorf("expected norm 1.0, got %f", nextPosting.Norm())
|
||||
}
|
||||
|
||||
nextPosting, err = postingsItr.Next()
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if count != 1 {
|
||||
t.Errorf("expected count to be 1, got %d", count)
|
||||
}
|
||||
|
||||
// check the name field
|
||||
dict, err = segment.Dictionary("name")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if dict == nil {
|
||||
t.Fatal("got nil dict, expected non-nil")
|
||||
}
|
||||
|
||||
postingsList, err = dict.PostingsList("wow", nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if postingsList == nil {
|
||||
t.Fatal("got nil postings list, expected non-nil")
|
||||
}
|
||||
|
||||
postingsItr = postingsList.Iterator()
|
||||
if postingsItr == nil {
|
||||
t.Fatal("got nil iterator, expected non-nil")
|
||||
}
|
||||
|
||||
count = 0
|
||||
nextPosting, err = postingsItr.Next()
|
||||
for nextPosting != nil && err == nil {
|
||||
count++
|
||||
if nextPosting.Frequency() != 1 {
|
||||
t.Errorf("expected frequency 1, got %d", nextPosting.Frequency())
|
||||
}
|
||||
if nextPosting.Number() != 0 {
|
||||
t.Errorf("expected doc number 0, got %d", nextPosting.Number())
|
||||
}
|
||||
if nextPosting.Norm() != 1.0 {
|
||||
t.Errorf("expected norm 1.0, got %f", nextPosting.Norm())
|
||||
}
|
||||
var numLocs uint64
|
||||
for _, loc := range nextPosting.Locations() {
|
||||
numLocs++
|
||||
if loc.Field() != "name" {
|
||||
t.Errorf("expected loc field to be 'name', got '%s'", loc.Field())
|
||||
}
|
||||
if loc.Start() != 0 {
|
||||
t.Errorf("expected loc start to be 0, got %d", loc.Start())
|
||||
}
|
||||
if loc.End() != 3 {
|
||||
t.Errorf("expected loc end to be 3, got %d", loc.End())
|
||||
}
|
||||
if loc.Pos() != 1 {
|
||||
t.Errorf("expected loc pos to be 1, got %d", loc.Pos())
|
||||
}
|
||||
if loc.ArrayPositions() != nil {
|
||||
t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions())
|
||||
}
|
||||
}
|
||||
if numLocs != nextPosting.Frequency() {
|
||||
t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs)
|
||||
}
|
||||
|
||||
nextPosting, err = postingsItr.Next()
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if count != 1 {
|
||||
t.Errorf("expected count to be 1, got %d", count)
|
||||
}
|
||||
|
||||
// check the _all field (composite)
|
||||
dict, err = segment.Dictionary("_all")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if dict == nil {
|
||||
t.Fatal("got nil dict, expected non-nil")
|
||||
}
|
||||
|
||||
postingsList, err = dict.PostingsList("wow", nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if postingsList == nil {
|
||||
t.Fatal("got nil postings list, expected non-nil")
|
||||
}
|
||||
|
||||
postingsItr = postingsList.Iterator()
|
||||
if postingsItr == nil {
|
||||
t.Fatal("got nil iterator, expected non-nil")
|
||||
}
|
||||
|
||||
count = 0
|
||||
nextPosting, err = postingsItr.Next()
|
||||
for nextPosting != nil && err == nil {
|
||||
count++
|
||||
if nextPosting.Frequency() != 1 {
|
||||
t.Errorf("expected frequency 1, got %d", nextPosting.Frequency())
|
||||
}
|
||||
if nextPosting.Number() != 0 {
|
||||
t.Errorf("expected doc number 0, got %d", nextPosting.Number())
|
||||
}
|
||||
expectedNorm := float32(1.0 / math.Sqrt(float64(5)))
|
||||
if nextPosting.Norm() != float64(expectedNorm) {
|
||||
t.Errorf("expected norm %f, got %f", expectedNorm, nextPosting.Norm())
|
||||
}
|
||||
var numLocs uint64
|
||||
for _, loc := range nextPosting.Locations() {
|
||||
numLocs++
|
||||
if loc.Field() != "name" {
|
||||
t.Errorf("expected loc field to be 'name', got '%s'", loc.Field())
|
||||
}
|
||||
if loc.Start() != 0 {
|
||||
t.Errorf("expected loc start to be 0, got %d", loc.Start())
|
||||
}
|
||||
if loc.End() != 3 {
|
||||
t.Errorf("expected loc end to be 3, got %d", loc.End())
|
||||
}
|
||||
if loc.Pos() != 1 {
|
||||
t.Errorf("expected loc pos to be 1, got %d", loc.Pos())
|
||||
}
|
||||
if loc.ArrayPositions() != nil {
|
||||
t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions())
|
||||
}
|
||||
}
|
||||
if numLocs != nextPosting.Frequency() {
|
||||
t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs)
|
||||
}
|
||||
|
||||
nextPosting, err = postingsItr.Next()
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if count != 1 {
|
||||
t.Errorf("expected count to be 1, got %d", count)
|
||||
}
|
||||
|
||||
// now try a field with array positions
|
||||
dict, err = segment.Dictionary("tag")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if dict == nil {
|
||||
t.Fatal("got nil dict, expected non-nil")
|
||||
}
|
||||
|
||||
postingsList, err = dict.PostingsList("dark", nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if postingsList == nil {
|
||||
t.Fatal("got nil postings list, expected non-nil")
|
||||
}
|
||||
|
||||
postingsItr = postingsList.Iterator()
|
||||
if postingsItr == nil {
|
||||
t.Fatal("got nil iterator, expected non-nil")
|
||||
}
|
||||
|
||||
nextPosting, err = postingsItr.Next()
|
||||
for nextPosting != nil && err == nil {
|
||||
|
||||
if nextPosting.Frequency() != 1 {
|
||||
t.Errorf("expected frequency 1, got %d", nextPosting.Frequency())
|
||||
}
|
||||
if nextPosting.Number() != 0 {
|
||||
t.Errorf("expected doc number 0, got %d", nextPosting.Number())
|
||||
}
|
||||
var numLocs uint64
|
||||
for _, loc := range nextPosting.Locations() {
|
||||
numLocs++
|
||||
if loc.Field() != "tag" {
|
||||
t.Errorf("expected loc field to be 'name', got '%s'", loc.Field())
|
||||
}
|
||||
if loc.Start() != 0 {
|
||||
t.Errorf("expected loc start to be 0, got %d", loc.Start())
|
||||
}
|
||||
if loc.End() != 4 {
|
||||
t.Errorf("expected loc end to be 3, got %d", loc.End())
|
||||
}
|
||||
if loc.Pos() != 1 {
|
||||
t.Errorf("expected loc pos to be 1, got %d", loc.Pos())
|
||||
}
|
||||
expectArrayPos := []uint64{1}
|
||||
if !reflect.DeepEqual(loc.ArrayPositions(), expectArrayPos) {
|
||||
t.Errorf("expect loc array pos to be %v, got %v", expectArrayPos, loc.ArrayPositions())
|
||||
}
|
||||
}
|
||||
if numLocs != nextPosting.Frequency() {
|
||||
t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs)
|
||||
}
|
||||
|
||||
nextPosting, err = postingsItr.Next()
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// now try and visit a document
|
||||
var fieldValuesSeen int
|
||||
err = segment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool {
|
||||
fieldValuesSeen++
|
||||
return true
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if fieldValuesSeen != 5 {
|
||||
t.Errorf("expected 5 field values, got %d", fieldValuesSeen)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOpenMulti(t *testing.T) {
|
||||
_ = os.RemoveAll("/tmp/scorch.zap")
|
||||
|
||||
memSegment := buildMemSegmentMulti()
|
||||
err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024)
|
||||
if err != nil {
|
||||
t.Fatalf("error persisting segment: %v", err)
|
||||
}
|
||||
|
||||
segment, err := Open("/tmp/scorch.zap")
|
||||
if err != nil {
|
||||
t.Fatalf("error opening segment: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
cerr := segment.Close()
|
||||
if cerr != nil {
|
||||
t.Fatalf("error closing segment: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
if segment.Count() != 2 {
|
||||
t.Errorf("expected count 2, got %d", segment.Count())
|
||||
}
|
||||
|
||||
// check the desc field
|
||||
dict, err := segment.Dictionary("desc")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if dict == nil {
|
||||
t.Fatal("got nil dict, expected non-nil")
|
||||
}
|
||||
|
||||
postingsList, err := dict.PostingsList("thing", nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if postingsList == nil {
|
||||
t.Fatal("got nil postings list, expected non-nil")
|
||||
}
|
||||
|
||||
postingsItr := postingsList.Iterator()
|
||||
if postingsItr == nil {
|
||||
t.Fatal("got nil iterator, expected non-nil")
|
||||
}
|
||||
|
||||
count := 0
|
||||
nextPosting, err := postingsItr.Next()
|
||||
for nextPosting != nil && err == nil {
|
||||
count++
|
||||
nextPosting, err = postingsItr.Next()
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if count != 2 {
|
||||
t.Errorf("expected count to be 2, got %d", count)
|
||||
}
|
||||
|
||||
// get docnum of a
|
||||
exclude, err := segment.DocNumbers([]string{"a"})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// look for term 'thing' excluding doc 'a'
|
||||
postingsListExcluding, err := dict.PostingsList("thing", exclude)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if postingsList == nil {
|
||||
t.Fatal("got nil postings list, expected non-nil")
|
||||
}
|
||||
|
||||
postingsListExcludingCount := postingsListExcluding.Count()
|
||||
if postingsListExcludingCount != 1 {
|
||||
t.Errorf("expected count from postings list to be 1, got %d", postingsListExcludingCount)
|
||||
}
|
||||
|
||||
postingsItrExcluding := postingsListExcluding.Iterator()
|
||||
if postingsItr == nil {
|
||||
t.Fatal("got nil iterator, expected non-nil")
|
||||
}
|
||||
|
||||
count = 0
|
||||
nextPosting, err = postingsItrExcluding.Next()
|
||||
for nextPosting != nil && err == nil {
|
||||
count++
|
||||
nextPosting, err = postingsItrExcluding.Next()
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if count != 1 {
|
||||
t.Errorf("expected count to be 1, got %d", count)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOpenMultiWithTwoChunks(t *testing.T) {
|
||||
_ = os.RemoveAll("/tmp/scorch.zap")
|
||||
|
||||
memSegment := buildMemSegmentMulti()
|
||||
err := PersistSegment(memSegment, "/tmp/scorch.zap", 1)
|
||||
if err != nil {
|
||||
t.Fatalf("error persisting segment: %v", err)
|
||||
}
|
||||
|
||||
segment, err := Open("/tmp/scorch.zap")
|
||||
if err != nil {
|
||||
t.Fatalf("error opening segment: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
cerr := segment.Close()
|
||||
if cerr != nil {
|
||||
t.Fatalf("error closing segment: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
if segment.Count() != 2 {
|
||||
t.Errorf("expected count 2, got %d", segment.Count())
|
||||
}
|
||||
|
||||
// check the desc field
|
||||
dict, err := segment.Dictionary("desc")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if dict == nil {
|
||||
t.Fatal("got nil dict, expected non-nil")
|
||||
}
|
||||
|
||||
postingsList, err := dict.PostingsList("thing", nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if postingsList == nil {
|
||||
t.Fatal("got nil postings list, expected non-nil")
|
||||
}
|
||||
|
||||
postingsItr := postingsList.Iterator()
|
||||
if postingsItr == nil {
|
||||
t.Fatal("got nil iterator, expected non-nil")
|
||||
}
|
||||
|
||||
count := 0
|
||||
nextPosting, err := postingsItr.Next()
|
||||
for nextPosting != nil && err == nil {
|
||||
count++
|
||||
nextPosting, err = postingsItr.Next()
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if count != 2 {
|
||||
t.Errorf("expected count to be 2, got %d", count)
|
||||
}
|
||||
|
||||
// get docnum of a
|
||||
exclude, err := segment.DocNumbers([]string{"a"})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// look for term 'thing' excluding doc 'a'
|
||||
postingsListExcluding, err := dict.PostingsList("thing", exclude)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if postingsList == nil {
|
||||
t.Fatal("got nil postings list, expected non-nil")
|
||||
}
|
||||
|
||||
postingsItrExcluding := postingsListExcluding.Iterator()
|
||||
if postingsItr == nil {
|
||||
t.Fatal("got nil iterator, expected non-nil")
|
||||
}
|
||||
|
||||
count = 0
|
||||
nextPosting, err = postingsItrExcluding.Next()
|
||||
for nextPosting != nil && err == nil {
|
||||
count++
|
||||
nextPosting, err = postingsItrExcluding.Next()
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if count != 1 {
|
||||
t.Errorf("expected count to be 1, got %d", count)
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue