0
0
Fork 0

add initial version of zap file format

This commit is contained in:
Marty Schoch 2017-12-09 14:28:33 -05:00
parent ff2e6b98e4
commit 9781d9b089
16 changed files with 3053 additions and 0 deletions

View File

@ -0,0 +1,120 @@
# zap file format
## stored fields section
- for each document
- preparation phase:
- produce a slice of metadata bytes and data bytes
- produce these slices in field id order
- field value is appended to the data slice
- metadata slice is govarint encoded with the following values for each field value
- field id (uint16)
- field type (byte)
- field value start offset in uncompressed data slice (uint64)
- field value length (uint64)
- field number of array positions (uint64)
- one additional value for each array position (uint64)
- compress the data slice using snappy
- file writing phase:
- remember the start offset for this document
- write out meta data length (varint uint64)
- write out compressed data length (varint uint64)
- write out the metadata bytes
- write out the compressed data bytes
## stored fields idx
- for each document
- write start offset (remembered from previous section) of stored data (big endian uint64)
With this index and a known document number, we have direct access to all the stored field data.
## posting details (freq/norm) section
- for each posting list
- produce a slice containing multiple consecutive chunks (each chunk is govarint stream)
- produce a slice remembering offsets of where each chunk starts
- preparation phase:
- for each hit in the posting list
- if this hit is in next chunk close out encoding of last chunk and record offset start of next
- encode term frequency (uint64)
- encode norm factor (float32)
- file writing phase:
- remember start position for this posting list details
- write out number of chunks that follow (varint uint64)
- write out length of each chunk (each a varint uint64)
- write out the byte slice containing all the chunk data
If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it.
## posting details (location) section
- for each posting list
- produce a slice containing multiple consecutive chunks (each chunk is govarint stream)
- produce a slice remembering offsets of where each chunk starts
- preparation phase:
- for each hit in the posting list
- if this hit is in next chunk close out encoding of last chunk and record offset start of next
- encode field (uint16)
- encode field pos (uint64)
- encode field start (uint64)
- encode field end (uint64)
- encode number of array positions to follow (uint64)
- encode each array position (each uint64)
- file writing phase:
- remember start position for this posting list details
- write out number of chunks that follow (varint uint64)
- write out length of each chunk (each a varint uint64)
- write out the byte slice containing all the chunk data
If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it.
## postings list section
- for each posting list
- preparation phase:
- encode roaring bitmap posting list to bytes (so we know the length)
- file writing phase:
- remember the start position for this posting list
- write freq/norm details offset (remembered from previous, as varint uint64)
- write location details offset (remembered from previous, as varint uint64)
- write length of encoded roaring bitmap
- write the serialized roaring bitmap data
## dictionary
- for each field
- preparation phase:
- encode vellum FST with dictionary data pointing to file offset of posting list (remembered from previous)
- file writing phase:
- remember the start position of this persistDictionary
- write length of vellum data (varint uint64)
- write out vellum data
## fields section
- for each field
- file writing phase:
- remember start offset for each field
- write 1 if field has location info indexed, 0 if not (varint uint64)
- write dictionary address (remembered from previous) (varint uint64)
- write length of field name (varint uint64)
- write field name bytes
## fields idx
- for each field
- file writing phase:
- write big endian uint64 of start offset for each field
NOTE: currently we don't know or record the length of this fields index. Instead we rely on the fact that we know it immediately precedes a footer of known size.
## footer
- file writing phase
- write number of docs (big endian uint64)
- write stored field index location (big endian uint64)
- write field index location (big endian uint64)
- write out chunk factor (big endian uint32)
- write out version (big endian uint32)
- write out file CRC of everything preceding this (big endian uint32)

View File

@ -0,0 +1,615 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bufio"
"bytes"
"encoding/binary"
"math"
"os"
"github.com/Smerity/govarint"
"github.com/blevesearch/bleve/index/scorch/segment/mem"
"github.com/couchbaselabs/vellum"
"github.com/golang/snappy"
)
var version uint32
// PersistSegment takes the in-memory segment and persists it to the specified
// path in the zap file format.
func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (err error) {
flag := os.O_RDWR | os.O_CREATE
f, err := os.OpenFile(path, flag, 0600)
if err != nil {
return err
}
// bufer the output
br := bufio.NewWriter(f)
// wrap it for counting (tracking offsets)
cr := NewCountHashWriter(br)
var storedIndexOffset uint64
storedIndexOffset, err = persistStored(memSegment, cr)
if err != nil {
return err
}
var freqOffsets, locOffsets []uint64
freqOffsets, locOffsets, err = persistPostingDetails(memSegment, cr, chunkFactor)
if err != nil {
return err
}
var postingsLocs []uint64
postingsLocs, err = persistPostingsLists(memSegment, cr, freqOffsets, locOffsets)
if err != nil {
return err
}
var dictLocs []uint64
dictLocs, err = persistDictionary(memSegment, cr, postingsLocs)
if err != nil {
return err
}
var fieldIndexStart uint64
fieldIndexStart, err = persistFields(memSegment, cr, dictLocs)
if err != nil {
return err
}
err = persistFooter(uint64(len(memSegment.Stored)), storedIndexOffset,
fieldIndexStart, chunkFactor, cr)
if err != nil {
return err
}
err = br.Flush()
if err != nil {
return err
}
err = f.Close()
if err != nil {
return err
}
return nil
}
func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) {
var curr int
var metaBuf bytes.Buffer
var data, compressed []byte
docNumOffsets := make(map[int]uint64, len(memSegment.Stored))
for docNum, storedValues := range memSegment.Stored {
if docNum != 0 {
// reset buffer if necessary
metaBuf.Reset()
data = data[:0]
compressed = compressed[:0]
curr = 0
}
metaEncoder := govarint.NewU64Base128Encoder(&metaBuf)
// encode fields in order
for fieldID := range memSegment.FieldsInv {
if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok {
// has stored values for this field
num := len(storedFieldValues)
// process each value
for i := 0; i < num; i++ {
// encode field
_, err2 := metaEncoder.PutU64(uint64(fieldID))
if err2 != nil {
return 0, err2
}
// encode type
_, err2 = metaEncoder.PutU64(uint64(memSegment.StoredTypes[docNum][uint16(fieldID)][i]))
if err2 != nil {
return 0, err2
}
// encode start offset
_, err2 = metaEncoder.PutU64(uint64(curr))
if err2 != nil {
return 0, err2
}
// end len
_, err2 = metaEncoder.PutU64(uint64(len(storedFieldValues[i])))
if err2 != nil {
return 0, err2
}
// encode number of array pos
_, err2 = metaEncoder.PutU64(uint64(len(memSegment.StoredPos[docNum][uint16(fieldID)][i])))
if err2 != nil {
return 0, err2
}
// encode all array positions
for j := 0; j < len(memSegment.StoredPos[docNum][uint16(fieldID)][i]); j++ {
_, err2 = metaEncoder.PutU64(memSegment.StoredPos[docNum][uint16(fieldID)][i][j])
if err2 != nil {
return 0, err2
}
}
// append data
data = append(data, storedFieldValues[i]...)
// update curr
curr += len(storedFieldValues[i])
}
}
}
metaEncoder.Close()
metaBytes := metaBuf.Bytes()
// compress the data
compressed = snappy.Encode(compressed, data)
// record where we're about to start writing
docNumOffsets[docNum] = uint64(w.Count())
buf := make([]byte, binary.MaxVarintLen64)
// write out the meta length
n := binary.PutUvarint(buf, uint64(len(metaBytes)))
_, err := w.Write(buf[:n])
if err != nil {
return 0, err
}
// write out the compressed data length
n = binary.PutUvarint(buf, uint64(len(compressed)))
_, err = w.Write(buf[:n])
if err != nil {
return 0, err
}
// now write the meta
_, err = w.Write(metaBytes)
if err != nil {
return 0, err
}
// now write the compressed data
_, err = w.Write(compressed)
if err != nil {
return 0, err
}
}
// return value is the start of the stored index
rv := uint64(w.Count())
// now write out the stored doc index
for docNum := range memSegment.Stored {
err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum])
if err != nil {
return 0, err
}
}
return rv, nil
}
func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) {
var freqOffsets, locOfffsets []uint64
for postingID := range memSegment.Postings {
postingsListItr := memSegment.Postings[postingID].Iterator()
total := uint64(len(memSegment.Stored))/uint64(chunkFactor) + 1
var freqNormBuf []byte
var offset int
var encodingBuf bytes.Buffer
encoder := govarint.NewU64Base128Encoder(&encodingBuf)
chunkLens := make([]uint64, total)
var currChunk uint64
for postingsListItr.HasNext() {
docNum := postingsListItr.Next()
chunk := uint64(docNum) / uint64(chunkFactor)
if chunk != currChunk {
// starting a new chunk
if encoder != nil {
// close out last
encoder.Close()
encodingBytes := encodingBuf.Bytes()
chunkLens[currChunk] = uint64(len(encodingBytes))
freqNormBuf = append(freqNormBuf, encodingBytes...)
encodingBuf.Reset()
encoder = govarint.NewU64Base128Encoder(&encodingBuf)
}
currChunk = chunk
}
// put freq
_, err := encoder.PutU64(memSegment.Freqs[postingID][offset])
if err != nil {
return nil, nil, err
}
// put norm
norm := memSegment.Norms[postingID][offset]
normBits := math.Float32bits(norm)
_, err = encoder.PutU32(normBits)
if err != nil {
return nil, nil, err
}
offset++
}
// close out last chunk
if encoder != nil {
// fix me write freq/norms
encoder.Close()
encodingBytes := encodingBuf.Bytes()
chunkLens[currChunk] = uint64(len(encodingBytes))
freqNormBuf = append(freqNormBuf, encodingBytes...)
}
// record where this postings freq info starts
freqOffsets = append(freqOffsets, uint64(w.Count()))
buf := make([]byte, binary.MaxVarintLen64)
// write out the number of chunks
n := binary.PutUvarint(buf, uint64(total))
_, err := w.Write(buf[:n])
if err != nil {
return nil, nil, err
}
// write out the chunk lens
for _, chunkLen := range chunkLens {
n := binary.PutUvarint(buf, uint64(chunkLen))
_, err = w.Write(buf[:n])
if err != nil {
return nil, nil, err
}
}
// write out the data
_, err = w.Write(freqNormBuf)
if err != nil {
return nil, nil, err
}
}
// now do it again for the locations
for postingID := range memSegment.Postings {
postingsListItr := memSegment.Postings[postingID].Iterator()
total := uint64(len(memSegment.Stored))/uint64(chunkFactor) + 1
var locBuf []byte
var offset int
var locOffset int
var encodingBuf bytes.Buffer
encoder := govarint.NewU64Base128Encoder(&encodingBuf)
chunkLens := make([]uint64, total)
var currChunk uint64
for postingsListItr.HasNext() {
docNum := postingsListItr.Next()
chunk := uint64(docNum) / uint64(chunkFactor)
if chunk != currChunk {
// starting a new chunk
if encoder != nil {
// close out last
encoder.Close()
encodingBytes := encodingBuf.Bytes()
chunkLens[currChunk] = uint64(len(encodingBytes))
locBuf = append(locBuf, encodingBytes...)
encodingBuf.Reset()
encoder = govarint.NewU64Base128Encoder(&encodingBuf)
}
currChunk = chunk
}
for i := 0; i < int(memSegment.Freqs[postingID][offset]); i++ {
if len(memSegment.Locfields[postingID]) > 0 {
// put field
_, err := encoder.PutU64(uint64(memSegment.Locfields[postingID][locOffset]))
if err != nil {
return nil, nil, err
}
// put pos
_, err = encoder.PutU64(memSegment.Locpos[postingID][locOffset])
if err != nil {
return nil, nil, err
}
// put start
_, err = encoder.PutU64(memSegment.Locstarts[postingID][locOffset])
if err != nil {
return nil, nil, err
}
// put end
_, err = encoder.PutU64(memSegment.Locends[postingID][locOffset])
if err != nil {
return nil, nil, err
}
// put array positions
num := len(memSegment.Locarraypos[postingID][locOffset])
// put the number of array positions to follow
_, err = encoder.PutU64(uint64(num))
if err != nil {
return nil, nil, err
}
// put each array position
for j := 0; j < num; j++ {
_, err = encoder.PutU64(memSegment.Locarraypos[postingID][locOffset][j])
if err != nil {
return nil, nil, err
}
}
}
locOffset++
}
offset++
}
// close out last chunk
if encoder != nil {
// fix me write freq/norms
encoder.Close()
encodingBytes := encodingBuf.Bytes()
chunkLens[currChunk] = uint64(len(encodingBytes))
locBuf = append(locBuf, encodingBytes...)
}
// record where this postings loc info starts
locOfffsets = append(locOfffsets, uint64(w.Count()))
buf := make([]byte, binary.MaxVarintLen64)
// write out the number of chunks
n := binary.PutUvarint(buf, uint64(total))
_, err := w.Write(buf[:n])
if err != nil {
return nil, nil, err
}
// write out the chunk lens
for _, chunkLen := range chunkLens {
n := binary.PutUvarint(buf, uint64(chunkLen))
_, err = w.Write(buf[:n])
if err != nil {
return nil, nil, err
}
}
// write out the data
_, err = w.Write(locBuf)
if err != nil {
return nil, nil, err
}
}
return freqOffsets, locOfffsets, nil
}
func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, freqOffsets, locOffsets []uint64) ([]uint64, error) {
var rv []uint64
var postingsBuf bytes.Buffer
for postingID := range memSegment.Postings {
if postingID != 0 {
postingsBuf.Reset()
}
// record where we start this posting list
rv = append(rv, uint64(w.Count()))
// write out postings list to memory so we know the len
postingsListLen, err := memSegment.Postings[postingID].WriteTo(&postingsBuf)
if err != nil {
return nil, err
}
// write out the start of the term info
buf := make([]byte, binary.MaxVarintLen64)
n := binary.PutUvarint(buf, freqOffsets[postingID])
_, err = w.Write(buf[:n])
if err != nil {
return nil, err
}
// write out the start of the loc info
n = binary.PutUvarint(buf, locOffsets[postingID])
_, err = w.Write(buf[:n])
if err != nil {
return nil, err
}
// write out the length of this postings list
n = binary.PutUvarint(buf, uint64(postingsListLen))
_, err = w.Write(buf[:n])
if err != nil {
return nil, err
}
// write out the postings list itself
_, err = w.Write(postingsBuf.Bytes())
if err != nil {
return nil, err
}
}
return rv, nil
}
func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) {
var rv []uint64
var buffer bytes.Buffer
for fieldID, fieldTerms := range memSegment.DictKeys {
if fieldID != 0 {
buffer.Reset()
}
// start a new vellum for this field
builder, err := vellum.New(&buffer, nil)
if err != nil {
return nil, err
}
dict := memSegment.Dicts[fieldID]
// now walk the dictionary in order of fieldTerms (already sorted)
for i := range fieldTerms {
postingID := dict[fieldTerms[i]] - 1
postingsAddr := postingsLocs[postingID]
err = builder.Insert([]byte(fieldTerms[i]), postingsAddr)
if err != nil {
return nil, err
}
}
err = builder.Close()
if err != nil {
return nil, err
}
// record where this dictionary starts
rv = append(rv, uint64(w.Count()))
vellumData := buffer.Bytes()
// write out the length of the vellum data
buf := make([]byte, binary.MaxVarintLen64)
// write out the number of chunks
n := binary.PutUvarint(buf, uint64(len(vellumData)))
_, err = w.Write(buf[:n])
if err != nil {
return nil, err
}
// write this vellum to disk
_, err = w.Write(vellumData)
if err != nil {
return nil, err
}
}
return rv, nil
}
func persistFields(memSegment *mem.Segment, w *CountHashWriter, dictLocs []uint64) (uint64, error) {
var rv uint64
var fieldStarts []uint64
for fieldID, fieldName := range memSegment.FieldsInv {
// record start of this field
fieldStarts = append(fieldStarts, uint64(w.Count()))
buf := make([]byte, binary.MaxVarintLen64)
// write out if the field has indexed locs (0 or 1)
var indexedLoc uint64
if memSegment.FieldsLoc[fieldID] {
indexedLoc = 1
}
n := binary.PutUvarint(buf, indexedLoc)
_, err := w.Write(buf[:n])
if err != nil {
return 0, err
}
// write out dict location for this field
n = binary.PutUvarint(buf, dictLocs[fieldID])
_, err = w.Write(buf[:n])
if err != nil {
return 0, err
}
// write out the length of the field name
n = binary.PutUvarint(buf, uint64(len(fieldName)))
_, err = w.Write(buf[:n])
if err != nil {
return 0, err
}
// write out the field name
_, err = w.Write([]byte(fieldName))
if err != nil {
return 0, err
}
}
// now write out the fields index
rv = uint64(w.Count())
// now write out the stored doc index
for fieldID := range memSegment.FieldsInv {
err := binary.Write(w, binary.BigEndian, fieldStarts[fieldID])
if err != nil {
return 0, err
}
}
return rv, nil
}
// NOTE: update if you make the footer bigger
// crc + ver + chunk + field offset + stored offset + num docs
const footerSize = 4 + 4 + 4 + 8 + 8 + 8
func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset uint64,
chunkFactor uint32, w *CountHashWriter) error {
// write out the number of docs
err := binary.Write(w, binary.BigEndian, numDocs)
if err != nil {
return err
}
// write out the stored field index location:
err = binary.Write(w, binary.BigEndian, storedIndexOffset)
if err != nil {
return err
}
// write out the field index location
err = binary.Write(w, binary.BigEndian, fieldIndexOffset)
if err != nil {
return err
}
// write out 32-bit chunk factor
err = binary.Write(w, binary.BigEndian, chunkFactor)
if err != nil {
return err
}
// write out 32-bit version
err = binary.Write(w, binary.BigEndian, version)
if err != nil {
return err
}
// write out CRC-32 of everything upto but not including this CRC
err = binary.Write(w, binary.BigEndian, w.Sum32())
if err != nil {
return err
}
return nil
}

View File

@ -0,0 +1,288 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"os"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment/mem"
)
func TestBuild(t *testing.T) {
_ = os.RemoveAll("/tmp/scorch.zap")
memSegment := buildMemSegment()
err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024)
if err != nil {
t.Fatal(err)
}
}
func buildMemSegment() *mem.Segment {
doc := &document.Document{
ID: "a",
Fields: []document.Field{
document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil),
document.NewTextFieldCustom("name", nil, []byte("wow"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
},
CompositeFields: []*document.CompositeField{
document.NewCompositeField("_all", true, nil, []string{"_id"}),
},
}
// forge analyzed docs
results := []*index.AnalysisResult{
&index.AnalysisResult{
Document: doc,
Analyzed: []analysis.TokenFrequencies{
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 1,
Position: 1,
Term: []byte("a"),
},
}, nil, false),
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 3,
Position: 1,
Term: []byte("wow"),
},
}, nil, true),
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 4,
Position: 1,
Term: []byte("some"),
},
&analysis.Token{
Start: 5,
End: 10,
Position: 2,
Term: []byte("thing"),
},
}, nil, true),
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 4,
Position: 1,
Term: []byte("cold"),
},
}, []uint64{0}, true),
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 4,
Position: 1,
Term: []byte("dark"),
},
}, []uint64{1}, true),
},
Length: []int{
1,
1,
2,
1,
1,
},
},
}
// fix up composite fields
for _, ar := range results {
for i, f := range ar.Document.Fields {
for _, cf := range ar.Document.CompositeFields {
cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i])
}
}
}
return mem.NewFromAnalyzedDocs(results)
}
func buildMemSegmentMulti() *mem.Segment {
doc := &document.Document{
ID: "a",
Fields: []document.Field{
document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil),
document.NewTextFieldCustom("name", nil, []byte("wow"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
},
CompositeFields: []*document.CompositeField{
document.NewCompositeField("_all", true, nil, []string{"_id"}),
},
}
doc2 := &document.Document{
ID: "b",
Fields: []document.Field{
document.NewTextFieldCustom("_id", nil, []byte("b"), document.IndexField|document.StoreField, nil),
document.NewTextFieldCustom("name", nil, []byte("who"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
},
CompositeFields: []*document.CompositeField{
document.NewCompositeField("_all", true, nil, []string{"_id"}),
},
}
// forge analyzed docs
results := []*index.AnalysisResult{
&index.AnalysisResult{
Document: doc,
Analyzed: []analysis.TokenFrequencies{
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 1,
Position: 1,
Term: []byte("a"),
},
}, nil, false),
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 3,
Position: 1,
Term: []byte("wow"),
},
}, nil, true),
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 4,
Position: 1,
Term: []byte("some"),
},
&analysis.Token{
Start: 5,
End: 10,
Position: 2,
Term: []byte("thing"),
},
}, nil, true),
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 4,
Position: 1,
Term: []byte("cold"),
},
}, []uint64{0}, true),
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 4,
Position: 1,
Term: []byte("dark"),
},
}, []uint64{1}, true),
},
Length: []int{
1,
1,
2,
1,
1,
},
},
&index.AnalysisResult{
Document: doc2,
Analyzed: []analysis.TokenFrequencies{
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 1,
Position: 1,
Term: []byte("b"),
},
}, nil, false),
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 3,
Position: 1,
Term: []byte("who"),
},
}, nil, true),
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 4,
Position: 1,
Term: []byte("some"),
},
&analysis.Token{
Start: 5,
End: 10,
Position: 2,
Term: []byte("thing"),
},
}, nil, true),
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 4,
Position: 1,
Term: []byte("cold"),
},
}, []uint64{0}, true),
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 4,
Position: 1,
Term: []byte("dark"),
},
}, []uint64{1}, true),
},
Length: []int{
1,
1,
2,
1,
1,
},
},
}
// fix up composite fields
for _, ar := range results {
for i, f := range ar.Document.Fields {
for _, cf := range ar.Document.CompositeFields {
cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i])
}
}
}
segment := mem.NewFromAnalyzedDocs(results)
return segment
}

View File

@ -0,0 +1,3 @@
# zap command line utility
Kind of a hack just put together quickly to let me debug some issues.

View File

@ -0,0 +1,72 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cmd
import (
"encoding/binary"
"fmt"
"github.com/couchbaselabs/vellum"
"github.com/spf13/cobra"
)
// dictCmd represents the dict command
var dictCmd = &cobra.Command{
Use: "dict [path] [field]",
Short: "dict prints the term dictionary for the specified field",
Long: `The dict command lets you print the term dictionary for the specified field.`,
RunE: func(cmd *cobra.Command, args []string) error {
if len(args) < 2 {
return fmt.Errorf("must specify field")
}
data := segment.Data()
addr, err := segment.DictAddr(args[1])
if err != nil {
return fmt.Errorf("error determing address: %v", err)
}
fmt.Printf("dictionary for field starts at %d (%x)\n", addr, addr)
vellumLen, read := binary.Uvarint(data[addr : addr+binary.MaxVarintLen64])
fmt.Printf("vellum length: %d\n", vellumLen)
fstBytes := data[addr+uint64(read) : addr+uint64(read)+vellumLen]
fmt.Printf("raw vellum data % x\n", fstBytes)
fmt.Printf("dictionary:\n\n")
if fstBytes != nil {
fst, err := vellum.Load(fstBytes)
if err != nil {
return fmt.Errorf("dictionary field %s vellum err: %v", args[1], err)
}
itr, err := fst.Iterator(nil, nil)
for err == nil {
currTerm, currVal := itr.Current()
fmt.Printf("%s - %d (%x)\n", currTerm, currVal, currVal)
err = itr.Next()
}
if err != nil && err != vellum.ErrIteratorDone {
return fmt.Errorf("error iterating dictionary: %v", err)
}
}
return nil
},
}
func init() {
RootCmd.AddCommand(dictCmd)
}

View File

@ -0,0 +1,124 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cmd
import (
"encoding/binary"
"fmt"
"log"
"github.com/couchbaselabs/vellum"
"github.com/spf13/cobra"
)
// exploreCmd represents the explore command
var exploreCmd = &cobra.Command{
Use: "explore [path] [field] <term> <docNum>",
Short: "explores the index by field, then term (optional), and then docNum (optional)",
Long: `The explore command lets you explore the index in order of field, then optionally by term, then optionally again by doc number.`,
RunE: func(cmd *cobra.Command, args []string) error {
if len(args) < 2 {
return fmt.Errorf("must specify field")
}
data := segment.Data()
addr, err := segment.DictAddr(args[1])
if err != nil {
return fmt.Errorf("error determing address: %v", err)
}
fmt.Printf("dictionary for field starts at %d (%x)\n", addr, addr)
vellumLen, read := binary.Uvarint(data[addr : addr+binary.MaxVarintLen64])
fmt.Printf("vellum length: %d\n", vellumLen)
fstBytes := data[addr+uint64(read) : addr+uint64(read)+vellumLen]
fmt.Printf("raw vellum data % x\n", fstBytes)
if len(args) >= 3 {
if fstBytes != nil {
fst, err := vellum.Load(fstBytes)
if err != nil {
return fmt.Errorf("dictionary field %s vellum err: %v", args[1], err)
}
postingsAddr, exists, err := fst.Get([]byte(args[2]))
if err != nil {
return fmt.Errorf("error looking for term : %v", err)
}
if exists {
fmt.Printf("postings list begins at %d (%x)\n", postingsAddr, postingsAddr)
var n uint64
freqAddr, read := binary.Uvarint(data[postingsAddr : postingsAddr+binary.MaxVarintLen64])
n += uint64(read)
var locAddr uint64
locAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64])
n += uint64(read)
var postingListLen uint64
postingListLen, _ = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64])
fmt.Printf("Posting List Length: %d\n", postingListLen)
fmt.Printf("Freq details at: %d (%x)\n", freqAddr, freqAddr)
numChunks, r2 := binary.Uvarint(data[freqAddr : freqAddr+binary.MaxVarintLen64])
n = uint64(r2)
var freqOffsets []uint64
for j := uint64(0); j < numChunks; j++ {
chunkLen, r3 := binary.Uvarint(data[freqAddr+n : freqAddr+n+binary.MaxVarintLen64])
n += uint64(r3)
freqOffsets = append(freqOffsets, chunkLen)
}
running := freqAddr + n
for k, offset := range freqOffsets {
fmt.Printf("freq chunk: %d, len %d, start at %d (%x) end %d (%x)\n", k, offset, running, running, running+offset, running+offset)
running += offset
}
fmt.Printf("Loc details at: %d (%x)\n", locAddr, locAddr)
numLChunks, r4 := binary.Uvarint(data[locAddr : locAddr+binary.MaxVarintLen64])
n = uint64(r4)
fmt.Printf("there are %d loc chunks\n", numLChunks)
var locOffsets []uint64
for j := uint64(0); j < numLChunks; j++ {
log.Printf("reading from %d(%x)\n", locAddr+n, locAddr+n)
log.Printf("data i see here: % x\n", data[locAddr+n:locAddr+n+binary.MaxVarintLen64])
lchunkLen, r4 := binary.Uvarint(data[locAddr+n : locAddr+n+binary.MaxVarintLen64])
n += uint64(r4)
log.Printf("see chunk len %d(%x)\n", lchunkLen, lchunkLen)
locOffsets = append(locOffsets, lchunkLen)
}
running2 := locAddr + n
for k, offset := range locOffsets {
fmt.Printf("loc chunk: %d, len %d(%x), start at %d (%x) end %d (%x)\n", k, offset, offset, running2, running2, running2+offset, running2+offset)
running2 += offset
}
} else {
fmt.Printf("dictionary does not contain term '%s'\n", args[2])
}
}
}
return nil
},
}
func init() {
RootCmd.AddCommand(exploreCmd)
}

View File

@ -0,0 +1,43 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cmd
import (
"fmt"
"github.com/spf13/cobra"
)
// footerCmd represents the footer command
var footerCmd = &cobra.Command{
Use: "footer [path]",
Short: "prints the contents of the zap footer",
Long: `The footer command will print the contents of the footer.`,
RunE: func(cmd *cobra.Command, args []string) error {
data := segment.Data()
fmt.Printf("Length: %d\n", len(data))
fmt.Printf("CRC: %#x\n", segment.CRC())
fmt.Printf("Version: %d\n", segment.Version())
fmt.Printf("Chunk Factor: %d\n", segment.ChunkFactor())
fmt.Printf("Fields Idx: %d (%#x)\n", segment.FieldsIndexOffset(), segment.FieldsIndexOffset())
fmt.Printf("Stored Idx: %d (%#x)\n", segment.StoredIndexOffset(), segment.StoredIndexOffset())
fmt.Printf("Num Docs: %d\n", segment.NumDocs())
return nil
},
}
func init() {
RootCmd.AddCommand(footerCmd)
}

View File

@ -0,0 +1,58 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cmd
import (
"fmt"
"os"
"github.com/blevesearch/bleve/index/scorch/segment/zap"
"github.com/spf13/cobra"
)
var segment *zap.Segment
// RootCmd represents the base command when called without any subcommands
var RootCmd = &cobra.Command{
Use: "zap",
Short: "command-line tool to interact with a zap file",
Long: `Zap is a command-line tool to interact with a zap file.`,
PersistentPreRunE: func(cmd *cobra.Command, args []string) error {
if len(args) < 1 {
return fmt.Errorf("must specify path to zap file")
}
segInf, err := zap.Open(args[0])
if err != nil {
return fmt.Errorf("error opening zap file: %v", err)
}
segment = segInf.(*zap.Segment)
return nil
},
PersistentPostRunE: func(cmd *cobra.Command, args []string) error {
return nil
},
}
// Execute adds all child commands to the root command sets flags appropriately.
// This is called by main.main(). It only needs to happen once to the rootCmd.
func Execute() {
if err := RootCmd.Execute(); err != nil {
fmt.Println(err)
os.Exit(-1)
}
}

View File

@ -0,0 +1,73 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cmd
import (
"encoding/binary"
"fmt"
"strconv"
"github.com/golang/snappy"
"github.com/spf13/cobra"
)
// storedCmd represents the stored command
var storedCmd = &cobra.Command{
Use: "stored [path] [docNum]",
Short: "prints the stored section for a doc number",
Long: `The stored command will print the raw stored data bytes for the specified document number.`,
RunE: func(cmd *cobra.Command, args []string) error {
if len(args) < 2 {
return fmt.Errorf("must specify doc number")
}
docNum, err := strconv.Atoi(args[1])
if err != nil {
return fmt.Errorf("unable to parse doc number: %v", err)
}
if docNum >= int(segment.NumDocs()) {
return fmt.Errorf("invalid doc number %d (valid 0 - %d)", docNum, segment.NumDocs()-1)
}
data := segment.Data()
storedIdx := segment.StoredIndexOffset()
// read docNum entry in the index
indexPos := storedIdx + (8 * uint64(docNum))
storedStartAddr := binary.BigEndian.Uint64(data[indexPos : indexPos+8])
fmt.Printf("Stored field starts at %d (%#x)\n", storedStartAddr, storedStartAddr)
var n uint64
metaLen, read := binary.Uvarint(data[storedStartAddr : storedStartAddr+binary.MaxVarintLen64])
n += uint64(read)
fmt.Printf("Meta Len: %d\n", metaLen)
var dataLen uint64
dataLen, read = binary.Uvarint(data[storedStartAddr+n : storedStartAddr+n+binary.MaxVarintLen64])
n += uint64(read)
fmt.Printf("Data Len: %d\n", dataLen)
meta := data[storedStartAddr+n : storedStartAddr+n+metaLen]
fmt.Printf("Raw meta: % x\n", meta)
raw := data[storedStartAddr+n+metaLen : storedStartAddr+n+metaLen+dataLen]
fmt.Printf("Raw data (len %d): % x\n", len(raw), raw)
uncompressed, err := snappy.Decode(nil, raw)
if err != nil {
panic(err)
}
fmt.Printf("Uncompressed data (len %d): % x\n", len(uncompressed), uncompressed)
return nil
},
}
func init() {
RootCmd.AddCommand(storedCmd)
}

View File

@ -0,0 +1,23 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"github.com/blevesearch/bleve/index/scorch/segment/zap/cmd/zap/cmd"
)
func main() {
cmd.Execute()
}

View File

@ -0,0 +1,55 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"hash"
"hash/crc32"
"io"
)
// CountHashWriter is a wrapper around a Writer which counts the number of
// bytes which have been written
type CountHashWriter struct {
w io.Writer
h hash.Hash32
n int
}
// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer
func NewCountHashWriter(w io.Writer) *CountHashWriter {
return &CountHashWriter{
w: w,
h: crc32.NewIEEE(),
}
}
// Write writes the provided bytes to the wrapped writer and counts the bytes
func (c *CountHashWriter) Write(b []byte) (int, error) {
n, err := c.w.Write(b)
c.n += n
_, _ = c.h.Write(b)
return n, err
}
// Count returns the number of bytes written
func (c *CountHashWriter) Count() int {
return c.n
}
// Sum32 returns the CRC-32 hash of the content written to this writer
func (c *CountHashWriter) Sum32() uint32 {
return c.h.Sum32()
}

View File

@ -0,0 +1,165 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"encoding/binary"
"fmt"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/couchbaselabs/vellum"
"github.com/couchbaselabs/vellum/regexp"
)
// Dictionary is the zap representation of the term dictionary
type Dictionary struct {
segment *Segment
field string
fieldID uint16
fst *vellum.FST
}
// PostingsList returns the postings list for the specified term
func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) {
return d.postingsList(term, except)
}
func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*PostingsList, error) {
rv := &PostingsList{
dictionary: d,
term: term,
except: except,
}
if d.fst != nil {
postingsOffset, exists, err := d.fst.Get([]byte(term))
if err != nil {
return nil, fmt.Errorf("vellum err: %v", err)
}
if exists {
rv.postingsOffset = postingsOffset
// read the location of the freq/norm details
var n uint64
var read int
rv.freqOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+binary.MaxVarintLen64])
n += uint64(read)
rv.locOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read)
var postingsLen uint64
postingsLen, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read)
roaringBytes := d.segment.mm[postingsOffset+n : postingsOffset+n+postingsLen]
bitmap := roaring.NewBitmap()
_, err = bitmap.FromBuffer(roaringBytes)
if err != nil {
return nil, fmt.Errorf("error loading roaring bitmap: %v", err)
}
rv.postings = bitmap
}
}
return rv, nil
}
// Iterator returns an iterator for this dictionary
func (d *Dictionary) Iterator() segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
}
if d.fst != nil {
itr, err := d.fst.Iterator(nil, nil)
if err == nil {
rv.itr = itr
}
}
return rv
}
// PrefixIterator returns an iterator which only visits terms having the
// the specified prefix
func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
}
if d.fst != nil {
r, err := regexp.New(prefix + ".*")
if err == nil {
itr, err := d.fst.Search(r, nil, nil)
if err == nil {
rv.itr = itr
}
}
}
return rv
}
// RangeIterator returns an iterator which only visits terms between the
// start and end terms. NOTE: bleve.index API specifies the end is inclusive.
func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator {
rv := &DictionaryIterator{
d: d,
}
// need to increment the end position to be inclusive
endBytes := []byte(end)
if endBytes[len(endBytes)-1] < 0xff {
endBytes[len(endBytes)-1]++
} else {
endBytes = append(endBytes, 0xff)
}
if d.fst != nil {
itr, err := d.fst.Iterator([]byte(start), endBytes)
if err == nil {
rv.itr = itr
}
}
return rv
}
// DictionaryIterator is an iterator for term dictionary
type DictionaryIterator struct {
d *Dictionary
itr vellum.Iterator
err error
}
// Next returns the next entry in the dictionary
func (i *DictionaryIterator) Next() (*index.DictEntry, error) {
if i.itr == nil || i.err == vellum.ErrIteratorDone {
return nil, nil
} else if i.err != nil {
return nil, i.err
}
term, count := i.itr.Current()
rv := &index.DictEntry{
Term: string(term),
Count: count,
}
i.err = i.itr.Next()
return rv, nil
}

View File

@ -0,0 +1,183 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"os"
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch/segment/mem"
)
func buildMemSegmentForDict() *mem.Segment {
doc := &document.Document{
ID: "a",
Fields: []document.Field{
document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil),
document.NewTextFieldCustom("desc", nil, []byte("apple ball cat dog egg fish bat"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil),
},
}
// forge analyzed docs
results := []*index.AnalysisResult{
&index.AnalysisResult{
Document: doc,
Analyzed: []analysis.TokenFrequencies{
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 1,
Position: 1,
Term: []byte("a"),
},
}, nil, false),
analysis.TokenFrequency(analysis.TokenStream{
&analysis.Token{
Start: 0,
End: 5,
Position: 1,
Term: []byte("apple"),
},
&analysis.Token{
Start: 6,
End: 10,
Position: 2,
Term: []byte("ball"),
},
&analysis.Token{
Start: 11,
End: 14,
Position: 3,
Term: []byte("cat"),
},
&analysis.Token{
Start: 15,
End: 18,
Position: 4,
Term: []byte("dog"),
},
&analysis.Token{
Start: 19,
End: 22,
Position: 5,
Term: []byte("egg"),
},
&analysis.Token{
Start: 20,
End: 24,
Position: 6,
Term: []byte("fish"),
},
&analysis.Token{
Start: 25,
End: 28,
Position: 7,
Term: []byte("bat"),
},
}, nil, true),
},
Length: []int{
1,
7,
},
},
}
segment := mem.NewFromAnalyzedDocs(results)
return segment
}
func TestDictionary(t *testing.T) {
_ = os.RemoveAll("/tmp/scorch.zap")
memSegment := buildMemSegmentForDict()
err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024)
if err != nil {
t.Fatalf("error persisting segment: %v", err)
}
segment, err := Open("/tmp/scorch.zap")
if err != nil {
t.Fatalf("error opening segment: %v", err)
}
defer func() {
cerr := segment.Close()
if cerr != nil {
t.Fatalf("error closing segment: %v", err)
}
}()
dict, err := segment.Dictionary("desc")
if err != nil {
t.Fatal(err)
}
// test basic full iterator
expected := []string{"apple", "ball", "bat", "cat", "dog", "egg", "fish"}
var got []string
itr := dict.Iterator()
next, err := itr.Next()
for next != nil && err == nil {
got = append(got, next.Term)
next, err = itr.Next()
}
if err != nil {
t.Fatalf("dict itr error: %v", err)
}
if !reflect.DeepEqual(expected, got) {
t.Errorf("expected: %v, got: %v", expected, got)
}
// test prefix iterator
expected = []string{"ball", "bat"}
got = got[:0]
itr = dict.PrefixIterator("b")
next, err = itr.Next()
for next != nil && err == nil {
got = append(got, next.Term)
next, err = itr.Next()
}
if err != nil {
t.Fatalf("dict itr error: %v", err)
}
if !reflect.DeepEqual(expected, got) {
t.Errorf("expected: %v, got: %v", expected, got)
}
// test range iterator
expected = []string{"cat", "dog", "egg"}
got = got[:0]
itr = dict.RangeIterator("cat", "egg")
next, err = itr.Next()
for next != nil && err == nil {
got = append(got, next.Term)
next, err = itr.Next()
}
if err != nil {
t.Fatalf("dict itr error: %v", err)
}
if !reflect.DeepEqual(expected, got) {
t.Errorf("expected: %v, got: %v", expected, got)
}
}

View File

@ -0,0 +1,362 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"encoding/binary"
"fmt"
"math"
"github.com/RoaringBitmap/roaring"
"github.com/Smerity/govarint"
"github.com/blevesearch/bleve/index/scorch/segment"
)
// PostingsList is an in-memory represenation of a postings list
type PostingsList struct {
dictionary *Dictionary
term string
postingsOffset uint64
freqOffset uint64
locOffset uint64
postings *roaring.Bitmap
except *roaring.Bitmap
postingKey []byte
}
// Iterator returns an iterator for this postings list
func (p *PostingsList) Iterator() segment.PostingsIterator {
rv := &PostingsIterator{
postings: p,
}
if p.postings != nil {
// prepare the freq chunk details
var n uint64
var read int
var numFreqChunks uint64
numFreqChunks, read = binary.Uvarint(p.dictionary.segment.mm[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
n += uint64(read)
rv.freqChunkLens = make([]uint64, int(numFreqChunks))
for i := 0; i < int(numFreqChunks); i++ {
rv.freqChunkLens[i], read = binary.Uvarint(p.dictionary.segment.mm[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64])
n += uint64(read)
}
rv.freqChunkStart = p.freqOffset + n
// prepare the loc chunk details
n = 0
var numLocChunks uint64
numLocChunks, read = binary.Uvarint(p.dictionary.segment.mm[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
n += uint64(read)
rv.locChunkLens = make([]uint64, int(numLocChunks))
for i := 0; i < int(numLocChunks); i++ {
rv.locChunkLens[i], read = binary.Uvarint(p.dictionary.segment.mm[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64])
n += uint64(read)
}
rv.locChunkStart = p.locOffset + n
rv.all = p.postings.Iterator()
if p.except != nil {
allExcept := p.postings.Clone()
allExcept.AndNot(p.except)
rv.actual = allExcept.Iterator()
} else {
rv.actual = p.postings.Iterator()
}
}
return rv
}
// Count returns the number of items on this postings list
func (p *PostingsList) Count() uint64 {
var rv uint64
if p.postings != nil {
rv = p.postings.GetCardinality()
if p.except != nil {
except := p.except.GetCardinality()
if except > rv {
// avoid underflow
except = rv
}
rv -= except
}
}
return rv
}
// PostingsIterator provides a way to iterate through the postings list
type PostingsIterator struct {
postings *PostingsList
all roaring.IntIterable
offset int
locoffset int
actual roaring.IntIterable
currChunk uint32
currChunkFreqNorm []byte
currChunkLoc []byte
freqNormDecoder *govarint.Base128Decoder
locDecoder *govarint.Base128Decoder
freqChunkLens []uint64
freqChunkStart uint64
locChunkLens []uint64
locChunkStart uint64
}
func (i *PostingsIterator) loadChunk(chunk int) error {
if chunk >= len(i.freqChunkLens) || chunk >= len(i.locChunkLens) {
return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkLens), len(i.locChunkLens))
}
// load correct chunk bytes
start := i.freqChunkStart
for j := 0; j < chunk; j++ {
start += i.freqChunkLens[j]
}
end := start + i.freqChunkLens[chunk]
i.currChunkFreqNorm = i.postings.dictionary.segment.mm[start:end]
i.freqNormDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkFreqNorm))
start = i.locChunkStart
for j := 0; j < chunk; j++ {
start += i.locChunkLens[j]
}
end = start + i.locChunkLens[chunk]
i.currChunkLoc = i.postings.dictionary.segment.mm[start:end]
i.locDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkLoc))
i.currChunk = uint32(chunk)
return nil
}
func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) {
freq, err := i.freqNormDecoder.GetU64()
if err != nil {
return 0, 0, fmt.Errorf("error reading frequency: %v", err)
}
normBits, err := i.freqNormDecoder.GetU64()
if err != nil {
return 0, 0, fmt.Errorf("error reading norm: %v", err)
}
return freq, normBits, err
}
// readLocation processes all the integers on the stream representing a single
// location. if you care about it, pass in a non-nil location struct, and we
// will fill it. if you don't care about it, pass in nil and we safely consume
// the contents.
func (i *PostingsIterator) readLocation(l *Location) error {
// read off field
fieldID, err := i.locDecoder.GetU64()
if err != nil {
return fmt.Errorf("error reading location field: %v", err)
}
// read off pos
pos, err := i.locDecoder.GetU64()
if err != nil {
return fmt.Errorf("error reading location pos: %v", err)
}
// read off start
start, err := i.locDecoder.GetU64()
if err != nil {
return fmt.Errorf("error reading location start: %v", err)
}
// read off end
end, err := i.locDecoder.GetU64()
if err != nil {
return fmt.Errorf("error reading location end: %v", err)
}
// read off num array pos
numArrayPos, err := i.locDecoder.GetU64()
if err != nil {
return fmt.Errorf("error reading location num array pos: %v", err)
}
// group these together for less branching
if l != nil {
l.field = i.postings.dictionary.segment.fieldsInv[fieldID]
l.pos = pos
l.start = start
l.end = end
if numArrayPos > 0 {
l.ap = make([]uint64, int(numArrayPos))
}
}
// read off array positions
for k := 0; k < int(numArrayPos); k++ {
ap, err := i.locDecoder.GetU64()
if err != nil {
return fmt.Errorf("error reading array position: %v", err)
}
if l != nil {
l.ap[k] = ap
}
}
return nil
}
// Next returns the next posting on the postings list, or nil at the end
func (i *PostingsIterator) Next() (segment.Posting, error) {
if i.actual == nil || !i.actual.HasNext() {
return nil, nil
}
n := i.actual.Next()
nChunk := n / i.postings.dictionary.segment.chunkFactor
allN := i.all.Next()
allNChunk := allN / i.postings.dictionary.segment.chunkFactor
// n is the next actual hit (excluding some postings)
// allN is the next hit in the full postings
// if they don't match, adjust offsets to factor in item we're skipping over
// incr the all iterator, and check again
for allN != n {
// in different chunks, reset offsets
if allNChunk != nChunk {
i.locoffset = 0
i.offset = 0
} else {
if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
err := i.loadChunk(int(nChunk))
if err != nil {
return nil, fmt.Errorf("error loading chunk: %v", err)
}
}
// read off freq/offsets even though we don't care about them
freq, _, err := i.readFreqNorm()
if err != nil {
return nil, err
}
if i.postings.dictionary.segment.fieldsLoc[i.postings.dictionary.fieldID] {
for j := 0; j < int(freq); j++ {
err := i.readLocation(nil)
if err != nil {
return nil, err
}
}
}
// in same chunk, need to account for offsets
i.offset++
}
allN = i.all.Next()
}
if i.currChunk != nChunk || i.currChunkFreqNorm == nil {
err := i.loadChunk(int(nChunk))
if err != nil {
return nil, fmt.Errorf("error loading chunk: %v", err)
}
}
rv := &Posting{
iterator: i,
docNum: uint64(n),
}
var err error
var normBits uint64
rv.freq, normBits, err = i.readFreqNorm()
if err != nil {
return nil, err
}
rv.norm = math.Float32frombits(uint32(normBits))
if i.postings.dictionary.segment.fieldsLoc[i.postings.dictionary.fieldID] {
// read off 'freq' locations
rv.locs = make([]segment.Location, rv.freq)
locs := make([]Location, rv.freq)
for j := 0; j < int(rv.freq); j++ {
err := i.readLocation(&locs[j])
if err != nil {
return nil, err
}
rv.locs[j] = &locs[j]
}
}
return rv, nil
}
// Posting is a single entry in a postings list
type Posting struct {
iterator *PostingsIterator
docNum uint64
freq uint64
norm float32
locs []segment.Location
}
// Number returns the document number of this posting in this segment
func (p *Posting) Number() uint64 {
return p.docNum
}
// Frequency returns the frequence of occurance of this term in this doc/field
func (p *Posting) Frequency() uint64 {
return p.freq
}
// Norm returns the normalization factor for this posting
func (p *Posting) Norm() float64 {
return float64(p.norm)
}
// Locations returns the location information for each occurance
func (p *Posting) Locations() []segment.Location {
return p.locs
}
// Location represents the location of a single occurance
type Location struct {
field string
pos uint64
start uint64
end uint64
ap []uint64
}
// Field returns the name of the field (useful in composite fields to know
// which original field the value came from)
func (l *Location) Field() string {
return l.field
}
// Start returns the start byte offset of this occurance
func (l *Location) Start() uint64 {
return l.start
}
// End returns the end byte offset of this occurance
func (l *Location) End() uint64 {
return l.end
}
// Pos returns the 1-based phrase position of this occurance
func (l *Location) Pos() uint64 {
return l.pos
}
// ArrayPositions returns the array position vector associated with this occurance
func (l *Location) ArrayPositions() []uint64 {
return l.ap
}

View File

@ -0,0 +1,352 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"os"
"github.com/RoaringBitmap/roaring"
"github.com/Smerity/govarint"
"github.com/blevesearch/bleve/index/scorch/segment"
"github.com/couchbaselabs/vellum"
mmap "github.com/edsrzf/mmap-go"
"github.com/golang/snappy"
)
// Open returns a zap impl of a segment
func Open(path string) (segment.Segment, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
mm, err := mmap.Map(f, mmap.RDONLY, 0)
if err != nil {
// mmap failed, try to close the file
_ = f.Close()
return nil, err
}
rv := &Segment{
f: f,
mm: mm,
path: path,
fieldsMap: make(map[string]uint16),
}
err = rv.loadConfig()
if err != nil {
_ = rv.Close()
return nil, err
}
err = rv.loadFields()
if err != nil {
_ = rv.Close()
return nil, err
}
return rv, nil
}
// Segment implements the segment.Segment inteface over top the zap file format
type Segment struct {
f *os.File
mm mmap.MMap
path string
crc uint32
version uint32
chunkFactor uint32
numDocs uint64
storedIndexOffset uint64
fieldsIndexOffset uint64
fieldsMap map[string]uint16
fieldsInv []string
fieldsLoc []bool
fieldsOffsets []uint64
}
func (s *Segment) loadConfig() error {
crcOffset := len(s.mm) - 4
s.crc = binary.BigEndian.Uint32(s.mm[crcOffset : crcOffset+4])
verOffset := crcOffset - 4
s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4])
if s.version != version {
return fmt.Errorf("unsupported version %d", s.version)
}
chunkOffset := verOffset - 4
s.chunkFactor = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4])
fieldsOffset := chunkOffset - 8
s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsOffset : fieldsOffset+8])
storedOffset := fieldsOffset - 8
s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedOffset : storedOffset+8])
docNumOffset := storedOffset - 8
s.numDocs = binary.BigEndian.Uint64(s.mm[docNumOffset : docNumOffset+8])
return nil
}
func (s *Segment) loadFields() error {
// NOTE for now we assume the fields index immediately preceeds the footer
// if this changes, need to adjust accordingly (or store epxlicit length)
fieldsIndexEnd := uint64(len(s.mm) - footerSize)
// iterate through fields index
var fieldID uint64
for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd {
addr := binary.BigEndian.Uint64(s.mm[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8])
var n uint64
hasStoredLoc, read := binary.Uvarint(s.mm[addr:fieldsIndexEnd])
n += uint64(read)
if hasStoredLoc == 1 {
s.fieldsLoc = append(s.fieldsLoc, true)
} else {
s.fieldsLoc = append(s.fieldsLoc, false)
}
var dictLoc uint64
dictLoc, read = binary.Uvarint(s.mm[addr+n : fieldsIndexEnd])
n += uint64(read)
s.fieldsOffsets = append(s.fieldsOffsets, dictLoc)
var nameLen uint64
nameLen, read = binary.Uvarint(s.mm[addr+n : fieldsIndexEnd])
n += uint64(read)
name := string(s.mm[addr+n : addr+n+nameLen])
s.fieldsInv = append(s.fieldsInv, name)
s.fieldsMap[name] = uint16(fieldID + 1)
fieldID++
}
return nil
}
// Dictionary returns the term dictionary for the specified field
func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) {
dict, err := s.dictionary(field)
if err == nil && dict == nil {
return &segment.EmptyDictionary{}, nil
}
return dict, err
}
func (s *Segment) dictionary(field string) (*Dictionary, error) {
rv := &Dictionary{
segment: s,
field: field,
}
rv.fieldID = s.fieldsMap[field]
if rv.fieldID > 0 {
rv.fieldID = rv.fieldID - 1
dictStart := s.fieldsOffsets[rv.fieldID]
// read the length of the vellum data
vellumLen, read := binary.Uvarint(s.mm[dictStart : dictStart+binary.MaxVarintLen64])
fstBytes := s.mm[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen]
if fstBytes != nil {
fst, err := vellum.Load(fstBytes)
if err != nil {
return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err)
}
if err == nil {
rv.fst = fst
}
}
} else {
return nil, nil
}
return rv, nil
}
// VisitDocument invokes the DocFieldValueVistor for each stored field
// for the specified doc number
func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
// first make sure this is a valid number in this segment
if num < s.numDocs {
docStoredStartAddr := s.storedIndexOffset + (8 * num)
docStoredStart := binary.BigEndian.Uint64(s.mm[docStoredStartAddr : docStoredStartAddr+8])
var n uint64
metaLen, read := binary.Uvarint(s.mm[docStoredStart : docStoredStart+binary.MaxVarintLen64])
n += uint64(read)
var dataLen uint64
dataLen, read = binary.Uvarint(s.mm[docStoredStart+n : docStoredStart+n+binary.MaxVarintLen64])
n += uint64(read)
meta := s.mm[docStoredStart+n : docStoredStart+n+metaLen]
data := s.mm[docStoredStart+n+metaLen : docStoredStart+n+metaLen+dataLen]
uncompressed, err := snappy.Decode(nil, data)
if err != nil {
panic(err)
}
// now decode meta and process
reader := bytes.NewReader(meta)
decoder := govarint.NewU64Base128Decoder(reader)
keepGoing := true
for keepGoing {
field, err := decoder.GetU64()
if err == io.EOF {
break
}
if err != nil {
return err
}
typ, err := decoder.GetU64()
if err != nil {
return err
}
offset, err := decoder.GetU64()
if err != nil {
return err
}
l, err := decoder.GetU64()
if err != nil {
return err
}
numap, err := decoder.GetU64()
if err != nil {
return err
}
var arrayPos []uint64
if numap > 0 {
arrayPos = make([]uint64, numap)
for i := 0; i < int(numap); i++ {
ap, err := decoder.GetU64()
if err != nil {
return err
}
arrayPos[i] = ap
}
}
value := uncompressed[offset : offset+l]
keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos)
}
}
return nil
}
// Count returns the number of documents in this segment.
func (s *Segment) Count() uint64 {
return s.numDocs
}
// DocNumbers returns a bitset corresponding to the doc numbers of all the
// provided _id strings
func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) {
rv := roaring.New()
if len(s.fieldsMap) > 0 {
idDict, err := s.dictionary("_id")
if err != nil {
return nil, err
}
for _, id := range ids {
postings, err := idDict.postingsList(id, nil)
if err != nil {
return nil, err
}
if postings.postings != nil {
rv.Or(postings.postings)
}
}
}
return rv, nil
}
// Fields returns the field names used in this segment
func (s *Segment) Fields() []string {
return s.fieldsInv
}
// Path returns the path of this segment on disk
func (s *Segment) Path() string {
return s.path
}
// Close releases all resources associated with this segment
func (s *Segment) Close() (err error) {
if s.mm != nil {
err = s.mm.Unmap()
}
// try to close file even if unmap failed
if s.f != nil {
err2 := s.f.Close()
if err == nil {
// try to return first error
err = err2
}
}
return
}
// some helpers i started adding for the command-line utility
// Data returns the underlying mmaped data slice
func (s *Segment) Data() []byte {
return s.mm
}
// CRC returns the CRC value stored in the file footer
func (s *Segment) CRC() uint32 {
return s.crc
}
// Version returns the file version in the file footer
func (s *Segment) Version() uint32 {
return s.version
}
// ChunkFactor returns the chunk factor in the file footer
func (s *Segment) ChunkFactor() uint32 {
return s.chunkFactor
}
// FieldsIndexOffset returns the fields index offset in the file footer
func (s *Segment) FieldsIndexOffset() uint64 {
return s.fieldsIndexOffset
}
// StoredIndexOffset returns the stored value index offset in the file foooter
func (s *Segment) StoredIndexOffset() uint64 {
return s.storedIndexOffset
}
// NumDocs returns the number of documents in the file footer
func (s *Segment) NumDocs() uint64 {
return s.numDocs
}
// DictAddr is a helper function to compute the file offset where the
// dictionary is stored for the specified field.
func (s *Segment) DictAddr(field string) (uint64, error) {
var fieldID uint16
var ok bool
if fieldID, ok = s.fieldsMap[field]; !ok {
return 0, fmt.Errorf("no such field '%s'", field)
}
return s.fieldsOffsets[fieldID-1], nil
}

View File

@ -0,0 +1,517 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package zap
import (
"math"
"os"
"reflect"
"testing"
)
func TestOpen(t *testing.T) {
_ = os.RemoveAll("/tmp/scorch.zap")
memSegment := buildMemSegment()
err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024)
if err != nil {
t.Fatalf("error persisting segment: %v", err)
}
segment, err := Open("/tmp/scorch.zap")
if err != nil {
t.Fatalf("error opening segment: %v", err)
}
defer func() {
cerr := segment.Close()
if cerr != nil {
t.Fatalf("error closing segment: %v", err)
}
}()
expectFields := map[string]struct{}{
"_id": struct{}{},
"_all": struct{}{},
"name": struct{}{},
"desc": struct{}{},
"tag": struct{}{},
}
fields := segment.Fields()
if len(fields) != len(expectFields) {
t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields))
}
for _, field := range fields {
if _, ok := expectFields[field]; !ok {
t.Errorf("got unexpected field: %s", field)
}
}
docCount := segment.Count()
if docCount != 1 {
t.Errorf("expected count 1, got %d", docCount)
}
// check the _id field
dict, err := segment.Dictionary("_id")
if err != nil {
t.Fatal(err)
}
if dict == nil {
t.Fatal("got nil dict, expected non-nil")
}
postingsList, err := dict.PostingsList("a", nil)
if err != nil {
t.Fatal(err)
}
if postingsList == nil {
t.Fatal("got nil postings list, expected non-nil")
}
postingsItr := postingsList.Iterator()
if postingsItr == nil {
t.Fatal("got nil iterator, expected non-nil")
}
count := 0
nextPosting, err := postingsItr.Next()
for nextPosting != nil && err == nil {
count++
if nextPosting.Frequency() != 1 {
t.Errorf("expected frequency 1, got %d", nextPosting.Frequency())
}
if nextPosting.Number() != 0 {
t.Errorf("expected doc number 0, got %d", nextPosting.Number())
}
if nextPosting.Norm() != 1.0 {
t.Errorf("expected norm 1.0, got %f", nextPosting.Norm())
}
nextPosting, err = postingsItr.Next()
}
if err != nil {
t.Fatal(err)
}
if count != 1 {
t.Errorf("expected count to be 1, got %d", count)
}
// check the name field
dict, err = segment.Dictionary("name")
if err != nil {
t.Fatal(err)
}
if dict == nil {
t.Fatal("got nil dict, expected non-nil")
}
postingsList, err = dict.PostingsList("wow", nil)
if err != nil {
t.Fatal(err)
}
if postingsList == nil {
t.Fatal("got nil postings list, expected non-nil")
}
postingsItr = postingsList.Iterator()
if postingsItr == nil {
t.Fatal("got nil iterator, expected non-nil")
}
count = 0
nextPosting, err = postingsItr.Next()
for nextPosting != nil && err == nil {
count++
if nextPosting.Frequency() != 1 {
t.Errorf("expected frequency 1, got %d", nextPosting.Frequency())
}
if nextPosting.Number() != 0 {
t.Errorf("expected doc number 0, got %d", nextPosting.Number())
}
if nextPosting.Norm() != 1.0 {
t.Errorf("expected norm 1.0, got %f", nextPosting.Norm())
}
var numLocs uint64
for _, loc := range nextPosting.Locations() {
numLocs++
if loc.Field() != "name" {
t.Errorf("expected loc field to be 'name', got '%s'", loc.Field())
}
if loc.Start() != 0 {
t.Errorf("expected loc start to be 0, got %d", loc.Start())
}
if loc.End() != 3 {
t.Errorf("expected loc end to be 3, got %d", loc.End())
}
if loc.Pos() != 1 {
t.Errorf("expected loc pos to be 1, got %d", loc.Pos())
}
if loc.ArrayPositions() != nil {
t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions())
}
}
if numLocs != nextPosting.Frequency() {
t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs)
}
nextPosting, err = postingsItr.Next()
}
if err != nil {
t.Fatal(err)
}
if count != 1 {
t.Errorf("expected count to be 1, got %d", count)
}
// check the _all field (composite)
dict, err = segment.Dictionary("_all")
if err != nil {
t.Fatal(err)
}
if dict == nil {
t.Fatal("got nil dict, expected non-nil")
}
postingsList, err = dict.PostingsList("wow", nil)
if err != nil {
t.Fatal(err)
}
if postingsList == nil {
t.Fatal("got nil postings list, expected non-nil")
}
postingsItr = postingsList.Iterator()
if postingsItr == nil {
t.Fatal("got nil iterator, expected non-nil")
}
count = 0
nextPosting, err = postingsItr.Next()
for nextPosting != nil && err == nil {
count++
if nextPosting.Frequency() != 1 {
t.Errorf("expected frequency 1, got %d", nextPosting.Frequency())
}
if nextPosting.Number() != 0 {
t.Errorf("expected doc number 0, got %d", nextPosting.Number())
}
expectedNorm := float32(1.0 / math.Sqrt(float64(5)))
if nextPosting.Norm() != float64(expectedNorm) {
t.Errorf("expected norm %f, got %f", expectedNorm, nextPosting.Norm())
}
var numLocs uint64
for _, loc := range nextPosting.Locations() {
numLocs++
if loc.Field() != "name" {
t.Errorf("expected loc field to be 'name', got '%s'", loc.Field())
}
if loc.Start() != 0 {
t.Errorf("expected loc start to be 0, got %d", loc.Start())
}
if loc.End() != 3 {
t.Errorf("expected loc end to be 3, got %d", loc.End())
}
if loc.Pos() != 1 {
t.Errorf("expected loc pos to be 1, got %d", loc.Pos())
}
if loc.ArrayPositions() != nil {
t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions())
}
}
if numLocs != nextPosting.Frequency() {
t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs)
}
nextPosting, err = postingsItr.Next()
}
if err != nil {
t.Fatal(err)
}
if count != 1 {
t.Errorf("expected count to be 1, got %d", count)
}
// now try a field with array positions
dict, err = segment.Dictionary("tag")
if err != nil {
t.Fatal(err)
}
if dict == nil {
t.Fatal("got nil dict, expected non-nil")
}
postingsList, err = dict.PostingsList("dark", nil)
if err != nil {
t.Fatal(err)
}
if postingsList == nil {
t.Fatal("got nil postings list, expected non-nil")
}
postingsItr = postingsList.Iterator()
if postingsItr == nil {
t.Fatal("got nil iterator, expected non-nil")
}
nextPosting, err = postingsItr.Next()
for nextPosting != nil && err == nil {
if nextPosting.Frequency() != 1 {
t.Errorf("expected frequency 1, got %d", nextPosting.Frequency())
}
if nextPosting.Number() != 0 {
t.Errorf("expected doc number 0, got %d", nextPosting.Number())
}
var numLocs uint64
for _, loc := range nextPosting.Locations() {
numLocs++
if loc.Field() != "tag" {
t.Errorf("expected loc field to be 'name', got '%s'", loc.Field())
}
if loc.Start() != 0 {
t.Errorf("expected loc start to be 0, got %d", loc.Start())
}
if loc.End() != 4 {
t.Errorf("expected loc end to be 3, got %d", loc.End())
}
if loc.Pos() != 1 {
t.Errorf("expected loc pos to be 1, got %d", loc.Pos())
}
expectArrayPos := []uint64{1}
if !reflect.DeepEqual(loc.ArrayPositions(), expectArrayPos) {
t.Errorf("expect loc array pos to be %v, got %v", expectArrayPos, loc.ArrayPositions())
}
}
if numLocs != nextPosting.Frequency() {
t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs)
}
nextPosting, err = postingsItr.Next()
}
if err != nil {
t.Fatal(err)
}
// now try and visit a document
var fieldValuesSeen int
err = segment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool {
fieldValuesSeen++
return true
})
if err != nil {
t.Fatal(err)
}
if fieldValuesSeen != 5 {
t.Errorf("expected 5 field values, got %d", fieldValuesSeen)
}
}
func TestOpenMulti(t *testing.T) {
_ = os.RemoveAll("/tmp/scorch.zap")
memSegment := buildMemSegmentMulti()
err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024)
if err != nil {
t.Fatalf("error persisting segment: %v", err)
}
segment, err := Open("/tmp/scorch.zap")
if err != nil {
t.Fatalf("error opening segment: %v", err)
}
defer func() {
cerr := segment.Close()
if cerr != nil {
t.Fatalf("error closing segment: %v", err)
}
}()
if segment.Count() != 2 {
t.Errorf("expected count 2, got %d", segment.Count())
}
// check the desc field
dict, err := segment.Dictionary("desc")
if err != nil {
t.Fatal(err)
}
if dict == nil {
t.Fatal("got nil dict, expected non-nil")
}
postingsList, err := dict.PostingsList("thing", nil)
if err != nil {
t.Fatal(err)
}
if postingsList == nil {
t.Fatal("got nil postings list, expected non-nil")
}
postingsItr := postingsList.Iterator()
if postingsItr == nil {
t.Fatal("got nil iterator, expected non-nil")
}
count := 0
nextPosting, err := postingsItr.Next()
for nextPosting != nil && err == nil {
count++
nextPosting, err = postingsItr.Next()
}
if err != nil {
t.Fatal(err)
}
if count != 2 {
t.Errorf("expected count to be 2, got %d", count)
}
// get docnum of a
exclude, err := segment.DocNumbers([]string{"a"})
if err != nil {
t.Fatal(err)
}
// look for term 'thing' excluding doc 'a'
postingsListExcluding, err := dict.PostingsList("thing", exclude)
if err != nil {
t.Fatal(err)
}
if postingsList == nil {
t.Fatal("got nil postings list, expected non-nil")
}
postingsListExcludingCount := postingsListExcluding.Count()
if postingsListExcludingCount != 1 {
t.Errorf("expected count from postings list to be 1, got %d", postingsListExcludingCount)
}
postingsItrExcluding := postingsListExcluding.Iterator()
if postingsItr == nil {
t.Fatal("got nil iterator, expected non-nil")
}
count = 0
nextPosting, err = postingsItrExcluding.Next()
for nextPosting != nil && err == nil {
count++
nextPosting, err = postingsItrExcluding.Next()
}
if err != nil {
t.Fatal(err)
}
if count != 1 {
t.Errorf("expected count to be 1, got %d", count)
}
}
func TestOpenMultiWithTwoChunks(t *testing.T) {
_ = os.RemoveAll("/tmp/scorch.zap")
memSegment := buildMemSegmentMulti()
err := PersistSegment(memSegment, "/tmp/scorch.zap", 1)
if err != nil {
t.Fatalf("error persisting segment: %v", err)
}
segment, err := Open("/tmp/scorch.zap")
if err != nil {
t.Fatalf("error opening segment: %v", err)
}
defer func() {
cerr := segment.Close()
if cerr != nil {
t.Fatalf("error closing segment: %v", err)
}
}()
if segment.Count() != 2 {
t.Errorf("expected count 2, got %d", segment.Count())
}
// check the desc field
dict, err := segment.Dictionary("desc")
if err != nil {
t.Fatal(err)
}
if dict == nil {
t.Fatal("got nil dict, expected non-nil")
}
postingsList, err := dict.PostingsList("thing", nil)
if err != nil {
t.Fatal(err)
}
if postingsList == nil {
t.Fatal("got nil postings list, expected non-nil")
}
postingsItr := postingsList.Iterator()
if postingsItr == nil {
t.Fatal("got nil iterator, expected non-nil")
}
count := 0
nextPosting, err := postingsItr.Next()
for nextPosting != nil && err == nil {
count++
nextPosting, err = postingsItr.Next()
}
if err != nil {
t.Fatal(err)
}
if count != 2 {
t.Errorf("expected count to be 2, got %d", count)
}
// get docnum of a
exclude, err := segment.DocNumbers([]string{"a"})
if err != nil {
t.Fatal(err)
}
// look for term 'thing' excluding doc 'a'
postingsListExcluding, err := dict.PostingsList("thing", exclude)
if err != nil {
t.Fatal(err)
}
if postingsList == nil {
t.Fatal("got nil postings list, expected non-nil")
}
postingsItrExcluding := postingsListExcluding.Iterator()
if postingsItr == nil {
t.Fatal("got nil iterator, expected non-nil")
}
count = 0
nextPosting, err = postingsItrExcluding.Next()
for nextPosting != nil && err == nil {
count++
nextPosting, err = postingsItrExcluding.Next()
}
if err != nil {
t.Fatal(err)
}
if count != 1 {
t.Errorf("expected count to be 1, got %d", count)
}
}