diff --git a/index/scorch/segment/zap/intcoder.go b/index/scorch/segment/zap/intcoder.go new file mode 100644 index 00000000..a682740f --- /dev/null +++ b/index/scorch/segment/zap/intcoder.go @@ -0,0 +1,111 @@ +package zap + +import ( + "bytes" + "encoding/binary" + "io" + + "github.com/Smerity/govarint" +) + +type chunkedIntCoder struct { + final []byte + maxDocNum uint64 + chunkSize uint64 + chunkBuf bytes.Buffer + encoder *govarint.Base128Encoder + chunkLens []uint64 + currChunk uint64 +} + +// newChunkedIntCoder returns a new chunk int coder which packs data into +// chunks based on the provided chunkSize and supports up to the specified +// maxDocNum +func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder { + total := maxDocNum/chunkSize + 1 + rv := &chunkedIntCoder{ + chunkSize: chunkSize, + maxDocNum: maxDocNum, + chunkLens: make([]uint64, total), + } + rv.encoder = govarint.NewU64Base128Encoder(&rv.chunkBuf) + + return rv +} + +// Reset lets you reuse this chunked int coder. buffers are reset and reused +// from previous use. you cannot change the chunk size or max doc num. +func (c *chunkedIntCoder) Reset() { + c.final = c.final[:0] + c.chunkBuf.Reset() + c.currChunk = 0 + for i := range c.chunkLens { + c.chunkLens[i] = 0 + } +} + +// Add encodes the provided integers into the correct chunk for the provided +// doc num. You MUST call Add() with increasing docNums. +func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // starting a new chunk + if c.encoder != nil { + // close out last + c.encoder.Close() + encodingBytes := c.chunkBuf.Bytes() + c.chunkLens[c.currChunk] = uint64(len(encodingBytes)) + c.final = append(c.final, encodingBytes...) + c.chunkBuf.Reset() + c.encoder = govarint.NewU64Base128Encoder(&c.chunkBuf) + } + c.currChunk = chunk + } + + for _, val := range vals { + _, err := c.encoder.PutU64(val) + if err != nil { + return err + } + } + + return nil +} + +// Close indicates you are done calling Add() this allows the final chunk +// to be encoded. +func (c *chunkedIntCoder) Close() { + c.encoder.Close() + encodingBytes := c.chunkBuf.Bytes() + c.chunkLens[c.currChunk] = uint64(len(encodingBytes)) + c.final = append(c.final, encodingBytes...) +} + +// Write commits all the encoded chunked integers to the provided writer. +func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { + var tw int + buf := make([]byte, binary.MaxVarintLen64) + // write out the number of chunks + n := binary.PutUvarint(buf, uint64(len(c.chunkLens))) + nw, err := w.Write(buf[:n]) + tw += nw + if err != nil { + return tw, err + } + // write out the chunk lens + for _, chunkLen := range c.chunkLens { + n := binary.PutUvarint(buf, uint64(chunkLen)) + nw, err = w.Write(buf[:n]) + tw += nw + if err != nil { + return tw, err + } + } + // write out the data + nw, err = w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + return tw, nil +} diff --git a/index/scorch/segment/zap/intcoder_test.go b/index/scorch/segment/zap/intcoder_test.go new file mode 100644 index 00000000..f2623a54 --- /dev/null +++ b/index/scorch/segment/zap/intcoder_test.go @@ -0,0 +1,59 @@ +package zap + +import ( + "bytes" + "reflect" + "testing" +) + +func TestChunkIntCoder(t *testing.T) { + tests := []struct { + maxDocNum uint64 + chunkSize uint64 + docNums []uint64 + vals [][]uint64 + expected []byte + }{ + { + maxDocNum: 0, + chunkSize: 1, + docNums: []uint64{0}, + vals: [][]uint64{ + []uint64{3}, + }, + // 1 chunk, chunk-0 length 1, value 3 + expected: []byte{0x1, 0x1, 0x3}, + }, + { + maxDocNum: 1, + chunkSize: 1, + docNums: []uint64{0, 1}, + vals: [][]uint64{ + []uint64{3}, + []uint64{7}, + }, + // 2 chunks, chunk-0 length 1, chunk-1 length 1, value 3, value 7 + expected: []byte{0x2, 0x1, 0x1, 0x3, 0x7}, + }, + } + + for _, test := range tests { + + cic := newChunkedIntCoder(test.chunkSize, test.maxDocNum) + for i, docNum := range test.docNums { + err := cic.Add(docNum, test.vals[i]...) + if err != nil { + t.Fatalf("error adding to intcoder: %v", err) + } + } + cic.Close() + var actual bytes.Buffer + _, err := cic.Write(&actual) + if err != nil { + t.Fatalf("error writing: %v", err) + } + if !reflect.DeepEqual(test.expected, actual.Bytes()) { + t.Errorf("got % x, expected % x", actual.Bytes(), test.expected) + } + } +} diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go new file mode 100644 index 00000000..fe5155d1 --- /dev/null +++ b/index/scorch/segment/zap/merge.go @@ -0,0 +1,526 @@ +package zap + +import ( + "bufio" + "bytes" + "encoding/binary" + "fmt" + "io" + "math" + "os" + + "github.com/RoaringBitmap/roaring" + "github.com/Smerity/govarint" + "github.com/couchbaselabs/vellum" + "github.com/golang/snappy" +) + +// Merge takes a slice of zap segments, bit masks describing which documents +// from the may be dropped, and creates a new segment containing the remaining +// data. This new segment is built at the specified path, with the provided +// chunkFactor. +func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, chunkFactor uint32) error { + + flag := os.O_RDWR | os.O_CREATE + + f, err := os.OpenFile(path, flag, 0600) + if err != nil { + return err + } + + // bufer the output + br := bufio.NewWriter(f) + + // wrap it for counting (tracking offsets) + cr := NewCountHashWriter(br) + + fieldsInv := mergeFields(segments) + fieldsMap := mapFields(fieldsInv) + + newSegDocCount := computeNewDocCount(segments, drops) + + var newDocNums [][]uint64 + var storedIndexOffset uint64 + storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, + fieldsMap, fieldsInv, newSegDocCount, cr) + if err != nil { + return err + } + + // FIXME temp until computed + //dictLocs := make([]uint64, len(fieldsInv)) + + var dictLocs []uint64 + dictLocs, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, + newDocNums, newSegDocCount, cr) + if err != nil { + return err + } + + var fieldsIndexOffset uint64 + fieldsIndexOffset, err = persistMergedFields(fieldsInv, cr, dictLocs) + if err != nil { + return err + } + + err = persistFooter(newSegDocCount, storedIndexOffset, + fieldsIndexOffset, chunkFactor, cr) + if err != nil { + return err + } + + err = br.Flush() + if err != nil { + return err + } + + err = f.Sync() + if err != nil { + return err + } + + err = f.Close() + if err != nil { + return err + } + + return nil +} + +func mapFields(fields []string) map[string]uint16 { + rv := make(map[string]uint16) + for i, fieldName := range fields { + rv[fieldName] = uint16(i) + } + return rv +} + +func computeNewDocCount(segments []*Segment, drops []*roaring.Bitmap) uint64 { + var newSegDocCount uint64 + for segI, segment := range segments { + segIAfterDrop := segment.NumDocs() + if drops[segI] != nil { + segIAfterDrop -= drops[segI].GetCardinality() + } + newSegDocCount += segIAfterDrop + } + return newSegDocCount +} + +func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer) (int, error) { + var buffer bytes.Buffer + // write out postings list to memory so we know the len + postingsListLen, err := r.WriteTo(&buffer) + if err != nil { + return 0, err + } + var tw int + // write out the length of this postings list + buf := make([]byte, binary.MaxVarintLen64) + n := binary.PutUvarint(buf, uint64(postingsListLen)) + nw, err := w.Write(buf[:n]) + tw += nw + if err != nil { + return tw, err + } + + // write out the postings list itself + nw, err = w.Write(buffer.Bytes()) + tw += nw + if err != nil { + return tw, err + } + + return tw, nil +} + +func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, + fieldsInv []string, fieldsMap map[string]uint16, newDocNums [][]uint64, newSegDocCount uint64, + w *CountHashWriter) ([]uint64, error) { + + rv := make([]uint64, len(fieldsInv)) + + var vellumBuf bytes.Buffer + // for each field + for fieldID, fieldName := range fieldsInv { + if fieldID != 0 { + vellumBuf.Reset() + } + newVellum, err := vellum.New(&vellumBuf, nil) + if err != nil { + return nil, err + } + + // collect FTS iterators from all segments for this field + var dicts []*Dictionary + var itrs []vellum.Iterator + for _, segment := range segments { + dict, err2 := segment.dictionary(fieldName) + if err2 != nil { + return nil, err2 + } + dicts = append(dicts, dict) + + itr, err2 := dict.fst.Iterator(nil, nil) + if err2 != nil { + return nil, err2 + } + itrs = append(itrs, itr) + } + + // create merging iterator + mergeItr, err := vellum.NewMergeIterator(itrs, func(postingOffsets []uint64) uint64 { + // we don't actually use the merged value + return 0 + }) + + tfEncoder := newChunkedIntCoder(1024, newSegDocCount-1) + locEncoder := newChunkedIntCoder(1024, newSegDocCount-1) + for err == nil { + term, _ := mergeItr.Current() + + newRoaring := roaring.NewBitmap() + newRoaringLocs := roaring.NewBitmap() + tfEncoder.Reset() + locEncoder.Reset() + + // now go back and get posting list for this term + // but pass in the deleted docs for that segment + for dictI, dict := range dicts { + postings, err2 := dict.postingsList(string(term), drops[dictI]) + if err2 != nil { + return nil, err2 + } + + postItr := postings.Iterator() + next, err2 := postItr.Next() + for next != nil && err2 == nil { + hitNewDocNum := newDocNums[dictI][next.Number()] + if hitNewDocNum == docDropped { + return nil, fmt.Errorf("see hit with dropped doc num") + } + newRoaring.Add(uint32(hitNewDocNum)) + // encode norm bits + norm := next.Norm() + normBits := math.Float32bits(float32(norm)) + err3 := tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits)) + if err3 != nil { + return nil, err3 + } + locs := next.Locations() + if len(locs) > 0 { + newRoaringLocs.Add(uint32(hitNewDocNum)) + for _, loc := range locs { + args := make([]uint64, 0, 5+len(loc.ArrayPositions())) + args = append(args, uint64(fieldsMap[loc.Field()])) + args = append(args, loc.Pos()) + args = append(args, loc.Start()) + args = append(args, loc.End()) + args = append(args, uint64(len(loc.ArrayPositions()))) + args = append(args, loc.ArrayPositions()...) + locEncoder.Add(hitNewDocNum, args...) + } + } + next, err2 = postItr.Next() + } + if err != nil { + return nil, err + } + + } + tfEncoder.Close() + locEncoder.Close() + + if newRoaring.GetCardinality() > 0 { + // this field/term actually has hits in the new segment, lets write it down + freqOffset := uint64(w.Count()) + _, err = tfEncoder.Write(w) + if err != nil { + return nil, err + } + locOffset := uint64(w.Count()) + _, err = locEncoder.Write(w) + if err != nil { + return nil, err + } + postingLocOffset := uint64(w.Count()) + _, err = writeRoaringWithLen(newRoaringLocs, w) + if err != nil { + return nil, err + } + postingOffset := uint64(w.Count()) + // write out the start of the term info + buf := make([]byte, binary.MaxVarintLen64) + n := binary.PutUvarint(buf, freqOffset) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + // write out the start of the loc info + n = binary.PutUvarint(buf, locOffset) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + // write out the start of the loc posting list + n = binary.PutUvarint(buf, postingLocOffset) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + _, err = writeRoaringWithLen(newRoaring, w) + if err != nil { + return nil, err + } + + newVellum.Insert(term, postingOffset) + } + + err = mergeItr.Next() + } + if err != nil && err != vellum.ErrIteratorDone { + return nil, err + } + + dictOffset := uint64(w.Count()) + err = newVellum.Close() + if err != nil { + return nil, err + } + vellumData := vellumBuf.Bytes() + + // write out the length of the vellum data + buf := make([]byte, binary.MaxVarintLen64) + // write out the number of chunks + n := binary.PutUvarint(buf, uint64(len(vellumData))) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + // write this vellum to disk + _, err = w.Write(vellumData) + if err != nil { + return nil, err + } + + rv[fieldID] = dictOffset + } + + return rv, nil +} + +const docDropped = math.MaxUint64 + +func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, + fieldsMap map[string]uint16, fieldsInv []string, newSegDocCount uint64, + w *CountHashWriter) (uint64, [][]uint64, error) { + var rv [][]uint64 + var newDocNum int + + var curr int + var metaBuf bytes.Buffer + var data, compressed []byte + + docNumOffsets := make([]uint64, newSegDocCount) + + // for each segment + for segI, segment := range segments { + var segNewDocNums []uint64 + + // for each doc num + for docNum := uint64(0); docNum < segment.numDocs; docNum++ { + metaBuf.Reset() + data = data[:0] + compressed = compressed[:0] + curr = 0 + + metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) + + if drops[segI] != nil && drops[segI].Contains(uint32(docNum)) { + segNewDocNums = append(segNewDocNums, docDropped) + } else { + segNewDocNums = append(segNewDocNums, uint64(newDocNum)) + // collect all the data + vals := make(map[uint16][][]byte) + typs := make(map[uint16][]byte) + poss := make(map[uint16][][]uint64) + err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool { + fieldID := fieldsMap[field] + vals[fieldID] = append(vals[fieldID], value) + typs[fieldID] = append(typs[fieldID], typ) + poss[fieldID] = append(poss[fieldID], pos) + return true + }) + if err != nil { + return 0, nil, err + } + + // now walk the fields in order + for fieldID := range fieldsInv { + + if storedFieldValues, ok := vals[uint16(fieldID)]; ok { + + // has stored values for this field + num := len(storedFieldValues) + + // process each value + for i := 0; i < num; i++ { + // encode field + _, err2 := metaEncoder.PutU64(uint64(fieldID)) + if err2 != nil { + return 0, nil, err2 + } + // encode type + _, err2 = metaEncoder.PutU64(uint64(typs[uint16(fieldID)][i])) + if err2 != nil { + return 0, nil, err2 + } + // encode start offset + _, err2 = metaEncoder.PutU64(uint64(curr)) + if err2 != nil { + return 0, nil, err2 + } + // end len + _, err2 = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) + if err2 != nil { + return 0, nil, err2 + } + // encode number of array pos + _, err2 = metaEncoder.PutU64(uint64(len(poss[uint16(fieldID)][i]))) + if err2 != nil { + return 0, nil, err2 + } + // encode all array positions + for j := 0; j < len(poss[uint16(fieldID)][i]); j++ { + _, err2 = metaEncoder.PutU64(poss[uint16(fieldID)][i][j]) + if err2 != nil { + return 0, nil, err2 + } + } + // append data + data = append(data, storedFieldValues[i]...) + // update curr + curr += len(storedFieldValues[i]) + + } + } + } + + metaEncoder.Close() + metaBytes := metaBuf.Bytes() + compressed = snappy.Encode(compressed, data) + // record where we're about to start writing + docNumOffsets[newDocNum] = uint64(w.Count()) + + buf := make([]byte, binary.MaxVarintLen64) + // write out the meta length + n := binary.PutUvarint(buf, uint64(len(metaBytes))) + _, err = w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + // write out the compressed data length + n = binary.PutUvarint(buf, uint64(len(compressed))) + _, err = w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + // now write the meta + _, err = w.Write(metaBytes) + if err != nil { + return 0, nil, err + } + // now write the compressed data + _, err = w.Write(compressed) + if err != nil { + return 0, nil, err + } + + newDocNum++ + } + } + rv = append(rv, segNewDocNums) + } + + // return value is the start of the stored index + offset := uint64(w.Count()) + // now write out the stored doc index + for docNum := range docNumOffsets { + err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum]) + if err != nil { + return 0, nil, err + } + } + + return offset, rv, nil +} + +// mergeFields builds a unified list of fields used across all the input segments +func mergeFields(segments []*Segment) []string { + fieldsMap := map[string]struct{}{} + + for _, segment := range segments { + fields := segment.Fields() + for _, field := range fields { + fieldsMap[field] = struct{}{} + } + } + rv := make([]string, 0, len(fieldsMap)) + // ensure _id stays first + rv = append(rv, "_id") + for k := range fieldsMap { + if k != "_id" { + rv = append(rv, k) + } + } + + return rv +} + +func persistMergedFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (uint64, error) { + var rv uint64 + + var fieldStarts []uint64 + for fieldID, fieldName := range fieldsInv { + + // record start of this field + fieldStarts = append(fieldStarts, uint64(w.Count())) + + buf := make([]byte, binary.MaxVarintLen64) + // write out dict location for this field + n := binary.PutUvarint(buf, dictLocs[fieldID]) + _, err := w.Write(buf[:n]) + if err != nil { + return 0, err + } + + // write out the length of the field name + n = binary.PutUvarint(buf, uint64(len(fieldName))) + _, err = w.Write(buf[:n]) + if err != nil { + return 0, err + } + + // write out the field name + _, err = w.Write([]byte(fieldName)) + if err != nil { + return 0, err + } + } + + // now write out the fields index + rv = uint64(w.Count()) + + // now write out the stored doc index + for fieldID := range fieldsInv { + err := binary.Write(w, binary.BigEndian, fieldStarts[fieldID]) + if err != nil { + return 0, err + } + } + + return rv, nil +} diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go new file mode 100644 index 00000000..53bcde7f --- /dev/null +++ b/index/scorch/segment/zap/merge_test.go @@ -0,0 +1,280 @@ +package zap + +import ( + "os" + "testing" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment/mem" +) + +func TestMerge(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch.zap") + _ = os.RemoveAll("/tmp/scorch2.zap") + _ = os.RemoveAll("/tmp/scorch3.zap") + + memSegment := buildMemSegmentMulti() + err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + if err != nil { + t.Fatal(err) + } + + memSegment2 := buildMemSegmentMulti2() + err = PersistSegment(memSegment2, "/tmp/scorch2.zap", 1024) + if err != nil { + t.Fatal(err) + } + + segment, err := Open("/tmp/scorch.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + segment2, err := Open("/tmp/scorch2.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment2.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + segsToMerge := make([]*Segment, 2) + segsToMerge[0] = segment.(*Segment) + segsToMerge[1] = segment2.(*Segment) + + err = Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024) + if err != nil { + t.Fatal(err) + } +} + +func TestMergeAndDrop(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch.zap") + _ = os.RemoveAll("/tmp/scorch2.zap") + _ = os.RemoveAll("/tmp/scorch3.zap") + + memSegment := buildMemSegmentMulti() + err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + if err != nil { + t.Fatal(err) + } + + memSegment2 := buildMemSegmentMulti2() + err = PersistSegment(memSegment2, "/tmp/scorch2.zap", 1024) + if err != nil { + t.Fatal(err) + } + + segment, err := Open("/tmp/scorch.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + segment2, err := Open("/tmp/scorch2.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment2.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + segsToMerge := make([]*Segment, 2) + segsToMerge[0] = segment.(*Segment) + segsToMerge[1] = segment2.(*Segment) + + docsToDrop := make([]*roaring.Bitmap, 2) + docsToDrop[0] = roaring.NewBitmap() + docsToDrop[0].AddInt(1) + docsToDrop[1] = roaring.NewBitmap() + docsToDrop[1].AddInt(1) + + err = Merge(segsToMerge, docsToDrop, "/tmp/scorch3.zap", 1024) + if err != nil { + t.Fatal(err) + } +} + +func buildMemSegmentMulti2() *mem.Segment { + + doc := &document.Document{ + ID: "c", + Fields: []document.Field{ + document.NewTextFieldCustom("_id", nil, []byte("c"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("name", nil, []byte("mat"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + doc2 := &document.Document{ + ID: "d", + Fields: []document.Field{ + document.NewTextFieldCustom("_id", nil, []byte("d"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("name", nil, []byte("joa"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + // forge analyzed docs + results := []*index.AnalysisResult{ + &index.AnalysisResult{ + Document: doc, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("c"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("mat"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("some"), + }, + &analysis.Token{ + Start: 5, + End: 10, + Position: 2, + Term: []byte("thing"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("cold"), + }, + }, []uint64{0}, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("dark"), + }, + }, []uint64{1}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + }, + &index.AnalysisResult{ + Document: doc2, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("d"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("joa"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("some"), + }, + &analysis.Token{ + Start: 5, + End: 10, + Position: 2, + Term: []byte("thing"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("cold"), + }, + }, []uint64{0}, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("dark"), + }, + }, []uint64{1}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + }, + } + + // fix up composite fields + for _, ar := range results { + for i, f := range ar.Document.Fields { + for _, cf := range ar.Document.CompositeFields { + cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) + } + } + } + + segment := mem.NewFromAnalyzedDocs(results) + + return segment +}