// Copyright (c) 2017 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package zap import ( "bytes" "encoding/binary" "fmt" "io" "os" "sync" "github.com/RoaringBitmap/roaring" "github.com/Smerity/govarint" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/couchbase/vellum" mmap "github.com/edsrzf/mmap-go" "github.com/golang/snappy" ) // Open returns a zap impl of a segment func Open(path string) (segment.Segment, error) { f, err := os.Open(path) if err != nil { return nil, err } mm, err := mmap.Map(f, mmap.RDONLY, 0) if err != nil { // mmap failed, try to close the file _ = f.Close() return nil, err } rv := &Segment{ f: f, mm: mm, path: path, fieldsMap: make(map[string]uint16), fieldDvIterMap: make(map[uint16]*docValueIterator), refs: 1, } err = rv.loadConfig() if err != nil { _ = rv.Close() return nil, err } err = rv.loadFields() if err != nil { _ = rv.Close() return nil, err } err = rv.loadDvIterators() if err != nil { _ = rv.Close() return nil, err } return rv, nil } // Segment implements the segment.Segment inteface over top the zap file format type Segment struct { f *os.File mm mmap.MMap path string crc uint32 version uint32 chunkFactor uint32 numDocs uint64 storedIndexOffset uint64 fieldsIndexOffset uint64 fieldsMap map[string]uint16 fieldsInv []string fieldsOffsets []uint64 docValueOffset uint64 fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field m sync.Mutex // Protects the fields that follow. refs int64 } func (s *Segment) SizeInBytes() uint64 { // 4 /* size of crc -> uint32 */ + // 4 /* size of version -> uint32 */ + // 4 /* size of chunkFactor -> uint32 */ + // 8 /* size of numDocs -> uint64 */ + // 8 /* size of storedIndexOffset -> uint64 */ + // 8 /* size of fieldsIndexOffset -> uint64 */ sizeOfUints := 36 sizeInBytes := len(s.mm) + len(s.path) + sizeOfUints for k, _ := range s.fieldsMap { sizeInBytes += len(k) + 2 /* size of uint16 */ } for _, entry := range s.fieldsInv { sizeInBytes += len(entry) } sizeInBytes += len(s.fieldsOffsets) * 8 /* size of uint64 */ sizeInBytes += 8 /* size of refs -> int64 */ return uint64(sizeInBytes) } func (s *Segment) AddRef() { s.m.Lock() s.refs++ s.m.Unlock() } func (s *Segment) DecRef() (err error) { s.m.Lock() s.refs-- if s.refs == 0 { err = s.closeActual() } s.m.Unlock() return err } func (s *Segment) loadConfig() error { crcOffset := len(s.mm) - 4 s.crc = binary.BigEndian.Uint32(s.mm[crcOffset : crcOffset+4]) verOffset := crcOffset - 4 s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4]) if s.version != version { return fmt.Errorf("unsupported version %d", s.version) } chunkOffset := verOffset - 4 s.chunkFactor = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4]) docValueOffset := chunkOffset - 8 s.docValueOffset = binary.BigEndian.Uint64(s.mm[docValueOffset : docValueOffset+8]) fieldsOffset := docValueOffset - 8 s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsOffset : fieldsOffset+8]) storedOffset := fieldsOffset - 8 s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedOffset : storedOffset+8]) docNumOffset := storedOffset - 8 s.numDocs = binary.BigEndian.Uint64(s.mm[docNumOffset : docNumOffset+8]) return nil } func (s *Segment) loadFields() error { // NOTE for now we assume the fields index immediately preceeds the footer // if this changes, need to adjust accordingly (or store epxlicit length) fieldsIndexEnd := uint64(len(s.mm) - FooterSize) // iterate through fields index var fieldID uint64 for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd { addr := binary.BigEndian.Uint64(s.mm[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8]) var n uint64 dictLoc, read := binary.Uvarint(s.mm[addr+n : fieldsIndexEnd]) n += uint64(read) s.fieldsOffsets = append(s.fieldsOffsets, dictLoc) var nameLen uint64 nameLen, read = binary.Uvarint(s.mm[addr+n : fieldsIndexEnd]) n += uint64(read) name := string(s.mm[addr+n : addr+n+nameLen]) s.fieldsInv = append(s.fieldsInv, name) s.fieldsMap[name] = uint16(fieldID + 1) fieldID++ } return nil } // Dictionary returns the term dictionary for the specified field func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { dict, err := s.dictionary(field) if err == nil && dict == nil { return &segment.EmptyDictionary{}, nil } return dict, err } func (s *Segment) dictionary(field string) (rv *Dictionary, err error) { rv = &Dictionary{ segment: s, field: field, } rv.fieldID = s.fieldsMap[field] if rv.fieldID > 0 { rv.fieldID = rv.fieldID - 1 dictStart := s.fieldsOffsets[rv.fieldID] if dictStart > 0 { // read the length of the vellum data vellumLen, read := binary.Uvarint(s.mm[dictStart : dictStart+binary.MaxVarintLen64]) fstBytes := s.mm[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen] if fstBytes != nil { rv.fst, err = vellum.Load(fstBytes) if err != nil { return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) } } } } else { return nil, nil } return rv, nil } // VisitDocument invokes the DocFieldValueVistor for each stored field // for the specified doc number func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { // first make sure this is a valid number in this segment if num < s.numDocs { meta, compressed := s.getStoredMetaAndCompressed(num) uncompressed, err := snappy.Decode(nil, compressed) if err != nil { return err } // now decode meta and process reader := bytes.NewReader(meta) decoder := govarint.NewU64Base128Decoder(reader) keepGoing := true for keepGoing { field, err := decoder.GetU64() if err == io.EOF { break } if err != nil { return err } typ, err := decoder.GetU64() if err != nil { return err } offset, err := decoder.GetU64() if err != nil { return err } l, err := decoder.GetU64() if err != nil { return err } numap, err := decoder.GetU64() if err != nil { return err } var arrayPos []uint64 if numap > 0 { arrayPos = make([]uint64, numap) for i := 0; i < int(numap); i++ { ap, err := decoder.GetU64() if err != nil { return err } arrayPos[i] = ap } } value := uncompressed[offset : offset+l] keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos) } } return nil } // Count returns the number of documents in this segment. func (s *Segment) Count() uint64 { return s.numDocs } // DocNumbers returns a bitset corresponding to the doc numbers of all the // provided _id strings func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) { rv := roaring.New() if len(s.fieldsMap) > 0 { idDict, err := s.dictionary("_id") if err != nil { return nil, err } for _, id := range ids { postings, err := idDict.postingsList(id, nil) if err != nil { return nil, err } if postings.postings != nil { rv.Or(postings.postings) } } } return rv, nil } // Fields returns the field names used in this segment func (s *Segment) Fields() []string { return s.fieldsInv } // Path returns the path of this segment on disk func (s *Segment) Path() string { return s.path } // Close releases all resources associated with this segment func (s *Segment) Close() (err error) { return s.DecRef() } func (s *Segment) closeActual() (err error) { if s.mm != nil { err = s.mm.Unmap() } // try to close file even if unmap failed if s.f != nil { err2 := s.f.Close() if err == nil { // try to return first error err = err2 } } return } // some helpers i started adding for the command-line utility // Data returns the underlying mmaped data slice func (s *Segment) Data() []byte { return s.mm } // CRC returns the CRC value stored in the file footer func (s *Segment) CRC() uint32 { return s.crc } // Version returns the file version in the file footer func (s *Segment) Version() uint32 { return s.version } // ChunkFactor returns the chunk factor in the file footer func (s *Segment) ChunkFactor() uint32 { return s.chunkFactor } // FieldsIndexOffset returns the fields index offset in the file footer func (s *Segment) FieldsIndexOffset() uint64 { return s.fieldsIndexOffset } // StoredIndexOffset returns the stored value index offset in the file footer func (s *Segment) StoredIndexOffset() uint64 { return s.storedIndexOffset } // DocValueOffset returns the docValue offset in the file footer func (s *Segment) DocValueOffset() uint64 { return s.docValueOffset } // NumDocs returns the number of documents in the file footer func (s *Segment) NumDocs() uint64 { return s.numDocs } // DictAddr is a helper function to compute the file offset where the // dictionary is stored for the specified field. func (s *Segment) DictAddr(field string) (uint64, error) { var fieldID uint16 var ok bool if fieldID, ok = s.fieldsMap[field]; !ok { return 0, fmt.Errorf("no such field '%s'", field) } return s.fieldsOffsets[fieldID-1], nil } func (s *Segment) loadDvIterators() error { if s.docValueOffset == fieldNotUninverted || s.docValueOffset == 0 { return nil } var read uint64 for fieldID, field := range s.fieldsInv { fieldLoc, n := binary.Uvarint(s.mm[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) if n <= 0 { return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID) } s.fieldDvIterMap[uint16(fieldID)], _ = s.loadFieldDocValueIterator(field, fieldLoc) read += uint64(n) } return nil }