refactor to make mem segment contents exported
This commit is contained in:
parent
f521d80835
commit
395458ce83
|
@ -23,7 +23,7 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment {
|
|||
}
|
||||
|
||||
// go back and sort the dictKeys
|
||||
for _, dict := range s.dictKeys {
|
||||
for _, dict := range s.DictKeys {
|
||||
sort.Strings(dict)
|
||||
}
|
||||
|
||||
|
@ -81,9 +81,9 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
|
|||
}
|
||||
|
||||
storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) {
|
||||
s.stored[docNum][field] = append(s.stored[docNum][field], val)
|
||||
s.storedTypes[docNum][field] = append(s.storedTypes[docNum][field], typ)
|
||||
s.storedPos[docNum][field] = append(s.storedPos[docNum][field], pos)
|
||||
s.Stored[docNum][field] = append(s.Stored[docNum][field], val)
|
||||
s.StoredTypes[docNum][field] = append(s.StoredTypes[docNum][field], typ)
|
||||
s.StoredPos[docNum][field] = append(s.StoredPos[docNum][field], pos)
|
||||
}
|
||||
|
||||
// walk each composite field
|
||||
|
@ -107,7 +107,7 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
|
|||
// now that its been rolled up into docMap, walk that
|
||||
for fieldID, tokenFrequencies := range docMap {
|
||||
for term, tokenFreq := range tokenFrequencies {
|
||||
fieldTermPostings := s.dicts[fieldID][term]
|
||||
fieldTermPostings := s.Dicts[fieldID][term]
|
||||
|
||||
// FIXME this if/else block has duplicate code that has resulted in
|
||||
// bugs fixed/missed more than once, need to refactor
|
||||
|
@ -116,12 +116,12 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
|
|||
bs := roaring.New()
|
||||
bs.AddInt(int(docNum))
|
||||
|
||||
newPostingID := uint64(len(s.postings) + 1)
|
||||
newPostingID := uint64(len(s.Postings) + 1)
|
||||
// add this new bitset to the postings slice
|
||||
s.postings = append(s.postings, bs)
|
||||
s.Postings = append(s.Postings, bs)
|
||||
// add this to the details slice
|
||||
s.freqs = append(s.freqs, []uint64{uint64(tokenFreq.Frequency())})
|
||||
s.norms = append(s.norms, []float32{float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))})
|
||||
s.Freqs = append(s.Freqs, []uint64{uint64(tokenFreq.Frequency())})
|
||||
s.Norms = append(s.Norms, []float32{float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))})
|
||||
// add to locations
|
||||
var locfields []uint16
|
||||
var locstarts []uint64
|
||||
|
@ -143,35 +143,35 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
|
|||
locarraypos = append(locarraypos, nil)
|
||||
}
|
||||
}
|
||||
s.locfields = append(s.locfields, locfields)
|
||||
s.locstarts = append(s.locstarts, locstarts)
|
||||
s.locends = append(s.locends, locends)
|
||||
s.locpos = append(s.locpos, locpos)
|
||||
s.locarraypos = append(s.locarraypos, locarraypos)
|
||||
s.Locfields = append(s.Locfields, locfields)
|
||||
s.Locstarts = append(s.Locstarts, locstarts)
|
||||
s.Locends = append(s.Locends, locends)
|
||||
s.Locpos = append(s.Locpos, locpos)
|
||||
s.Locarraypos = append(s.Locarraypos, locarraypos)
|
||||
// record it
|
||||
s.dicts[fieldID][term] = newPostingID
|
||||
s.Dicts[fieldID][term] = newPostingID
|
||||
// this term was new for this field, add it to dictKeys
|
||||
s.dictKeys[fieldID] = append(s.dictKeys[fieldID], term)
|
||||
s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term)
|
||||
} else {
|
||||
// posting already started for this field/term
|
||||
// the actual offset is - 1, because 0 is zero value
|
||||
bs := s.postings[fieldTermPostings-1]
|
||||
bs := s.Postings[fieldTermPostings-1]
|
||||
bs.AddInt(int(docNum))
|
||||
s.freqs[fieldTermPostings-1] = append(s.freqs[fieldTermPostings-1], uint64(tokenFreq.Frequency()))
|
||||
s.norms[fieldTermPostings-1] = append(s.norms[fieldTermPostings-1], float32(1.0/math.Sqrt(float64(fieldLens[fieldID]))))
|
||||
s.Freqs[fieldTermPostings-1] = append(s.Freqs[fieldTermPostings-1], uint64(tokenFreq.Frequency()))
|
||||
s.Norms[fieldTermPostings-1] = append(s.Norms[fieldTermPostings-1], float32(1.0/math.Sqrt(float64(fieldLens[fieldID]))))
|
||||
for _, loc := range tokenFreq.Locations {
|
||||
var locf = fieldID
|
||||
if loc.Field != "" {
|
||||
locf = uint16(s.getOrDefineField(loc.Field, false))
|
||||
}
|
||||
s.locfields[fieldTermPostings-1] = append(s.locfields[fieldTermPostings-1], locf)
|
||||
s.locstarts[fieldTermPostings-1] = append(s.locstarts[fieldTermPostings-1], uint64(loc.Start))
|
||||
s.locends[fieldTermPostings-1] = append(s.locends[fieldTermPostings-1], uint64(loc.End))
|
||||
s.locpos[fieldTermPostings-1] = append(s.locpos[fieldTermPostings-1], uint64(loc.Position))
|
||||
s.Locfields[fieldTermPostings-1] = append(s.Locfields[fieldTermPostings-1], locf)
|
||||
s.Locstarts[fieldTermPostings-1] = append(s.Locstarts[fieldTermPostings-1], uint64(loc.Start))
|
||||
s.Locends[fieldTermPostings-1] = append(s.Locends[fieldTermPostings-1], uint64(loc.End))
|
||||
s.Locpos[fieldTermPostings-1] = append(s.Locpos[fieldTermPostings-1], uint64(loc.Position))
|
||||
if len(loc.ArrayPositions) > 0 {
|
||||
s.locarraypos[fieldTermPostings-1] = append(s.locarraypos[fieldTermPostings-1], loc.ArrayPositions)
|
||||
s.Locarraypos[fieldTermPostings-1] = append(s.Locarraypos[fieldTermPostings-1], loc.ArrayPositions)
|
||||
} else {
|
||||
s.locarraypos[fieldTermPostings-1] = append(s.locarraypos[fieldTermPostings-1], nil)
|
||||
s.Locarraypos[fieldTermPostings-1] = append(s.Locarraypos[fieldTermPostings-1], nil)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -180,23 +180,23 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
|
|||
}
|
||||
|
||||
func (s *Segment) getOrDefineField(name string, hasLoc bool) int {
|
||||
fieldID, ok := s.fields[name]
|
||||
fieldID, ok := s.FieldsMap[name]
|
||||
if !ok {
|
||||
fieldID = uint16(len(s.fieldsInv) + 1)
|
||||
s.fields[name] = fieldID
|
||||
s.fieldsInv = append(s.fieldsInv, name)
|
||||
s.fieldsLoc = append(s.fieldsLoc, hasLoc)
|
||||
s.dicts = append(s.dicts, make(map[string]uint64))
|
||||
s.dictKeys = append(s.dictKeys, make([]string, 0))
|
||||
fieldID = uint16(len(s.FieldsInv) + 1)
|
||||
s.FieldsMap[name] = fieldID
|
||||
s.FieldsInv = append(s.FieldsInv, name)
|
||||
s.FieldsLoc = append(s.FieldsLoc, hasLoc)
|
||||
s.Dicts = append(s.Dicts, make(map[string]uint64))
|
||||
s.DictKeys = append(s.DictKeys, make([]string, 0))
|
||||
}
|
||||
return int(fieldID - 1)
|
||||
}
|
||||
|
||||
func (s *Segment) addDocument() int {
|
||||
docNum := len(s.stored)
|
||||
s.stored = append(s.stored, map[uint16][][]byte{})
|
||||
s.storedTypes = append(s.storedTypes, map[uint16][]byte{})
|
||||
s.storedPos = append(s.storedPos, map[uint16][][]uint64{})
|
||||
docNum := len(s.Stored)
|
||||
s.Stored = append(s.Stored, map[uint16][][]byte{})
|
||||
s.StoredTypes = append(s.StoredTypes, map[uint16][]byte{})
|
||||
s.StoredPos = append(s.StoredPos, map[uint16][][]uint64{})
|
||||
return docNum
|
||||
}
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) segment.P
|
|||
return &PostingsList{
|
||||
dictionary: d,
|
||||
term: term,
|
||||
postingsID: d.segment.dicts[d.fieldID][term],
|
||||
postingsID: d.segment.Dicts[d.fieldID][term],
|
||||
except: except,
|
||||
}
|
||||
}
|
||||
|
@ -36,7 +36,7 @@ func (d *Dictionary) Iterator() segment.DictionaryIterator {
|
|||
// PrefixIterator returns an iterator which only visits terms having the
|
||||
// the specified prefix
|
||||
func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
|
||||
offset := sort.SearchStrings(d.segment.dictKeys[d.fieldID], prefix)
|
||||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], prefix)
|
||||
return &DictionaryIterator{
|
||||
d: d,
|
||||
prefix: prefix,
|
||||
|
@ -47,7 +47,7 @@ func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
|
|||
// RangeIterator returns an iterator which only visits terms between the
|
||||
// start and end terms. NOTE: bleve.index API specifies the end is inclusive.
|
||||
func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator {
|
||||
offset := sort.SearchStrings(d.segment.dictKeys[d.fieldID], start)
|
||||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], start)
|
||||
return &DictionaryIterator{
|
||||
d: d,
|
||||
offset: offset,
|
||||
|
@ -65,10 +65,10 @@ type DictionaryIterator struct {
|
|||
|
||||
// Next returns the next entry in the dictionary
|
||||
func (d *DictionaryIterator) Next() (*index.DictEntry, error) {
|
||||
if d.offset > len(d.d.segment.dictKeys[d.d.fieldID])-1 {
|
||||
if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 {
|
||||
return nil, nil
|
||||
}
|
||||
next := d.d.segment.dictKeys[d.d.fieldID][d.offset]
|
||||
next := d.d.segment.DictKeys[d.d.fieldID][d.offset]
|
||||
// check prefix
|
||||
if d.prefix != "" && !strings.HasPrefix(next, d.prefix) {
|
||||
return nil, nil
|
||||
|
@ -79,9 +79,9 @@ func (d *DictionaryIterator) Next() (*index.DictEntry, error) {
|
|||
}
|
||||
|
||||
d.offset++
|
||||
postingID := d.d.segment.dicts[d.d.fieldID][next]
|
||||
postingID := d.d.segment.Dicts[d.d.fieldID][next]
|
||||
return &index.DictEntry{
|
||||
Term: next,
|
||||
Count: d.d.segment.postings[postingID-1].GetCardinality(),
|
||||
Count: d.d.segment.Postings[postingID-1].GetCardinality(),
|
||||
}, nil
|
||||
}
|
||||
|
|
|
@ -17,7 +17,7 @@ type PostingsList struct {
|
|||
func (p *PostingsList) Count() uint64 {
|
||||
var rv uint64
|
||||
if p.postingsID > 0 {
|
||||
rv = p.dictionary.segment.postings[p.postingsID-1].GetCardinality()
|
||||
rv = p.dictionary.segment.Postings[p.postingsID-1].GetCardinality()
|
||||
if p.except != nil {
|
||||
except := p.except.GetCardinality()
|
||||
if except > rv {
|
||||
|
@ -36,7 +36,7 @@ func (p *PostingsList) Iterator() segment.PostingsIterator {
|
|||
postings: p,
|
||||
}
|
||||
if p.postingsID > 0 {
|
||||
allbits := p.dictionary.segment.postings[p.postingsID-1]
|
||||
allbits := p.dictionary.segment.Postings[p.postingsID-1]
|
||||
rv.all = allbits.Iterator()
|
||||
if p.except != nil {
|
||||
allExcept := allbits.Clone()
|
||||
|
@ -72,7 +72,7 @@ func (i *PostingsIterator) Next() segment.Posting {
|
|||
// if they don't match, adjust offsets to factor in item we're skipping over
|
||||
// incr the all iterator, and check again
|
||||
for allN != n {
|
||||
i.locoffset += int(i.postings.dictionary.segment.freqs[i.postings.postingsID-1][i.offset])
|
||||
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset])
|
||||
i.offset++
|
||||
allN = i.all.Next()
|
||||
}
|
||||
|
@ -83,7 +83,7 @@ func (i *PostingsIterator) Next() segment.Posting {
|
|||
locoffset: i.locoffset,
|
||||
}
|
||||
|
||||
i.locoffset += int(i.postings.dictionary.segment.freqs[i.postings.postingsID-1][i.offset])
|
||||
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset])
|
||||
i.offset++
|
||||
return rv
|
||||
}
|
||||
|
@ -103,17 +103,17 @@ func (p *Posting) Number() uint64 {
|
|||
|
||||
// Frequency returns the frequence of occurance of this term in this doc/field
|
||||
func (p *Posting) Frequency() uint64 {
|
||||
return p.iterator.postings.dictionary.segment.freqs[p.iterator.postings.postingsID-1][p.offset]
|
||||
return p.iterator.postings.dictionary.segment.Freqs[p.iterator.postings.postingsID-1][p.offset]
|
||||
}
|
||||
|
||||
// Norm returns the normalization factor for this posting
|
||||
func (p *Posting) Norm() float64 {
|
||||
return float64(p.iterator.postings.dictionary.segment.norms[p.iterator.postings.postingsID-1][p.offset])
|
||||
return float64(p.iterator.postings.dictionary.segment.Norms[p.iterator.postings.postingsID-1][p.offset])
|
||||
}
|
||||
|
||||
// Locations returns the location information for each occurance
|
||||
func (p *Posting) Locations() []segment.Location {
|
||||
if !p.iterator.postings.dictionary.segment.fieldsLoc[p.iterator.postings.dictionary.fieldID] {
|
||||
if !p.iterator.postings.dictionary.segment.FieldsLoc[p.iterator.postings.dictionary.fieldID] {
|
||||
return nil
|
||||
}
|
||||
freq := int(p.Frequency())
|
||||
|
@ -136,25 +136,25 @@ type Location struct {
|
|||
// Field returns the name of the field (useful in composite fields to know
|
||||
// which original field the value came from)
|
||||
func (l *Location) Field() string {
|
||||
return l.p.iterator.postings.dictionary.segment.fieldsInv[l.p.iterator.postings.dictionary.segment.locfields[l.p.iterator.postings.postingsID-1][l.offset]]
|
||||
return l.p.iterator.postings.dictionary.segment.FieldsInv[l.p.iterator.postings.dictionary.segment.Locfields[l.p.iterator.postings.postingsID-1][l.offset]]
|
||||
}
|
||||
|
||||
// Start returns the start byte offset of this occurance
|
||||
func (l *Location) Start() uint64 {
|
||||
return l.p.iterator.postings.dictionary.segment.locstarts[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
return l.p.iterator.postings.dictionary.segment.Locstarts[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
}
|
||||
|
||||
// End returns the end byte offset of this occurance
|
||||
func (l *Location) End() uint64 {
|
||||
return l.p.iterator.postings.dictionary.segment.locends[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
return l.p.iterator.postings.dictionary.segment.Locends[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
}
|
||||
|
||||
// Pos returns the 1-based phrase position of this occurance
|
||||
func (l *Location) Pos() uint64 {
|
||||
return l.p.iterator.postings.dictionary.segment.locpos[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
return l.p.iterator.postings.dictionary.segment.Locpos[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
}
|
||||
|
||||
// ArrayPositions returns the array position vector associated with this occurance
|
||||
func (l *Location) ArrayPositions() []uint64 {
|
||||
return l.p.iterator.postings.dictionary.segment.locarraypos[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
return l.p.iterator.postings.dictionary.segment.Locarraypos[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
}
|
||||
|
|
|
@ -22,77 +22,77 @@ import (
|
|||
// Segment is an in memory implementation of scorch.Segment
|
||||
type Segment struct {
|
||||
|
||||
// fields name -> id+1
|
||||
fields map[string]uint16
|
||||
// FieldsMap name -> id+1
|
||||
FieldsMap map[string]uint16
|
||||
// fields id -> name
|
||||
fieldsInv []string
|
||||
FieldsInv []string
|
||||
// field id -> has location info
|
||||
fieldsLoc []bool
|
||||
FieldsLoc []bool
|
||||
|
||||
// term dictionary
|
||||
// field id -> term -> posting id + 1
|
||||
dicts []map[string]uint64
|
||||
Dicts []map[string]uint64
|
||||
|
||||
// term dictionary keys
|
||||
// field id -> []dictionary keys
|
||||
dictKeys [][]string
|
||||
DictKeys [][]string
|
||||
|
||||
// postings list
|
||||
// postings list id -> postings bitmap
|
||||
postings []*roaring.Bitmap
|
||||
// Postings list
|
||||
// Postings list id -> Postings bitmap
|
||||
Postings []*roaring.Bitmap
|
||||
|
||||
// term frequencies
|
||||
// postings list id -> freqs (one for each hit in bitmap)
|
||||
freqs [][]uint64
|
||||
// postings list id -> Freqs (one for each hit in bitmap)
|
||||
Freqs [][]uint64
|
||||
|
||||
// field norms
|
||||
// postings list id -> norms (one for each hit in bitmap)
|
||||
norms [][]float32
|
||||
// field Norms
|
||||
// postings list id -> Norms (one for each hit in bitmap)
|
||||
Norms [][]float32
|
||||
|
||||
// field/start/end/pos/locarraypos
|
||||
// postings list id -> start/end/pos/locarraypos (one for each freq)
|
||||
locfields [][]uint16
|
||||
locstarts [][]uint64
|
||||
locends [][]uint64
|
||||
locpos [][]uint64
|
||||
locarraypos [][][]uint64
|
||||
Locfields [][]uint16
|
||||
Locstarts [][]uint64
|
||||
Locends [][]uint64
|
||||
Locpos [][]uint64
|
||||
Locarraypos [][][]uint64
|
||||
|
||||
// stored field values
|
||||
// Stored field values
|
||||
// docNum -> field id -> slice of values (each value []byte)
|
||||
stored []map[uint16][][]byte
|
||||
Stored []map[uint16][][]byte
|
||||
|
||||
// stored field types
|
||||
// docNum -> field id -> slice of types (each type byte)
|
||||
storedTypes []map[uint16][]byte
|
||||
StoredTypes []map[uint16][]byte
|
||||
|
||||
// stored field array positions
|
||||
// docNum -> field id -> slice of array positions (each is []uint64)
|
||||
storedPos []map[uint16][][]uint64
|
||||
StoredPos []map[uint16][][]uint64
|
||||
}
|
||||
|
||||
// New builds a new empty Segment
|
||||
func New() *Segment {
|
||||
return &Segment{
|
||||
fields: map[string]uint16{},
|
||||
FieldsMap: map[string]uint16{},
|
||||
}
|
||||
}
|
||||
|
||||
// Fields returns the field names used in this segment
|
||||
func (s *Segment) Fields() []string {
|
||||
return s.fieldsInv
|
||||
return s.FieldsInv
|
||||
}
|
||||
|
||||
// VisitDocument invokes the DocFieldValueVistor for each stored field
|
||||
// for the specified doc number
|
||||
func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
|
||||
// ensure document number exists
|
||||
if int(num) > len(s.stored)-1 {
|
||||
if int(num) > len(s.Stored)-1 {
|
||||
return nil
|
||||
}
|
||||
docFields := s.stored[int(num)]
|
||||
docFields := s.Stored[int(num)]
|
||||
for field, values := range docFields {
|
||||
for i, value := range values {
|
||||
keepGoing := visitor(s.fieldsInv[field], s.storedTypes[int(num)][field][i], value, s.storedPos[int(num)][field][i])
|
||||
keepGoing := visitor(s.FieldsInv[field], s.StoredTypes[int(num)][field][i], value, s.StoredPos[int(num)][field][i])
|
||||
if !keepGoing {
|
||||
return nil
|
||||
}
|
||||
|
@ -113,19 +113,19 @@ func (s *Segment) Dictionary(field string) segment.TermDictionary {
|
|||
// Count returns the number of documents in this segment
|
||||
// (this has no notion of deleted docs)
|
||||
func (s *Segment) Count() uint64 {
|
||||
return uint64(len(s.stored))
|
||||
return uint64(len(s.Stored))
|
||||
}
|
||||
|
||||
// DocNumbers returns a bitset corresponding to the doc numbers of all the
|
||||
// provided _id strings
|
||||
func (s *Segment) DocNumbers(ids []string) *roaring.Bitmap {
|
||||
|
||||
idDictionary := s.dicts[s.getOrDefineField("_id", false)]
|
||||
idDictionary := s.Dicts[s.getOrDefineField("_id", false)]
|
||||
rv := roaring.New()
|
||||
for _, id := range ids {
|
||||
postingID := idDictionary[id]
|
||||
if postingID > 0 {
|
||||
rv.Or(s.postings[postingID-1])
|
||||
rv.Or(s.Postings[postingID-1])
|
||||
}
|
||||
}
|
||||
return rv
|
||||
|
|
Loading…
Reference in New Issue