0
0
Fork 0

refactor to make mem segment contents exported

This commit is contained in:
Marty Schoch 2017-12-01 07:26:47 -05:00
parent f521d80835
commit 395458ce83
4 changed files with 86 additions and 86 deletions

View File

@ -23,7 +23,7 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment {
}
// go back and sort the dictKeys
for _, dict := range s.dictKeys {
for _, dict := range s.DictKeys {
sort.Strings(dict)
}
@ -81,9 +81,9 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
}
storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) {
s.stored[docNum][field] = append(s.stored[docNum][field], val)
s.storedTypes[docNum][field] = append(s.storedTypes[docNum][field], typ)
s.storedPos[docNum][field] = append(s.storedPos[docNum][field], pos)
s.Stored[docNum][field] = append(s.Stored[docNum][field], val)
s.StoredTypes[docNum][field] = append(s.StoredTypes[docNum][field], typ)
s.StoredPos[docNum][field] = append(s.StoredPos[docNum][field], pos)
}
// walk each composite field
@ -107,7 +107,7 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
// now that its been rolled up into docMap, walk that
for fieldID, tokenFrequencies := range docMap {
for term, tokenFreq := range tokenFrequencies {
fieldTermPostings := s.dicts[fieldID][term]
fieldTermPostings := s.Dicts[fieldID][term]
// FIXME this if/else block has duplicate code that has resulted in
// bugs fixed/missed more than once, need to refactor
@ -116,12 +116,12 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
bs := roaring.New()
bs.AddInt(int(docNum))
newPostingID := uint64(len(s.postings) + 1)
newPostingID := uint64(len(s.Postings) + 1)
// add this new bitset to the postings slice
s.postings = append(s.postings, bs)
s.Postings = append(s.Postings, bs)
// add this to the details slice
s.freqs = append(s.freqs, []uint64{uint64(tokenFreq.Frequency())})
s.norms = append(s.norms, []float32{float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))})
s.Freqs = append(s.Freqs, []uint64{uint64(tokenFreq.Frequency())})
s.Norms = append(s.Norms, []float32{float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))})
// add to locations
var locfields []uint16
var locstarts []uint64
@ -143,35 +143,35 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
locarraypos = append(locarraypos, nil)
}
}
s.locfields = append(s.locfields, locfields)
s.locstarts = append(s.locstarts, locstarts)
s.locends = append(s.locends, locends)
s.locpos = append(s.locpos, locpos)
s.locarraypos = append(s.locarraypos, locarraypos)
s.Locfields = append(s.Locfields, locfields)
s.Locstarts = append(s.Locstarts, locstarts)
s.Locends = append(s.Locends, locends)
s.Locpos = append(s.Locpos, locpos)
s.Locarraypos = append(s.Locarraypos, locarraypos)
// record it
s.dicts[fieldID][term] = newPostingID
s.Dicts[fieldID][term] = newPostingID
// this term was new for this field, add it to dictKeys
s.dictKeys[fieldID] = append(s.dictKeys[fieldID], term)
s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term)
} else {
// posting already started for this field/term
// the actual offset is - 1, because 0 is zero value
bs := s.postings[fieldTermPostings-1]
bs := s.Postings[fieldTermPostings-1]
bs.AddInt(int(docNum))
s.freqs[fieldTermPostings-1] = append(s.freqs[fieldTermPostings-1], uint64(tokenFreq.Frequency()))
s.norms[fieldTermPostings-1] = append(s.norms[fieldTermPostings-1], float32(1.0/math.Sqrt(float64(fieldLens[fieldID]))))
s.Freqs[fieldTermPostings-1] = append(s.Freqs[fieldTermPostings-1], uint64(tokenFreq.Frequency()))
s.Norms[fieldTermPostings-1] = append(s.Norms[fieldTermPostings-1], float32(1.0/math.Sqrt(float64(fieldLens[fieldID]))))
for _, loc := range tokenFreq.Locations {
var locf = fieldID
if loc.Field != "" {
locf = uint16(s.getOrDefineField(loc.Field, false))
}
s.locfields[fieldTermPostings-1] = append(s.locfields[fieldTermPostings-1], locf)
s.locstarts[fieldTermPostings-1] = append(s.locstarts[fieldTermPostings-1], uint64(loc.Start))
s.locends[fieldTermPostings-1] = append(s.locends[fieldTermPostings-1], uint64(loc.End))
s.locpos[fieldTermPostings-1] = append(s.locpos[fieldTermPostings-1], uint64(loc.Position))
s.Locfields[fieldTermPostings-1] = append(s.Locfields[fieldTermPostings-1], locf)
s.Locstarts[fieldTermPostings-1] = append(s.Locstarts[fieldTermPostings-1], uint64(loc.Start))
s.Locends[fieldTermPostings-1] = append(s.Locends[fieldTermPostings-1], uint64(loc.End))
s.Locpos[fieldTermPostings-1] = append(s.Locpos[fieldTermPostings-1], uint64(loc.Position))
if len(loc.ArrayPositions) > 0 {
s.locarraypos[fieldTermPostings-1] = append(s.locarraypos[fieldTermPostings-1], loc.ArrayPositions)
s.Locarraypos[fieldTermPostings-1] = append(s.Locarraypos[fieldTermPostings-1], loc.ArrayPositions)
} else {
s.locarraypos[fieldTermPostings-1] = append(s.locarraypos[fieldTermPostings-1], nil)
s.Locarraypos[fieldTermPostings-1] = append(s.Locarraypos[fieldTermPostings-1], nil)
}
}
}
@ -180,23 +180,23 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
}
func (s *Segment) getOrDefineField(name string, hasLoc bool) int {
fieldID, ok := s.fields[name]
fieldID, ok := s.FieldsMap[name]
if !ok {
fieldID = uint16(len(s.fieldsInv) + 1)
s.fields[name] = fieldID
s.fieldsInv = append(s.fieldsInv, name)
s.fieldsLoc = append(s.fieldsLoc, hasLoc)
s.dicts = append(s.dicts, make(map[string]uint64))
s.dictKeys = append(s.dictKeys, make([]string, 0))
fieldID = uint16(len(s.FieldsInv) + 1)
s.FieldsMap[name] = fieldID
s.FieldsInv = append(s.FieldsInv, name)
s.FieldsLoc = append(s.FieldsLoc, hasLoc)
s.Dicts = append(s.Dicts, make(map[string]uint64))
s.DictKeys = append(s.DictKeys, make([]string, 0))
}
return int(fieldID - 1)
}
func (s *Segment) addDocument() int {
docNum := len(s.stored)
s.stored = append(s.stored, map[uint16][][]byte{})
s.storedTypes = append(s.storedTypes, map[uint16][]byte{})
s.storedPos = append(s.storedPos, map[uint16][][]uint64{})
docNum := len(s.Stored)
s.Stored = append(s.Stored, map[uint16][][]byte{})
s.StoredTypes = append(s.StoredTypes, map[uint16][]byte{})
s.StoredPos = append(s.StoredPos, map[uint16][][]uint64{})
return docNum
}

View File

@ -21,7 +21,7 @@ func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) segment.P
return &PostingsList{
dictionary: d,
term: term,
postingsID: d.segment.dicts[d.fieldID][term],
postingsID: d.segment.Dicts[d.fieldID][term],
except: except,
}
}
@ -36,7 +36,7 @@ func (d *Dictionary) Iterator() segment.DictionaryIterator {
// PrefixIterator returns an iterator which only visits terms having the
// the specified prefix
func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
offset := sort.SearchStrings(d.segment.dictKeys[d.fieldID], prefix)
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], prefix)
return &DictionaryIterator{
d: d,
prefix: prefix,
@ -47,7 +47,7 @@ func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
// RangeIterator returns an iterator which only visits terms between the
// start and end terms. NOTE: bleve.index API specifies the end is inclusive.
func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator {
offset := sort.SearchStrings(d.segment.dictKeys[d.fieldID], start)
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], start)
return &DictionaryIterator{
d: d,
offset: offset,
@ -65,10 +65,10 @@ type DictionaryIterator struct {
// Next returns the next entry in the dictionary
func (d *DictionaryIterator) Next() (*index.DictEntry, error) {
if d.offset > len(d.d.segment.dictKeys[d.d.fieldID])-1 {
if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 {
return nil, nil
}
next := d.d.segment.dictKeys[d.d.fieldID][d.offset]
next := d.d.segment.DictKeys[d.d.fieldID][d.offset]
// check prefix
if d.prefix != "" && !strings.HasPrefix(next, d.prefix) {
return nil, nil
@ -79,9 +79,9 @@ func (d *DictionaryIterator) Next() (*index.DictEntry, error) {
}
d.offset++
postingID := d.d.segment.dicts[d.d.fieldID][next]
postingID := d.d.segment.Dicts[d.d.fieldID][next]
return &index.DictEntry{
Term: next,
Count: d.d.segment.postings[postingID-1].GetCardinality(),
Count: d.d.segment.Postings[postingID-1].GetCardinality(),
}, nil
}

View File

@ -17,7 +17,7 @@ type PostingsList struct {
func (p *PostingsList) Count() uint64 {
var rv uint64
if p.postingsID > 0 {
rv = p.dictionary.segment.postings[p.postingsID-1].GetCardinality()
rv = p.dictionary.segment.Postings[p.postingsID-1].GetCardinality()
if p.except != nil {
except := p.except.GetCardinality()
if except > rv {
@ -36,7 +36,7 @@ func (p *PostingsList) Iterator() segment.PostingsIterator {
postings: p,
}
if p.postingsID > 0 {
allbits := p.dictionary.segment.postings[p.postingsID-1]
allbits := p.dictionary.segment.Postings[p.postingsID-1]
rv.all = allbits.Iterator()
if p.except != nil {
allExcept := allbits.Clone()
@ -72,7 +72,7 @@ func (i *PostingsIterator) Next() segment.Posting {
// if they don't match, adjust offsets to factor in item we're skipping over
// incr the all iterator, and check again
for allN != n {
i.locoffset += int(i.postings.dictionary.segment.freqs[i.postings.postingsID-1][i.offset])
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset])
i.offset++
allN = i.all.Next()
}
@ -83,7 +83,7 @@ func (i *PostingsIterator) Next() segment.Posting {
locoffset: i.locoffset,
}
i.locoffset += int(i.postings.dictionary.segment.freqs[i.postings.postingsID-1][i.offset])
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset])
i.offset++
return rv
}
@ -103,17 +103,17 @@ func (p *Posting) Number() uint64 {
// Frequency returns the frequence of occurance of this term in this doc/field
func (p *Posting) Frequency() uint64 {
return p.iterator.postings.dictionary.segment.freqs[p.iterator.postings.postingsID-1][p.offset]
return p.iterator.postings.dictionary.segment.Freqs[p.iterator.postings.postingsID-1][p.offset]
}
// Norm returns the normalization factor for this posting
func (p *Posting) Norm() float64 {
return float64(p.iterator.postings.dictionary.segment.norms[p.iterator.postings.postingsID-1][p.offset])
return float64(p.iterator.postings.dictionary.segment.Norms[p.iterator.postings.postingsID-1][p.offset])
}
// Locations returns the location information for each occurance
func (p *Posting) Locations() []segment.Location {
if !p.iterator.postings.dictionary.segment.fieldsLoc[p.iterator.postings.dictionary.fieldID] {
if !p.iterator.postings.dictionary.segment.FieldsLoc[p.iterator.postings.dictionary.fieldID] {
return nil
}
freq := int(p.Frequency())
@ -136,25 +136,25 @@ type Location struct {
// Field returns the name of the field (useful in composite fields to know
// which original field the value came from)
func (l *Location) Field() string {
return l.p.iterator.postings.dictionary.segment.fieldsInv[l.p.iterator.postings.dictionary.segment.locfields[l.p.iterator.postings.postingsID-1][l.offset]]
return l.p.iterator.postings.dictionary.segment.FieldsInv[l.p.iterator.postings.dictionary.segment.Locfields[l.p.iterator.postings.postingsID-1][l.offset]]
}
// Start returns the start byte offset of this occurance
func (l *Location) Start() uint64 {
return l.p.iterator.postings.dictionary.segment.locstarts[l.p.iterator.postings.postingsID-1][l.offset]
return l.p.iterator.postings.dictionary.segment.Locstarts[l.p.iterator.postings.postingsID-1][l.offset]
}
// End returns the end byte offset of this occurance
func (l *Location) End() uint64 {
return l.p.iterator.postings.dictionary.segment.locends[l.p.iterator.postings.postingsID-1][l.offset]
return l.p.iterator.postings.dictionary.segment.Locends[l.p.iterator.postings.postingsID-1][l.offset]
}
// Pos returns the 1-based phrase position of this occurance
func (l *Location) Pos() uint64 {
return l.p.iterator.postings.dictionary.segment.locpos[l.p.iterator.postings.postingsID-1][l.offset]
return l.p.iterator.postings.dictionary.segment.Locpos[l.p.iterator.postings.postingsID-1][l.offset]
}
// ArrayPositions returns the array position vector associated with this occurance
func (l *Location) ArrayPositions() []uint64 {
return l.p.iterator.postings.dictionary.segment.locarraypos[l.p.iterator.postings.postingsID-1][l.offset]
return l.p.iterator.postings.dictionary.segment.Locarraypos[l.p.iterator.postings.postingsID-1][l.offset]
}

View File

@ -22,77 +22,77 @@ import (
// Segment is an in memory implementation of scorch.Segment
type Segment struct {
// fields name -> id+1
fields map[string]uint16
// FieldsMap name -> id+1
FieldsMap map[string]uint16
// fields id -> name
fieldsInv []string
FieldsInv []string
// field id -> has location info
fieldsLoc []bool
FieldsLoc []bool
// term dictionary
// field id -> term -> posting id + 1
dicts []map[string]uint64
Dicts []map[string]uint64
// term dictionary keys
// field id -> []dictionary keys
dictKeys [][]string
DictKeys [][]string
// postings list
// postings list id -> postings bitmap
postings []*roaring.Bitmap
// Postings list
// Postings list id -> Postings bitmap
Postings []*roaring.Bitmap
// term frequencies
// postings list id -> freqs (one for each hit in bitmap)
freqs [][]uint64
// postings list id -> Freqs (one for each hit in bitmap)
Freqs [][]uint64
// field norms
// postings list id -> norms (one for each hit in bitmap)
norms [][]float32
// field Norms
// postings list id -> Norms (one for each hit in bitmap)
Norms [][]float32
// field/start/end/pos/locarraypos
// postings list id -> start/end/pos/locarraypos (one for each freq)
locfields [][]uint16
locstarts [][]uint64
locends [][]uint64
locpos [][]uint64
locarraypos [][][]uint64
Locfields [][]uint16
Locstarts [][]uint64
Locends [][]uint64
Locpos [][]uint64
Locarraypos [][][]uint64
// stored field values
// Stored field values
// docNum -> field id -> slice of values (each value []byte)
stored []map[uint16][][]byte
Stored []map[uint16][][]byte
// stored field types
// docNum -> field id -> slice of types (each type byte)
storedTypes []map[uint16][]byte
StoredTypes []map[uint16][]byte
// stored field array positions
// docNum -> field id -> slice of array positions (each is []uint64)
storedPos []map[uint16][][]uint64
StoredPos []map[uint16][][]uint64
}
// New builds a new empty Segment
func New() *Segment {
return &Segment{
fields: map[string]uint16{},
FieldsMap: map[string]uint16{},
}
}
// Fields returns the field names used in this segment
func (s *Segment) Fields() []string {
return s.fieldsInv
return s.FieldsInv
}
// VisitDocument invokes the DocFieldValueVistor for each stored field
// for the specified doc number
func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
// ensure document number exists
if int(num) > len(s.stored)-1 {
if int(num) > len(s.Stored)-1 {
return nil
}
docFields := s.stored[int(num)]
docFields := s.Stored[int(num)]
for field, values := range docFields {
for i, value := range values {
keepGoing := visitor(s.fieldsInv[field], s.storedTypes[int(num)][field][i], value, s.storedPos[int(num)][field][i])
keepGoing := visitor(s.FieldsInv[field], s.StoredTypes[int(num)][field][i], value, s.StoredPos[int(num)][field][i])
if !keepGoing {
return nil
}
@ -113,19 +113,19 @@ func (s *Segment) Dictionary(field string) segment.TermDictionary {
// Count returns the number of documents in this segment
// (this has no notion of deleted docs)
func (s *Segment) Count() uint64 {
return uint64(len(s.stored))
return uint64(len(s.Stored))
}
// DocNumbers returns a bitset corresponding to the doc numbers of all the
// provided _id strings
func (s *Segment) DocNumbers(ids []string) *roaring.Bitmap {
idDictionary := s.dicts[s.getOrDefineField("_id", false)]
idDictionary := s.Dicts[s.getOrDefineField("_id", false)]
rv := roaring.New()
for _, id := range ids {
postingID := idDictionary[id]
if postingID > 0 {
rv.Or(s.postings[postingID-1])
rv.Or(s.Postings[postingID-1])
}
}
return rv