From 522f9d5cc7cf698cf9e0d288ea4a04b45489d4fd Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 10 Mar 2015 16:22:19 -0400 Subject: [PATCH] significant change to index format, support dictionary rows this introduces disk format v4 now the summary rows for a term are stored in their own "dictionary row" format, previously the same information was stored in special term frequency rows this now allows us to easily iterate all the terms for a field in sorted order (useful for many other fuzzy data structures) at the top-level of bleve you can now browse terms within a field using the following api on the Index interface: FieldDict(field string) (index.FieldDict, error) FieldDictRange(field string, startTerm []byte, endTerm []byte) (index.FieldDict, error) FieldDictPrefix(field string, termPrefix []byte) (index.FieldDict, error) fixes #127 --- index.go | 4 + index/index.go | 13 +- .../{field_reader.go => field_dict.go} | 34 +++--- ...ield_reader_test.go => field_dict_test.go} | 54 ++++++--- index/upside_down/index_reader.go | 14 ++- index/upside_down/reader.go | 112 ++++++++++-------- index/upside_down/row.go | 97 +++++++++++++-- index/upside_down/row_merge.go | 36 +++--- index/upside_down/row_test.go | 5 + index/upside_down/upside_down.go | 10 +- index_alias_impl.go | 92 ++++++++++++++ index_alias_impl_test.go | 12 ++ index_impl.go | 100 ++++++++++++++++ index_test.go | 64 ++++++++++ search/searchers/search_fuzzy.go | 12 +- search/searchers/search_term_prefix.go | 6 +- 16 files changed, 541 insertions(+), 124 deletions(-) rename index/upside_down/{field_reader.go => field_dict.go} (60%) rename index/upside_down/{field_reader_test.go => field_dict_test.go} (74%) diff --git a/index.go b/index.go index cf2ecd89..4fbd107b 100644 --- a/index.go +++ b/index.go @@ -78,6 +78,10 @@ type Index interface { Fields() ([]string, error) + FieldDict(field string) (index.FieldDict, error) + FieldDictRange(field string, startTerm []byte, endTerm []byte) (index.FieldDict, error) + FieldDictPrefix(field string, termPrefix []byte) (index.FieldDict, error) + DumpAll() chan interface{} DumpDoc(id string) chan interface{} DumpFields() chan interface{} diff --git a/index/index.go b/index/index.go index 6f2b3f19..6ffd7909 100644 --- a/index/index.go +++ b/index/index.go @@ -41,7 +41,9 @@ type IndexReader interface { TermFieldReader(term []byte, field string) (TermFieldReader, error) DocIDReader(start, end string) (DocIDReader, error) - FieldReader(field string, startTerm []byte, endTerm []byte) (FieldReader, error) + FieldDict(field string) (FieldDict, error) + FieldDictRange(field string, startTerm []byte, endTerm []byte) (FieldDict, error) + FieldDictPrefix(field string, termPrefix []byte) (FieldDict, error) Document(id string) (*document.Document, error) DocumentFieldTerms(id string) (FieldTerms, error) @@ -79,8 +81,13 @@ type TermFieldReader interface { Close() error } -type FieldReader interface { - Next() (*TermFieldDoc, error) +type DictEntry struct { + Term string + Count uint64 +} + +type FieldDict interface { + Next() (*DictEntry, error) Close() error } diff --git a/index/upside_down/field_reader.go b/index/upside_down/field_dict.go similarity index 60% rename from index/upside_down/field_reader.go rename to index/upside_down/field_dict.go index 27a249aa..f5feb120 100644 --- a/index/upside_down/field_reader.go +++ b/index/upside_down/field_dict.go @@ -17,23 +17,24 @@ import ( "github.com/blevesearch/bleve/index/store" ) -type UpsideDownCouchFieldReader struct { +type UpsideDownCouchFieldDict struct { indexReader *IndexReader iterator store.KVIterator endKey []byte field uint16 } -func newUpsideDownCouchFieldReader(indexReader *IndexReader, field uint16, startTerm, endTerm []byte) (*UpsideDownCouchFieldReader, error) { +func newUpsideDownCouchFieldDict(indexReader *IndexReader, field uint16, startTerm, endTerm []byte) (*UpsideDownCouchFieldDict, error) { - startRow := NewTermFrequencyRow(startTerm, field, "", 0, 0) - startKey := startRow.ScanPrefixForFieldTermPrefix() - - endKey := NewTermFrequencyRow(endTerm, field, "", 0, 0).Key() + startKey := NewDictionaryRow(startTerm, field, 0).Key() + if endTerm == nil { + endTerm = []byte{ByteSeparator} + } + endKey := NewDictionaryRow(endTerm, field, 0).Key() it := indexReader.kvreader.Iterator(startKey) - return &UpsideDownCouchFieldReader{ + return &UpsideDownCouchFieldDict{ indexReader: indexReader, iterator: it, field: field, @@ -42,7 +43,7 @@ func newUpsideDownCouchFieldReader(indexReader *IndexReader, field uint16, start } -func (r *UpsideDownCouchFieldReader) Next() (*index.TermFieldDoc, error) { +func (r *UpsideDownCouchFieldDict) Next() (*index.DictEntry, error) { key, val, valid := r.iterator.Current() if !valid { return nil, nil @@ -53,24 +54,21 @@ func (r *UpsideDownCouchFieldReader) Next() (*index.TermFieldDoc, error) { return nil, nil } - currRow, err := NewTermFrequencyRowKV(key, val) + currRow, err := NewDictionaryRowKV(key, val) if err != nil { - return nil, fmt.Errorf("unexpected error parsing term freq row kv: %v", err) + return nil, fmt.Errorf("unexpected error parsing dictionary row kv: %v", err) } - rv := index.TermFieldDoc{ - Term: string(currRow.term), - Freq: currRow.freq, + rv := index.DictEntry{ + Term: string(currRow.term), + Count: currRow.count, } // advance the iterator to the next term - // by using invalid doc id (higher sorting) - nextTerm := incrementBytes(currRow.term) - nextRow := NewTermFrequencyRow(nextTerm, r.field, "", 0, 0) - r.iterator.Seek(nextRow.ScanPrefixForFieldTermPrefix()) + r.iterator.Next() return &rv, nil } -func (r *UpsideDownCouchFieldReader) Close() error { +func (r *UpsideDownCouchFieldDict) Close() error { return r.iterator.Close() } diff --git a/index/upside_down/field_reader_test.go b/index/upside_down/field_dict_test.go similarity index 74% rename from index/upside_down/field_reader_test.go rename to index/upside_down/field_dict_test.go index e39e78eb..e01a73e3 100644 --- a/index/upside_down/field_reader_test.go +++ b/index/upside_down/field_dict_test.go @@ -18,7 +18,7 @@ import ( "github.com/blevesearch/bleve/index/store/boltdb" ) -func TestIndexFieldReader(t *testing.T) { +func TestIndexFieldDict(t *testing.T) { defer os.RemoveAll("test") store, err := boltdb.Open("test", "bleve") @@ -54,38 +54,39 @@ func TestIndexFieldReader(t *testing.T) { t.Error(err) } defer indexReader.Close() - reader, err := indexReader.FieldReader("name", nil, nil) + + dict, err := indexReader.FieldDict("name") if err != nil { t.Errorf("error creating reader: %v", err) } - defer reader.Close() + defer dict.Close() termCount := 0 - curr, err := reader.Next() + curr, err := dict.Next() for err == nil && curr != nil { termCount++ if curr.Term != "test" { t.Errorf("expected term to be 'test', got '%s'", curr.Term) } - curr, err = reader.Next() + curr, err = dict.Next() } if termCount != 1 { t.Errorf("expected 1 term for this field, got %d", termCount) } - reader, err = indexReader.FieldReader("desc", nil, nil) + dict, err = indexReader.FieldDict("desc") if err != nil { t.Errorf("error creating reader: %v", err) } - defer reader.Close() + defer dict.Close() termCount = 0 terms := make([]string, 0) - curr, err = reader.Next() + curr, err = dict.Next() for err == nil && curr != nil { termCount++ terms = append(terms, curr.Term) - curr, err = reader.Next() + curr, err = dict.Next() } if termCount != 3 { t.Errorf("expected 3 term for this field, got %d", termCount) @@ -95,25 +96,48 @@ func TestIndexFieldReader(t *testing.T) { t.Errorf("expected %#v, got %#v", expectedTerms, terms) } - // test use case for prefix - reader, err = indexReader.FieldReader("prefix", []byte("cat"), []byte("cat")) + // test start and end range + dict, err = indexReader.FieldDictRange("desc", []byte("fun"), []byte("nice")) if err != nil { t.Errorf("error creating reader: %v", err) } - defer reader.Close() + defer dict.Close() termCount = 0 terms = make([]string, 0) - curr, err = reader.Next() + curr, err = dict.Next() for err == nil && curr != nil { termCount++ terms = append(terms, curr.Term) - curr, err = reader.Next() + curr, err = dict.Next() + } + if termCount != 1 { + t.Errorf("expected 1 term for this field, got %d", termCount) + } + expectedTerms = []string{"more"} + if !reflect.DeepEqual(expectedTerms, terms) { + t.Errorf("expected %#v, got %#v", expectedTerms, terms) + } + + // test use case for prefix + dict, err = indexReader.FieldDictPrefix("prefix", []byte("cat")) + if err != nil { + t.Errorf("error creating reader: %v", err) + } + defer dict.Close() + + termCount = 0 + terms = make([]string, 0) + curr, err = dict.Next() + for err == nil && curr != nil { + termCount++ + terms = append(terms, curr.Term) + curr, err = dict.Next() } if termCount != 3 { t.Errorf("expected 3 term for this field, got %d", termCount) } - expectedTerms = []string{"cats", "catting", "cat"} + expectedTerms = []string{"cat", "cats", "catting"} if !reflect.DeepEqual(expectedTerms, terms) { t.Errorf("expected %#v, got %#v", expectedTerms, terms) } diff --git a/index/upside_down/index_reader.go b/index/upside_down/index_reader.go index e9ccc40f..f620cbed 100644 --- a/index/upside_down/index_reader.go +++ b/index/upside_down/index_reader.go @@ -31,12 +31,20 @@ func (i *IndexReader) TermFieldReader(term []byte, fieldName string) (index.Term return newUpsideDownCouchTermFieldReader(i, []byte{ByteSeparator}, ^uint16(0)) } -func (i *IndexReader) FieldReader(fieldName string, startTerm []byte, endTerm []byte) (index.FieldReader, error) { +func (i *IndexReader) FieldDict(fieldName string) (index.FieldDict, error) { + return i.FieldDictRange(fieldName, nil, nil) +} + +func (i *IndexReader) FieldDictRange(fieldName string, startTerm []byte, endTerm []byte) (index.FieldDict, error) { fieldIndex, fieldExists := i.index.fieldIndexCache.FieldExists(fieldName) if fieldExists { - return newUpsideDownCouchFieldReader(i, uint16(fieldIndex), startTerm, endTerm) + return newUpsideDownCouchFieldDict(i, uint16(fieldIndex), startTerm, endTerm) } - return newUpsideDownCouchTermFieldReader(i, []byte{ByteSeparator}, ^uint16(0)) + return newUpsideDownCouchFieldDict(i, ^uint16(0), []byte{ByteSeparator}, []byte{}) +} + +func (i *IndexReader) FieldDictPrefix(fieldName string, termPrefix []byte) (index.FieldDict, error) { + return i.FieldDictRange(fieldName, termPrefix, incrementBytes(termPrefix)) } func (i *IndexReader) DocIDReader(start, end string) (index.DocIDReader, error) { diff --git a/index/upside_down/reader.go b/index/upside_down/reader.go index f56a9890..ab78685d 100644 --- a/index/upside_down/reader.go +++ b/index/upside_down/reader.go @@ -25,25 +25,31 @@ type UpsideDownCouchTermFieldReader struct { } func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, field uint16) (*UpsideDownCouchTermFieldReader, error) { + dictionaryRow := NewDictionaryRow(term, field, 0) + val, err := indexReader.kvreader.Get(dictionaryRow.Key()) + if err != nil { + return nil, err + } + if val == nil { + return &UpsideDownCouchTermFieldReader{ + count: 0, + term: term, + field: field, + }, nil + } + + err = dictionaryRow.parseDictionaryV(val) + if err != nil { + return nil, err + } + tfr := NewTermFrequencyRow(term, field, "", 0, 0) it := indexReader.kvreader.Iterator(tfr.Key()) - var count uint64 - key, val, valid := it.Current() - if valid { - if bytes.Equal(key, tfr.Key()) { - tfr, err := NewTermFrequencyRowKV(key, val) - if err != nil { - return nil, err - } - count = tfr.freq - } - } - return &UpsideDownCouchTermFieldReader{ indexReader: indexReader, iterator: it, - count: count, + count: dictionaryRow.count, term: term, field: field, }, nil @@ -54,54 +60,62 @@ func (r *UpsideDownCouchTermFieldReader) Count() uint64 { } func (r *UpsideDownCouchTermFieldReader) Next() (*index.TermFieldDoc, error) { - r.iterator.Next() - key, val, valid := r.iterator.Current() - if valid { - testfr := NewTermFrequencyRow(r.term, r.field, "", 0, 0) - if !bytes.HasPrefix(key, testfr.Key()) { - // end of the line - return nil, nil + if r.iterator != nil { + key, val, valid := r.iterator.Current() + if valid { + testfr := NewTermFrequencyRow(r.term, r.field, "", 0, 0) + if !bytes.HasPrefix(key, testfr.Key()) { + // end of the line + return nil, nil + } + tfr, err := NewTermFrequencyRowKV(key, val) + if err != nil { + return nil, err + } + r.iterator.Next() + return &index.TermFieldDoc{ + ID: string(tfr.doc), + Freq: tfr.freq, + Norm: float64(tfr.norm), + Vectors: r.indexReader.index.termFieldVectorsFromTermVectors(tfr.vectors), + }, nil } - tfr, err := NewTermFrequencyRowKV(key, val) - if err != nil { - return nil, err - } - return &index.TermFieldDoc{ - ID: string(tfr.doc), - Freq: tfr.freq, - Norm: float64(tfr.norm), - Vectors: r.indexReader.index.termFieldVectorsFromTermVectors(tfr.vectors), - }, nil } return nil, nil } func (r *UpsideDownCouchTermFieldReader) Advance(docID string) (*index.TermFieldDoc, error) { - tfr := NewTermFrequencyRow(r.term, r.field, docID, 0, 0) - r.iterator.Seek(tfr.Key()) - key, val, valid := r.iterator.Current() - if valid { - testfr := NewTermFrequencyRow(r.term, r.field, "", 0, 0) - if !bytes.HasPrefix(key, testfr.Key()) { - // end of the line - return nil, nil + if r.iterator != nil { + tfr := NewTermFrequencyRow(r.term, r.field, docID, 0, 0) + r.iterator.Seek(tfr.Key()) + key, val, valid := r.iterator.Current() + if valid { + testfr := NewTermFrequencyRow(r.term, r.field, "", 0, 0) + if !bytes.HasPrefix(key, testfr.Key()) { + // end of the line + return nil, nil + } + tfr, err := NewTermFrequencyRowKV(key, val) + if err != nil { + return nil, err + } + r.iterator.Next() + return &index.TermFieldDoc{ + ID: string(tfr.doc), + Freq: tfr.freq, + Norm: float64(tfr.norm), + Vectors: r.indexReader.index.termFieldVectorsFromTermVectors(tfr.vectors), + }, nil } - tfr, err := NewTermFrequencyRowKV(key, val) - if err != nil { - return nil, err - } - return &index.TermFieldDoc{ - ID: string(tfr.doc), - Freq: tfr.freq, - Norm: float64(tfr.norm), - Vectors: r.indexReader.index.termFieldVectorsFromTermVectors(tfr.vectors), - }, nil } return nil, nil } func (r *UpsideDownCouchTermFieldReader) Close() error { - return r.iterator.Close() + if r.iterator != nil { + return r.iterator.Close() + } + return nil } type UpsideDownCouchDocIDReader struct { diff --git a/index/upside_down/row.go b/index/upside_down/row.go index 9d318b85..0784e428 100644 --- a/index/upside_down/row.go +++ b/index/upside_down/row.go @@ -35,6 +35,8 @@ func ParseFromKeyValue(key, value []byte) (UpsideDownCouchRow, error) { return NewVersionRowKV(key, value) case 'f': return NewFieldRowKV(key, value) + case 'd': + return NewDictionaryRowKV(key, value) case 't': return NewTermFrequencyRowKV(key, value) case 'b': @@ -171,6 +173,91 @@ func NewFieldRowKV(key, value []byte) (*FieldRow, error) { return &rv, nil } +// DICTIONARY + +type DictionaryRow struct { + field uint16 + term []byte + count uint64 +} + +func (dr *DictionaryRow) Key() []byte { + buf := make([]byte, 3+len(dr.term)) + buf[0] = 'd' + binary.LittleEndian.PutUint16(buf[1:3], dr.field) + copy(buf[3:], dr.term) + return buf +} + +func (dr *DictionaryRow) Value() []byte { + used := 0 + buf := make([]byte, 8) + + used += binary.PutUvarint(buf[used:used+8], dr.count) + + return buf[0:used] +} + +func (dr *DictionaryRow) String() string { + return fmt.Sprintf("Dictionary Term: `%s` Field: %d Count: %d ", string(dr.term), dr.field, dr.count) +} + +func NewDictionaryRow(term []byte, field uint16, count uint64) *DictionaryRow { + return &DictionaryRow{ + term: term, + field: field, + count: count, + } +} + +func NewDictionaryRowKV(key, value []byte) (*DictionaryRow, error) { + rv, err := NewDictionaryRowK(key) + if err != nil { + return nil, err + } + + err = rv.parseDictionaryV(value) + if err != nil { + return nil, err + } + return rv, nil + +} + +func NewDictionaryRowK(key []byte) (*DictionaryRow, error) { + rv := DictionaryRow{} + buf := bytes.NewBuffer(key) + _, err := buf.ReadByte() // type + if err != nil { + return nil, err + } + + err = binary.Read(buf, binary.LittleEndian, &rv.field) + if err != nil { + return nil, err + } + + rv.term, err = buf.ReadBytes(ByteSeparator) + // there is no separator expected here, should get EOF + if err != io.EOF { + return nil, err + } + + return &rv, nil +} + +func (dr *DictionaryRow) parseDictionaryV(value []byte) error { + buf := bytes.NewBuffer((value)) + + count, err := binary.ReadUvarint(buf) + if err != nil { + return err + } + dr.count = count + + return nil +} + // TERM FIELD FREQUENCY type TermVector struct { @@ -227,13 +314,9 @@ func (tfr *TermFrequencyRow) Key() []byte { return buf } -func (tfr *TermFrequencyRow) SummaryKey() []byte { - buf := make([]byte, 3+len(tfr.term)+1) - buf[0] = 't' - binary.LittleEndian.PutUint16(buf[1:3], tfr.field) - termLen := copy(buf[3:], tfr.term) - buf[3+termLen] = ByteSeparator - return buf +func (tfr *TermFrequencyRow) DictionaryRowKey() []byte { + dr := NewDictionaryRow(tfr.term, tfr.field, 0) + return dr.Key() } func (tfr *TermFrequencyRow) Value() []byte { diff --git a/index/upside_down/row_merge.go b/index/upside_down/row_merge.go index f9d7ef9d..1721db59 100644 --- a/index/upside_down/row_merge.go +++ b/index/upside_down/row_merge.go @@ -9,45 +9,45 @@ package upside_down -type termSummaryIncr struct{} +type dictionaryTermIncr struct{} -func newTermSummaryIncr() *termSummaryIncr { - return &termSummaryIncr{} +func newDictionaryTermIncr() *dictionaryTermIncr { + return &dictionaryTermIncr{} } -func (t *termSummaryIncr) Merge(key, existing []byte) ([]byte, error) { +func (t *dictionaryTermIncr) Merge(key, existing []byte) ([]byte, error) { if len(existing) > 0 { - tfr, err := NewTermFrequencyRowKV(key, existing) + dr, err := NewDictionaryRowKV(key, existing) if err != nil { return nil, err } - tfr.freq++ - return tfr.Value(), nil + dr.count++ + return dr.Value(), nil } else { - tfr, err := NewTermFrequencyRowK(key) + dr, err := NewDictionaryRowK(key) if err != nil { return nil, err } - tfr.freq = 1 - return tfr.Value(), nil + dr.count = 1 + return dr.Value(), nil } } -type termSummaryDecr struct{} +type dictionaryTermDecr struct{} -func newTermSummaryDecr() *termSummaryDecr { - return &termSummaryDecr{} +func newDictionaryTermDecr() *dictionaryTermDecr { + return &dictionaryTermDecr{} } -func (t *termSummaryDecr) Merge(key, existing []byte) ([]byte, error) { +func (t *dictionaryTermDecr) Merge(key, existing []byte) ([]byte, error) { if len(existing) > 0 { - tfr, err := NewTermFrequencyRowKV(key, existing) + dr, err := NewDictionaryRowKV(key, existing) if err != nil { return nil, err } - tfr.freq-- - if tfr.freq > 0 { - return tfr.Value(), nil + dr.count-- + if dr.count > 0 { + return dr.Value(), nil } } return nil, nil diff --git a/index/upside_down/row_test.go b/index/upside_down/row_test.go index b9cf0663..9c9cf66f 100644 --- a/index/upside_down/row_test.go +++ b/index/upside_down/row_test.go @@ -42,6 +42,11 @@ func TestRows(t *testing.T) { []byte{'f', 1, 2}, []byte{'s', 't', 'y', 'l', 'e', ByteSeparator}, }, + { + NewDictionaryRow([]byte{'b', 'e', 'e', 'r'}, 0, 27), + []byte{'d', 0, 0, 'b', 'e', 'e', 'r'}, + []byte{27}, + }, { NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "", 3, 3.14), []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator}, diff --git a/index/upside_down/upside_down.go b/index/upside_down/upside_down.go index 21689bff..0a41d9e2 100644 --- a/index/upside_down/upside_down.go +++ b/index/upside_down/upside_down.go @@ -27,7 +27,7 @@ import ( var VersionKey = []byte{'v'} -const Version uint8 = 3 +const Version uint8 = 4 var IncompatibleVersion = fmt.Errorf("incompatible version, %d is supported", Version) @@ -111,8 +111,8 @@ func (udc *UpsideDownCouch) batchRows(writer store.KVWriter, addRows []UpsideDow tfr, ok := row.(*TermFrequencyRow) if ok { // need to increment counter - summaryKey := tfr.SummaryKey() - wb.Merge(summaryKey, newTermSummaryIncr()) + dictionaryKey := tfr.DictionaryRowKey() + wb.Merge(dictionaryKey, newDictionaryTermIncr()) } wb.Set(row.Key(), row.Value()) } @@ -127,8 +127,8 @@ func (udc *UpsideDownCouch) batchRows(writer store.KVWriter, addRows []UpsideDow tfr, ok := row.(*TermFrequencyRow) if ok { // need to decrement counter - summaryKey := tfr.SummaryKey() - wb.Merge(summaryKey, newTermSummaryDecr()) + dictionaryKey := tfr.DictionaryRowKey() + wb.Merge(dictionaryKey, newDictionaryTermDecr()) } wb.Delete(row.Key()) } diff --git a/index_alias_impl.go b/index_alias_impl.go index 617853a7..a11f330b 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -165,6 +165,84 @@ func (i *indexAliasImpl) Fields() ([]string, error) { return i.indexes[0].Fields() } +func (i *indexAliasImpl) FieldDict(field string) (index.FieldDict, error) { + i.mutex.RLock() + + if !i.open { + i.mutex.RUnlock() + return nil, ErrorIndexClosed + } + + err := i.isAliasToSingleIndex() + if err != nil { + i.mutex.RUnlock() + return nil, err + } + + fieldDict, err := i.indexes[0].FieldDict(field) + if err != nil { + i.mutex.RUnlock() + return nil, err + } + + return &indexAliasImplFieldDict{ + index: i, + fieldDict: fieldDict, + }, nil +} + +func (i *indexAliasImpl) FieldDictRange(field string, startTerm []byte, endTerm []byte) (index.FieldDict, error) { + i.mutex.RLock() + + if !i.open { + i.mutex.RUnlock() + return nil, ErrorIndexClosed + } + + err := i.isAliasToSingleIndex() + if err != nil { + i.mutex.RUnlock() + return nil, err + } + + fieldDict, err := i.indexes[0].FieldDictRange(field, startTerm, endTerm) + if err != nil { + i.mutex.RUnlock() + return nil, err + } + + return &indexAliasImplFieldDict{ + index: i, + fieldDict: fieldDict, + }, nil +} + +func (i *indexAliasImpl) FieldDictPrefix(field string, termPrefix []byte) (index.FieldDict, error) { + i.mutex.RLock() + + if !i.open { + i.mutex.RUnlock() + return nil, ErrorIndexClosed + } + + err := i.isAliasToSingleIndex() + if err != nil { + i.mutex.RUnlock() + return nil, err + } + + fieldDict, err := i.indexes[0].FieldDictPrefix(field, termPrefix) + if err != nil { + i.mutex.RUnlock() + return nil, err + } + + return &indexAliasImplFieldDict{ + index: i, + fieldDict: fieldDict, + }, nil +} + func (i *indexAliasImpl) DumpAll() chan interface{} { i.mutex.RLock() defer i.mutex.RUnlock() @@ -482,3 +560,17 @@ func (i *indexAliasImpl) NewBatch() *Batch { return i.indexes[0].NewBatch() } + +type indexAliasImplFieldDict struct { + index *indexAliasImpl + fieldDict index.FieldDict +} + +func (f *indexAliasImplFieldDict) Next() (*index.DictEntry, error) { + return f.fieldDict.Next() +} + +func (f *indexAliasImplFieldDict) Close() error { + defer f.index.mutex.RUnlock() + return f.fieldDict.Close() +} diff --git a/index_alias_impl_test.go b/index_alias_impl_test.go index 10ae0011..de84e9f9 100644 --- a/index_alias_impl_test.go +++ b/index_alias_impl_test.go @@ -730,6 +730,18 @@ func (i *stubIndex) Fields() ([]string, error) { return nil, i.err } +func (i *stubIndex) FieldDict(field string) (index.FieldDict, error) { + return nil, i.err +} + +func (i *stubIndex) FieldDictRange(field string, startTerm []byte, endTerm []byte) (index.FieldDict, error) { + return nil, i.err +} + +func (i *stubIndex) FieldDictPrefix(field string, termPrefix []byte) (index.FieldDict, error) { + return nil, i.err +} + func (i *stubIndex) DumpAll() chan interface{} { return nil } diff --git a/index_impl.go b/index_impl.go index cfbf2c17..f8e1be9d 100644 --- a/index_impl.go +++ b/index_impl.go @@ -498,6 +498,87 @@ func (i *indexImpl) Fields() ([]string, error) { return indexReader.Fields() } +func (i *indexImpl) FieldDict(field string) (index.FieldDict, error) { + i.mutex.RLock() + + if !i.open { + i.mutex.RUnlock() + return nil, ErrorIndexClosed + } + + indexReader, err := i.i.Reader() + if err != nil { + i.mutex.RUnlock() + return nil, err + } + + fieldDict, err := indexReader.FieldDict(field) + if err != nil { + i.mutex.RUnlock() + return nil, err + } + + return &indexImplFieldDict{ + index: i, + indexReader: indexReader, + fieldDict: fieldDict, + }, nil +} + +func (i *indexImpl) FieldDictRange(field string, startTerm []byte, endTerm []byte) (index.FieldDict, error) { + i.mutex.RLock() + + if !i.open { + i.mutex.RUnlock() + return nil, ErrorIndexClosed + } + + indexReader, err := i.i.Reader() + if err != nil { + i.mutex.RUnlock() + return nil, err + } + + fieldDict, err := indexReader.FieldDictRange(field, startTerm, endTerm) + if err != nil { + i.mutex.RUnlock() + return nil, err + } + + return &indexImplFieldDict{ + index: i, + indexReader: indexReader, + fieldDict: fieldDict, + }, nil +} + +func (i *indexImpl) FieldDictPrefix(field string, termPrefix []byte) (index.FieldDict, error) { + i.mutex.RLock() + + if !i.open { + i.mutex.RUnlock() + return nil, ErrorIndexClosed + } + + indexReader, err := i.i.Reader() + if err != nil { + i.mutex.RUnlock() + return nil, err + } + + fieldDict, err := indexReader.FieldDictPrefix(field, termPrefix) + if err != nil { + i.mutex.RUnlock() + return nil, err + } + + return &indexImplFieldDict{ + index: i, + indexReader: indexReader, + fieldDict: fieldDict, + }, nil +} + // DumpAll writes all index rows to a channel. // INTERNAL: do not rely on this function, it is // only intended to be used by the debug utilities @@ -586,3 +667,22 @@ func (i *indexImpl) NewBatch() *Batch { internal: index.NewBatch(), } } + +type indexImplFieldDict struct { + index *indexImpl + indexReader index.IndexReader + fieldDict index.FieldDict +} + +func (f *indexImplFieldDict) Next() (*index.DictEntry, error) { + return f.fieldDict.Next() +} + +func (f *indexImplFieldDict) Close() error { + defer f.index.mutex.RUnlock() + err := f.fieldDict.Close() + if err != nil { + return err + } + return f.indexReader.Close() +} diff --git a/index_test.go b/index_test.go index 6cb192d3..232d33b7 100644 --- a/index_test.go +++ b/index_test.go @@ -13,6 +13,7 @@ import ( "io/ioutil" "log" "os" + "reflect" "testing" "time" ) @@ -363,3 +364,66 @@ func TestStoredFieldPreserved(t *testing.T) { } } + +func TestDict(t *testing.T) { + defer os.RemoveAll("testidx") + + index, err := New("testidx", NewIndexMapping()) + if err != nil { + t.Fatal(err) + } + + doca := map[string]interface{}{ + "name": "marty", + "desc": "gophercon india", + } + err = index.Index("a", doca) + if err != nil { + t.Error(err) + } + + docy := map[string]interface{}{ + "name": "jasper", + "desc": "clojure", + } + err = index.Index("y", docy) + if err != nil { + t.Error(err) + } + + docx := map[string]interface{}{ + "name": "rose", + "desc": "googler", + } + err = index.Index("x", docx) + if err != nil { + t.Error(err) + } + + dict, err := index.FieldDict("name") + if err != nil { + t.Error(err) + } + + terms := []string{} + de, err := dict.Next() + for err == nil && de != nil { + terms = append(terms, string(de.Term)) + de, err = dict.Next() + } + + expectedTerms := []string{"jasper", "marty", "rose"} + if !reflect.DeepEqual(terms, expectedTerms) { + t.Errorf("expected %v, got %v", expectedTerms, terms) + } + + err = dict.Close() + if err != nil { + t.Fatal(err) + } + + err = index.Close() + if err != nil { + t.Fatal(err) + } +} diff --git a/search/searchers/search_fuzzy.go b/search/searchers/search_fuzzy.go index 8d233547..111e302f 100644 --- a/search/searchers/search_fuzzy.go +++ b/search/searchers/search_fuzzy.go @@ -33,17 +33,23 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string, prefix, fuzzin } // find the terms with this prefix - fieldReader, err := indexReader.FieldReader(field, []byte(prefixTerm), []byte(prefixTerm)) + var fieldDict index.FieldDict + var err error + if len(prefixTerm) > 0 { + fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm)) + } else { + fieldDict, err = indexReader.FieldDict(field) + } // enumerate terms and check levenshtein distance candidateTerms := make([]string, 0) - tfd, err := fieldReader.Next() + tfd, err := fieldDict.Next() for err == nil && tfd != nil { ld, exceeded := search.LevenshteinDistanceMax(&term, &tfd.Term, fuzziness) if !exceeded && ld <= fuzziness { candidateTerms = append(candidateTerms, tfd.Term) } - tfd, err = fieldReader.Next() + tfd, err = fieldDict.Next() } // enumerate all the terms in the range diff --git a/search/searchers/search_term_prefix.go b/search/searchers/search_term_prefix.go index 13d588fc..d2d562b0 100644 --- a/search/searchers/search_term_prefix.go +++ b/search/searchers/search_term_prefix.go @@ -24,18 +24,18 @@ type TermPrefixSearcher struct { func NewTermPrefixSearcher(indexReader index.IndexReader, prefix string, field string, boost float64, explain bool) (*TermPrefixSearcher, error) { // find the terms with this prefix - fieldReader, err := indexReader.FieldReader(field, []byte(prefix), []byte(prefix)) + fieldDict, err := indexReader.FieldDictPrefix(field, []byte(prefix)) // enumerate all the terms in the range qsearchers := make([]search.Searcher, 0, 25) - tfd, err := fieldReader.Next() + tfd, err := fieldDict.Next() for err == nil && tfd != nil { qsearcher, err := NewTermSearcher(indexReader, string(tfd.Term), field, 1.0, explain) if err != nil { return nil, err } qsearchers = append(qsearchers, qsearcher) - tfd, err = fieldReader.Next() + tfd, err = fieldDict.Next() } // build disjunction searcher of these ranges searcher, err := NewDisjunctionSearcher(indexReader, qsearchers, 0, explain)