0
0
Fork 0

implemented prefix search

closes #4
This commit is contained in:
Marty Schoch 2014-08-07 13:45:39 -04:00
parent b16c1d7f79
commit 292af78b9e
8 changed files with 381 additions and 1 deletions

View File

@ -22,6 +22,8 @@ type Index interface {
TermFieldReader(term []byte, field string) (TermFieldReader, error)
DocIdReader(start, end string) (DocIdReader, error)
FieldReader(field string, startTerm []byte, endTerm []byte) (FieldReader, error)
DocCount() uint64
Document(id string) (*document.Document, error)
@ -41,6 +43,7 @@ type TermFieldVector struct {
}
type TermFieldDoc struct {
Term string
ID string
Freq uint64
Norm float64
@ -54,6 +57,11 @@ type TermFieldReader interface {
Close()
}
type FieldReader interface {
Next() (*TermFieldDoc, error)
Close()
}
type DocIdReader interface {
Next() (string, error)
Advance(ID string) (string, error)

View File

@ -0,0 +1,87 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package upside_down
import (
"bytes"
"fmt"
"github.com/couchbaselabs/bleve/index"
"github.com/couchbaselabs/bleve/index/store"
)
type UpsideDownCouchFieldReader struct {
index *UpsideDownCouch
iterator store.KVIterator
endKey []byte
field uint16
}
func newUpsideDownCouchFieldReader(index *UpsideDownCouch, field uint16, startTerm, endTerm []byte) (*UpsideDownCouchFieldReader, error) {
startRow := NewTermFrequencyRow(startTerm, field, "", 0, 0)
startKey := startRow.ScanPrefixForFieldTermPrefix()
endKey := NewTermFrequencyRow(endTerm, field, "", 0, 0).Key()
it := index.store.Iterator(startKey)
return &UpsideDownCouchFieldReader{
index: index,
iterator: it,
field: field,
endKey: endKey,
}, nil
}
func (r *UpsideDownCouchFieldReader) Next() (*index.TermFieldDoc, error) {
key, val, valid := r.iterator.Current()
if !valid {
return nil, nil
}
// past end term
if bytes.Compare(key, r.endKey) > 0 {
return nil, nil
}
currRow, err := NewTermFrequencyRowKV(key, val)
if err != nil {
return nil, fmt.Errorf("unexpected error parsing term freq row kv: %v", err)
}
rv := index.TermFieldDoc{
Term: string(currRow.term),
Freq: currRow.freq,
}
// advance the iterator to the next term
// by using invalid doc id (higher sorting)
nextTerm := incrementBytes(currRow.term)
nextRow := NewTermFrequencyRow(nextTerm, r.field, "", 0, 0)
r.iterator.Seek(nextRow.ScanPrefixForFieldTermPrefix())
return &rv, nil
}
func (r *UpsideDownCouchFieldReader) Close() {
r.iterator.Close()
}
func incrementBytes(in []byte) []byte {
rv := make([]byte, len(in))
copy(rv, in)
for i := len(rv) - 1; i >= 0; i-- {
rv[i] = rv[i] + 1
if rv[i] != 0 {
// didnt' overflow, so stop
break
}
}
return rv
}

View File

@ -0,0 +1,113 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package upside_down
import (
"os"
"reflect"
"testing"
"github.com/couchbaselabs/bleve/document"
"github.com/couchbaselabs/bleve/index/store/leveldb"
)
func TestIndexFieldReader(t *testing.T) {
defer os.RemoveAll("test")
store, err := leveldb.Open("test", true)
idx := NewUpsideDownCouch(store)
err = idx.Open()
if err != nil {
t.Errorf("error opening index: %v", err)
}
defer idx.Close()
var expectedCount uint64 = 0
doc := document.NewDocument("1")
doc.AddField(document.NewTextField("name", []byte("test")))
err = idx.Update(doc)
if err != nil {
t.Errorf("Error updating index: %v", err)
}
expectedCount += 1
doc = document.NewDocument("2")
doc.AddField(document.NewTextFieldWithAnalyzer("name", []byte("test test test"), testAnalyzer))
doc.AddField(document.NewTextFieldCustom("desc", []byte("eat more rice"), document.INDEX_FIELD|document.INCLUDE_TERM_VECTORS, testAnalyzer))
doc.AddField(document.NewTextFieldCustom("prefix", []byte("bob cat cats catting dog doggy zoo"), document.INDEX_FIELD|document.INCLUDE_TERM_VECTORS, testAnalyzer))
err = idx.Update(doc)
if err != nil {
t.Errorf("Error updating index: %v", err)
}
expectedCount += 1
reader, err := idx.FieldReader("name", nil, nil)
if err != nil {
t.Errorf("error creating reader: %v", err)
}
defer reader.Close()
termCount := 0
curr, err := reader.Next()
for err == nil && curr != nil {
termCount++
if curr.Term != "test" {
t.Errorf("expected term to be 'test', got '%s'", curr.Term)
}
curr, err = reader.Next()
}
if termCount != 1 {
t.Errorf("expected 1 term for this field, got %d", termCount)
}
reader, err = idx.FieldReader("desc", nil, nil)
if err != nil {
t.Errorf("error creating reader: %v", err)
}
defer reader.Close()
termCount = 0
terms := make([]string, 0)
curr, err = reader.Next()
for err == nil && curr != nil {
termCount++
terms = append(terms, curr.Term)
curr, err = reader.Next()
}
if termCount != 3 {
t.Errorf("expected 3 term for this field, got %d", termCount)
}
expectedTerms := []string{"eat", "more", "rice"}
if !reflect.DeepEqual(expectedTerms, terms) {
t.Errorf("expected %#v, got %#v", expectedTerms, terms)
}
// test use case for prefix
reader, err = idx.FieldReader("prefix", []byte("cat"), []byte("cat"))
if err != nil {
t.Errorf("error creating reader: %v", err)
}
defer reader.Close()
termCount = 0
terms = make([]string, 0)
curr, err = reader.Next()
for err == nil && curr != nil {
termCount++
terms = append(terms, curr.Term)
curr, err = reader.Next()
}
if termCount != 3 {
t.Errorf("expected 3 term for this field, got %d", termCount)
}
expectedTerms = []string{"cats", "catting", "cat"}
if !reflect.DeepEqual(expectedTerms, terms) {
t.Errorf("expected %#v, got %#v", expectedTerms, terms)
}
}

View File

@ -149,6 +149,30 @@ type TermFrequencyRow struct {
vectors []*TermVector
}
func (tfr *TermFrequencyRow) ScanPrefixForField() []byte {
buf := make([]byte, 3)
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
return buf
}
func (tfr *TermFrequencyRow) ScanPrefixForFieldTermPrefix() []byte {
buf := make([]byte, 3+len(tfr.term))
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
copy(buf[3:], tfr.term)
return buf
}
func (tfr *TermFrequencyRow) ScanPrefixForFieldTerm() []byte {
buf := make([]byte, 3+len(tfr.term)+1)
buf[0] = 't'
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
termLen := copy(buf[3:], tfr.term)
buf[3+termLen] = BYTE_SEPARATOR
return buf
}
func (tfr *TermFrequencyRow) Key() []byte {
buf := make([]byte, 3+len(tfr.term)+1+len(tfr.doc))
buf[0] = 't'

View File

@ -598,7 +598,15 @@ func (udc *UpsideDownCouch) TermFieldReader(term []byte, fieldName string) (inde
if fieldExists {
return newUpsideDownCouchTermFieldReader(udc, term, uint16(fieldIndex))
}
return newUpsideDownCouchTermFieldReader(udc, []byte{BYTE_SEPARATOR}, 0)
return newUpsideDownCouchTermFieldReader(udc, []byte{BYTE_SEPARATOR}, ^uint16(0))
}
func (udc *UpsideDownCouch) FieldReader(fieldName string, startTerm []byte, endTerm []byte) (index.FieldReader, error) {
fieldIndex, fieldExists := udc.fieldIndexes[fieldName]
if fieldExists {
return newUpsideDownCouchFieldReader(udc, uint16(fieldIndex), startTerm, endTerm)
}
return newUpsideDownCouchTermFieldReader(udc, []byte{BYTE_SEPARATOR}, ^uint16(0))
}
func (udc *UpsideDownCouch) DocIdReader(start, end string) (index.DocIdReader, error) {

View File

@ -106,5 +106,14 @@ func ParseQuery(input []byte) (Query, error) {
}
return &rv, nil
}
_, hasPrefix := tmp["prefix"]
if hasPrefix {
var rv PrefixQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return &rv, nil
}
return nil, fmt.Errorf("Unrecognized query")
}

56
query_prefix.go Normal file
View File

@ -0,0 +1,56 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package bleve
import (
"github.com/couchbaselabs/bleve/search"
)
type PrefixQuery struct {
Prefix string `json:"prefix"`
FieldVal string `json:"field,omitempty"`
BoostVal float64 `json:"boost,omitempty"`
}
func NewPrefixQuery(prefix string) *PrefixQuery {
return &PrefixQuery{
Prefix: prefix,
BoostVal: 1.0,
}
}
func (q *PrefixQuery) Boost() float64 {
return q.BoostVal
}
func (q *PrefixQuery) SetBoost(b float64) *PrefixQuery {
q.BoostVal = b
return q
}
func (q *PrefixQuery) Field() string {
return q.FieldVal
}
func (q *PrefixQuery) SetField(f string) *PrefixQuery {
q.FieldVal = f
return q
}
func (q *PrefixQuery) Searcher(i *indexImpl, explain bool) (search.Searcher, error) {
field := q.FieldVal
if q.FieldVal == "" {
field = i.m.defaultField()
}
return search.NewTermPrefixSearcher(i.i, q.Prefix, field, q.BoostVal, explain)
}
func (q *PrefixQuery) Validate() error {
return nil
}

View File

@ -0,0 +1,75 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package search
import (
"github.com/couchbaselabs/bleve/index"
)
type TermPrefixSearcher struct {
index index.Index
prefix string
field string
explain bool
searcher *DisjunctionSearcher
}
func NewTermPrefixSearcher(index index.Index, prefix string, field string, boost float64, explain bool) (*TermPrefixSearcher, error) {
// find the terms with this prefix
fieldReader, err := index.FieldReader(field, []byte(prefix), []byte(prefix))
// enumerate all the terms in the range
qsearchers := make([]Searcher, 0, 25)
tfd, err := fieldReader.Next()
for err == nil && tfd != nil {
qsearcher, err := NewTermSearcher(index, string(tfd.Term), field, 1.0, explain)
if err != nil {
return nil, err
}
qsearchers = append(qsearchers, qsearcher)
tfd, err = fieldReader.Next()
}
// build disjunction searcher of these ranges
searcher, err := NewDisjunctionSearcher(index, qsearchers, 0, explain)
if err != nil {
return nil, err
}
return &TermPrefixSearcher{
index: index,
prefix: prefix,
field: field,
explain: explain,
searcher: searcher,
}, nil
}
func (s *TermPrefixSearcher) Count() uint64 {
return s.searcher.Count()
}
func (s *TermPrefixSearcher) Weight() float64 {
return s.searcher.Weight()
}
func (s *TermPrefixSearcher) SetQueryNorm(qnorm float64) {
s.searcher.SetQueryNorm(qnorm)
}
func (s *TermPrefixSearcher) Next() (*DocumentMatch, error) {
return s.searcher.Next()
}
func (s *TermPrefixSearcher) Advance(ID string) (*DocumentMatch, error) {
return s.searcher.Next()
}
func (s *TermPrefixSearcher) Close() {
s.searcher.Close()
}